1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3 ** Functions to handle UTF-8 and UTF-16 strings.
4 **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7
8 #include <cstdlib>
9
10 #include <stdexcept>
11 #include <string>
12
13 #include "UniConversion.h"
14
15 using namespace Scintilla;
16
17 namespace Scintilla {
18
UTF8Length(const wchar_t * uptr,size_t tlen)19 size_t UTF8Length(const wchar_t *uptr, size_t tlen) noexcept {
20 size_t len = 0;
21 for (size_t i = 0; i < tlen && uptr[i];) {
22 const unsigned int uch = uptr[i];
23 if (uch < 0x80) {
24 len++;
25 } else if (uch < 0x800) {
26 len += 2;
27 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
28 (uch <= SURROGATE_TRAIL_LAST)) {
29 len += 4;
30 i++;
31 } else {
32 len += 3;
33 }
34 i++;
35 }
36 return len;
37 }
38
UTF8FromUTF16(const wchar_t * uptr,size_t tlen,char * putf,size_t len)39 void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) noexcept {
40 size_t k = 0;
41 for (size_t i = 0; i < tlen && uptr[i];) {
42 const unsigned int uch = uptr[i];
43 if (uch < 0x80) {
44 putf[k++] = static_cast<char>(uch);
45 } else if (uch < 0x800) {
46 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
47 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
48 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
49 (uch <= SURROGATE_TRAIL_LAST)) {
50 // Half a surrogate pair
51 i++;
52 const unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
53 putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
54 putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
55 putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
56 putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
57 } else {
58 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
59 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
60 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
61 }
62 i++;
63 }
64 if (k < len)
65 putf[k] = '\0';
66 }
67
UTF8FromUTF32Character(int uch,char * putf)68 void UTF8FromUTF32Character(int uch, char *putf) noexcept {
69 size_t k = 0;
70 if (uch < 0x80) {
71 putf[k++] = static_cast<char>(uch);
72 } else if (uch < 0x800) {
73 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
74 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
75 } else if (uch < 0x10000) {
76 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
77 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
78 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
79 } else {
80 putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
81 putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
82 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
83 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
84 }
85 putf[k] = '\0';
86 }
87
UTF16Length(const char * s,size_t len)88 size_t UTF16Length(const char *s, size_t len) noexcept {
89 size_t ulen = 0;
90 for (size_t i = 0; i < len;) {
91 const unsigned char ch = s[i];
92 const unsigned int byteCount = UTF8BytesOfLead[ch];
93 const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
94 i += byteCount;
95 ulen += (i > len) ? 1 : utf16Len;
96 }
97 return ulen;
98 }
99
TrailByteValue(unsigned char c)100 constexpr unsigned char TrailByteValue(unsigned char c) {
101 // The top 2 bits are 0b10 to indicate a trail byte.
102 // The lower 6 bits contain the value.
103 return c & 0x3F;
104 }
105
UTF16FromUTF8(const char * s,size_t len,wchar_t * tbuf,size_t tlen)106 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
107 size_t ui = 0;
108 for (size_t i = 0; i < len;) {
109 unsigned char ch = s[i];
110 const unsigned int byteCount = UTF8BytesOfLead[ch];
111 unsigned int value;
112
113 if (i + byteCount > len) {
114 // Trying to read past end but still have space to write
115 if (ui < tlen) {
116 tbuf[ui] = ch;
117 ui++;
118 }
119 break;
120 }
121
122 const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount);
123 if (ui + outLen > tlen) {
124 throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
125 }
126
127 i++;
128 switch (byteCount) {
129 case 1:
130 tbuf[ui] = ch;
131 break;
132 case 2:
133 value = (ch & 0x1F) << 6;
134 ch = s[i++];
135 value += TrailByteValue(ch);
136 tbuf[ui] = static_cast<wchar_t>(value);
137 break;
138 case 3:
139 value = (ch & 0xF) << 12;
140 ch = s[i++];
141 value += (TrailByteValue(ch) << 6);
142 ch = s[i++];
143 value += TrailByteValue(ch);
144 tbuf[ui] = static_cast<wchar_t>(value);
145 break;
146 default:
147 // Outside the BMP so need two surrogates
148 value = (ch & 0x7) << 18;
149 ch = s[i++];
150 value += TrailByteValue(ch) << 12;
151 ch = s[i++];
152 value += TrailByteValue(ch) << 6;
153 ch = s[i++];
154 value += TrailByteValue(ch);
155 tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
156 ui++;
157 tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
158 break;
159 }
160 ui++;
161 }
162 return ui;
163 }
164
UTF32Length(const char * s,size_t len)165 size_t UTF32Length(const char *s, size_t len) noexcept {
166 size_t ulen = 0;
167 for (size_t i = 0; i < len;) {
168 const unsigned char ch = s[i];
169 const unsigned int byteCount = UTF8BytesOfLead[ch];
170 i += byteCount;
171 ulen++;
172 }
173 return ulen;
174 }
175
UTF32FromUTF8(const char * s,size_t len,unsigned int * tbuf,size_t tlen)176 size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) {
177 size_t ui = 0;
178 for (size_t i = 0; i < len;) {
179 unsigned char ch = s[i];
180 const unsigned int byteCount = UTF8BytesOfLead[ch];
181 unsigned int value;
182
183 if (i + byteCount > len) {
184 // Trying to read past end but still have space to write
185 if (ui < tlen) {
186 tbuf[ui] = ch;
187 ui++;
188 }
189 break;
190 }
191
192 if (ui == tlen) {
193 throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
194 }
195
196 i++;
197 switch (byteCount) {
198 case 1:
199 value = ch;
200 break;
201 case 2:
202 value = (ch & 0x1F) << 6;
203 ch = s[i++];
204 value += TrailByteValue(ch);
205 break;
206 case 3:
207 value = (ch & 0xF) << 12;
208 ch = s[i++];
209 value += TrailByteValue(ch) << 6;
210 ch = s[i++];
211 value += TrailByteValue(ch);
212 break;
213 default:
214 value = (ch & 0x7) << 18;
215 ch = s[i++];
216 value += TrailByteValue(ch) << 12;
217 ch = s[i++];
218 value += TrailByteValue(ch) << 6;
219 ch = s[i++];
220 value += TrailByteValue(ch);
221 break;
222 }
223 tbuf[ui] = value;
224 ui++;
225 }
226 return ui;
227 }
228
WStringFromUTF8(const char * s,size_t len)229 std::wstring WStringFromUTF8(const char *s, size_t len) {
230 #ifdef _WIN32
231 const size_t len16 = UTF16Length(s, len);
232 std::wstring ws(len16, 0);
233 UTF16FromUTF8(s, len, &ws[0], len16);
234 return ws;
235 #else
236 const size_t len32 = UTF32Length(s, len);
237 std::wstring ws(len32, 0);
238 UTF32FromUTF8(s, len, reinterpret_cast<unsigned int *>(&ws[0]), len32);
239 return ws;
240 #endif
241 }
242
UTF16FromUTF32Character(unsigned int val,wchar_t * tbuf)243 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept {
244 if (val < SUPPLEMENTAL_PLANE_FIRST) {
245 tbuf[0] = static_cast<wchar_t>(val);
246 return 1;
247 } else {
248 tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
249 tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
250 return 2;
251 }
252 }
253
254 const unsigned char UTF8BytesOfLead[256] = {
255 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
256 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
257 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
258 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
259 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
260 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
261 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
262 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
263 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
264 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
265 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
266 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
267 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
268 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
269 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
270 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
271 };
272
273 // Return both the width of the first character in the string and a status
274 // saying whether it is valid or invalid.
275 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
276 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
277 // reasonably treated as code points in some circumstances. They will, however,
278 // not have associated glyphs.
UTF8Classify(const unsigned char * us,size_t len)279 int UTF8Classify(const unsigned char *us, size_t len) noexcept {
280 // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
281 if (us[0] < 0x80) {
282 // ASCII
283 return 1;
284 }
285
286 const size_t byteCount = UTF8BytesOfLead[us[0]];
287 if (byteCount == 1 || byteCount > len) {
288 // Invalid lead byte
289 return UTF8MaskInvalid | 1;
290 }
291
292 if (!UTF8IsTrailByte(us[1])) {
293 // Invalid trail byte
294 return UTF8MaskInvalid | 1;
295 }
296
297 switch (byteCount) {
298 case 2:
299 return 2;
300
301 case 3:
302 if (UTF8IsTrailByte(us[2])) {
303 if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
304 // Overlong
305 return UTF8MaskInvalid | 1;
306 }
307 if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
308 // Surrogate
309 return UTF8MaskInvalid | 1;
310 }
311 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
312 // U+FFFE non-character - 3 bytes long
313 return UTF8MaskInvalid | 3;
314 }
315 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
316 // U+FFFF non-character - 3 bytes long
317 return UTF8MaskInvalid | 3;
318 }
319 if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
320 // U+FDD0 .. U+FDEF
321 return UTF8MaskInvalid | 3;
322 }
323 return 3;
324 }
325 break;
326
327 default:
328 if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
329 if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
330 // *FFFE or *FFFF non-character
331 return UTF8MaskInvalid | 4;
332 }
333 if (*us == 0xf4) {
334 // Check if encoding a value beyond the last Unicode character 10FFFF
335 if (us[1] > 0x8f) {
336 return UTF8MaskInvalid | 1;
337 }
338 } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
339 // Overlong
340 return UTF8MaskInvalid | 1;
341 }
342 return 4;
343 }
344 break;
345 }
346
347 return UTF8MaskInvalid | 1;
348 }
349
UTF8DrawBytes(const unsigned char * us,int len)350 int UTF8DrawBytes(const unsigned char *us, int len) noexcept {
351 const int utf8StatusNext = UTF8Classify(us, len);
352 return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
353 }
354
UTF8IsValid(const char * s,size_t len)355 bool UTF8IsValid(const char *s, size_t len) noexcept {
356 const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
357 size_t remaining = len;
358 while (remaining > 0) {
359 const int utf8Status = UTF8Classify(us, remaining);
360 if (utf8Status & UTF8MaskInvalid) {
361 return false;
362 } else {
363 const int lenChar = utf8Status & UTF8MaskWidth;
364 us += lenChar;
365 remaining -= lenChar;
366 }
367 }
368 return remaining == 0;
369 }
370
371 // Replace invalid bytes in UTF-8 with the replacement character
FixInvalidUTF8(const std::string & text)372 std::string FixInvalidUTF8(const std::string &text) {
373 std::string result;
374 const char *s = text.c_str();
375 size_t remaining = text.size();
376 while (remaining > 0) {
377 const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining);
378 if (utf8Status & UTF8MaskInvalid) {
379 // Replacement character 0xFFFD = UTF8:"efbfbd".
380 result.append("\xef\xbf\xbd");
381 s++;
382 remaining--;
383 } else {
384 const size_t len = utf8Status & UTF8MaskWidth;
385 result.append(s, len);
386 s += len;
387 remaining -= len;
388 }
389 }
390 return result;
391 }
392
393 }
394