1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3  ** Functions to handle UTF-8 and UTF-16 strings.
4  **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 
8 #include <cstdlib>
9 
10 #include <stdexcept>
11 #include <string>
12 #include <string_view>
13 
14 #include "UniConversion.h"
15 
16 using namespace Scintilla;
17 
18 namespace Scintilla {
19 
UTF8Length(std::wstring_view wsv)20 size_t UTF8Length(std::wstring_view wsv) noexcept {
21 	size_t len = 0;
22 	for (size_t i = 0; i < wsv.length() && wsv[i];) {
23 		const unsigned int uch = wsv[i];
24 		if (uch < 0x80) {
25 			len++;
26 		} else if (uch < 0x800) {
27 			len += 2;
28 		} else if ((uch >= SURROGATE_LEAD_FIRST) &&
29 			(uch <= SURROGATE_TRAIL_LAST)) {
30 			len += 4;
31 			i++;
32 		} else {
33 			len += 3;
34 		}
35 		i++;
36 	}
37 	return len;
38 }
39 
UTF8PositionFromUTF16Position(std::string_view u8Text,size_t positionUTF16)40 size_t UTF8PositionFromUTF16Position(std::string_view u8Text, size_t positionUTF16) noexcept {
41 	size_t positionUTF8 = 0;
42 	for (size_t lengthUTF16 = 0; (positionUTF8 < u8Text.length()) && (lengthUTF16 < positionUTF16);) {
43 		const unsigned char uch = u8Text[positionUTF8];
44 		const unsigned int byteCount = UTF8BytesOfLead[uch];
45 		lengthUTF16 += UTF16LengthFromUTF8ByteCount(byteCount);
46 		positionUTF8 += byteCount;
47 	}
48 
49 	return positionUTF8;
50 }
51 
UTF8FromUTF16(std::wstring_view wsv,char * putf,size_t len)52 void UTF8FromUTF16(std::wstring_view wsv, char *putf, size_t len) noexcept {
53 	size_t k = 0;
54 	for (size_t i = 0; i < wsv.length() && wsv[i];) {
55 		const unsigned int uch = wsv[i];
56 		if (uch < 0x80) {
57 			putf[k++] = static_cast<char>(uch);
58 		} else if (uch < 0x800) {
59 			putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
60 			putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
61 		} else if ((uch >= SURROGATE_LEAD_FIRST) &&
62 			(uch <= SURROGATE_TRAIL_LAST)) {
63 			// Half a surrogate pair
64 			i++;
65 			const unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (wsv[i] & 0x3ff);
66 			putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
67 			putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
68 			putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
69 			putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
70 		} else {
71 			putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
72 			putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
73 			putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
74 		}
75 		i++;
76 	}
77 	if (k < len)
78 		putf[k] = '\0';
79 }
80 
UTF8FromUTF32Character(int uch,char * putf)81 void UTF8FromUTF32Character(int uch, char *putf) noexcept {
82 	size_t k = 0;
83 	if (uch < 0x80) {
84 		putf[k++] = static_cast<char>(uch);
85 	} else if (uch < 0x800) {
86 		putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
87 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
88 	} else if (uch < 0x10000) {
89 		putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
90 		putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
91 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
92 	} else {
93 		putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
94 		putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
95 		putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
96 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
97 	}
98 	putf[k] = '\0';
99 }
100 
UTF16Length(std::string_view svu8)101 size_t UTF16Length(std::string_view svu8) noexcept {
102 	size_t ulen = 0;
103 	for (size_t i = 0; i< svu8.length();) {
104 		const unsigned char ch = svu8[i];
105 		const unsigned int byteCount = UTF8BytesOfLead[ch];
106 		const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
107 		i += byteCount;
108 		ulen += (i > svu8.length()) ? 1 : utf16Len;
109 	}
110 	return ulen;
111 }
112 
TrailByteValue(unsigned char c)113 constexpr unsigned char TrailByteValue(unsigned char c) {
114 	// The top 2 bits are 0b10 to indicate a trail byte.
115 	// The lower 6 bits contain the value.
116 	return c & 0b0011'1111;
117 }
118 
UTF16FromUTF8(std::string_view svu8,wchar_t * tbuf,size_t tlen)119 size_t UTF16FromUTF8(std::string_view svu8, wchar_t *tbuf, size_t tlen) {
120 	size_t ui = 0;
121 	for (size_t i = 0; i < svu8.length();) {
122 		unsigned char ch = svu8[i];
123 		const unsigned int byteCount = UTF8BytesOfLead[ch];
124 		unsigned int value;
125 
126 		if (i + byteCount > svu8.length()) {
127 			// Trying to read past end but still have space to write
128 			if (ui < tlen) {
129 				tbuf[ui] = ch;
130 				ui++;
131 			}
132 			break;
133 		}
134 
135 		const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount);
136 		if (ui + outLen > tlen) {
137 			throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
138 		}
139 
140 		i++;
141 		switch (byteCount) {
142 		case 1:
143 			tbuf[ui] = ch;
144 			break;
145 		case 2:
146 			value = (ch & 0x1F) << 6;
147 			ch = svu8[i++];
148 			value += TrailByteValue(ch);
149 			tbuf[ui] = static_cast<wchar_t>(value);
150 			break;
151 		case 3:
152 			value = (ch & 0xF) << 12;
153 			ch = svu8[i++];
154 			value += (TrailByteValue(ch) << 6);
155 			ch = svu8[i++];
156 			value += TrailByteValue(ch);
157 			tbuf[ui] = static_cast<wchar_t>(value);
158 			break;
159 		default:
160 			// Outside the BMP so need two surrogates
161 			value = (ch & 0x7) << 18;
162 			ch = svu8[i++];
163 			value += TrailByteValue(ch) << 12;
164 			ch = svu8[i++];
165 			value += TrailByteValue(ch) << 6;
166 			ch = svu8[i++];
167 			value += TrailByteValue(ch);
168 			tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
169 			ui++;
170 			tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
171 			break;
172 		}
173 		ui++;
174 	}
175 	return ui;
176 }
177 
UTF32Length(std::string_view svu8)178 size_t UTF32Length(std::string_view svu8) noexcept {
179 	size_t ulen = 0;
180 	for (size_t i = 0; i < svu8.length();) {
181 		const unsigned char ch = svu8[i];
182 		const unsigned int byteCount = UTF8BytesOfLead[ch];
183 		i += byteCount;
184 		ulen++;
185 	}
186 	return ulen;
187 }
188 
UTF32FromUTF8(std::string_view svu8,unsigned int * tbuf,size_t tlen)189 size_t UTF32FromUTF8(std::string_view svu8, unsigned int *tbuf, size_t tlen) {
190 	size_t ui = 0;
191 	for (size_t i = 0; i < svu8.length();) {
192 		unsigned char ch = svu8[i];
193 		const unsigned int byteCount = UTF8BytesOfLead[ch];
194 		unsigned int value;
195 
196 		if (i + byteCount > svu8.length()) {
197 			// Trying to read past end but still have space to write
198 			if (ui < tlen) {
199 				tbuf[ui] = ch;
200 				ui++;
201 			}
202 			break;
203 		}
204 
205 		if (ui == tlen) {
206 			throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
207 		}
208 
209 		i++;
210 		switch (byteCount) {
211 		case 1:
212 			value = ch;
213 			break;
214 		case 2:
215 			value = (ch & 0x1F) << 6;
216 			ch = svu8[i++];
217 			value += TrailByteValue(ch);
218 			break;
219 		case 3:
220 			value = (ch & 0xF) << 12;
221 			ch = svu8[i++];
222 			value += TrailByteValue(ch) << 6;
223 			ch = svu8[i++];
224 			value += TrailByteValue(ch);
225 			break;
226 		default:
227 			value = (ch & 0x7) << 18;
228 			ch = svu8[i++];
229 			value += TrailByteValue(ch) << 12;
230 			ch = svu8[i++];
231 			value += TrailByteValue(ch) << 6;
232 			ch = svu8[i++];
233 			value += TrailByteValue(ch);
234 			break;
235 		}
236 		tbuf[ui] = value;
237 		ui++;
238 	}
239 	return ui;
240 }
241 
WStringFromUTF8(std::string_view svu8)242 std::wstring WStringFromUTF8(std::string_view svu8) {
243 	if constexpr (sizeof(wchar_t) == 2) {
244 		const size_t len16 = UTF16Length(svu8);
245 		std::wstring ws(len16, 0);
246 		UTF16FromUTF8(svu8, &ws[0], len16);
247 		return ws;
248 	} else {
249 		const size_t len32 = UTF32Length(svu8);
250 		std::wstring ws(len32, 0);
251 		UTF32FromUTF8(svu8, reinterpret_cast<unsigned int *>(&ws[0]), len32);
252 		return ws;
253 	}
254 }
255 
UTF16FromUTF32Character(unsigned int val,wchar_t * tbuf)256 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept {
257 	if (val < SUPPLEMENTAL_PLANE_FIRST) {
258 		tbuf[0] = static_cast<wchar_t>(val);
259 		return 1;
260 	} else {
261 		tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
262 		tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
263 		return 2;
264 	}
265 }
266 
267 const unsigned char UTF8BytesOfLead[256] = {
268 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
269 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
270 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
271 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
272 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
273 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
274 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
275 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
276 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
277 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
278 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
279 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
280 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
281 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
282 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
283 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
284 };
285 
286 // Return both the width of the first character in the string and a status
287 // saying whether it is valid or invalid.
288 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
289 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
290 // reasonably treated as code points in some circumstances. They will, however,
291 // not have associated glyphs.
UTF8Classify(const unsigned char * us,size_t len)292 int UTF8Classify(const unsigned char *us, size_t len) noexcept {
293 	// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
294 	if (us[0] < 0x80) {
295 		// ASCII
296 		return 1;
297 	}
298 
299 	const size_t byteCount = UTF8BytesOfLead[us[0]];
300 	if (byteCount == 1 || byteCount > len) {
301 		// Invalid lead byte
302 		return UTF8MaskInvalid | 1;
303 	}
304 
305 	if (!UTF8IsTrailByte(us[1])) {
306 		// Invalid trail byte
307 		return UTF8MaskInvalid | 1;
308 	}
309 
310 	switch (byteCount) {
311 	case 2:
312 		return 2;
313 
314 	case 3:
315 		if (UTF8IsTrailByte(us[2])) {
316 			if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
317 				// Overlong
318 				return UTF8MaskInvalid | 1;
319 			}
320 			if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
321 				// Surrogate
322 				return UTF8MaskInvalid | 1;
323 			}
324 			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
325 				// U+FFFE non-character - 3 bytes long
326 				return UTF8MaskInvalid | 3;
327 			}
328 			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
329 				// U+FFFF non-character - 3 bytes long
330 				return UTF8MaskInvalid | 3;
331 			}
332 			if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
333 				// U+FDD0 .. U+FDEF
334 				return UTF8MaskInvalid | 3;
335 			}
336 			return 3;
337 		}
338 		break;
339 
340 	default:
341 		if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
342 			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
343 				// *FFFE or *FFFF non-character
344 				return UTF8MaskInvalid | 4;
345 			}
346 			if (*us == 0xf4) {
347 				// Check if encoding a value beyond the last Unicode character 10FFFF
348 				if (us[1] > 0x8f) {
349 					return UTF8MaskInvalid | 1;
350 				}
351 			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
352 				// Overlong
353 				return UTF8MaskInvalid | 1;
354 			}
355 			return 4;
356 		}
357 		break;
358 	}
359 
360 	return UTF8MaskInvalid | 1;
361 }
362 
UTF8DrawBytes(const unsigned char * us,int len)363 int UTF8DrawBytes(const unsigned char *us, int len) noexcept {
364 	const int utf8StatusNext = UTF8Classify(us, len);
365 	return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
366 }
367 
UTF8IsValid(std::string_view svu8)368 bool UTF8IsValid(std::string_view svu8) noexcept {
369 	const unsigned char *us = reinterpret_cast<const unsigned char *>(svu8.data());
370 	size_t remaining = svu8.length();
371 	while (remaining > 0) {
372 		const int utf8Status = UTF8Classify(us, remaining);
373 		if (utf8Status & UTF8MaskInvalid) {
374 			return false;
375 		} else {
376 			const int lenChar = utf8Status & UTF8MaskWidth;
377 			us += lenChar;
378 			remaining -= lenChar;
379 		}
380 	}
381 	return remaining == 0;
382 }
383 
384 // Replace invalid bytes in UTF-8 with the replacement character
FixInvalidUTF8(const std::string & text)385 std::string FixInvalidUTF8(const std::string &text) {
386 	std::string result;
387 	const char *s = text.c_str();
388 	size_t remaining = text.size();
389 	while (remaining > 0) {
390 		const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining);
391 		if (utf8Status & UTF8MaskInvalid) {
392 			// Replacement character 0xFFFD = UTF8:"efbfbd".
393 			result.append("\xef\xbf\xbd");
394 			s++;
395 			remaining--;
396 		} else {
397 			const size_t len = utf8Status & UTF8MaskWidth;
398 			result.append(s, len);
399 			s += len;
400 			remaining -= len;
401 		}
402 	}
403 	return result;
404 }
405 
406 }
407