1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3  ** Functions to handle UTF-8 and UTF-16 strings.
4  **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 
8 #include <cstdlib>
9 
10 #include <stdexcept>
11 #include <string>
12 
13 #include "UniConversion.h"
14 
15 using namespace Scintilla;
16 
17 namespace Scintilla {
18 
UTF8Length(const wchar_t * uptr,size_t tlen)19 size_t UTF8Length(const wchar_t *uptr, size_t tlen) noexcept {
20 	size_t len = 0;
21 	for (size_t i = 0; i < tlen && uptr[i];) {
22 		const unsigned int uch = uptr[i];
23 		if (uch < 0x80) {
24 			len++;
25 		} else if (uch < 0x800) {
26 			len += 2;
27 		} else if ((uch >= SURROGATE_LEAD_FIRST) &&
28 			(uch <= SURROGATE_TRAIL_LAST)) {
29 			len += 4;
30 			i++;
31 		} else {
32 			len += 3;
33 		}
34 		i++;
35 	}
36 	return len;
37 }
38 
UTF8FromUTF16(const wchar_t * uptr,size_t tlen,char * putf,size_t len)39 void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) noexcept {
40 	size_t k = 0;
41 	for (size_t i = 0; i < tlen && uptr[i];) {
42 		const unsigned int uch = uptr[i];
43 		if (uch < 0x80) {
44 			putf[k++] = static_cast<char>(uch);
45 		} else if (uch < 0x800) {
46 			putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
47 			putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
48 		} else if ((uch >= SURROGATE_LEAD_FIRST) &&
49 			(uch <= SURROGATE_TRAIL_LAST)) {
50 			// Half a surrogate pair
51 			i++;
52 			const unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
53 			putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
54 			putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
55 			putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
56 			putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
57 		} else {
58 			putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
59 			putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
60 			putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
61 		}
62 		i++;
63 	}
64 	if (k < len)
65 		putf[k] = '\0';
66 }
67 
UTF8FromUTF32Character(int uch,char * putf)68 void UTF8FromUTF32Character(int uch, char *putf) noexcept {
69 	size_t k = 0;
70 	if (uch < 0x80) {
71 		putf[k++] = static_cast<char>(uch);
72 	} else if (uch < 0x800) {
73 		putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
74 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
75 	} else if (uch < 0x10000) {
76 		putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
77 		putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
78 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
79 	} else {
80 		putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
81 		putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
82 		putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
83 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
84 	}
85 	putf[k] = '\0';
86 }
87 
UTF16Length(const char * s,size_t len)88 size_t UTF16Length(const char *s, size_t len) noexcept {
89 	size_t ulen = 0;
90 	for (size_t i = 0; i < len;) {
91 		const unsigned char ch = s[i];
92 		const unsigned int byteCount = UTF8BytesOfLead[ch];
93 		const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
94 		i += byteCount;
95 		ulen += (i > len) ? 1 : utf16Len;
96 	}
97 	return ulen;
98 }
99 
TrailByteValue(unsigned char c)100 constexpr unsigned char TrailByteValue(unsigned char c) {
101 	// The top 2 bits are 0b10 to indicate a trail byte.
102 	// The lower 6 bits contain the value.
103 	return c & 0x3F;
104 }
105 
UTF16FromUTF8(const char * s,size_t len,wchar_t * tbuf,size_t tlen)106 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
107 	size_t ui = 0;
108 	for (size_t i = 0; i < len;) {
109 		unsigned char ch = s[i];
110 		const unsigned int byteCount = UTF8BytesOfLead[ch];
111 		unsigned int value;
112 
113 		if (i + byteCount > len) {
114 			// Trying to read past end but still have space to write
115 			if (ui < tlen) {
116 			tbuf[ui] = ch;
117 				ui++;
118 			}
119 			break;
120 		}
121 
122 		const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount);
123 		if (ui + outLen > tlen) {
124 			throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
125 		}
126 
127 		i++;
128 		switch (byteCount) {
129 		case 1:
130 			tbuf[ui] = ch;
131 			break;
132 		case 2:
133 			value = (ch & 0x1F) << 6;
134 			ch = s[i++];
135 			value += TrailByteValue(ch);
136 			tbuf[ui] = static_cast<wchar_t>(value);
137 			break;
138 		case 3:
139 			value = (ch & 0xF) << 12;
140 			ch = s[i++];
141 			value += (TrailByteValue(ch) << 6);
142 			ch = s[i++];
143 			value += TrailByteValue(ch);
144 			tbuf[ui] = static_cast<wchar_t>(value);
145 			break;
146 		default:
147 			// Outside the BMP so need two surrogates
148 			value = (ch & 0x7) << 18;
149 			ch = s[i++];
150 			value += TrailByteValue(ch) << 12;
151 			ch = s[i++];
152 			value += TrailByteValue(ch) << 6;
153 			ch = s[i++];
154 			value += TrailByteValue(ch);
155 			tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
156 			ui++;
157 			tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
158 			break;
159 		}
160 		ui++;
161 	}
162 	return ui;
163 }
164 
UTF32Length(const char * s,size_t len)165 size_t UTF32Length(const char *s, size_t len) noexcept {
166 	size_t ulen = 0;
167 	for (size_t i = 0; i < len;) {
168 		const unsigned char ch = s[i];
169 		const unsigned int byteCount = UTF8BytesOfLead[ch];
170 		i += byteCount;
171 		ulen++;
172 	}
173 	return ulen;
174 }
175 
UTF32FromUTF8(const char * s,size_t len,unsigned int * tbuf,size_t tlen)176 size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) {
177 	size_t ui = 0;
178 	for (size_t i = 0; i < len;) {
179 		unsigned char ch = s[i];
180 		const unsigned int byteCount = UTF8BytesOfLead[ch];
181 		unsigned int value;
182 
183 		if (i + byteCount > len) {
184 			// Trying to read past end but still have space to write
185 			if (ui < tlen) {
186 				tbuf[ui] = ch;
187 				ui++;
188 			}
189 			break;
190 		}
191 
192 		if (ui == tlen) {
193 			throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
194 		}
195 
196 		i++;
197 		switch (byteCount) {
198 		case 1:
199 			value = ch;
200 			break;
201 		case 2:
202 			value = (ch & 0x1F) << 6;
203 			ch = s[i++];
204 			value += TrailByteValue(ch);
205 			break;
206 		case 3:
207 			value = (ch & 0xF) << 12;
208 			ch = s[i++];
209 			value += TrailByteValue(ch) << 6;
210 			ch = s[i++];
211 			value += TrailByteValue(ch);
212 			break;
213 		default:
214 			value = (ch & 0x7) << 18;
215 			ch = s[i++];
216 			value += TrailByteValue(ch) << 12;
217 			ch = s[i++];
218 			value += TrailByteValue(ch) << 6;
219 			ch = s[i++];
220 			value += TrailByteValue(ch);
221 			break;
222 		}
223 		tbuf[ui] = value;
224 		ui++;
225 	}
226 	return ui;
227 }
228 
WStringFromUTF8(const char * s,size_t len)229 std::wstring WStringFromUTF8(const char *s, size_t len) {
230 #ifdef _WIN32
231 		const size_t len16 = UTF16Length(s, len);
232 		std::wstring ws(len16, 0);
233 		UTF16FromUTF8(s, len, &ws[0], len16);
234 		return ws;
235 #else
236 		const size_t len32 = UTF32Length(s, len);
237 		std::wstring ws(len32, 0);
238 		UTF32FromUTF8(s, len, reinterpret_cast<unsigned int *>(&ws[0]), len32);
239 		return ws;
240 #endif
241 }
242 
UTF16FromUTF32Character(unsigned int val,wchar_t * tbuf)243 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept {
244 	if (val < SUPPLEMENTAL_PLANE_FIRST) {
245 		tbuf[0] = static_cast<wchar_t>(val);
246 		return 1;
247 	} else {
248 		tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
249 		tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
250 		return 2;
251 	}
252 }
253 
254 const unsigned char UTF8BytesOfLead[256] = {
255 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
256 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
257 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
258 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
259 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
260 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
261 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
262 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
263 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
264 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
265 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
266 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
267 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
268 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
269 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
270 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
271 };
272 
273 // Return both the width of the first character in the string and a status
274 // saying whether it is valid or invalid.
275 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
276 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
277 // reasonably treated as code points in some circumstances. They will, however,
278 // not have associated glyphs.
UTF8Classify(const unsigned char * us,size_t len)279 int UTF8Classify(const unsigned char *us, size_t len) noexcept {
280 	// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
281 	if (us[0] < 0x80) {
282 		// ASCII
283 		return 1;
284 	}
285 
286 	const size_t byteCount = UTF8BytesOfLead[us[0]];
287 	if (byteCount == 1 || byteCount > len) {
288 		// Invalid lead byte
289 		return UTF8MaskInvalid | 1;
290 	}
291 
292 	if (!UTF8IsTrailByte(us[1])) {
293 		// Invalid trail byte
294 		return UTF8MaskInvalid | 1;
295 	}
296 
297 	switch (byteCount) {
298 	case 2:
299 		return 2;
300 
301 	case 3:
302 		if (UTF8IsTrailByte(us[2])) {
303 			if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
304 				// Overlong
305 				return UTF8MaskInvalid | 1;
306 			}
307 			if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
308 				// Surrogate
309 				return UTF8MaskInvalid | 1;
310 			}
311 			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
312 				// U+FFFE non-character - 3 bytes long
313 				return UTF8MaskInvalid | 3;
314 			}
315 			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
316 				// U+FFFF non-character - 3 bytes long
317 				return UTF8MaskInvalid | 3;
318 			}
319 			if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
320 				// U+FDD0 .. U+FDEF
321 				return UTF8MaskInvalid | 3;
322 			}
323 			return 3;
324 		}
325 		break;
326 
327 	default:
328 		if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
329 			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
330 				// *FFFE or *FFFF non-character
331 				return UTF8MaskInvalid | 4;
332 			}
333 			if (*us == 0xf4) {
334 				// Check if encoding a value beyond the last Unicode character 10FFFF
335 				if (us[1] > 0x8f) {
336 					return UTF8MaskInvalid | 1;
337 				}
338 			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
339 				// Overlong
340 				return UTF8MaskInvalid | 1;
341 			}
342 			return 4;
343 		}
344 		break;
345 	}
346 
347 	return UTF8MaskInvalid | 1;
348 }
349 
UTF8DrawBytes(const unsigned char * us,int len)350 int UTF8DrawBytes(const unsigned char *us, int len) noexcept {
351 	const int utf8StatusNext = UTF8Classify(us, len);
352 	return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
353 }
354 
UTF8IsValid(const char * s,size_t len)355 bool UTF8IsValid(const char *s, size_t len) noexcept {
356 	const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
357 	size_t remaining = len;
358 	while (remaining > 0) {
359 		const int utf8Status = UTF8Classify(us, remaining);
360 		if (utf8Status & UTF8MaskInvalid) {
361 			return false;
362 		} else {
363 			const int lenChar = utf8Status & UTF8MaskWidth;
364 			us += lenChar;
365 			remaining -= lenChar;
366 		}
367 	}
368 	return remaining == 0;
369 }
370 
371 // Replace invalid bytes in UTF-8 with the replacement character
FixInvalidUTF8(const std::string & text)372 std::string FixInvalidUTF8(const std::string &text) {
373 	std::string result;
374 	const char *s = text.c_str();
375 	size_t remaining = text.size();
376 	while (remaining > 0) {
377 		const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining);
378 		if (utf8Status & UTF8MaskInvalid) {
379 			// Replacement character 0xFFFD = UTF8:"efbfbd".
380 			result.append("\xef\xbf\xbd");
381 			s++;
382 			remaining--;
383 		} else {
384 			const size_t len = utf8Status & UTF8MaskWidth;
385 			result.append(s, len);
386 			s += len;
387 			remaining -= len;
388 		}
389 	}
390 	return result;
391 }
392 
393 }
394