1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3  ** Functions to handle UTF-8 and UTF-16 strings.
4  **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 
8 #include <cstdlib>
9 
10 #include <stdexcept>
11 #include <string>
12 
13 #include "UniConversion.h"
14 
15 using namespace Scintilla;
16 
17 namespace Scintilla {
18 
UTF8Length(const wchar_t * uptr,size_t tlen)19 size_t UTF8Length(const wchar_t *uptr, size_t tlen) {
20 	size_t len = 0;
21 	for (size_t i = 0; i < tlen && uptr[i];) {
22 		const unsigned int uch = uptr[i];
23 		if (uch < 0x80) {
24 			len++;
25 		} else if (uch < 0x800) {
26 			len += 2;
27 		} else if ((uch >= SURROGATE_LEAD_FIRST) &&
28 			(uch <= SURROGATE_TRAIL_LAST)) {
29 			len += 4;
30 			i++;
31 		} else {
32 			len += 3;
33 		}
34 		i++;
35 	}
36 	return len;
37 }
38 
UTF8FromUTF16(const wchar_t * uptr,size_t tlen,char * putf,size_t len)39 void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) {
40 	size_t k = 0;
41 	for (size_t i = 0; i < tlen && uptr[i];) {
42 		const unsigned int uch = uptr[i];
43 		if (uch < 0x80) {
44 			putf[k++] = static_cast<char>(uch);
45 		} else if (uch < 0x800) {
46 			putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
47 			putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
48 		} else if ((uch >= SURROGATE_LEAD_FIRST) &&
49 			(uch <= SURROGATE_TRAIL_LAST)) {
50 			// Half a surrogate pair
51 			i++;
52 			const unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
53 			putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
54 			putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
55 			putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
56 			putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
57 		} else {
58 			putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
59 			putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
60 			putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
61 		}
62 		i++;
63 	}
64 	if (k < len)
65 		putf[k] = '\0';
66 }
67 
UTF8FromUTF32Character(int uch,char * putf)68 void UTF8FromUTF32Character(int uch, char *putf) {
69 	size_t k = 0;
70 	if (uch < 0x80) {
71 		putf[k++] = static_cast<char>(uch);
72 	} else if (uch < 0x800) {
73 		putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
74 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
75 	} else if (uch < 0x10000) {
76 		putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
77 		putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
78 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
79 	} else {
80 		putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
81 		putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
82 		putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
83 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
84 	}
85 	putf[k] = '\0';
86 }
87 
UTF16Length(const char * s,size_t len)88 size_t UTF16Length(const char *s, size_t len) {
89 	size_t ulen = 0;
90 	for (size_t i = 0; i < len;) {
91 		const unsigned char ch = s[i];
92 		const unsigned int byteCount = UTF8BytesOfLead[ch];
93 		const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
94 		i += byteCount;
95 		ulen += (i > len) ? 1 : utf16Len;
96 	}
97 	return ulen;
98 }
99 
TrailByteValue(unsigned char c)100 constexpr unsigned char TrailByteValue(unsigned char c) {
101 	// The top 2 bits are 0b10 to indicate a trail byte.
102 	// The lower 6 bits contain the value.
103 	return c & 0x3F;
104 }
105 
UTF16FromUTF8(const char * s,size_t len,wchar_t * tbuf,size_t tlen)106 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
107 	size_t ui = 0;
108 	for (size_t i = 0; i < len;) {
109 		unsigned char ch = s[i];
110 		const unsigned int byteCount = UTF8BytesOfLead[ch];
111 		unsigned int value;
112 
113 		if (i + byteCount > len) {
114 			// Trying to read past end but still have space to write
115 			if (ui < tlen) {
116 			tbuf[ui] = ch;
117 				ui++;
118 			}
119 			break;
120 		}
121 
122 		const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount);
123 		if (ui + outLen > tlen) {
124 			throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
125 		}
126 
127 		i++;
128 		switch (byteCount) {
129 		case 1:
130 			tbuf[ui] = ch;
131 			break;
132 		case 2:
133 			value = (ch & 0x1F) << 6;
134 			ch = s[i++];
135 			value += TrailByteValue(ch);
136 			tbuf[ui] = static_cast<wchar_t>(value);
137 			break;
138 		case 3:
139 			value = (ch & 0xF) << 12;
140 			ch = s[i++];
141 			value += (TrailByteValue(ch) << 6);
142 			ch = s[i++];
143 			value += TrailByteValue(ch);
144 			tbuf[ui] = static_cast<wchar_t>(value);
145 			break;
146 		default:
147 			// Outside the BMP so need two surrogates
148 			value = (ch & 0x7) << 18;
149 			ch = s[i++];
150 			value += TrailByteValue(ch) << 12;
151 			ch = s[i++];
152 			value += TrailByteValue(ch) << 6;
153 			ch = s[i++];
154 			value += TrailByteValue(ch);
155 			tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
156 			ui++;
157 			tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
158 			break;
159 		}
160 		ui++;
161 	}
162 	return ui;
163 }
164 
UTF32FromUTF8(const char * s,size_t len,unsigned int * tbuf,size_t tlen)165 size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) {
166 	size_t ui = 0;
167 	for (size_t i = 0; i < len;) {
168 		unsigned char ch = s[i];
169 		const unsigned int byteCount = UTF8BytesOfLead[ch];
170 		unsigned int value;
171 
172 		if (i + byteCount > len) {
173 			// Trying to read past end but still have space to write
174 			if (ui < tlen) {
175 				tbuf[ui] = ch;
176 				ui++;
177 			}
178 			break;
179 		}
180 
181 		if (ui == tlen) {
182 			throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
183 		}
184 
185 		i++;
186 		switch (byteCount) {
187 		case 1:
188 			value = ch;
189 			break;
190 		case 2:
191 			value = (ch & 0x1F) << 6;
192 			ch = s[i++];
193 			value += TrailByteValue(ch);
194 			break;
195 		case 3:
196 			value = (ch & 0xF) << 12;
197 			ch = s[i++];
198 			value += TrailByteValue(ch) << 6;
199 			ch = s[i++];
200 			value += TrailByteValue(ch);
201 			break;
202 		default:
203 			value = (ch & 0x7) << 18;
204 			ch = s[i++];
205 			value += TrailByteValue(ch) << 12;
206 			ch = s[i++];
207 			value += TrailByteValue(ch) << 6;
208 			ch = s[i++];
209 			value += TrailByteValue(ch);
210 			break;
211 		}
212 		tbuf[ui] = value;
213 		ui++;
214 	}
215 	return ui;
216 }
217 
UTF16FromUTF32Character(unsigned int val,wchar_t * tbuf)218 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept {
219 	if (val < SUPPLEMENTAL_PLANE_FIRST) {
220 		tbuf[0] = static_cast<wchar_t>(val);
221 		return 1;
222 	} else {
223 		tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
224 		tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
225 		return 2;
226 	}
227 }
228 
229 const unsigned char UTF8BytesOfLead[256] = {
230 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
231 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
232 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
233 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
236 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
237 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
238 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
239 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
240 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
241 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
242 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
243 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
244 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
245 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
246 };
247 
248 // Return both the width of the first character in the string and a status
249 // saying whether it is valid or invalid.
250 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
251 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
252 // reasonably treated as code points in some circumstances. They will, however,
253 // not have associated glyphs.
UTF8Classify(const unsigned char * us,size_t len)254 int UTF8Classify(const unsigned char *us, size_t len) noexcept {
255 	// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
256 	if (us[0] < 0x80) {
257 		// ASCII
258 		return 1;
259 	}
260 
261 	const size_t byteCount = UTF8BytesOfLead[us[0]];
262 	if (byteCount == 1 || byteCount > len) {
263 		// Invalid lead byte
264 		return UTF8MaskInvalid | 1;
265 	}
266 
267 	if (!UTF8IsTrailByte(us[1])) {
268 		// Invalid trail byte
269 		return UTF8MaskInvalid | 1;
270 	}
271 
272 	switch (byteCount) {
273 	case 2:
274 		return 2;
275 
276 	case 3:
277 		if (UTF8IsTrailByte(us[2])) {
278 			if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
279 				// Overlong
280 				return UTF8MaskInvalid | 1;
281 			}
282 			if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
283 				// Surrogate
284 				return UTF8MaskInvalid | 1;
285 			}
286 			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
287 				// U+FFFE non-character - 3 bytes long
288 				return UTF8MaskInvalid | 3;
289 			}
290 			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
291 				// U+FFFF non-character - 3 bytes long
292 				return UTF8MaskInvalid | 3;
293 			}
294 			if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
295 				// U+FDD0 .. U+FDEF
296 				return UTF8MaskInvalid | 3;
297 			}
298 			return 3;
299 		}
300 		break;
301 
302 	default:
303 		if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
304 			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
305 				// *FFFE or *FFFF non-character
306 				return UTF8MaskInvalid | 4;
307 			}
308 			if (*us == 0xf4) {
309 				// Check if encoding a value beyond the last Unicode character 10FFFF
310 				if (us[1] > 0x8f) {
311 					return UTF8MaskInvalid | 1;
312 				}
313 			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
314 				// Overlong
315 				return UTF8MaskInvalid | 1;
316 			}
317 			return 4;
318 		}
319 		break;
320 	}
321 
322 	return UTF8MaskInvalid | 1;
323 }
324 
UTF8DrawBytes(const unsigned char * us,int len)325 int UTF8DrawBytes(const unsigned char *us, int len) noexcept {
326 	const int utf8StatusNext = UTF8Classify(us, len);
327 	return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
328 }
329 
UTF8IsValid(const char * s,size_t len)330 bool UTF8IsValid(const char *s, size_t len) noexcept {
331 	const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
332 	size_t remaining = len;
333 	while (remaining > 0) {
334 		const int utf8Status = UTF8Classify(us, remaining);
335 		if (utf8Status & UTF8MaskInvalid) {
336 			return false;
337 		} else {
338 			const int lenChar = utf8Status & UTF8MaskWidth;
339 			us += lenChar;
340 			remaining -= lenChar;
341 		}
342 	}
343 	return remaining == 0;
344 }
345 
346 // Replace invalid bytes in UTF-8 with the replacement character
FixInvalidUTF8(const std::string & text)347 std::string FixInvalidUTF8(const std::string &text) {
348 	std::string result;
349 	const char *s = text.c_str();
350 	size_t remaining = text.size();
351 	while (remaining > 0) {
352 		const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining);
353 		if (utf8Status & UTF8MaskInvalid) {
354 			// Replacement character 0xFFFD = UTF8:"efbfbd".
355 			result.append("\xef\xbf\xbd");
356 			s++;
357 			remaining--;
358 		} else {
359 			const size_t len = utf8Status & UTF8MaskWidth;
360 			result.append(s, len);
361 			s += len;
362 			remaining -= len;
363 		}
364 	}
365 	return result;
366 }
367 
368 }
369