1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3 ** Functions to handle UTF-8 and UTF-16 strings.
4 **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7
8 #include <cstdlib>
9
10 #include <stdexcept>
11 #include <string>
12
13 #include "UniConversion.h"
14
15 using namespace Scintilla;
16
17 namespace Scintilla {
18
UTF8Length(const wchar_t * uptr,size_t tlen)19 size_t UTF8Length(const wchar_t *uptr, size_t tlen) {
20 size_t len = 0;
21 for (size_t i = 0; i < tlen && uptr[i];) {
22 const unsigned int uch = uptr[i];
23 if (uch < 0x80) {
24 len++;
25 } else if (uch < 0x800) {
26 len += 2;
27 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
28 (uch <= SURROGATE_TRAIL_LAST)) {
29 len += 4;
30 i++;
31 } else {
32 len += 3;
33 }
34 i++;
35 }
36 return len;
37 }
38
UTF8FromUTF16(const wchar_t * uptr,size_t tlen,char * putf,size_t len)39 void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len) {
40 size_t k = 0;
41 for (size_t i = 0; i < tlen && uptr[i];) {
42 const unsigned int uch = uptr[i];
43 if (uch < 0x80) {
44 putf[k++] = static_cast<char>(uch);
45 } else if (uch < 0x800) {
46 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
47 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
48 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
49 (uch <= SURROGATE_TRAIL_LAST)) {
50 // Half a surrogate pair
51 i++;
52 const unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
53 putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
54 putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
55 putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
56 putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
57 } else {
58 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
59 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
60 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
61 }
62 i++;
63 }
64 if (k < len)
65 putf[k] = '\0';
66 }
67
UTF8FromUTF32Character(int uch,char * putf)68 void UTF8FromUTF32Character(int uch, char *putf) {
69 size_t k = 0;
70 if (uch < 0x80) {
71 putf[k++] = static_cast<char>(uch);
72 } else if (uch < 0x800) {
73 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
74 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
75 } else if (uch < 0x10000) {
76 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
77 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
78 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
79 } else {
80 putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
81 putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
82 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
83 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
84 }
85 putf[k] = '\0';
86 }
87
UTF16Length(const char * s,size_t len)88 size_t UTF16Length(const char *s, size_t len) {
89 size_t ulen = 0;
90 for (size_t i = 0; i < len;) {
91 const unsigned char ch = s[i];
92 const unsigned int byteCount = UTF8BytesOfLead[ch];
93 const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
94 i += byteCount;
95 ulen += (i > len) ? 1 : utf16Len;
96 }
97 return ulen;
98 }
99
TrailByteValue(unsigned char c)100 constexpr unsigned char TrailByteValue(unsigned char c) {
101 // The top 2 bits are 0b10 to indicate a trail byte.
102 // The lower 6 bits contain the value.
103 return c & 0x3F;
104 }
105
UTF16FromUTF8(const char * s,size_t len,wchar_t * tbuf,size_t tlen)106 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen) {
107 size_t ui = 0;
108 for (size_t i = 0; i < len;) {
109 unsigned char ch = s[i];
110 const unsigned int byteCount = UTF8BytesOfLead[ch];
111 unsigned int value;
112
113 if (i + byteCount > len) {
114 // Trying to read past end but still have space to write
115 if (ui < tlen) {
116 tbuf[ui] = ch;
117 ui++;
118 }
119 break;
120 }
121
122 const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount);
123 if (ui + outLen > tlen) {
124 throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
125 }
126
127 i++;
128 switch (byteCount) {
129 case 1:
130 tbuf[ui] = ch;
131 break;
132 case 2:
133 value = (ch & 0x1F) << 6;
134 ch = s[i++];
135 value += TrailByteValue(ch);
136 tbuf[ui] = static_cast<wchar_t>(value);
137 break;
138 case 3:
139 value = (ch & 0xF) << 12;
140 ch = s[i++];
141 value += (TrailByteValue(ch) << 6);
142 ch = s[i++];
143 value += TrailByteValue(ch);
144 tbuf[ui] = static_cast<wchar_t>(value);
145 break;
146 default:
147 // Outside the BMP so need two surrogates
148 value = (ch & 0x7) << 18;
149 ch = s[i++];
150 value += TrailByteValue(ch) << 12;
151 ch = s[i++];
152 value += TrailByteValue(ch) << 6;
153 ch = s[i++];
154 value += TrailByteValue(ch);
155 tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
156 ui++;
157 tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
158 break;
159 }
160 ui++;
161 }
162 return ui;
163 }
164
UTF32FromUTF8(const char * s,size_t len,unsigned int * tbuf,size_t tlen)165 size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen) {
166 size_t ui = 0;
167 for (size_t i = 0; i < len;) {
168 unsigned char ch = s[i];
169 const unsigned int byteCount = UTF8BytesOfLead[ch];
170 unsigned int value;
171
172 if (i + byteCount > len) {
173 // Trying to read past end but still have space to write
174 if (ui < tlen) {
175 tbuf[ui] = ch;
176 ui++;
177 }
178 break;
179 }
180
181 if (ui == tlen) {
182 throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
183 }
184
185 i++;
186 switch (byteCount) {
187 case 1:
188 value = ch;
189 break;
190 case 2:
191 value = (ch & 0x1F) << 6;
192 ch = s[i++];
193 value += TrailByteValue(ch);
194 break;
195 case 3:
196 value = (ch & 0xF) << 12;
197 ch = s[i++];
198 value += TrailByteValue(ch) << 6;
199 ch = s[i++];
200 value += TrailByteValue(ch);
201 break;
202 default:
203 value = (ch & 0x7) << 18;
204 ch = s[i++];
205 value += TrailByteValue(ch) << 12;
206 ch = s[i++];
207 value += TrailByteValue(ch) << 6;
208 ch = s[i++];
209 value += TrailByteValue(ch);
210 break;
211 }
212 tbuf[ui] = value;
213 ui++;
214 }
215 return ui;
216 }
217
UTF16FromUTF32Character(unsigned int val,wchar_t * tbuf)218 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept {
219 if (val < SUPPLEMENTAL_PLANE_FIRST) {
220 tbuf[0] = static_cast<wchar_t>(val);
221 return 1;
222 } else {
223 tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
224 tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
225 return 2;
226 }
227 }
228
229 const unsigned char UTF8BytesOfLead[256] = {
230 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
231 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
232 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
233 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
236 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
237 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
238 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
239 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
240 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
241 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
242 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
243 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
244 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
245 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
246 };
247
248 // Return both the width of the first character in the string and a status
249 // saying whether it is valid or invalid.
250 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
251 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
252 // reasonably treated as code points in some circumstances. They will, however,
253 // not have associated glyphs.
UTF8Classify(const unsigned char * us,size_t len)254 int UTF8Classify(const unsigned char *us, size_t len) noexcept {
255 // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
256 if (us[0] < 0x80) {
257 // ASCII
258 return 1;
259 }
260
261 const size_t byteCount = UTF8BytesOfLead[us[0]];
262 if (byteCount == 1 || byteCount > len) {
263 // Invalid lead byte
264 return UTF8MaskInvalid | 1;
265 }
266
267 if (!UTF8IsTrailByte(us[1])) {
268 // Invalid trail byte
269 return UTF8MaskInvalid | 1;
270 }
271
272 switch (byteCount) {
273 case 2:
274 return 2;
275
276 case 3:
277 if (UTF8IsTrailByte(us[2])) {
278 if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
279 // Overlong
280 return UTF8MaskInvalid | 1;
281 }
282 if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
283 // Surrogate
284 return UTF8MaskInvalid | 1;
285 }
286 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
287 // U+FFFE non-character - 3 bytes long
288 return UTF8MaskInvalid | 3;
289 }
290 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
291 // U+FFFF non-character - 3 bytes long
292 return UTF8MaskInvalid | 3;
293 }
294 if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
295 // U+FDD0 .. U+FDEF
296 return UTF8MaskInvalid | 3;
297 }
298 return 3;
299 }
300 break;
301
302 default:
303 if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
304 if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
305 // *FFFE or *FFFF non-character
306 return UTF8MaskInvalid | 4;
307 }
308 if (*us == 0xf4) {
309 // Check if encoding a value beyond the last Unicode character 10FFFF
310 if (us[1] > 0x8f) {
311 return UTF8MaskInvalid | 1;
312 }
313 } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
314 // Overlong
315 return UTF8MaskInvalid | 1;
316 }
317 return 4;
318 }
319 break;
320 }
321
322 return UTF8MaskInvalid | 1;
323 }
324
UTF8DrawBytes(const unsigned char * us,int len)325 int UTF8DrawBytes(const unsigned char *us, int len) noexcept {
326 const int utf8StatusNext = UTF8Classify(us, len);
327 return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
328 }
329
UTF8IsValid(const char * s,size_t len)330 bool UTF8IsValid(const char *s, size_t len) noexcept {
331 const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
332 size_t remaining = len;
333 while (remaining > 0) {
334 const int utf8Status = UTF8Classify(us, remaining);
335 if (utf8Status & UTF8MaskInvalid) {
336 return false;
337 } else {
338 const int lenChar = utf8Status & UTF8MaskWidth;
339 us += lenChar;
340 remaining -= lenChar;
341 }
342 }
343 return remaining == 0;
344 }
345
346 // Replace invalid bytes in UTF-8 with the replacement character
FixInvalidUTF8(const std::string & text)347 std::string FixInvalidUTF8(const std::string &text) {
348 std::string result;
349 const char *s = text.c_str();
350 size_t remaining = text.size();
351 while (remaining > 0) {
352 const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining);
353 if (utf8Status & UTF8MaskInvalid) {
354 // Replacement character 0xFFFD = UTF8:"efbfbd".
355 result.append("\xef\xbf\xbd");
356 s++;
357 remaining--;
358 } else {
359 const size_t len = utf8Status & UTF8MaskWidth;
360 result.append(s, len);
361 s += len;
362 remaining -= len;
363 }
364 }
365 return result;
366 }
367
368 }
369