1
2 /* Compiler implementation of the D programming language
3 * Copyright (C) 2003-2021 by The D Language Foundation, All Rights Reserved
4 * written by Walter Bright
5 * http://www.digitalmars.com
6 * Distributed under the Boost Software License, Version 1.0.
7 * http://www.boost.org/LICENSE_1_0.txt
8 * https://github.com/D-Programming-Language/dmd/blob/master/src/utf.c
9 */
10
11 /// Description of UTF-8 in [1]. Unicode non-characters and private-use
12 /// code points described in [2],[4].
13 ///
14 /// References:
15 /// [1] http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
16 /// [2] http://en.wikipedia.org/wiki/Unicode
17 /// [3] http://unicode.org/faq/utf_bom.html
18 /// [4] http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf
19
20 #include "utf.h"
21
22 /* The following encodings are valid, except for the 5 and 6 byte
23 * combinations:
24 * 0xxxxxxx
25 * 110xxxxx 10xxxxxx
26 * 1110xxxx 10xxxxxx 10xxxxxx
27 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
28 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
29 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
30 */
31 const unsigned UTF8_STRIDE[256] =
32 {
33 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
34 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
35 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
36 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
37 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
38 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
39 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
40 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
41 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
42 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
43 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
44 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
45 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
46 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
47 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
48 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
49 };
50
51 // UTF-8 decoding errors
52 char const UTF8_DECODE_OUTSIDE_CODE_SPACE[] = "Outside Unicode code space";
53 char const UTF8_DECODE_TRUNCATED_SEQUENCE[] = "Truncated UTF-8 sequence";
54 char const UTF8_DECODE_OVERLONG[] = "Overlong UTF-8 sequence";
55 char const UTF8_DECODE_INVALID_TRAILER[] = "Invalid trailing code unit";
56 char const UTF8_DECODE_INVALID_CODE_POINT[] = "Invalid code point decoded";
57
58 // UTF-16 decoding errors
59 char const UTF16_DECODE_TRUNCATED_SEQUENCE[]= "Truncated UTF-16 sequence";
60 char const UTF16_DECODE_INVALID_SURROGATE[] = "Invalid low surrogate";
61 char const UTF16_DECODE_UNPAIRED_SURROGATE[]= "Unpaired surrogate";
62 char const UTF16_DECODE_INVALID_CODE_POINT[]= "Invalid code point decoded";
63
64 /// The Unicode code space is the range of code points [0x000000,0x10FFFF]
65 /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF]
66 /// and non-characters (which end in 0xFFFE or 0xFFFF).
utf_isValidDchar(dchar_t c)67 bool utf_isValidDchar(dchar_t c)
68 {
69 // TODO: Whether non-char code points should be rejected is pending review
70 // largest character code point
71 if (c > 0x10FFFF)
72 return false;
73 // surrogate pairs
74 if (0xD800 <= c && c <= 0xDFFF)
75 return false;
76 // non-characters
77 if ((c & 0xFFFFFE) == 0x00FFFE)
78 return false;
79 return true;
80 }
81
82 /*******************************
83 * Return !=0 if unicode alpha.
84 * Use table from C99 Appendix D.
85 */
86
isUniAlpha(dchar_t c)87 bool isUniAlpha(dchar_t c)
88 {
89 size_t high = ALPHA_TABLE_LENGTH - 1;
90 // Shortcut search if c is out of range
91 size_t low
92 = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0;
93 // Binary search
94 while (low <= high)
95 {
96 size_t mid = (low + high) >> 1;
97 if (c < ALPHA_TABLE[mid][0])
98 high = mid - 1;
99 else if (ALPHA_TABLE[mid][1] < c)
100 low = mid + 1;
101 else
102 {
103 assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]);
104 return true;
105 }
106 }
107 return false;
108 }
109
110 /**
111 * Returns the code length of c in code units.
112 */
113
utf_codeLengthChar(dchar_t c)114 int utf_codeLengthChar(dchar_t c)
115 {
116 if (c <= 0x7F)
117 return 1;
118 if (c <= 0x7FF)
119 return 2;
120 if (c <= 0xFFFF)
121 return 3;
122 if (c <= 0x10FFFF)
123 return 4;
124 assert(false);
125 return 6;
126 }
127
utf_codeLengthWchar(dchar_t c)128 int utf_codeLengthWchar(dchar_t c)
129 {
130 return c <= 0xFFFF ? 1 : 2;
131 }
132
133 /**
134 * Returns the code length of c in code units for the encoding.
135 * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32.
136 */
137
utf_codeLength(int sz,dchar_t c)138 int utf_codeLength(int sz, dchar_t c)
139 {
140 if (sz == 1)
141 return utf_codeLengthChar(c);
142 if (sz == 2)
143 return utf_codeLengthWchar(c);
144 assert(sz == 4);
145 return 1;
146 }
147
utf_encodeChar(utf8_t * s,dchar_t c)148 void utf_encodeChar(utf8_t *s, dchar_t c)
149 {
150 assert(s != NULL);
151 assert(utf_isValidDchar(c));
152 if (c <= 0x7F)
153 {
154 s[0] = static_cast<utf8_t>(c);
155 }
156 else if (c <= 0x07FF)
157 {
158 s[0] = static_cast<utf8_t>(0xC0 | (c >> 6));
159 s[1] = static_cast<utf8_t>(0x80 | (c & 0x3F));
160 }
161 else if (c <= 0xFFFF)
162 {
163 s[0] = static_cast<utf8_t>(0xE0 | (c >> 12));
164 s[1] = static_cast<utf8_t>(0x80 | ((c >> 6) & 0x3F));
165 s[2] = static_cast<utf8_t>(0x80 | (c & 0x3F));
166 }
167 else if (c <= 0x10FFFF)
168 {
169 s[0] = static_cast<utf8_t>(0xF0 | (c >> 18));
170 s[1] = static_cast<utf8_t>(0x80 | ((c >> 12) & 0x3F));
171 s[2] = static_cast<utf8_t>(0x80 | ((c >> 6) & 0x3F));
172 s[3] = static_cast<utf8_t>(0x80 | (c & 0x3F));
173 }
174 else
175 assert(0);
176 }
177
utf_encodeWchar(utf16_t * s,dchar_t c)178 void utf_encodeWchar(utf16_t *s, dchar_t c)
179 {
180 assert(s != NULL);
181 assert(utf_isValidDchar(c));
182 if (c <= 0xFFFF)
183 {
184 s[0] = static_cast<utf16_t>(c);
185 }
186 else
187 {
188 s[0] = static_cast<utf16_t>((((c - 0x010000) >> 10) & 0x03FF) + 0xD800);
189 s[1] = static_cast<utf16_t>(((c - 0x010000) & 0x03FF) + 0xDC00);
190 }
191 }
192
utf_encode(int sz,void * s,dchar_t c)193 void utf_encode(int sz, void *s, dchar_t c)
194 {
195 if (sz == 1)
196 utf_encodeChar((utf8_t *)s, c);
197 else if (sz == 2)
198 utf_encodeWchar((utf16_t *)s, c);
199 else
200 {
201 assert(sz == 4);
202 *((utf32_t *)s) = c;
203 }
204 }
205
206 /********************************************
207 * Decode a UTF-8 sequence as a single UTF-32 code point.
208 * Returns:
209 * NULL success
210 * !=NULL error message string
211 */
212
utf_decodeChar(utf8_t const * s,size_t len,size_t * pidx,dchar_t * presult)213 const char *utf_decodeChar(utf8_t const *s, size_t len, size_t *pidx, dchar_t *presult)
214 {
215 assert(s != NULL);
216 assert(pidx != NULL);
217 assert(presult != NULL);
218 size_t i = (*pidx)++;
219 assert(i < len);
220 utf8_t u = s[i];
221 // Pre-stage results for ASCII and error cases
222 *presult = u;
223
224 //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len);
225
226 // Get expected sequence length
227 size_t n = UTF8_STRIDE[u];
228 switch (n)
229 {
230 case 1: // ASCII
231 return UTF8_DECODE_OK;
232 case 2: case 3: case 4: // multi-byte UTF-8
233 break;
234 default: // 5- or 6-byte sequence
235 return UTF8_DECODE_OUTSIDE_CODE_SPACE;
236 }
237 if (len < i + n) // source too short
238 return UTF8_DECODE_TRUNCATED_SEQUENCE;
239
240 // Pick off 7 - n low bits from first code unit
241 utf32_t c = u & ((1 << (7 - n)) - 1);
242 /* The following combinations are overlong, and illegal:
243 * 1100000x (10xxxxxx)
244 * 11100000 100xxxxx (10xxxxxx)
245 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
246 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
247 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
248 */
249 utf8_t u2 = s[++i];
250 // overlong combination
251 if ((u & 0xFE) == 0xC0 ||
252 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
253 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
254 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
255 (u == 0xFC && (u2 & 0xFC) == 0x80))
256 return UTF8_DECODE_OVERLONG;
257 // Decode remaining bits
258 for (n += i - 1; i != n; ++i)
259 {
260 u = s[i];
261 if ((u & 0xC0) != 0x80) // trailing bytes are 10xxxxxx
262 return UTF8_DECODE_INVALID_TRAILER;
263 c = (c << 6) | (u & 0x3F);
264 }
265 if (!utf_isValidDchar(c))
266 return UTF8_DECODE_INVALID_CODE_POINT;
267 *pidx = i;
268 *presult = c;
269 return UTF8_DECODE_OK;
270 }
271
272 /********************************************
273 * Decode a UTF-16 sequence as a single UTF-32 code point.
274 * Returns:
275 * NULL success
276 * !=NULL error message string
277 */
278
utf_decodeWchar(utf16_t const * s,size_t len,size_t * pidx,dchar_t * presult)279 const char *utf_decodeWchar(utf16_t const *s, size_t len, size_t *pidx, dchar_t *presult)
280 {
281 assert(s != NULL);
282 assert(pidx != NULL);
283 assert(presult != NULL);
284 size_t i = (*pidx)++;
285 assert(i < len);
286 // Pre-stage results for ASCII and error cases
287 utf32_t u = *presult = s[i];
288
289 if (u < 0x80) // ASCII
290 return UTF16_DECODE_OK;
291 if (0xD800 <= u && u <= 0xDBFF) // Surrogate pair
292 { if (len <= i + 1)
293 return UTF16_DECODE_TRUNCATED_SEQUENCE;
294 utf16_t u2 = s[i + 1];
295 if (u2 < 0xDC00 || 0xDFFF < u)
296 return UTF16_DECODE_INVALID_SURROGATE;
297 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
298 ++*pidx;
299 }
300 else if (0xDC00 <= u && u <= 0xDFFF)
301 return UTF16_DECODE_UNPAIRED_SURROGATE;
302 if (!utf_isValidDchar(u))
303 return UTF16_DECODE_INVALID_CODE_POINT;
304 *presult = u;
305 return UTF16_DECODE_OK;
306 }
307