1 /**
2 * @file
3 * @brief Conversions between Unicode and local charsets, string
4 * manipulation functions that act on character types.
5 **/
6
7 #include "AppHdr.h"
8
9 #include "unicode.h"
10
11 #include <climits>
12 #include <clocale>
13 #include <cstdio>
14 #include <cstring>
15 #include <string>
16
17 #include "syscalls.h"
18
19 // there must be at least 4 bytes free, NOT CHECKED!
wctoutf8(char * d,char32_t s)20 int wctoutf8(char *d, char32_t s)
21 {
22 if (s < 0x80)
23 {
24 d[0] = s;
25 return 1;
26 }
27 if (s < 0x800)
28 {
29 d[0] = ( s >> 6) | 0xc0;
30 d[1] = ( s & 0x3f) | 0x80;
31 return 2;
32 }
33 if (s < 0x10000)
34 {
35 d[0] = ( s >> 12) | 0xe0;
36 d[1] = ((s >> 6) & 0x3f) | 0x80;
37 d[2] = ( s & 0x3f) | 0x80;
38 return 3;
39 }
40 if (s < 0x110000)
41 {
42 d[0] = ( s >> 18) | 0xf0;
43 d[1] = ((s >> 12) & 0x3f) | 0x80;
44 d[2] = ((s >> 6) & 0x3f) | 0x80;
45 d[3] = ( s & 0x3f) | 0x80;
46 return 4;
47 }
48 // Invalid char marker (U+FFFD). Make sure we handled it above.
49 ASSERT(s != 0xFFFD);
50 return wctoutf8(d, 0xFFFD);
51 }
52
utf8towc(char32_t * d,const char * s)53 int utf8towc(char32_t *d, const char *s)
54 {
55 if (*s == 0)
56 {
57 *d = 0;
58 return 0;
59 }
60 if (!(*s & 0x80))
61 {
62 *d = *s;
63 return 1;
64 }
65 if ((*s & 0xc0) == 0x80)
66 { // bare tail, invalid
67 *d = 0xFFFD;
68 int bad = 0;
69 do bad++; while ((s[bad] & 0xc0) == 0x80);
70 return bad;
71 }
72
73 int cnt;
74 char32_t c;
75 if ((*s & 0xe0) == 0xc0)
76 cnt=2, c = *s & 0x1f;
77 else if ((*s & 0xf0) == 0xe0)
78 cnt=3, c = *s & 0x0f;
79 else if ((*s & 0xf8) == 0xf0)
80 cnt=4, c =*s & 0x07;
81 /* valid UTF-8, invalid Unicode
82 else if ((*s & 0xfc) == 0xf8)
83 cnt=5, c = *s & 0x03;
84 else if ((*s & 0xfe) == 0xfc)
85 cnt=6, c = *s & 0x01;
86 */
87 else
88 { // 0xfe or 0xff, invalid
89 *d = 0xFFFD;
90 return 1;
91 }
92
93 for (int i = 1; i < cnt; i++)
94 {
95 if ((s[i] & 0xc0) != 0x80)
96 { // only tail characters are allowed here, invalid
97 *d = 0xFFFD;
98 return i;
99 }
100 c = (c << 6) | (s[i] & 0x3f);
101 }
102
103 if (c < 0xA0 // illegal characters
104 || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogates
105 || (cnt == 3 && c < 0x800) // overlong characters
106 || (cnt == 4 && c < 0x10000) // overlong characters
107 || c > 0x10FFFF) // outside Unicode
108 {
109 c = 0xFFFD;
110 }
111 *d = c;
112 return cnt;
113 }
114
115 #ifdef TARGET_OS_WINDOWS
116 // don't pull in wstring templates on other systems
utf8_to_16(const char * s)117 wstring utf8_to_16(const char *s)
118 {
119 wstring d;
120 char32_t c;
121
122 while (int l = utf8towc(&c, s))
123 {
124 s += l;
125 if (c >= 0x10000)
126 {
127 c -= 0x10000;
128 d.push_back(0xD800 + (c >> 10));
129 d.push_back(0xDC00 + (c & 0x3FF));
130 }
131 else
132 d.push_back(c);
133 }
134 return d;
135 }
136 #endif
137
138 #ifndef TARGET_OS_WINDOWS
139 static
140 #endif
utf16_to_8(const utf16_t * s)141 string utf16_to_8(const utf16_t *s)
142 {
143 string d;
144 char32_t c;
145
146 while (*s)
147 {
148 if (*s >= 0xD800 && *s <= 0xDBFF)
149 if (s[1] >= 0xDC00 && s[1] <= 0xDFFF)
150 {
151 c = (((char32_t)s[0]) << 10) + s[1] - 0x35fdc00;
152 s++;
153 }
154 else
155 c = 0xFFFD; // leading surrogate without its tail
156 else if (*s >= 0xDC00 && *s <= 0xDFFF)
157 c = 0xFFFD; // unpaired trailing surrogate
158 else
159 c = *s;
160 s++;
161
162 char buf[4];
163 int l = wctoutf8(buf, c);
164 for (int i = 0; i < l; i++)
165 d.push_back(buf[i]);
166 }
167
168 return d;
169 }
170
utf8_to_mb(const char * s)171 string utf8_to_mb(const char *s)
172 {
173 #ifdef __ANDROID__
174 return s;
175 #else
176 string d;
177 char32_t c;
178 int l;
179 mbstate_t ps;
180
181 memset(&ps, 0, sizeof(ps));
182 while ((l = utf8towc(&c, s)))
183 {
184 s += l;
185
186 char buf[MB_LEN_MAX];
187 int r = wcrtomb(buf, c, &ps);
188 if (r != -1)
189 {
190 for (int i = 0; i < r; i++)
191 d.push_back(buf[i]);
192 }
193 else
194 d.push_back('?'); // TODO: try to transliterate
195 }
196 return d;
197 #endif
198 }
199
utf8_validate(const char * s)200 static string utf8_validate(const char *s)
201 {
202 string d;
203 char32_t c;
204 int l;
205
206 while ((l = utf8towc(&c, s)))
207 {
208 s += l;
209
210 char buf[4];
211 int r = wctoutf8(buf, c);
212 for (int i = 0; i < r; i++)
213 d.push_back(buf[i]);
214 }
215 return d;
216 }
217
mb_to_utf8(const char * s)218 string mb_to_utf8(const char *s)
219 {
220 #ifdef __ANDROID__
221 // Paranoia; all consumers already use the same code so this won't do
222 // anything new.
223 return utf8_validate(s);
224 #else
225 string d;
226 wchar_t c;
227 int l;
228 mbstate_t ps;
229
230 memset(&ps, 0, sizeof(ps));
231 // the input is zero-terminated, so third argument doesn't matter
232 while ((l = mbrtowc(&c, s, MB_LEN_MAX, &ps)))
233 {
234 if (l > 0)
235 s += l;
236 else
237 { // invalid input, mark it and try to recover
238 s++;
239 c = 0xFFFD;
240 }
241
242 char buf[4];
243 int r = wctoutf8(buf, c);
244 for (int i = 0; i < r; i++)
245 d.push_back(buf[i]);
246 }
247 return d;
248 #endif
249 }
250
_check_trail(FILE * f,const char * bytes,int len)251 static bool _check_trail(FILE *f, const char* bytes, int len)
252 {
253 while (len--)
254 {
255 if (fgetc(f) != (unsigned char)*bytes++)
256 {
257 rewind(f);
258 return false;
259 }
260 }
261 return true;
262 }
263
FileLineInput(const char * name)264 FileLineInput::FileLineInput(const char *name)
265 {
266 f = fopen_u(name, "r");
267 if (!f)
268 {
269 seen_eof = true;
270 return;
271 }
272 seen_eof = false;
273
274 bom = BOM_NORMAL;
275 int ch = fgetc(f);
276 switch (ch)
277 {
278 case 0xEF:
279 if (_check_trail(f, "\xBB\xBF", 2))
280 bom = BOM_UTF8;
281 break;
282 case 0xFE:
283 if (_check_trail(f, "\xFF", 1))
284 bom = BOM_UTF16BE;
285 break;
286 case 0xFF:
287 if (_check_trail(f, "\xFE\x00\x00", 3))
288 bom = BOM_UTF32LE;
289 else if (_check_trail(f, "\xFF\xFE", 2)) // rewound
290 bom = BOM_UTF16LE;
291 break;
292 case 0x00:
293 if (_check_trail(f, "\x00\xFE\xFF", 3))
294 bom = BOM_UTF32BE;
295 break;
296 default:
297 ungetc(ch, f);
298 }
299 }
300
~FileLineInput()301 FileLineInput::~FileLineInput()
302 {
303 if (f)
304 fclose(f);
305 }
306
get_line()307 string FileLineInput::get_line()
308 {
309 ASSERT(f);
310 vector<utf16_t> win;
311 string out;
312 char buf[512];
313 char32_t c;
314 int len;
315
316 switch (bom)
317 {
318 case BOM_NORMAL:
319 do
320 {
321 if (!fgets(buf, sizeof buf, f))
322 {
323 seen_eof = true;
324 break;
325 }
326 out += buf;
327 if (out[out.length() - 1] == '\n')
328 {
329 out.erase(out.length() - 1);
330 break;
331 }
332 } while (!seen_eof);
333 return mb_to_utf8(out.c_str());
334
335 case BOM_UTF8:
336 do
337 {
338 if (!fgets(buf, sizeof buf, f))
339 {
340 seen_eof = true;
341 break;
342 }
343 out += buf;
344 if (out[out.length() - 1] == '\n')
345 {
346 out.erase(out.length() - 1);
347 break;
348 }
349 } while (!seen_eof);
350 return utf8_validate(out.c_str());
351
352 case BOM_UTF16LE:
353 do
354 {
355 if (fread(buf, 2, 1, f) != 1)
356 {
357 seen_eof = true;
358 break;
359 }
360 c = ((uint32_t)((unsigned char)buf[0]))
361 | ((uint32_t)((unsigned char)buf[1])) << 8;
362 if (c == '\n')
363 break;
364 win.push_back(c);
365 }
366 while (!seen_eof);
367 win.push_back(0);
368 return utf16_to_8(&win[0]);
369
370 case BOM_UTF16BE:
371 do
372 {
373 if (fread(buf, 2, 1, f) != 1)
374 {
375 seen_eof = true;
376 break;
377 }
378 c = ((uint32_t)((unsigned char)buf[1]))
379 | ((uint32_t)((unsigned char)buf[0])) << 8;
380 if (c == '\n')
381 break;
382 win.push_back(c);
383 }
384 while (!seen_eof);
385 win.push_back(0);
386 return utf16_to_8(&win[0]);
387
388 case BOM_UTF32LE:
389 do
390 {
391 if (fread(buf, 4, 1, f) != 1)
392 {
393 seen_eof = true;
394 break;
395 }
396 c = ((uint32_t)((unsigned char)buf[0]))
397 | ((uint32_t)((unsigned char)buf[1])) << 8
398 | ((uint32_t)((unsigned char)buf[2])) << 16
399 | ((uint32_t)((unsigned char)buf[3])) << 24;
400 if (c == '\n')
401 break;
402 len = wctoutf8(buf, c);
403 for (int i = 0; i < len; i++)
404 out.push_back(buf[i]);
405 }
406 while (!seen_eof);
407 return out;
408
409 case BOM_UTF32BE:
410 do
411 {
412 if (fread(buf, 4, 1, f) != 1)
413 {
414 seen_eof = true;
415 break;
416 }
417 c = ((uint32_t)((unsigned char)buf[0])) << 24
418 | ((uint32_t)((unsigned char)buf[1])) << 16
419 | ((uint32_t)((unsigned char)buf[2])) << 8
420 | ((uint32_t)((unsigned char)buf[3]));
421 if (c == '\n')
422 break;
423 len = wctoutf8(buf, c);
424 for (int i = 0; i < len; i++)
425 out.push_back(buf[i]);
426 }
427 while (!seen_eof);
428 return out;
429 }
430
431 die("FileLineInput had a bad bom_type (%d)", bom);
432 }
433
UTF8FileLineInput(const char * name)434 UTF8FileLineInput::UTF8FileLineInput(const char *name)
435 {
436 f = fopen_u(name, "r");
437 if (!f)
438 {
439 seen_eof = true;
440 return;
441 }
442 seen_eof = false;
443 }
444
~UTF8FileLineInput()445 UTF8FileLineInput::~UTF8FileLineInput()
446 {
447 if (f)
448 fclose(f);
449 }
450
get_line()451 string UTF8FileLineInput::get_line()
452 {
453 ASSERT(f);
454 string out;
455 char buf[512];
456
457 do
458 {
459 if (!fgets(buf, sizeof buf, f))
460 {
461 seen_eof = true;
462 break;
463 }
464 out += buf;
465 if (out[out.length() - 1] == '\n')
466 {
467 out.erase(out.length() - 1);
468 break;
469 }
470 } while (!seen_eof);
471 return utf8_validate(out.c_str());
472 }
473
strwidth(const char * s)474 int strwidth(const char *s)
475 {
476 char32_t c;
477 int w = 0;
478
479 while (int l = utf8towc(&c, s))
480 {
481 s += l;
482 int cw = wcwidth(c);
483 if (cw != -1) // shouldn't ever happen
484 w += cw;
485 }
486
487 return w;
488 }
489
strwidth(const string & s)490 int strwidth(const string &s)
491 {
492 return strwidth(s.c_str());
493 }
494
wclen(char32_t c)495 int wclen(char32_t c)
496 {
497 char dummy[4];
498 return wctoutf8(dummy, c);
499 }
500
prev_glyph(char * s,char * start)501 char *prev_glyph(char *s, char *start)
502 {
503 char32_t c;
504 do
505 {
506 // Find the start of the previous code point.
507 do
508 if (--s < start)
509 return 0;
510 while ((*s & 0xc0) == 0x80);
511 // If a combining one, continue.
512 utf8towc(&c, s);
513 } while (!wcwidth(c));
514 return s;
515 }
516
next_glyph(char * s)517 char *next_glyph(char *s)
518 {
519 char *s_cur;
520 char32_t c;
521 // Skip at least one character.
522 s += utf8towc(&c, s);
523 if (!c)
524 return 0;
525 do
526 {
527 s += utf8towc(&c, s_cur = s);
528 // And any combining ones after it.
529 }
530 while (c && !wcwidth(c));
531 return s_cur;
532 }
533
chop_string(const char * s,int width,bool spaces)534 string chop_string(const char *s, int width, bool spaces)
535 {
536 const char *s0 = s;
537 char32_t c;
538
539 while (int clen = utf8towc(&c, s))
540 {
541 int cw = wcwidth(c);
542 // Due to combining chars, we can't stop at merely reaching the
543 // target width, the next character needs to exceed it.
544 if (cw > width) // note: a CJK character might leave one space left
545 break;
546 if (cw >= 0) // should we assert on control chars instead?
547 width -= cw;
548 s += clen;
549 }
550
551 if (spaces && width)
552 return string(s0, s - s0) + string(width, ' ');
553 return string(s0, s - s0);;
554 }
555
chop_string(const string & s,int width,bool spaces)556 string chop_string(const string &s, int width, bool spaces)
557 {
558 return chop_string(s.c_str(), width, spaces);
559 }
560