1 /**
2  * @file
3  * @brief Conversions between Unicode and local charsets, string
4  *        manipulation functions that act on character types.
5 **/
6 
7 #include "AppHdr.h"
8 
9 #include "unicode.h"
10 
11 #include <climits>
12 #include <clocale>
13 #include <cstdio>
14 #include <cstring>
15 #include <string>
16 
17 #include "syscalls.h"
18 
19 // there must be at least 4 bytes free, NOT CHECKED!
wctoutf8(char * d,char32_t s)20 int wctoutf8(char *d, char32_t s)
21 {
22     if (s < 0x80)
23     {
24         d[0] = s;
25         return 1;
26     }
27     if (s < 0x800)
28     {
29         d[0] = ( s >>  6)         | 0xc0;
30         d[1] = ( s        & 0x3f) | 0x80;
31         return 2;
32     }
33     if (s < 0x10000)
34     {
35         d[0] = ( s >> 12)         | 0xe0;
36         d[1] = ((s >>  6) & 0x3f) | 0x80;
37         d[2] = ( s        & 0x3f) | 0x80;
38         return 3;
39     }
40     if (s < 0x110000)
41     {
42         d[0] = ( s >> 18)         | 0xf0;
43         d[1] = ((s >> 12) & 0x3f) | 0x80;
44         d[2] = ((s >>  6) & 0x3f) | 0x80;
45         d[3] = ( s        & 0x3f) | 0x80;
46         return 4;
47     }
48     // Invalid char marker (U+FFFD). Make sure we handled it above.
49     ASSERT(s != 0xFFFD);
50     return wctoutf8(d, 0xFFFD);
51 }
52 
utf8towc(char32_t * d,const char * s)53 int utf8towc(char32_t *d, const char *s)
54 {
55     if (*s == 0)
56     {
57         *d = 0;
58         return 0;
59     }
60     if (!(*s & 0x80))
61     {
62         *d = *s;
63         return 1;
64     }
65     if ((*s & 0xc0) == 0x80)
66     {   // bare tail, invalid
67         *d = 0xFFFD;
68         int bad = 0;
69         do bad++; while ((s[bad] & 0xc0) == 0x80);
70         return bad;
71     }
72 
73     int cnt;
74     char32_t c;
75     if ((*s & 0xe0) == 0xc0)
76         cnt=2, c = *s & 0x1f;
77     else if ((*s & 0xf0) == 0xe0)
78         cnt=3, c = *s & 0x0f;
79     else if ((*s & 0xf8) == 0xf0)
80         cnt=4, c =*s & 0x07;
81     /* valid UTF-8, invalid Unicode
82     else if ((*s & 0xfc) == 0xf8)
83         cnt=5, c = *s & 0x03;
84     else if ((*s & 0xfe) == 0xfc)
85         cnt=6, c = *s & 0x01;
86     */
87     else
88     {   // 0xfe or 0xff, invalid
89         *d = 0xFFFD;
90         return 1;
91     }
92 
93     for (int i = 1;  i < cnt; i++)
94     {
95         if ((s[i] & 0xc0) != 0x80)
96         {   // only tail characters are allowed here, invalid
97             *d = 0xFFFD;
98             return i;
99         }
100         c = (c << 6) | (s[i] & 0x3f);
101     }
102 
103     if (c < 0xA0                        // illegal characters
104         || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogates
105         || (cnt == 3 && c < 0x800)      // overlong characters
106         || (cnt == 4 && c < 0x10000)    // overlong characters
107         || c > 0x10FFFF)                // outside Unicode
108     {
109         c = 0xFFFD;
110     }
111     *d = c;
112     return cnt;
113 }
114 
115 #ifdef TARGET_OS_WINDOWS
116 // don't pull in wstring templates on other systems
utf8_to_16(const char * s)117 wstring utf8_to_16(const char *s)
118 {
119     wstring d;
120     char32_t c;
121 
122     while (int l = utf8towc(&c, s))
123     {
124         s += l;
125         if (c >= 0x10000)
126         {
127             c -= 0x10000;
128             d.push_back(0xD800 + (c >> 10));
129             d.push_back(0xDC00 + (c & 0x3FF));
130         }
131         else
132             d.push_back(c);
133     }
134     return d;
135 }
136 #endif
137 
138 #ifndef TARGET_OS_WINDOWS
139 static
140 #endif
utf16_to_8(const utf16_t * s)141 string utf16_to_8(const utf16_t *s)
142 {
143     string d;
144     char32_t c;
145 
146     while (*s)
147     {
148         if (*s >= 0xD800 && *s <= 0xDBFF)
149             if (s[1] >= 0xDC00 && s[1] <= 0xDFFF)
150             {
151                 c = (((char32_t)s[0]) << 10) + s[1] - 0x35fdc00;
152                 s++;
153             }
154             else
155                 c = 0xFFFD; // leading surrogate without its tail
156         else if (*s >= 0xDC00 && *s <= 0xDFFF)
157             c = 0xFFFD;     // unpaired trailing surrogate
158         else
159             c = *s;
160         s++;
161 
162         char buf[4];
163         int l = wctoutf8(buf, c);
164         for (int i = 0; i < l; i++)
165             d.push_back(buf[i]);
166     }
167 
168     return d;
169 }
170 
utf8_to_mb(const char * s)171 string utf8_to_mb(const char *s)
172 {
173 #ifdef __ANDROID__
174     return s;
175 #else
176     string d;
177     char32_t c;
178     int l;
179     mbstate_t ps;
180 
181     memset(&ps, 0, sizeof(ps));
182     while ((l = utf8towc(&c, s)))
183     {
184         s += l;
185 
186         char buf[MB_LEN_MAX];
187         int r = wcrtomb(buf, c, &ps);
188         if (r != -1)
189         {
190             for (int i = 0; i < r; i++)
191                 d.push_back(buf[i]);
192         }
193         else
194             d.push_back('?'); // TODO: try to transliterate
195     }
196     return d;
197 #endif
198 }
199 
utf8_validate(const char * s)200 static string utf8_validate(const char *s)
201 {
202     string d;
203     char32_t c;
204     int l;
205 
206     while ((l = utf8towc(&c, s)))
207     {
208         s += l;
209 
210         char buf[4];
211         int r = wctoutf8(buf, c);
212         for (int i = 0; i < r; i++)
213             d.push_back(buf[i]);
214     }
215     return d;
216 }
217 
mb_to_utf8(const char * s)218 string mb_to_utf8(const char *s)
219 {
220 #ifdef __ANDROID__
221     // Paranoia; all consumers already use the same code so this won't do
222     // anything new.
223     return utf8_validate(s);
224 #else
225     string d;
226     wchar_t c;
227     int l;
228     mbstate_t ps;
229 
230     memset(&ps, 0, sizeof(ps));
231     // the input is zero-terminated, so third argument doesn't matter
232     while ((l = mbrtowc(&c, s, MB_LEN_MAX, &ps)))
233     {
234         if (l > 0)
235             s += l;
236         else
237         {   // invalid input, mark it and try to recover
238             s++;
239             c = 0xFFFD;
240         }
241 
242         char buf[4];
243         int r = wctoutf8(buf, c);
244         for (int i = 0; i < r; i++)
245             d.push_back(buf[i]);
246     }
247     return d;
248 #endif
249 }
250 
_check_trail(FILE * f,const char * bytes,int len)251 static bool _check_trail(FILE *f, const char* bytes, int len)
252 {
253     while (len--)
254     {
255         if (fgetc(f) != (unsigned char)*bytes++)
256         {
257             rewind(f);
258             return false;
259         }
260     }
261     return true;
262 }
263 
FileLineInput(const char * name)264 FileLineInput::FileLineInput(const char *name)
265 {
266     f = fopen_u(name, "r");
267     if (!f)
268     {
269         seen_eof = true;
270         return;
271     }
272     seen_eof = false;
273 
274     bom = BOM_NORMAL;
275     int ch = fgetc(f);
276     switch (ch)
277     {
278     case 0xEF:
279         if (_check_trail(f, "\xBB\xBF", 2))
280             bom = BOM_UTF8;
281         break;
282     case 0xFE:
283         if (_check_trail(f, "\xFF", 1))
284             bom = BOM_UTF16BE;
285         break;
286     case 0xFF:
287         if (_check_trail(f, "\xFE\x00\x00", 3))
288             bom = BOM_UTF32LE;
289         else if (_check_trail(f, "\xFF\xFE", 2)) // rewound
290             bom = BOM_UTF16LE;
291         break;
292     case 0x00:
293         if (_check_trail(f, "\x00\xFE\xFF", 3))
294             bom = BOM_UTF32BE;
295         break;
296     default:
297         ungetc(ch, f);
298     }
299 }
300 
~FileLineInput()301 FileLineInput::~FileLineInput()
302 {
303     if (f)
304         fclose(f);
305 }
306 
get_line()307 string FileLineInput::get_line()
308 {
309     ASSERT(f);
310     vector<utf16_t> win;
311     string out;
312     char buf[512];
313     char32_t c;
314     int len;
315 
316     switch (bom)
317     {
318     case BOM_NORMAL:
319         do
320         {
321             if (!fgets(buf, sizeof buf, f))
322             {
323                 seen_eof = true;
324                 break;
325             }
326             out += buf;
327             if (out[out.length() - 1] == '\n')
328             {
329                 out.erase(out.length() - 1);
330                 break;
331             }
332         } while (!seen_eof);
333         return mb_to_utf8(out.c_str());
334 
335     case BOM_UTF8:
336         do
337         {
338             if (!fgets(buf, sizeof buf, f))
339             {
340                 seen_eof = true;
341                 break;
342             }
343             out += buf;
344             if (out[out.length() - 1] == '\n')
345             {
346                 out.erase(out.length() - 1);
347                 break;
348             }
349         } while (!seen_eof);
350         return utf8_validate(out.c_str());
351 
352     case BOM_UTF16LE:
353         do
354         {
355             if (fread(buf, 2, 1, f) != 1)
356             {
357                 seen_eof = true;
358                 break;
359             }
360             c = ((uint32_t)((unsigned char)buf[0]))
361               | ((uint32_t)((unsigned char)buf[1])) << 8;
362             if (c == '\n')
363                 break;
364             win.push_back(c);
365         }
366         while (!seen_eof);
367         win.push_back(0);
368         return utf16_to_8(&win[0]);
369 
370     case BOM_UTF16BE:
371         do
372         {
373             if (fread(buf, 2, 1, f) != 1)
374             {
375                 seen_eof = true;
376                 break;
377             }
378             c = ((uint32_t)((unsigned char)buf[1]))
379               | ((uint32_t)((unsigned char)buf[0])) << 8;
380             if (c == '\n')
381                 break;
382             win.push_back(c);
383         }
384         while (!seen_eof);
385         win.push_back(0);
386         return utf16_to_8(&win[0]);
387 
388     case BOM_UTF32LE:
389         do
390         {
391             if (fread(buf, 4, 1, f) != 1)
392             {
393                 seen_eof = true;
394                 break;
395             }
396             c = ((uint32_t)((unsigned char)buf[0]))
397               | ((uint32_t)((unsigned char)buf[1])) << 8
398               | ((uint32_t)((unsigned char)buf[2])) << 16
399               | ((uint32_t)((unsigned char)buf[3])) << 24;
400             if (c == '\n')
401                 break;
402             len = wctoutf8(buf, c);
403             for (int i = 0; i < len; i++)
404                 out.push_back(buf[i]);
405         }
406         while (!seen_eof);
407         return out;
408 
409     case BOM_UTF32BE:
410         do
411         {
412             if (fread(buf, 4, 1, f) != 1)
413             {
414                 seen_eof = true;
415                 break;
416             }
417             c = ((uint32_t)((unsigned char)buf[0])) << 24
418               | ((uint32_t)((unsigned char)buf[1])) << 16
419               | ((uint32_t)((unsigned char)buf[2])) << 8
420               | ((uint32_t)((unsigned char)buf[3]));
421             if (c == '\n')
422                 break;
423             len = wctoutf8(buf, c);
424             for (int i = 0; i < len; i++)
425                 out.push_back(buf[i]);
426         }
427         while (!seen_eof);
428         return out;
429     }
430 
431     die("FileLineInput had a bad bom_type (%d)", bom);
432 }
433 
UTF8FileLineInput(const char * name)434 UTF8FileLineInput::UTF8FileLineInput(const char *name)
435 {
436     f = fopen_u(name, "r");
437     if (!f)
438     {
439         seen_eof = true;
440         return;
441     }
442     seen_eof = false;
443 }
444 
~UTF8FileLineInput()445 UTF8FileLineInput::~UTF8FileLineInput()
446 {
447     if (f)
448         fclose(f);
449 }
450 
get_line()451 string UTF8FileLineInput::get_line()
452 {
453     ASSERT(f);
454     string out;
455     char buf[512];
456 
457     do
458     {
459         if (!fgets(buf, sizeof buf, f))
460         {
461             seen_eof = true;
462             break;
463         }
464         out += buf;
465         if (out[out.length() - 1] == '\n')
466         {
467             out.erase(out.length() - 1);
468             break;
469         }
470     } while (!seen_eof);
471     return utf8_validate(out.c_str());
472 }
473 
strwidth(const char * s)474 int strwidth(const char *s)
475 {
476     char32_t c;
477     int w = 0;
478 
479     while (int l = utf8towc(&c, s))
480     {
481         s += l;
482         int cw = wcwidth(c);
483         if (cw != -1) // shouldn't ever happen
484             w += cw;
485     }
486 
487     return w;
488 }
489 
strwidth(const string & s)490 int strwidth(const string &s)
491 {
492     return strwidth(s.c_str());
493 }
494 
wclen(char32_t c)495 int wclen(char32_t c)
496 {
497     char dummy[4];
498     return wctoutf8(dummy, c);
499 }
500 
prev_glyph(char * s,char * start)501 char *prev_glyph(char *s, char *start)
502 {
503     char32_t c;
504     do
505     {
506         // Find the start of the previous code point.
507         do
508             if (--s < start)
509                 return 0;
510         while ((*s & 0xc0) == 0x80);
511         // If a combining one, continue.
512         utf8towc(&c, s);
513     } while (!wcwidth(c));
514     return s;
515 }
516 
next_glyph(char * s)517 char *next_glyph(char *s)
518 {
519     char *s_cur;
520     char32_t c;
521     // Skip at least one character.
522     s += utf8towc(&c, s);
523     if (!c)
524         return 0;
525     do
526     {
527         s += utf8towc(&c, s_cur = s);
528         // And any combining ones after it.
529     }
530     while (c && !wcwidth(c));
531     return s_cur;
532 }
533 
chop_string(const char * s,int width,bool spaces)534 string chop_string(const char *s, int width, bool spaces)
535 {
536     const char *s0 = s;
537     char32_t c;
538 
539     while (int clen = utf8towc(&c, s))
540     {
541         int cw = wcwidth(c);
542         // Due to combining chars, we can't stop at merely reaching the
543         // target width, the next character needs to exceed it.
544         if (cw > width) // note: a CJK character might leave one space left
545             break;
546         if (cw >= 0) // should we assert on control chars instead?
547             width -= cw;
548         s += clen;
549     }
550 
551     if (spaces && width)
552         return string(s0, s - s0) + string(width, ' ');
553     return string(s0, s - s0);;
554 }
555 
chop_string(const string & s,int width,bool spaces)556 string chop_string(const string &s, int width, bool spaces)
557 {
558     return chop_string(s.c_str(), width, spaces);
559 }
560