1 //
2 // Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2021
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 //
7 #include "td/utils/utf8.h"
8 
9 #include "td/utils/logging.h"
10 #include "td/utils/unicode.h"
11 
12 namespace td {
13 
check_utf8(CSlice str)14 bool check_utf8(CSlice str) {
15   const char *data = str.data();
16   const char *data_end = data + str.size();
17   do {
18     uint32 a = static_cast<unsigned char>(*data++);
19     if ((a & 0x80) == 0) {
20       if (data == data_end + 1) {
21         return true;
22       }
23       continue;
24     }
25 
26 #define ENSURE(condition) \
27   if (!(condition)) {     \
28     return false;         \
29   }
30 
31     ENSURE((a & 0x40) != 0);
32 
33     uint32 b = static_cast<unsigned char>(*data++);
34     ENSURE((b & 0xc0) == 0x80);
35     if ((a & 0x20) == 0) {
36       ENSURE((a & 0x1e) > 0);
37       continue;
38     }
39 
40     uint32 c = static_cast<unsigned char>(*data++);
41     ENSURE((c & 0xc0) == 0x80);
42     if ((a & 0x10) == 0) {
43       uint32 x = (((a & 0x0f) << 6) | (b & 0x20));
44       ENSURE(x != 0 && x != 0x360);  // surrogates
45       continue;
46     }
47 
48     uint32 d = static_cast<unsigned char>(*data++);
49     ENSURE((d & 0xc0) == 0x80);
50     if ((a & 0x08) == 0) {
51       uint32 t = (((a & 0x07) << 6) | (b & 0x30));
52       ENSURE(0 < t && t < 0x110);  // end of unicode
53       continue;
54     }
55 
56     return false;
57 #undef ENSURE
58   } while (true);
59 
60   UNREACHABLE();
61   return false;
62 }
63 
append_utf8_character(string & str,uint32 ch)64 void append_utf8_character(string &str, uint32 ch) {
65   if (ch <= 0x7f) {
66     str.push_back(static_cast<char>(ch));
67   } else if (ch <= 0x7ff) {
68     str.push_back(static_cast<char>(0xc0 | (ch >> 6)));  // implementation-defined
69     str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
70   } else if (ch <= 0xffff) {
71     str.push_back(static_cast<char>(0xe0 | (ch >> 12)));  // implementation-defined
72     str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
73     str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
74   } else {
75     str.push_back(static_cast<char>(0xf0 | (ch >> 18)));  // implementation-defined
76     str.push_back(static_cast<char>(0x80 | ((ch >> 12) & 0x3f)));
77     str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3f)));
78     str.push_back(static_cast<char>(0x80 | (ch & 0x3f)));
79   }
80 }
81 
next_utf8_unsafe(const unsigned char * ptr,uint32 * code,const char * source)82 const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code, const char *source) {
83   uint32 a = ptr[0];
84   if ((a & 0x80) == 0) {
85     if (code) {
86       *code = a;
87     }
88     return ptr + 1;
89   } else if ((a & 0x20) == 0) {
90     if (code) {
91       *code = ((a & 0x1f) << 6) | (ptr[1] & 0x3f);
92     }
93     return ptr + 2;
94   } else if ((a & 0x10) == 0) {
95     if (code) {
96       *code = ((a & 0x0f) << 12) | ((ptr[1] & 0x3f) << 6) | (ptr[2] & 0x3f);
97     }
98     return ptr + 3;
99   } else if ((a & 0x08) == 0) {
100     if (code) {
101       *code = ((a & 0x07) << 18) | ((ptr[1] & 0x3f) << 12) | ((ptr[2] & 0x3f) << 6) | (ptr[3] & 0x3f);
102     }
103     return ptr + 4;
104   }
105   LOG(FATAL) << a << " " << source;
106   if (code) {
107     *code = 0;
108   }
109   return ptr;
110 }
111 
utf8_to_lower(Slice str)112 string utf8_to_lower(Slice str) {
113   string result;
114   auto pos = str.ubegin();
115   auto end = str.uend();
116   while (pos != end) {
117     uint32 code;
118     pos = next_utf8_unsafe(pos, &code, "utf8_to_lower");
119     append_utf8_character(result, unicode_to_lower(code));
120   }
121   return result;
122 }
123 
124 }  // namespace td
125