1 //
2 // Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2021
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 //
7 #include "td/telegram/misc.h"
8 
9 #include "td/utils/algorithm.h"
10 #include "td/utils/common.h"
11 #include "td/utils/misc.h"
12 #include "td/utils/Slice.h"
13 #include "td/utils/utf8.h"
14 
15 #include <cstring>
16 #include <limits>
17 
18 namespace td {
19 
clean_name(string str,size_t max_length)20 string clean_name(string str, size_t max_length) {
21   str = strip_empty_characters(str, max_length);
22   size_t new_len = 0;
23   bool is_previous_space = false;
24   for (size_t i = 0; i < str.size(); i++) {
25     if (str[i] == ' ' || str[i] == '\n') {
26       if (!is_previous_space) {
27         str[new_len++] = ' ';
28         is_previous_space = true;
29       }
30       continue;
31     }
32     if (static_cast<unsigned char>(str[i]) == 0xC2 && static_cast<unsigned char>(str[i + 1]) == 0xA0) {  // &nbsp;
33       if (!is_previous_space) {
34         str[new_len++] = ' ';
35         is_previous_space = true;
36       }
37       i++;
38       continue;
39     }
40 
41     str[new_len++] = str[i];
42     is_previous_space = false;
43   }
44   str.resize(new_len);
45   return trim(str);
46 }
47 
clean_username(string str)48 string clean_username(string str) {
49   td::remove(str, '.');
50   to_lower_inplace(str);
51   return trim(str);
52 }
53 
replace_offending_characters(string & str)54 void replace_offending_characters(string &str) {
55   // "(\xe2\x80\x8f|\xe2\x80\x8e){N}(\xe2\x80\x8f|\xe2\x80\x8e)" -> "(\xe2\x80\x8c){N}$2"
56   auto s = MutableSlice(str).ubegin();
57   for (size_t pos = 0; pos < str.size(); pos++) {
58     if (s[pos] == 0xe2 && s[pos + 1] == 0x80 && (s[pos + 2] == 0x8e || s[pos + 2] == 0x8f)) {
59       while (s[pos + 3] == 0xe2 && s[pos + 4] == 0x80 && (s[pos + 5] == 0x8e || s[pos + 5] == 0x8f)) {
60         s[pos + 2] = static_cast<unsigned char>(0x8c);
61         pos += 3;
62       }
63       pos += 2;
64     }
65   }
66 }
67 
clean_input_string(string & str)68 bool clean_input_string(string &str) {
69   constexpr size_t LENGTH_LIMIT = 35000;  // server side limit
70   if (!check_utf8(str)) {
71     return false;
72   }
73 
74   size_t str_size = str.size();
75   size_t new_size = 0;
76   for (size_t pos = 0; pos < str_size; pos++) {
77     auto c = static_cast<unsigned char>(str[pos]);
78     switch (c) {
79       // remove control characters
80       case 0:
81       case 1:
82       case 2:
83       case 3:
84       case 4:
85       case 5:
86       case 6:
87       case 7:
88       case 8:
89       case 9:
90       // allow '\n'
91       case 11:
92       case 12:
93       // ignore '\r'
94       case 14:
95       case 15:
96       case 16:
97       case 17:
98       case 18:
99       case 19:
100       case 20:
101       case 21:
102       case 22:
103       case 23:
104       case 24:
105       case 25:
106       case 26:
107       case 27:
108       case 28:
109       case 29:
110       case 30:
111       case 31:
112       case 32:
113         str[new_size++] = ' ';
114         break;
115       case '\r':
116         // skip
117         break;
118       default:
119         // remove \xe2\x80[\xa8-\xae]
120         if (c == 0xe2 && pos + 2 < str_size) {
121           auto next = static_cast<unsigned char>(str[pos + 1]);
122           if (next == 0x80) {
123             next = static_cast<unsigned char>(str[pos + 2]);
124             if (0xa8 <= next && next <= 0xae) {
125               pos += 2;
126               break;
127             }
128           }
129         }
130         // remove vertical lines \xcc[\xb3\xbf\x8a]
131         if (c == 0xcc && pos + 1 < str_size) {
132           auto next = static_cast<unsigned char>(str[pos + 1]);
133           if (next == 0xb3 || next == 0xbf || next == 0x8a) {
134             pos++;
135             break;
136           }
137         }
138 
139         str[new_size++] = str[pos];
140         break;
141     }
142     if (new_size >= LENGTH_LIMIT - 3 && is_utf8_character_first_code_unit(str[new_size - 1])) {
143       new_size--;
144       break;
145     }
146   }
147 
148   str.resize(new_size);
149 
150   replace_offending_characters(str);
151 
152   return true;
153 }
154 
strip_empty_characters(string str,size_t max_length,bool strip_rtlo)155 string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) {
156   static const char *space_characters[] = {u8"\u1680", u8"\u180E", u8"\u2000", u8"\u2001", u8"\u2002",
157                                            u8"\u2003", u8"\u2004", u8"\u2005", u8"\u2006", u8"\u2007",
158                                            u8"\u2008", u8"\u2009", u8"\u200A", u8"\u202E", u8"\u202F",
159                                            u8"\u205F", u8"\u2800", u8"\u3000", u8"\uFFFC"};
160   static bool can_be_first[std::numeric_limits<unsigned char>::max() + 1];
161   static bool can_be_first_inited = [&] {
162     for (auto space_ch : space_characters) {
163       CHECK(std::strlen(space_ch) == 3);
164       can_be_first[static_cast<unsigned char>(space_ch[0])] = true;
165     }
166     return true;
167   }();
168   CHECK(can_be_first_inited);
169 
170   // replace all occurences of space characters with a space
171   size_t i = 0;
172   while (i < str.size() && !can_be_first[static_cast<unsigned char>(str[i])]) {
173     i++;
174   }
175   size_t new_len = i;
176   while (i < str.size()) {
177     if (can_be_first[static_cast<unsigned char>(str[i])] && i + 3 <= str.size()) {
178       bool found = false;
179       for (auto space_ch : space_characters) {
180         if (space_ch[0] == str[i] && space_ch[1] == str[i + 1] && space_ch[2] == str[i + 2]) {
181           if (static_cast<unsigned char>(str[i + 2]) != 0xAE || static_cast<unsigned char>(str[i + 1]) != 0x80 ||
182               static_cast<unsigned char>(str[i]) != 0xE2 || strip_rtlo) {
183             found = true;
184           }
185           break;
186         }
187       }
188       if (found) {
189         str[new_len++] = ' ';
190         i += 3;
191         continue;
192       }
193     }
194     str[new_len++] = str[i++];
195   }
196   Slice trimmed = trim(utf8_truncate(trim(Slice(str.c_str(), new_len)), max_length));
197 
198   // check if there is some non-empty character, empty characters:
199   // "\xE2\x80\x8B", ZERO WIDTH SPACE
200   // "\xE2\x80\x8C", ZERO WIDTH NON-JOINER
201   // "\xE2\x80\x8D", ZERO WIDTH JOINER
202   // "\xE2\x80\x8E", LEFT-TO-RIGHT MARK
203   // "\xE2\x80\x8F", RIGHT-TO-LEFT MARK
204   // "\xE2\x80\xAE", RIGHT-TO-LEFT OVERRIDE
205   // "\xEF\xBB\xBF", ZERO WIDTH NO-BREAK SPACE aka BYTE ORDER MARK
206   // "\xC2\xA0", NO-BREAK SPACE
207   for (i = 0;;) {
208     if (i == trimmed.size()) {
209       // if all characters are empty, return an empty string
210       return string();
211     }
212 
213     if (trimmed[i] == ' ' || trimmed[i] == '\n') {
214       i++;
215       continue;
216     }
217     if (static_cast<unsigned char>(trimmed[i]) == 0xE2 && static_cast<unsigned char>(trimmed[i + 1]) == 0x80) {
218       auto next = static_cast<unsigned char>(trimmed[i + 2]);
219       if ((0x8B <= next && next <= 0x8F) || next == 0xAE) {
220         i += 3;
221         continue;
222       }
223     }
224     if (static_cast<unsigned char>(trimmed[i]) == 0xEF && static_cast<unsigned char>(trimmed[i + 1]) == 0xBB &&
225         static_cast<unsigned char>(trimmed[i + 2]) == 0xBF) {
226       i += 3;
227       continue;
228     }
229     if (static_cast<unsigned char>(trimmed[i]) == 0xC2 && static_cast<unsigned char>(trimmed[i + 1]) == 0xA0) {
230       i += 2;
231       continue;
232     }
233     break;
234   }
235   return trimmed.str();
236 }
237 
is_empty_string(const string & str)238 bool is_empty_string(const string &str) {
239   return strip_empty_characters(str, str.size()).empty();
240 }
241 
get_vector_hash(const vector<uint64> & numbers)242 int64 get_vector_hash(const vector<uint64> &numbers) {
243   uint64 acc = 0;
244   for (auto number : numbers) {
245     acc ^= acc >> 21;
246     acc ^= acc << 35;
247     acc ^= acc >> 4;
248     acc += number;
249   }
250   return static_cast<int64>(acc);
251 }
252 
get_emoji_fingerprint(uint64 num)253 string get_emoji_fingerprint(uint64 num) {
254   static const vector<Slice> emojis{
255       u8"\U0001f609", u8"\U0001f60d", u8"\U0001f61b", u8"\U0001f62d", u8"\U0001f631", u8"\U0001f621", u8"\U0001f60e",
256       u8"\U0001f634", u8"\U0001f635", u8"\U0001f608", u8"\U0001f62c", u8"\U0001f607", u8"\U0001f60f", u8"\U0001f46e",
257       u8"\U0001f477", u8"\U0001f482", u8"\U0001f476", u8"\U0001f468", u8"\U0001f469", u8"\U0001f474", u8"\U0001f475",
258       u8"\U0001f63b", u8"\U0001f63d", u8"\U0001f640", u8"\U0001f47a", u8"\U0001f648", u8"\U0001f649", u8"\U0001f64a",
259       u8"\U0001f480", u8"\U0001f47d", u8"\U0001f4a9", u8"\U0001f525", u8"\U0001f4a5", u8"\U0001f4a4", u8"\U0001f442",
260       u8"\U0001f440", u8"\U0001f443", u8"\U0001f445", u8"\U0001f444", u8"\U0001f44d", u8"\U0001f44e", u8"\U0001f44c",
261       u8"\U0001f44a", u8"\u270c", u8"\u270b", u8"\U0001f450", u8"\U0001f446", u8"\U0001f447", u8"\U0001f449",
262       u8"\U0001f448", u8"\U0001f64f", u8"\U0001f44f", u8"\U0001f4aa", u8"\U0001f6b6", u8"\U0001f3c3", u8"\U0001f483",
263       u8"\U0001f46b", u8"\U0001f46a", u8"\U0001f46c", u8"\U0001f46d", u8"\U0001f485", u8"\U0001f3a9", u8"\U0001f451",
264       u8"\U0001f452", u8"\U0001f45f", u8"\U0001f45e", u8"\U0001f460", u8"\U0001f455", u8"\U0001f457", u8"\U0001f456",
265       u8"\U0001f459", u8"\U0001f45c", u8"\U0001f453", u8"\U0001f380", u8"\U0001f484", u8"\U0001f49b", u8"\U0001f499",
266       u8"\U0001f49c", u8"\U0001f49a", u8"\U0001f48d", u8"\U0001f48e", u8"\U0001f436", u8"\U0001f43a", u8"\U0001f431",
267       u8"\U0001f42d", u8"\U0001f439", u8"\U0001f430", u8"\U0001f438", u8"\U0001f42f", u8"\U0001f428", u8"\U0001f43b",
268       u8"\U0001f437", u8"\U0001f42e", u8"\U0001f417", u8"\U0001f434", u8"\U0001f411", u8"\U0001f418", u8"\U0001f43c",
269       u8"\U0001f427", u8"\U0001f425", u8"\U0001f414", u8"\U0001f40d", u8"\U0001f422", u8"\U0001f41b", u8"\U0001f41d",
270       u8"\U0001f41c", u8"\U0001f41e", u8"\U0001f40c", u8"\U0001f419", u8"\U0001f41a", u8"\U0001f41f", u8"\U0001f42c",
271       u8"\U0001f40b", u8"\U0001f410", u8"\U0001f40a", u8"\U0001f42b", u8"\U0001f340", u8"\U0001f339", u8"\U0001f33b",
272       u8"\U0001f341", u8"\U0001f33e", u8"\U0001f344", u8"\U0001f335", u8"\U0001f334", u8"\U0001f333", u8"\U0001f31e",
273       u8"\U0001f31a", u8"\U0001f319", u8"\U0001f30e", u8"\U0001f30b", u8"\u26a1", u8"\u2614", u8"\u2744", u8"\u26c4",
274       u8"\U0001f300", u8"\U0001f308", u8"\U0001f30a", u8"\U0001f393", u8"\U0001f386", u8"\U0001f383", u8"\U0001f47b",
275       u8"\U0001f385", u8"\U0001f384", u8"\U0001f381", u8"\U0001f388", u8"\U0001f52e", u8"\U0001f3a5", u8"\U0001f4f7",
276       u8"\U0001f4bf", u8"\U0001f4bb", u8"\u260e", u8"\U0001f4e1", u8"\U0001f4fa", u8"\U0001f4fb", u8"\U0001f509",
277       u8"\U0001f514", u8"\u23f3", u8"\u23f0", u8"\u231a", u8"\U0001f512", u8"\U0001f511", u8"\U0001f50e",
278       u8"\U0001f4a1", u8"\U0001f526", u8"\U0001f50c", u8"\U0001f50b", u8"\U0001f6bf", u8"\U0001f6bd", u8"\U0001f527",
279       u8"\U0001f528", u8"\U0001f6aa", u8"\U0001f6ac", u8"\U0001f4a3", u8"\U0001f52b", u8"\U0001f52a", u8"\U0001f48a",
280       u8"\U0001f489", u8"\U0001f4b0", u8"\U0001f4b5", u8"\U0001f4b3", u8"\u2709", u8"\U0001f4eb", u8"\U0001f4e6",
281       u8"\U0001f4c5", u8"\U0001f4c1", u8"\u2702", u8"\U0001f4cc", u8"\U0001f4ce", u8"\u2712", u8"\u270f",
282       u8"\U0001f4d0", u8"\U0001f4da", u8"\U0001f52c", u8"\U0001f52d", u8"\U0001f3a8", u8"\U0001f3ac", u8"\U0001f3a4",
283       u8"\U0001f3a7", u8"\U0001f3b5", u8"\U0001f3b9", u8"\U0001f3bb", u8"\U0001f3ba", u8"\U0001f3b8", u8"\U0001f47e",
284       u8"\U0001f3ae", u8"\U0001f0cf", u8"\U0001f3b2", u8"\U0001f3af", u8"\U0001f3c8", u8"\U0001f3c0", u8"\u26bd",
285       u8"\u26be", u8"\U0001f3be", u8"\U0001f3b1", u8"\U0001f3c9", u8"\U0001f3b3", u8"\U0001f3c1", u8"\U0001f3c7",
286       u8"\U0001f3c6", u8"\U0001f3ca", u8"\U0001f3c4", u8"\u2615", u8"\U0001f37c", u8"\U0001f37a", u8"\U0001f377",
287       u8"\U0001f374", u8"\U0001f355", u8"\U0001f354", u8"\U0001f35f", u8"\U0001f357", u8"\U0001f371", u8"\U0001f35a",
288       u8"\U0001f35c", u8"\U0001f361", u8"\U0001f373", u8"\U0001f35e", u8"\U0001f369", u8"\U0001f366", u8"\U0001f382",
289       u8"\U0001f370", u8"\U0001f36a", u8"\U0001f36b", u8"\U0001f36d", u8"\U0001f36f", u8"\U0001f34e", u8"\U0001f34f",
290       u8"\U0001f34a", u8"\U0001f34b", u8"\U0001f352", u8"\U0001f347", u8"\U0001f349", u8"\U0001f353", u8"\U0001f351",
291       u8"\U0001f34c", u8"\U0001f350", u8"\U0001f34d", u8"\U0001f346", u8"\U0001f345", u8"\U0001f33d", u8"\U0001f3e1",
292       u8"\U0001f3e5", u8"\U0001f3e6", u8"\u26ea", u8"\U0001f3f0", u8"\u26fa", u8"\U0001f3ed", u8"\U0001f5fb",
293       u8"\U0001f5fd", u8"\U0001f3a0", u8"\U0001f3a1", u8"\u26f2", u8"\U0001f3a2", u8"\U0001f6a2", u8"\U0001f6a4",
294       u8"\u2693", u8"\U0001f680", u8"\u2708", u8"\U0001f681", u8"\U0001f682", u8"\U0001f68b", u8"\U0001f68e",
295       u8"\U0001f68c", u8"\U0001f699", u8"\U0001f697", u8"\U0001f695", u8"\U0001f69b", u8"\U0001f6a8", u8"\U0001f694",
296       u8"\U0001f692", u8"\U0001f691", u8"\U0001f6b2", u8"\U0001f6a0", u8"\U0001f69c", u8"\U0001f6a6", u8"\u26a0",
297       u8"\U0001f6a7", u8"\u26fd", u8"\U0001f3b0", u8"\U0001f5ff", u8"\U0001f3aa", u8"\U0001f3ad",
298       u8"\U0001f1ef\U0001f1f5", u8"\U0001f1f0\U0001f1f7", u8"\U0001f1e9\U0001f1ea", u8"\U0001f1e8\U0001f1f3",
299       u8"\U0001f1fa\U0001f1f8", u8"\U0001f1eb\U0001f1f7", u8"\U0001f1ea\U0001f1f8", u8"\U0001f1ee\U0001f1f9",
300       u8"\U0001f1f7\U0001f1fa", u8"\U0001f1ec\U0001f1e7", u8"\u0031\u20e3", u8"\u0032\u20e3", u8"\u0033\u20e3",
301       u8"\u0034\u20e3", u8"\u0035\u20e3", u8"\u0036\u20e3", u8"\u0037\u20e3", u8"\u0038\u20e3", u8"\u0039\u20e3",
302       u8"\u0030\u20e3", u8"\U0001f51f", u8"\u2757", u8"\u2753", u8"\u2665", u8"\u2666", u8"\U0001f4af", u8"\U0001f517",
303       u8"\U0001f531", u8"\U0001f534", u8"\U0001f535", u8"\U0001f536",
304       // comment for clang-format
305       u8"\U0001f537"};
306 
307   return emojis[static_cast<size_t>((num & 0x7FFFFFFFFFFFFFFF) % emojis.size())].str();
308 }
309 
310 }  // namespace td
311