1 //
2 // Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2021
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 //
7 #include "td/telegram/misc.h"
8
9 #include "td/utils/algorithm.h"
10 #include "td/utils/common.h"
11 #include "td/utils/misc.h"
12 #include "td/utils/Slice.h"
13 #include "td/utils/utf8.h"
14
15 #include <cstring>
16 #include <limits>
17
18 namespace td {
19
clean_name(string str,size_t max_length)20 string clean_name(string str, size_t max_length) {
21 str = strip_empty_characters(str, max_length);
22 size_t new_len = 0;
23 bool is_previous_space = false;
24 for (size_t i = 0; i < str.size(); i++) {
25 if (str[i] == ' ' || str[i] == '\n') {
26 if (!is_previous_space) {
27 str[new_len++] = ' ';
28 is_previous_space = true;
29 }
30 continue;
31 }
32 if (static_cast<unsigned char>(str[i]) == 0xC2 && static_cast<unsigned char>(str[i + 1]) == 0xA0) { //
33 if (!is_previous_space) {
34 str[new_len++] = ' ';
35 is_previous_space = true;
36 }
37 i++;
38 continue;
39 }
40
41 str[new_len++] = str[i];
42 is_previous_space = false;
43 }
44 str.resize(new_len);
45 return trim(str);
46 }
47
clean_username(string str)48 string clean_username(string str) {
49 td::remove(str, '.');
50 to_lower_inplace(str);
51 return trim(str);
52 }
53
replace_offending_characters(string & str)54 void replace_offending_characters(string &str) {
55 // "(\xe2\x80\x8f|\xe2\x80\x8e){N}(\xe2\x80\x8f|\xe2\x80\x8e)" -> "(\xe2\x80\x8c){N}$2"
56 auto s = MutableSlice(str).ubegin();
57 for (size_t pos = 0; pos < str.size(); pos++) {
58 if (s[pos] == 0xe2 && s[pos + 1] == 0x80 && (s[pos + 2] == 0x8e || s[pos + 2] == 0x8f)) {
59 while (s[pos + 3] == 0xe2 && s[pos + 4] == 0x80 && (s[pos + 5] == 0x8e || s[pos + 5] == 0x8f)) {
60 s[pos + 2] = static_cast<unsigned char>(0x8c);
61 pos += 3;
62 }
63 pos += 2;
64 }
65 }
66 }
67
clean_input_string(string & str)68 bool clean_input_string(string &str) {
69 constexpr size_t LENGTH_LIMIT = 35000; // server side limit
70 if (!check_utf8(str)) {
71 return false;
72 }
73
74 size_t str_size = str.size();
75 size_t new_size = 0;
76 for (size_t pos = 0; pos < str_size; pos++) {
77 auto c = static_cast<unsigned char>(str[pos]);
78 switch (c) {
79 // remove control characters
80 case 0:
81 case 1:
82 case 2:
83 case 3:
84 case 4:
85 case 5:
86 case 6:
87 case 7:
88 case 8:
89 case 9:
90 // allow '\n'
91 case 11:
92 case 12:
93 // ignore '\r'
94 case 14:
95 case 15:
96 case 16:
97 case 17:
98 case 18:
99 case 19:
100 case 20:
101 case 21:
102 case 22:
103 case 23:
104 case 24:
105 case 25:
106 case 26:
107 case 27:
108 case 28:
109 case 29:
110 case 30:
111 case 31:
112 case 32:
113 str[new_size++] = ' ';
114 break;
115 case '\r':
116 // skip
117 break;
118 default:
119 // remove \xe2\x80[\xa8-\xae]
120 if (c == 0xe2 && pos + 2 < str_size) {
121 auto next = static_cast<unsigned char>(str[pos + 1]);
122 if (next == 0x80) {
123 next = static_cast<unsigned char>(str[pos + 2]);
124 if (0xa8 <= next && next <= 0xae) {
125 pos += 2;
126 break;
127 }
128 }
129 }
130 // remove vertical lines \xcc[\xb3\xbf\x8a]
131 if (c == 0xcc && pos + 1 < str_size) {
132 auto next = static_cast<unsigned char>(str[pos + 1]);
133 if (next == 0xb3 || next == 0xbf || next == 0x8a) {
134 pos++;
135 break;
136 }
137 }
138
139 str[new_size++] = str[pos];
140 break;
141 }
142 if (new_size >= LENGTH_LIMIT - 3 && is_utf8_character_first_code_unit(str[new_size - 1])) {
143 new_size--;
144 break;
145 }
146 }
147
148 str.resize(new_size);
149
150 replace_offending_characters(str);
151
152 return true;
153 }
154
strip_empty_characters(string str,size_t max_length,bool strip_rtlo)155 string strip_empty_characters(string str, size_t max_length, bool strip_rtlo) {
156 static const char *space_characters[] = {u8"\u1680", u8"\u180E", u8"\u2000", u8"\u2001", u8"\u2002",
157 u8"\u2003", u8"\u2004", u8"\u2005", u8"\u2006", u8"\u2007",
158 u8"\u2008", u8"\u2009", u8"\u200A", u8"\u202E", u8"\u202F",
159 u8"\u205F", u8"\u2800", u8"\u3000", u8"\uFFFC"};
160 static bool can_be_first[std::numeric_limits<unsigned char>::max() + 1];
161 static bool can_be_first_inited = [&] {
162 for (auto space_ch : space_characters) {
163 CHECK(std::strlen(space_ch) == 3);
164 can_be_first[static_cast<unsigned char>(space_ch[0])] = true;
165 }
166 return true;
167 }();
168 CHECK(can_be_first_inited);
169
170 // replace all occurences of space characters with a space
171 size_t i = 0;
172 while (i < str.size() && !can_be_first[static_cast<unsigned char>(str[i])]) {
173 i++;
174 }
175 size_t new_len = i;
176 while (i < str.size()) {
177 if (can_be_first[static_cast<unsigned char>(str[i])] && i + 3 <= str.size()) {
178 bool found = false;
179 for (auto space_ch : space_characters) {
180 if (space_ch[0] == str[i] && space_ch[1] == str[i + 1] && space_ch[2] == str[i + 2]) {
181 if (static_cast<unsigned char>(str[i + 2]) != 0xAE || static_cast<unsigned char>(str[i + 1]) != 0x80 ||
182 static_cast<unsigned char>(str[i]) != 0xE2 || strip_rtlo) {
183 found = true;
184 }
185 break;
186 }
187 }
188 if (found) {
189 str[new_len++] = ' ';
190 i += 3;
191 continue;
192 }
193 }
194 str[new_len++] = str[i++];
195 }
196 Slice trimmed = trim(utf8_truncate(trim(Slice(str.c_str(), new_len)), max_length));
197
198 // check if there is some non-empty character, empty characters:
199 // "\xE2\x80\x8B", ZERO WIDTH SPACE
200 // "\xE2\x80\x8C", ZERO WIDTH NON-JOINER
201 // "\xE2\x80\x8D", ZERO WIDTH JOINER
202 // "\xE2\x80\x8E", LEFT-TO-RIGHT MARK
203 // "\xE2\x80\x8F", RIGHT-TO-LEFT MARK
204 // "\xE2\x80\xAE", RIGHT-TO-LEFT OVERRIDE
205 // "\xEF\xBB\xBF", ZERO WIDTH NO-BREAK SPACE aka BYTE ORDER MARK
206 // "\xC2\xA0", NO-BREAK SPACE
207 for (i = 0;;) {
208 if (i == trimmed.size()) {
209 // if all characters are empty, return an empty string
210 return string();
211 }
212
213 if (trimmed[i] == ' ' || trimmed[i] == '\n') {
214 i++;
215 continue;
216 }
217 if (static_cast<unsigned char>(trimmed[i]) == 0xE2 && static_cast<unsigned char>(trimmed[i + 1]) == 0x80) {
218 auto next = static_cast<unsigned char>(trimmed[i + 2]);
219 if ((0x8B <= next && next <= 0x8F) || next == 0xAE) {
220 i += 3;
221 continue;
222 }
223 }
224 if (static_cast<unsigned char>(trimmed[i]) == 0xEF && static_cast<unsigned char>(trimmed[i + 1]) == 0xBB &&
225 static_cast<unsigned char>(trimmed[i + 2]) == 0xBF) {
226 i += 3;
227 continue;
228 }
229 if (static_cast<unsigned char>(trimmed[i]) == 0xC2 && static_cast<unsigned char>(trimmed[i + 1]) == 0xA0) {
230 i += 2;
231 continue;
232 }
233 break;
234 }
235 return trimmed.str();
236 }
237
is_empty_string(const string & str)238 bool is_empty_string(const string &str) {
239 return strip_empty_characters(str, str.size()).empty();
240 }
241
get_vector_hash(const vector<uint64> & numbers)242 int64 get_vector_hash(const vector<uint64> &numbers) {
243 uint64 acc = 0;
244 for (auto number : numbers) {
245 acc ^= acc >> 21;
246 acc ^= acc << 35;
247 acc ^= acc >> 4;
248 acc += number;
249 }
250 return static_cast<int64>(acc);
251 }
252
get_emoji_fingerprint(uint64 num)253 string get_emoji_fingerprint(uint64 num) {
254 static const vector<Slice> emojis{
255 u8"\U0001f609", u8"\U0001f60d", u8"\U0001f61b", u8"\U0001f62d", u8"\U0001f631", u8"\U0001f621", u8"\U0001f60e",
256 u8"\U0001f634", u8"\U0001f635", u8"\U0001f608", u8"\U0001f62c", u8"\U0001f607", u8"\U0001f60f", u8"\U0001f46e",
257 u8"\U0001f477", u8"\U0001f482", u8"\U0001f476", u8"\U0001f468", u8"\U0001f469", u8"\U0001f474", u8"\U0001f475",
258 u8"\U0001f63b", u8"\U0001f63d", u8"\U0001f640", u8"\U0001f47a", u8"\U0001f648", u8"\U0001f649", u8"\U0001f64a",
259 u8"\U0001f480", u8"\U0001f47d", u8"\U0001f4a9", u8"\U0001f525", u8"\U0001f4a5", u8"\U0001f4a4", u8"\U0001f442",
260 u8"\U0001f440", u8"\U0001f443", u8"\U0001f445", u8"\U0001f444", u8"\U0001f44d", u8"\U0001f44e", u8"\U0001f44c",
261 u8"\U0001f44a", u8"\u270c", u8"\u270b", u8"\U0001f450", u8"\U0001f446", u8"\U0001f447", u8"\U0001f449",
262 u8"\U0001f448", u8"\U0001f64f", u8"\U0001f44f", u8"\U0001f4aa", u8"\U0001f6b6", u8"\U0001f3c3", u8"\U0001f483",
263 u8"\U0001f46b", u8"\U0001f46a", u8"\U0001f46c", u8"\U0001f46d", u8"\U0001f485", u8"\U0001f3a9", u8"\U0001f451",
264 u8"\U0001f452", u8"\U0001f45f", u8"\U0001f45e", u8"\U0001f460", u8"\U0001f455", u8"\U0001f457", u8"\U0001f456",
265 u8"\U0001f459", u8"\U0001f45c", u8"\U0001f453", u8"\U0001f380", u8"\U0001f484", u8"\U0001f49b", u8"\U0001f499",
266 u8"\U0001f49c", u8"\U0001f49a", u8"\U0001f48d", u8"\U0001f48e", u8"\U0001f436", u8"\U0001f43a", u8"\U0001f431",
267 u8"\U0001f42d", u8"\U0001f439", u8"\U0001f430", u8"\U0001f438", u8"\U0001f42f", u8"\U0001f428", u8"\U0001f43b",
268 u8"\U0001f437", u8"\U0001f42e", u8"\U0001f417", u8"\U0001f434", u8"\U0001f411", u8"\U0001f418", u8"\U0001f43c",
269 u8"\U0001f427", u8"\U0001f425", u8"\U0001f414", u8"\U0001f40d", u8"\U0001f422", u8"\U0001f41b", u8"\U0001f41d",
270 u8"\U0001f41c", u8"\U0001f41e", u8"\U0001f40c", u8"\U0001f419", u8"\U0001f41a", u8"\U0001f41f", u8"\U0001f42c",
271 u8"\U0001f40b", u8"\U0001f410", u8"\U0001f40a", u8"\U0001f42b", u8"\U0001f340", u8"\U0001f339", u8"\U0001f33b",
272 u8"\U0001f341", u8"\U0001f33e", u8"\U0001f344", u8"\U0001f335", u8"\U0001f334", u8"\U0001f333", u8"\U0001f31e",
273 u8"\U0001f31a", u8"\U0001f319", u8"\U0001f30e", u8"\U0001f30b", u8"\u26a1", u8"\u2614", u8"\u2744", u8"\u26c4",
274 u8"\U0001f300", u8"\U0001f308", u8"\U0001f30a", u8"\U0001f393", u8"\U0001f386", u8"\U0001f383", u8"\U0001f47b",
275 u8"\U0001f385", u8"\U0001f384", u8"\U0001f381", u8"\U0001f388", u8"\U0001f52e", u8"\U0001f3a5", u8"\U0001f4f7",
276 u8"\U0001f4bf", u8"\U0001f4bb", u8"\u260e", u8"\U0001f4e1", u8"\U0001f4fa", u8"\U0001f4fb", u8"\U0001f509",
277 u8"\U0001f514", u8"\u23f3", u8"\u23f0", u8"\u231a", u8"\U0001f512", u8"\U0001f511", u8"\U0001f50e",
278 u8"\U0001f4a1", u8"\U0001f526", u8"\U0001f50c", u8"\U0001f50b", u8"\U0001f6bf", u8"\U0001f6bd", u8"\U0001f527",
279 u8"\U0001f528", u8"\U0001f6aa", u8"\U0001f6ac", u8"\U0001f4a3", u8"\U0001f52b", u8"\U0001f52a", u8"\U0001f48a",
280 u8"\U0001f489", u8"\U0001f4b0", u8"\U0001f4b5", u8"\U0001f4b3", u8"\u2709", u8"\U0001f4eb", u8"\U0001f4e6",
281 u8"\U0001f4c5", u8"\U0001f4c1", u8"\u2702", u8"\U0001f4cc", u8"\U0001f4ce", u8"\u2712", u8"\u270f",
282 u8"\U0001f4d0", u8"\U0001f4da", u8"\U0001f52c", u8"\U0001f52d", u8"\U0001f3a8", u8"\U0001f3ac", u8"\U0001f3a4",
283 u8"\U0001f3a7", u8"\U0001f3b5", u8"\U0001f3b9", u8"\U0001f3bb", u8"\U0001f3ba", u8"\U0001f3b8", u8"\U0001f47e",
284 u8"\U0001f3ae", u8"\U0001f0cf", u8"\U0001f3b2", u8"\U0001f3af", u8"\U0001f3c8", u8"\U0001f3c0", u8"\u26bd",
285 u8"\u26be", u8"\U0001f3be", u8"\U0001f3b1", u8"\U0001f3c9", u8"\U0001f3b3", u8"\U0001f3c1", u8"\U0001f3c7",
286 u8"\U0001f3c6", u8"\U0001f3ca", u8"\U0001f3c4", u8"\u2615", u8"\U0001f37c", u8"\U0001f37a", u8"\U0001f377",
287 u8"\U0001f374", u8"\U0001f355", u8"\U0001f354", u8"\U0001f35f", u8"\U0001f357", u8"\U0001f371", u8"\U0001f35a",
288 u8"\U0001f35c", u8"\U0001f361", u8"\U0001f373", u8"\U0001f35e", u8"\U0001f369", u8"\U0001f366", u8"\U0001f382",
289 u8"\U0001f370", u8"\U0001f36a", u8"\U0001f36b", u8"\U0001f36d", u8"\U0001f36f", u8"\U0001f34e", u8"\U0001f34f",
290 u8"\U0001f34a", u8"\U0001f34b", u8"\U0001f352", u8"\U0001f347", u8"\U0001f349", u8"\U0001f353", u8"\U0001f351",
291 u8"\U0001f34c", u8"\U0001f350", u8"\U0001f34d", u8"\U0001f346", u8"\U0001f345", u8"\U0001f33d", u8"\U0001f3e1",
292 u8"\U0001f3e5", u8"\U0001f3e6", u8"\u26ea", u8"\U0001f3f0", u8"\u26fa", u8"\U0001f3ed", u8"\U0001f5fb",
293 u8"\U0001f5fd", u8"\U0001f3a0", u8"\U0001f3a1", u8"\u26f2", u8"\U0001f3a2", u8"\U0001f6a2", u8"\U0001f6a4",
294 u8"\u2693", u8"\U0001f680", u8"\u2708", u8"\U0001f681", u8"\U0001f682", u8"\U0001f68b", u8"\U0001f68e",
295 u8"\U0001f68c", u8"\U0001f699", u8"\U0001f697", u8"\U0001f695", u8"\U0001f69b", u8"\U0001f6a8", u8"\U0001f694",
296 u8"\U0001f692", u8"\U0001f691", u8"\U0001f6b2", u8"\U0001f6a0", u8"\U0001f69c", u8"\U0001f6a6", u8"\u26a0",
297 u8"\U0001f6a7", u8"\u26fd", u8"\U0001f3b0", u8"\U0001f5ff", u8"\U0001f3aa", u8"\U0001f3ad",
298 u8"\U0001f1ef\U0001f1f5", u8"\U0001f1f0\U0001f1f7", u8"\U0001f1e9\U0001f1ea", u8"\U0001f1e8\U0001f1f3",
299 u8"\U0001f1fa\U0001f1f8", u8"\U0001f1eb\U0001f1f7", u8"\U0001f1ea\U0001f1f8", u8"\U0001f1ee\U0001f1f9",
300 u8"\U0001f1f7\U0001f1fa", u8"\U0001f1ec\U0001f1e7", u8"\u0031\u20e3", u8"\u0032\u20e3", u8"\u0033\u20e3",
301 u8"\u0034\u20e3", u8"\u0035\u20e3", u8"\u0036\u20e3", u8"\u0037\u20e3", u8"\u0038\u20e3", u8"\u0039\u20e3",
302 u8"\u0030\u20e3", u8"\U0001f51f", u8"\u2757", u8"\u2753", u8"\u2665", u8"\u2666", u8"\U0001f4af", u8"\U0001f517",
303 u8"\U0001f531", u8"\U0001f534", u8"\U0001f535", u8"\U0001f536",
304 // comment for clang-format
305 u8"\U0001f537"};
306
307 return emojis[static_cast<size_t>((num & 0x7FFFFFFFFFFFFFFF) % emojis.size())].str();
308 }
309
310 } // namespace td
311