1 /*-
2 * Copyright 2021 Vsevolod Stakhov
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "html_url.hxx"
18 #include "libutil/str_util.h"
19 #include "libserver/url.h"
20 #include "libserver/logger.h"
21 #include "rspamd.h"
22
23 #include <unicode/idna.h>
24
25 namespace rspamd::html {
26
27 static auto
rspamd_url_is_subdomain(std::string_view t1,std::string_view t2)28 rspamd_url_is_subdomain(std::string_view t1, std::string_view t2) -> bool
29 {
30 const auto *p1 = t1.data() + t1.size() - 1;
31 const auto *p2 = t2.data() + t2.size() - 1;
32
33 /* Skip trailing dots */
34 while (p1 > t1.data()) {
35 if (*p1 != '.') {
36 break;
37 }
38
39 p1--;
40 }
41
42 while (p2 > t2.data()) {
43 if (*p2 != '.') {
44 break;
45 }
46
47 p2--;
48 }
49
50 while (p1 > t1.data() && p2 > t2.data()) {
51 if (*p1 != *p2) {
52 break;
53 }
54
55 p1--;
56 p2--;
57 }
58
59 if (p2 == t2.data()) {
60 /* p2 can be subdomain of p1 if *p1 is '.' */
61 if (p1 != t1.data() && *(p1 - 1) == '.') {
62 return true;
63 }
64 }
65 else if (p1 == t1.data()) {
66 if (p2 != t2.data() && *(p2 - 1) == '.') {
67 return true;
68 }
69 }
70
71 return false;
72 }
73
74
75 static auto
get_icu_idna_instance(void)76 get_icu_idna_instance(void) -> auto
77 {
78 auto uc_err = U_ZERO_ERROR;
79 static auto *udn = icu::IDNA::createUTS46Instance(UIDNA_DEFAULT, uc_err);
80
81 return udn;
82 }
83
84 static auto
convert_idna_hostname_maybe(rspamd_mempool_t * pool,struct rspamd_url * url,bool use_tld)85 convert_idna_hostname_maybe(rspamd_mempool_t *pool, struct rspamd_url *url, bool use_tld)
86 -> std::string_view
87 {
88 std::string_view ret = use_tld ?
89 std::string_view{rspamd_url_tld_unsafe (url), url->tldlen} :
90 std::string_view {rspamd_url_host_unsafe (url), url->hostlen};
91
92 /* Handle IDN url's */
93 if (ret.size() > 4 &&
94 rspamd_substring_search_caseless(ret.data(), ret.size(), "xn--", 4) != -1) {
95 const auto buf_capacity = ret.size() * 2 + 1;
96 auto *idn_hbuf = (char *)rspamd_mempool_alloc (pool, buf_capacity);
97 icu::CheckedArrayByteSink byte_sink{idn_hbuf, (int)buf_capacity};
98 /* We need to convert it to the normal value first */
99 icu::IDNAInfo info;
100 auto uc_err = U_ZERO_ERROR;
101 auto *udn = get_icu_idna_instance();
102 udn->nameToASCII_UTF8(icu::StringPiece(ret.data(), ret.size()),
103 byte_sink, info, uc_err);
104
105 if (uc_err == U_ZERO_ERROR && !info.hasErrors()) {
106 ret = std::string_view{idn_hbuf, (std::size_t)byte_sink.NumberOfBytesWritten()};
107 }
108 else {
109 msg_err_pool ("cannot convert to IDN: %s (0x%xd)",
110 u_errorName(uc_err), info.getErrors());
111 }
112 }
113
114 return ret;
115 };
116
sv_equals(std::string_view s1,std::string_view s2)117 constexpr auto sv_equals(std::string_view s1, std::string_view s2) -> auto {
118 return (s1.size() == s2.size()) &&
119 std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(),
120 [](const auto c1, const auto c2) {
121 return g_ascii_tolower(c1) == g_ascii_tolower(c2);
122 });
123 }
124
125 constexpr auto
is_transfer_proto(struct rspamd_url * u)126 is_transfer_proto(struct rspamd_url *u) -> bool
127 {
128 return (u->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_FTP)) != 0;
129 }
130
131 auto
html_url_is_phished(rspamd_mempool_t * pool,struct rspamd_url * href_url,std::string_view text_data)132 html_url_is_phished(rspamd_mempool_t *pool,
133 struct rspamd_url *href_url,
134 std::string_view text_data) -> std::optional<rspamd_url *>
135 {
136 struct rspamd_url *text_url;
137 std::string_view disp_tok, href_tok;
138 goffset url_pos;
139 gchar *url_str = NULL;
140
141 auto sz = text_data.size();
142 const auto *trimmed = rspamd_string_unicode_trim_inplace(text_data.data(), &sz);
143 text_data = std::string_view(trimmed, sz);
144
145 if (text_data.size() > 4 &&
146 rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str,
147 RSPAMD_URL_FIND_ALL,
148 &url_pos, NULL) && url_str != nullptr) {
149
150 if (url_pos > 0) {
151 /*
152 * We have some url at some offset, so we need to check what is
153 * at the start of the text
154 */
155 return std::nullopt;
156 }
157
158 text_url = rspamd_mempool_alloc0_type (pool, struct rspamd_url);
159 auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
160 RSPAMD_URL_PARSE_TEXT);
161
162 if (rc == URI_ERRNO_OK) {
163 text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
164 href_url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL;
165
166 /* Check for phishing */
167 if (is_transfer_proto(text_url) == is_transfer_proto(href_url)) {
168 disp_tok = convert_idna_hostname_maybe(pool, text_url, false);
169 href_tok = convert_idna_hostname_maybe(pool, href_url, false);
170
171 if (!sv_equals(disp_tok, href_tok) &&
172 text_url->tldlen > 0 && href_url->tldlen > 0) {
173
174 /* Apply the same logic for TLD */
175 disp_tok = convert_idna_hostname_maybe(pool, text_url, true);
176 href_tok = convert_idna_hostname_maybe(pool, href_url, true);
177
178 if (!sv_equals(disp_tok, href_tok)) {
179 /* Check if one url is a subdomain for another */
180
181 if (!rspamd_url_is_subdomain(disp_tok, href_tok)) {
182 href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
183 href_url->linked_url = text_url;
184 text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
185 }
186 }
187 }
188 }
189
190 return text_url;
191 }
192 else {
193 /*
194 * We have found something that looks like an url but it was
195 * not parsed correctly.
196 * Sometimes it means an obfuscation attempt, so we have to check
197 * what's inside of the text
198 */
199 gboolean obfuscation_found = FALSE;
200
201 if (text_data.size() > 4
202 && g_ascii_strncasecmp(text_data.begin(), "http", 4) == 0 &&
203 rspamd_substring_search(text_data.begin(), text_data.size(), "://", 3) != -1) {
204 /* Clearly an obfuscation attempt */
205 obfuscation_found = TRUE;
206 }
207
208 msg_info_pool ("extract of url '%s' failed: %s; obfuscation detected: %s",
209 url_str,
210 rspamd_url_strerror(rc),
211 obfuscation_found ? "yes" : "no");
212
213 if (obfuscation_found) {
214 href_url->flags |= RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED;
215 }
216 }
217 }
218
219 return std::nullopt;
220 }
221
222 void
html_check_displayed_url(rspamd_mempool_t * pool,GList ** exceptions,void * url_set,std::string_view visible_part,goffset href_offset,struct rspamd_url * url)223 html_check_displayed_url(rspamd_mempool_t *pool,
224 GList **exceptions,
225 void *url_set,
226 std::string_view visible_part,
227 goffset href_offset,
228 struct rspamd_url *url)
229 {
230 struct rspamd_url *displayed_url = nullptr;
231 struct rspamd_url *turl;
232 struct rspamd_process_exception *ex;
233 guint saved_flags = 0;
234 gsize dlen;
235
236 if (visible_part.empty()) {
237 /* No dispalyed url, just some text within <a> tag */
238 return;
239 }
240
241 url->visible_part = rspamd_mempool_alloc_buffer(pool, visible_part.size() + 1);
242 rspamd_strlcpy(url->visible_part,
243 visible_part.data(),
244 visible_part.size() + 1);
245 dlen = visible_part.size();
246
247 /* Strip unicode spaces from the start and the end */
248 url->visible_part = const_cast<char *>(
249 rspamd_string_unicode_trim_inplace(url->visible_part,
250 &dlen));
251 auto maybe_url = html_url_is_phished(pool, url,
252 {url->visible_part, dlen});
253
254 if (maybe_url) {
255 url->flags |= saved_flags;
256 displayed_url = maybe_url.value();
257 }
258
259 if (exceptions && displayed_url != nullptr) {
260 ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception);
261 ex->pos = href_offset;
262 ex->len = dlen;
263 ex->type = RSPAMD_EXCEPTION_URL;
264 ex->ptr = url;
265
266 *exceptions = g_list_prepend(*exceptions, ex);
267 }
268
269 if (displayed_url && url_set) {
270 turl = rspamd_url_set_add_or_return((khash_t (rspamd_url_hash) *)url_set, displayed_url);
271
272 if (turl != nullptr) {
273 /* Here, we assume the following:
274 * if we have a URL in the text part which
275 * is the same as displayed URL in the
276 * HTML part, we assume that it is also
277 * hint only.
278 */
279 if (turl->flags &
280 RSPAMD_URL_FLAG_FROM_TEXT) {
281 turl->flags |= displayed_url->flags;
282 turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
283 }
284
285 turl->count++;
286 }
287 else {
288 /* Already inserted by `rspamd_url_set_add_or_return` */
289 }
290 }
291
292 rspamd_normalise_unicode_inplace(url->visible_part, &dlen);
293 }
294
295 auto
html_process_url(rspamd_mempool_t * pool,std::string_view & input)296 html_process_url(rspamd_mempool_t *pool, std::string_view &input)
297 -> std::optional<struct rspamd_url *>
298 {
299 struct rspamd_url *url;
300 guint saved_flags = 0;
301 gint rc;
302 const gchar *s, *prefix = "http://";
303 gchar *d;
304 gsize dlen;
305 gboolean has_bad_chars = FALSE, no_prefix = FALSE;
306 static const gchar hexdigests[] = "0123456789abcdef";
307
308 auto sz = input.length();
309 const auto *trimmed = rspamd_string_unicode_trim_inplace(input.data(), &sz);
310 input = {trimmed, sz};
311
312 const auto *start = input.data();
313 s = start;
314 dlen = 0;
315
316 for (auto i = 0; i < sz; i++) {
317 if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
318 dlen += 3;
319 }
320 else {
321 dlen++;
322 }
323 }
324
325 if (rspamd_substring_search(start, sz, "://", 3) == -1) {
326 if (sz >= sizeof("mailto:") &&
327 (memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 ||
328 memcmp(start, "tel:", sizeof("tel:") - 1) == 0 ||
329 memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) {
330 /* Exclusion, has valid but 'strange' prefix */
331 }
332 else {
333 for (auto i = 0; i < sz; i++) {
334 if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) {
335 if (i == 0 && sz > 2 && s[i] == '/' && s[i + 1] == '/') {
336 prefix = "http:";
337 dlen += sizeof("http:") - 1;
338 no_prefix = TRUE;
339 }
340 else if (s[i] == '@') {
341 /* Likely email prefix */
342 prefix = "mailto://";
343 dlen += sizeof("mailto://") - 1;
344 no_prefix = TRUE;
345 }
346 else if (s[i] == ':' && i != 0) {
347 /* Special case */
348 no_prefix = FALSE;
349 }
350 else {
351 if (i == 0) {
352 /* No valid data */
353 return std::nullopt;
354 }
355 else {
356 no_prefix = TRUE;
357 dlen += strlen(prefix);
358 }
359 }
360
361 break;
362 }
363 }
364 }
365 }
366
367 auto *decoded = rspamd_mempool_alloc_buffer(pool, dlen + 1);
368 d = decoded;
369
370 if (no_prefix) {
371 gsize plen = strlen(prefix);
372 memcpy(d, prefix, plen);
373 d += plen;
374 }
375
376 /*
377 * We also need to remove all internal newlines, spaces
378 * and encode unsafe characters
379 */
380 for (auto i = 0; i < sz; i++) {
381 if (G_UNLIKELY (g_ascii_isspace(s[i]))) {
382 continue;
383 }
384 else if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
385 /* URL encode */
386 *d++ = '%';
387 *d++ = hexdigests[(s[i] >> 4) & 0xf];
388 *d++ = hexdigests[s[i] & 0xf];
389 has_bad_chars = TRUE;
390 }
391 else {
392 *d++ = s[i];
393 }
394 }
395
396 *d = '\0';
397 dlen = d - decoded;
398
399 url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
400 rspamd_url_normalise_propagate_flags (pool, decoded, &dlen, saved_flags);
401 rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
402
403 /* Filter some completely damaged urls */
404 if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
405 !((url->protocol & PROTOCOL_UNKNOWN))) {
406 url->flags |= saved_flags;
407
408 if (has_bad_chars) {
409 url->flags |= RSPAMD_URL_FLAG_OBSCURED;
410 }
411
412 if (no_prefix) {
413 url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
414
415 if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
416 /* Ignore urls with both no schema and no tld */
417 return std::nullopt;
418 }
419 }
420
421 decoded = url->string;
422
423 input = {decoded, url->urllen};
424
425 /* Spaces in href usually mean an attempt to obfuscate URL */
426 /* See https://github.com/vstakhov/rspamd/issues/593 */
427 #if 0
428 if (has_spaces) {
429 url->flags |= RSPAMD_URL_FLAG_OBSCURED;
430 }
431 #endif
432
433 return url;
434 }
435
436 return std::nullopt;
437 }
438
439 }