1 /*-
2  * Copyright 2021 Vsevolod Stakhov
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "html_url.hxx"
18 #include "libutil/str_util.h"
19 #include "libserver/url.h"
20 #include "libserver/logger.h"
21 #include "rspamd.h"
22 
23 #include <unicode/idna.h>
24 
25 namespace rspamd::html {
26 
27 static auto
rspamd_url_is_subdomain(std::string_view t1,std::string_view t2)28 rspamd_url_is_subdomain(std::string_view t1, std::string_view t2) -> bool
29 {
30 	const auto *p1 = t1.data() + t1.size() - 1;
31 	const auto *p2 = t2.data() + t2.size() - 1;
32 
33 	/* Skip trailing dots */
34 	while (p1 > t1.data()) {
35 		if (*p1 != '.') {
36 			break;
37 		}
38 
39 		p1--;
40 	}
41 
42 	while (p2 > t2.data()) {
43 		if (*p2 != '.') {
44 			break;
45 		}
46 
47 		p2--;
48 	}
49 
50 	while (p1 > t1.data() && p2 > t2.data()) {
51 		if (*p1 != *p2) {
52 			break;
53 		}
54 
55 		p1--;
56 		p2--;
57 	}
58 
59 	if (p2 == t2.data()) {
60 		/* p2 can be subdomain of p1 if *p1 is '.' */
61 		if (p1 != t1.data() && *(p1 - 1) == '.') {
62 			return true;
63 		}
64 	}
65 	else if (p1 == t1.data()) {
66 		if (p2 != t2.data() && *(p2 - 1) == '.') {
67 			return true;
68 		}
69 	}
70 
71 	return false;
72 }
73 
74 
75 static auto
get_icu_idna_instance(void)76 get_icu_idna_instance(void) -> auto
77 {
78 	auto uc_err = U_ZERO_ERROR;
79 	static auto *udn = icu::IDNA::createUTS46Instance(UIDNA_DEFAULT, uc_err);
80 
81 	return udn;
82 }
83 
84 static auto
convert_idna_hostname_maybe(rspamd_mempool_t * pool,struct rspamd_url * url,bool use_tld)85 convert_idna_hostname_maybe(rspamd_mempool_t *pool, struct rspamd_url *url, bool use_tld)
86 		-> std::string_view
87 {
88 	std::string_view ret = use_tld ?
89 			std::string_view{rspamd_url_tld_unsafe (url), url->tldlen} :
90 			std::string_view {rspamd_url_host_unsafe (url), url->hostlen};
91 
92 	/* Handle IDN url's */
93 	if (ret.size() > 4 &&
94 		rspamd_substring_search_caseless(ret.data(), ret.size(), "xn--", 4) != -1) {
95 		const auto buf_capacity = ret.size() * 2 + 1;
96 		auto *idn_hbuf = (char *)rspamd_mempool_alloc (pool, buf_capacity);
97 		icu::CheckedArrayByteSink byte_sink{idn_hbuf, (int)buf_capacity};
98 		/* We need to convert it to the normal value first */
99 		icu::IDNAInfo info;
100 		auto uc_err = U_ZERO_ERROR;
101 		auto *udn = get_icu_idna_instance();
102 		udn->nameToASCII_UTF8(icu::StringPiece(ret.data(), ret.size()),
103 				byte_sink, info, uc_err);
104 
105 		if (uc_err == U_ZERO_ERROR && !info.hasErrors()) {
106 			ret = std::string_view{idn_hbuf, (std::size_t)byte_sink.NumberOfBytesWritten()};
107 		}
108 		else {
109 			msg_err_pool ("cannot convert to IDN: %s (0x%xd)",
110 					u_errorName(uc_err), info.getErrors());
111 		}
112 	}
113 
114 	return ret;
115 };
116 
sv_equals(std::string_view s1,std::string_view s2)117 constexpr auto sv_equals(std::string_view s1, std::string_view s2) -> auto {
118 	return (s1.size() == s2.size()) &&
119 		std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(),
120 				[](const auto c1, const auto c2) {
121 					return g_ascii_tolower(c1) == g_ascii_tolower(c2);
122 		});
123 }
124 
125 constexpr auto
is_transfer_proto(struct rspamd_url * u)126 is_transfer_proto(struct rspamd_url *u) -> bool
127 {
128 	return (u->protocol & (PROTOCOL_HTTP|PROTOCOL_HTTPS|PROTOCOL_FTP)) != 0;
129 }
130 
131 auto
html_url_is_phished(rspamd_mempool_t * pool,struct rspamd_url * href_url,std::string_view text_data)132 html_url_is_phished(rspamd_mempool_t *pool,
133 					struct rspamd_url *href_url,
134 					std::string_view text_data) -> std::optional<rspamd_url *>
135 {
136 	struct rspamd_url *text_url;
137 	std::string_view disp_tok, href_tok;
138 	goffset url_pos;
139 	gchar *url_str = NULL;
140 
141 	auto sz = text_data.size();
142 	const auto *trimmed = rspamd_string_unicode_trim_inplace(text_data.data(), &sz);
143 	text_data = std::string_view(trimmed, sz);
144 
145 	if (text_data.size() > 4 &&
146 		rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str,
147 				RSPAMD_URL_FIND_ALL,
148 				&url_pos, NULL) && url_str != nullptr) {
149 
150 		if (url_pos > 0) {
151 			/*
152 			 * We have some url at some offset, so we need to check what is
153 			 * at the start of the text
154 			 */
155 			return std::nullopt;
156 		}
157 
158 		text_url = rspamd_mempool_alloc0_type (pool, struct rspamd_url);
159 		auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
160 				RSPAMD_URL_PARSE_TEXT);
161 
162 		if (rc == URI_ERRNO_OK) {
163 			text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
164 			href_url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL;
165 
166 			/* Check for phishing */
167 			if (is_transfer_proto(text_url) == is_transfer_proto(href_url)) {
168 				disp_tok = convert_idna_hostname_maybe(pool, text_url, false);
169 				href_tok = convert_idna_hostname_maybe(pool, href_url, false);
170 
171 				if (!sv_equals(disp_tok, href_tok) &&
172 					text_url->tldlen > 0 && href_url->tldlen > 0) {
173 
174 					/* Apply the same logic for TLD */
175 					disp_tok = convert_idna_hostname_maybe(pool, text_url, true);
176 					href_tok = convert_idna_hostname_maybe(pool, href_url, true);
177 
178 					if (!sv_equals(disp_tok, href_tok)) {
179 						/* Check if one url is a subdomain for another */
180 
181 						if (!rspamd_url_is_subdomain(disp_tok, href_tok)) {
182 							href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
183 							href_url->linked_url = text_url;
184 							text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
185 						}
186 					}
187 				}
188 			}
189 
190 			return text_url;
191 		}
192 		else {
193 			/*
194 			 * We have found something that looks like an url but it was
195 			 * not parsed correctly.
196 			 * Sometimes it means an obfuscation attempt, so we have to check
197 			 * what's inside of the text
198 			 */
199 			gboolean obfuscation_found = FALSE;
200 
201 			if (text_data.size() > 4
202 				&& g_ascii_strncasecmp(text_data.begin(), "http", 4) == 0 &&
203 				rspamd_substring_search(text_data.begin(), text_data.size(), "://", 3) != -1) {
204 				/* Clearly an obfuscation attempt */
205 				obfuscation_found = TRUE;
206 			}
207 
208 			msg_info_pool ("extract of url '%s' failed: %s; obfuscation detected: %s",
209 					url_str,
210 					rspamd_url_strerror(rc),
211 					obfuscation_found ? "yes" : "no");
212 
213 			if (obfuscation_found) {
214 				href_url->flags |= RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED;
215 			}
216 		}
217 	}
218 
219 	return std::nullopt;
220 }
221 
222 void
html_check_displayed_url(rspamd_mempool_t * pool,GList ** exceptions,void * url_set,std::string_view visible_part,goffset href_offset,struct rspamd_url * url)223 html_check_displayed_url(rspamd_mempool_t *pool,
224 						 GList **exceptions,
225 						 void *url_set,
226 						 std::string_view visible_part,
227 						 goffset href_offset,
228 						 struct rspamd_url *url)
229 {
230 	struct rspamd_url *displayed_url = nullptr;
231 	struct rspamd_url *turl;
232 	struct rspamd_process_exception *ex;
233 	guint saved_flags = 0;
234 	gsize dlen;
235 
236 	if (visible_part.empty()) {
237 		/* No dispalyed url, just some text within <a> tag */
238 		return;
239 	}
240 
241 	url->visible_part = rspamd_mempool_alloc_buffer(pool, visible_part.size() + 1);
242 	rspamd_strlcpy(url->visible_part,
243 			visible_part.data(),
244 			visible_part.size() + 1);
245 	dlen = visible_part.size();
246 
247 	/* Strip unicode spaces from the start and the end */
248 	url->visible_part = const_cast<char *>(
249 			rspamd_string_unicode_trim_inplace(url->visible_part,
250 			&dlen));
251 	auto maybe_url = html_url_is_phished(pool, url,
252 			{url->visible_part, dlen});
253 
254 	if (maybe_url) {
255 		url->flags |= saved_flags;
256 		displayed_url = maybe_url.value();
257 	}
258 
259 	if (exceptions && displayed_url != nullptr) {
260 		ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception);
261 		ex->pos = href_offset;
262 		ex->len = dlen;
263 		ex->type = RSPAMD_EXCEPTION_URL;
264 		ex->ptr = url;
265 
266 		*exceptions = g_list_prepend(*exceptions, ex);
267 	}
268 
269 	if (displayed_url && url_set) {
270 		turl = rspamd_url_set_add_or_return((khash_t (rspamd_url_hash) *)url_set, displayed_url);
271 
272 		if (turl != nullptr) {
273 			/* Here, we assume the following:
274 			 * if we have a URL in the text part which
275 			 * is the same as displayed URL in the
276 			 * HTML part, we assume that it is also
277 			 * hint only.
278 			 */
279 			if (turl->flags &
280 				RSPAMD_URL_FLAG_FROM_TEXT) {
281 				turl->flags |= displayed_url->flags;
282 				turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
283 			}
284 
285 			turl->count++;
286 		}
287 		else {
288 			/* Already inserted by `rspamd_url_set_add_or_return` */
289 		}
290 	}
291 
292 	rspamd_normalise_unicode_inplace(url->visible_part, &dlen);
293 }
294 
295 auto
html_process_url(rspamd_mempool_t * pool,std::string_view & input)296 html_process_url(rspamd_mempool_t *pool, std::string_view &input)
297 	-> std::optional<struct rspamd_url *>
298 {
299 	struct rspamd_url *url;
300 	guint saved_flags = 0;
301 	gint rc;
302 	const gchar *s, *prefix = "http://";
303 	gchar *d;
304 	gsize dlen;
305 	gboolean has_bad_chars = FALSE, no_prefix = FALSE;
306 	static const gchar hexdigests[] = "0123456789abcdef";
307 
308 	auto sz = input.length();
309 	const auto *trimmed = rspamd_string_unicode_trim_inplace(input.data(), &sz);
310 	input = {trimmed, sz};
311 
312 	const auto *start = input.data();
313 	s = start;
314 	dlen = 0;
315 
316 	for (auto i = 0; i < sz; i++) {
317 		if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
318 			dlen += 3;
319 		}
320 		else {
321 			dlen++;
322 		}
323 	}
324 
325 	if (rspamd_substring_search(start, sz, "://", 3) == -1) {
326 		if (sz >= sizeof("mailto:") &&
327 			(memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 ||
328 			 memcmp(start, "tel:", sizeof("tel:") - 1) == 0 ||
329 			 memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) {
330 			/* Exclusion, has valid but 'strange' prefix */
331 		}
332 		else {
333 			for (auto i = 0; i < sz; i++) {
334 				if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) {
335 					if (i == 0 && sz > 2 && s[i] == '/' && s[i + 1] == '/') {
336 						prefix = "http:";
337 						dlen += sizeof("http:") - 1;
338 						no_prefix = TRUE;
339 					}
340 					else if (s[i] == '@') {
341 						/* Likely email prefix */
342 						prefix = "mailto://";
343 						dlen += sizeof("mailto://") - 1;
344 						no_prefix = TRUE;
345 					}
346 					else if (s[i] == ':' && i != 0) {
347 						/* Special case */
348 						no_prefix = FALSE;
349 					}
350 					else {
351 						if (i == 0) {
352 							/* No valid data */
353 							return std::nullopt;
354 						}
355 						else {
356 							no_prefix = TRUE;
357 							dlen += strlen(prefix);
358 						}
359 					}
360 
361 					break;
362 				}
363 			}
364 		}
365 	}
366 
367 	auto *decoded = rspamd_mempool_alloc_buffer(pool, dlen + 1);
368 	d = decoded;
369 
370 	if (no_prefix) {
371 		gsize plen = strlen(prefix);
372 		memcpy(d, prefix, plen);
373 		d += plen;
374 	}
375 
376 	/*
377 	 * We also need to remove all internal newlines, spaces
378 	 * and encode unsafe characters
379 	 */
380 	for (auto i = 0; i < sz; i++) {
381 		if (G_UNLIKELY (g_ascii_isspace(s[i]))) {
382 			continue;
383 		}
384 		else if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
385 			/* URL encode */
386 			*d++ = '%';
387 			*d++ = hexdigests[(s[i] >> 4) & 0xf];
388 			*d++ = hexdigests[s[i] & 0xf];
389 			has_bad_chars = TRUE;
390 		}
391 		else {
392 			*d++ = s[i];
393 		}
394 	}
395 
396 	*d = '\0';
397 	dlen = d - decoded;
398 
399 	url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
400 	rspamd_url_normalise_propagate_flags (pool, decoded, &dlen, saved_flags);
401 	rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
402 
403 	/* Filter some completely damaged urls */
404 	if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
405 		!((url->protocol & PROTOCOL_UNKNOWN))) {
406 		url->flags |= saved_flags;
407 
408 		if (has_bad_chars) {
409 			url->flags |= RSPAMD_URL_FLAG_OBSCURED;
410 		}
411 
412 		if (no_prefix) {
413 			url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
414 
415 			if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
416 				/* Ignore urls with both no schema and no tld */
417 				return std::nullopt;
418 			}
419 		}
420 
421 		decoded = url->string;
422 
423 		input = {decoded, url->urllen};
424 
425 		/* Spaces in href usually mean an attempt to obfuscate URL */
426 		/* See https://github.com/vstakhov/rspamd/issues/593 */
427 #if 0
428 		if (has_spaces) {
429 			url->flags |= RSPAMD_URL_FLAG_OBSCURED;
430 		}
431 #endif
432 
433 		return url;
434 	}
435 
436 	return std::nullopt;
437 }
438 
439 }