1 /*
2  * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
3  * Version 2, December 2004
4  *
5  * Copyright (C) 2012-2013 Sebastien Tricaud <sebastien@honeynet.org>
6  *
7  * Everyone is permitted to copy and distribute verbatim or modified
8  * copies of this license document, and changing it is allowed as long
9  * as the name is changed.
10  *
11  * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
12  * TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
13  *
14  *  0. You just DO WHAT THE FUCK YOU WANT TO.
15  */
16 
17 #define _GNU_SOURCE
18 #include <faup/faup.h>
19 #include <faup/decode.h>
20 #include <faup/features.h>
21 #include <faup/tld-tree.h>
22 
23 #ifdef FAUP_LUA_MODULES
24 #include <faup/modules.h>
25 #endif
26 
27 #ifdef WIN32
28 #include <faup/compat.h>
29 #endif
30 
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include <ctype.h>
35 
powui(uint32_t base,uint32_t n)36 uint32_t powui(uint32_t base, uint32_t n)
37 {
38 	uint32_t ret = 1;
39 	uint32_t i;
40 
41 	for (i = 0; i < n; i++) {
42 		ret *= base;
43 	}
44 	return ret;
45 }
46 
is_ipv4(const char * str,const size_t n)47 bool is_ipv4(const char* str, const size_t n)
48 {
49 	// TODO: vectorize this
50 	uint32_t ndots = 0;
51 	uint32_t nip = 0;
52 	int32_t cur_d = 2;
53 	size_t i = 0;
54 	char prev_c = 0;
55 
56 	if (n > 15) {
57 		return false;
58 	}
59 
60 	while ((i < n) && isspace(str[i])) {
61 		i++;
62 	}
63 
64 	for (; i < n; i++) {
65 		const char c = str[i];
66 		if (c == '.') {
67 			if (prev_c == '.') {
68 				return false;
69 			}
70 			ndots++;
71 			nip /= powui(10, cur_d+1);
72 			if (nip > 255) {
73 				return false;
74 			}
75 			nip = 0;
76 			cur_d = 2;
77 		}
78 		else {
79 			if ((c < '0') || (c > '9')) {
80 				if (prev_c == '.') {
81 					return false;
82 				}
83 				break;
84 			}
85 			if (cur_d < 0) {
86 				return false;
87 			}
88 			nip += ((uint32_t)((c-'0')))*powui(10, cur_d);
89 			cur_d--;
90 		}
91 		prev_c = c;
92 	}
93 	nip /= powui(10, cur_d+1);
94 	if (nip > 255) {
95 		return false;
96 	}
97 	// Check trailing characters
98 	for (; i < n; i++) {
99 		if (!isspace(str[i])) {
100 			return false;
101 		}
102 	}
103 	return ndots == 3;
104 }
105 
is_ipv6(const char * str,const size_t n)106 bool is_ipv6(const char* str, const size_t n)
107 {
108         if (n < 3) {
109                 return false;
110 	}
111 
112 	if ((str[0] == '[') && (str[n-1] == ']')) {
113                 return true;
114 	}
115 
116 	return false;
117 }
118 
faup_decode(faup_handler_t * fh,const char * url,size_t url_len)119 const char *faup_decode(faup_handler_t *fh, const char *url, size_t url_len)
120 {
121 	uint32_t total_size = 0;
122 	int next_valid_token_pos = 0;
123 	const char *retval_url = url;
124 
125 #ifdef FAUP_LUA_MODULES
126 	faup_modules_transformed_url_t *url_transformed_by_modules = NULL;
127 #endif
128 
129 	faup_features_t *url_features = NULL;
130 
131 	if (!url) {
132 		return NULL;
133 	}
134 
135 	if (fh->options->number_of_chars_to_remove >= url_len) {
136 	  if (url_len > 0) {
137 	    fprintf(stderr, "Warning: Cannot remove more characters than the url string! Not removing any character on this url: %s\n", url);
138 	  }
139 	} else {
140 	  url_len -= fh->options->number_of_chars_to_remove;
141 	}
142 
143 	fh->faup.decoded = true;
144 	fh->faup.url_type = FAUP_URL_HAS_NO_TLD;
145 
146 #ifdef FAUP_LUA_MODULES
147 	if (fh->options->exec_modules != FAUP_MODULES_NOEXEC) {
148 		url_transformed_by_modules = faup_modules_decode_url_start(fh, url, url_len);
149 		if (url_transformed_by_modules) {
150 			fh->faup.org_str = url_transformed_by_modules->url; // FIXME: Change to 'url' when the output has changed to reflect new way of doing with lua stuff
151 			fh->faup.org_str_len = url_len;
152 			faup_features_find(fh, url_transformed_by_modules->url, url_transformed_by_modules->url_len);
153 			url_len = url_transformed_by_modules->url_len;
154 			retval_url = url_transformed_by_modules->url;
155 		}
156 	}
157 	if (!url_transformed_by_modules) {
158 		fh->faup.org_str = url;
159 		fh->faup.org_str_len = url_len;
160 		faup_features_find(fh, url, url_len);
161 	}
162 #else
163 	// Nothing has been transformed, so we simply use our original url
164 	fh->faup.org_str = url;
165 	fh->faup.org_str_len = url_len;
166 	faup_features_find(fh, url, url_len);
167 #endif // FAUP_LUA_MODULES
168 
169 	url_features = &fh->faup.features;
170 
171 	//FIXME: faup_features_errors_lookup _always_ return 0 => ?! This if statement is useless !
172 	if (!faup_features_errors_lookup(url_features)) {
173 		if ((faup_features_exist(url_features->scheme)) && (faup_features_exist(url_features->hierarchical))) {
174 			total_size = url_features->hierarchical.pos - url_features->scheme.pos;
175 			url_features->scheme.size = total_size;
176 		}
177 
178 		if (faup_features_exist(url_features->credential)) {
179 			total_size = url_features->host.pos - url_features->credential.pos - 1;
180 			url_features->credential.size = total_size;
181 		}
182 
183 		if (faup_features_exist(url_features->host)) {
184 			if (faup_features_exist(url_features->port)) {
185 				next_valid_token_pos = url_features->port.pos - 1;
186 			} else if (faup_features_exist(url_features->resource_path)) {
187 				next_valid_token_pos = url_features->resource_path.pos;
188 			} else if (faup_features_exist(url_features->query_string)) {
189 				next_valid_token_pos = url_features->query_string.pos;
190 			} else if (faup_features_exist(url_features->fragment)) {
191 				next_valid_token_pos = url_features->fragment.pos;
192 			} else {
193 				/* /\\* We have no next token *\\/  */
194 				/* /\\* FIXME: We shall return after, no need to go further *\\/  */
195 				next_valid_token_pos = url_len;
196 			}
197 
198 			if (next_valid_token_pos > url_features->host.pos) {
199 				const char *host = NULL;
200 
201 				total_size = next_valid_token_pos - url_features->host.pos;
202 				url_features->host.size = total_size;
203 				/* Check if we are dealing with an IPv(4|6) */
204 				host = url + url_features->host.pos;
205 
206 				bool ipv4_host = is_ipv4(host, total_size);
207 				bool ipv6_host = is_ipv6(host, total_size);
208 				if (!ipv4_host && !ipv6_host) {
209 					uint32_t tld_pos;
210 					uintptr_t tld_len;
211 					 /* Extract the TLD now */
212 					const char *tld = (const char*) memrchr(host, '.', url_features->host.size);
213 					if (tld) {
214 						tld++;
215 
216 						tld_pos = (uint32_t) (((uintptr_t)tld)-((uintptr_t)host));
217 						tld_len = url_features->host.size - tld_pos;
218 
219 						if (tld_len>0) {
220 							const char* domain;
221 							/* We sometime have no resource_path after but a trailing slash ('www.honeynet.org/') */
222 							if ((tld[tld_len-1] == '/') || (tld[tld_len-1] == '?')) {
223 								tld_len--;
224 							}
225 
226 							// All the features are detected, we can do some extra operations now
227 							if (fh->options->tld_greater_extraction) {
228 								faup_tld_tree_extracted_t tld_extracted = faup_tld_tree_extract(fh, fh->options->tld_tree);
229 								url_features->tld.pos = tld_extracted.pos;
230 								url_features->tld.size = tld_extracted.size;
231 
232 								// Since we do not have the -t option, check if the TLD wasn't > 1
233 								if (tld_extracted.pos >= 0) {
234 									fh->faup.url_type = FAUP_URL_HAS_MOZILLA_TLD;
235 									tld_pos = tld_extracted.pos;
236 									tld_len = tld_extracted.size;
237 								} else {
238 									fh->faup.url_type = FAUP_URL_HAS_UNKNOWN_TLD;
239 									url_features->tld.pos = tld_pos + url_features->host.pos;
240 									url_features->tld.size = tld_len;
241 								}
242 
243 							} else {
244 								fh->faup.url_type = FAUP_URL_HAS_UNKNOWN_TLD;
245 								url_features->tld.pos = tld_pos + url_features->host.pos;
246 								url_features->tld.size = tld_len;
247 							}
248 
249 							// Extract the domain (google.com)
250 							domain = (const char*) memrchr(host, '.', url_features->host.size - tld_len - 1);
251 							if (domain) {
252 								uint32_t domain_pos = (uint32_t) (((uintptr_t)domain)-((uintptr_t)host));
253 								if (tld_pos > domain_pos) {
254 									domain_pos += url_features->host.pos + 1;
255 									url_features->domain.pos = domain_pos;
256 									// Grab the TLD with us
257 									url_features->domain.size = next_valid_token_pos - domain_pos;
258 
259 									// subdomaing is what remains from the host
260 									if (url_features->domain.pos > 1) {
261 										url_features->subdomain.pos = url_features->host.pos;
262 										url_features->subdomain.size = url_features->domain.pos - url_features->host.pos - 1;
263 									}
264 								}
265 							}
266 						}
267 					}
268 					else {
269 						// If no TLD, the domain is same as the host
270 						url_features->domain = url_features->host;
271 					}
272 				}
273 				else {
274 					// If this is an IPv4, put it also in the host field
275 					url_features->domain = url_features->host;
276 					if (ipv4_host) {
277 					  fh->faup.url_type = FAUP_URL_IPV4;
278 					}
279 					if (ipv6_host) {
280 					  fh->faup.url_type = FAUP_URL_IPV6;
281 					}
282 				}
283 			}
284 		}
285 
286 		if (faup_features_exist(url_features->port)) {
287 			if (faup_features_exist(url_features->resource_path)) {
288 				next_valid_token_pos = url_features->resource_path.pos;
289 			} else if (faup_features_exist(url_features->query_string)) {
290 				next_valid_token_pos = url_features->query_string.pos;
291 			} else if (faup_features_exist(url_features->fragment)) {
292 				next_valid_token_pos = url_features->fragment.pos;
293 			} else {
294 				/* /\\* We have no next token *\\/  */
295 				/* /\\* FIXME: We shall return after, no need to go further *\\/  */
296 				next_valid_token_pos = url_len;
297 			}
298 			if (next_valid_token_pos > url_features->port.pos) {
299 				total_size = next_valid_token_pos - url_features->port.pos;
300 				url_features->port.size = total_size;
301 			}
302 		}
303 
304 		if (faup_features_exist(url_features->resource_path)) {
305 			if (faup_features_exist(url_features->query_string)) {
306 				next_valid_token_pos = url_features->query_string.pos;
307 			} else if (faup_features_exist(url_features->fragment)) {
308 				next_valid_token_pos = url_features->fragment.pos;
309 			} else {
310 				/* /\\* We have no next token *\\/  */
311 				/* /\\* FIXME: We shall return after, no need to go further *\\/  */
312 				next_valid_token_pos = url_len;
313 			}
314 			if (next_valid_token_pos > url_features->resource_path.pos) {
315 				total_size = next_valid_token_pos - url_features->resource_path.pos;
316 				url_features->resource_path.size = total_size;
317 			}
318 		}
319 
320 		if (faup_features_exist(url_features->query_string)) {
321 			if (faup_features_exist(url_features->fragment)) {
322 				next_valid_token_pos = url_features->fragment.pos;
323 			} else {
324 				/* /\\* We have no next token *\\/  */
325 				/* /\\* FIXME: We shall return after, no need to go further *\\/  */
326 				next_valid_token_pos = url_len;
327 			}
328 			if (next_valid_token_pos > url_features->query_string.pos) {
329 				total_size = next_valid_token_pos - url_features->query_string.pos;
330 				url_features->query_string.size = total_size;
331 			}
332 		}
333 
334 		if (faup_features_exist(url_features->fragment)) {
335 			total_size = url_len - url_features->fragment.pos;
336 			url_features->fragment.size = total_size;
337 		}
338 
339 		// If no domain was found, and no subdomain, then our domain == host (see issue 24)
340 		if (!faup_features_exist(url_features->domain) && (!faup_features_exist(url_features->subdomain))) {
341 			url_features->domain.pos = url_features->host.pos;
342 			url_features->domain.size = url_features->host.size;
343 		}
344 
345 		// URL has been analyzed so we can determine 'domain_without_tld'
346 		if( faup_features_exist(url_features->domain) ) {
347 			url_features->domain_without_tld.pos  = url_features->domain.pos;
348 			url_features->domain_without_tld.size = url_features->domain.size;
349 
350 			if( faup_features_exist(url_features->tld) ) {
351 				url_features->domain_without_tld.size -= (url_features->tld.size +1); //+1 for the dot before the tld
352 			}
353 		}
354 
355 		//faup_features_debug(url, url_features);
356 #ifdef FAUP_LUA_MODULES
357 		if (url_transformed_by_modules) {
358 		  free(url_transformed_by_modules);
359 		}
360 #endif // FAUP_LUA_MODULES
361 		return retval_url;
362 	}
363 
364 	// FIXME: we never go here because of the 'return 0' just here and in error_lookup() !
365 
366 
367 	/* FIXME: Such a message should not belong to the library */
368 	fprintf(stderr, "Cannot parse the url: '%s'\n", url);
369 	return NULL;
370 }
371