1 /*
2 * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
3 * Version 2, December 2004
4 *
5 * Copyright (C) 2012-2013 Sebastien Tricaud <sebastien@honeynet.org>
6 *
7 * Everyone is permitted to copy and distribute verbatim or modified
8 * copies of this license document, and changing it is allowed as long
9 * as the name is changed.
10 *
11 * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
12 * TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
13 *
14 * 0. You just DO WHAT THE FUCK YOU WANT TO.
15 */
16
17 #define _GNU_SOURCE
18 #include <faup/faup.h>
19 #include <faup/decode.h>
20 #include <faup/features.h>
21 #include <faup/tld-tree.h>
22
23 #ifdef FAUP_LUA_MODULES
24 #include <faup/modules.h>
25 #endif
26
27 #ifdef WIN32
28 #include <faup/compat.h>
29 #endif
30
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include <ctype.h>
35
powui(uint32_t base,uint32_t n)36 uint32_t powui(uint32_t base, uint32_t n)
37 {
38 uint32_t ret = 1;
39 uint32_t i;
40
41 for (i = 0; i < n; i++) {
42 ret *= base;
43 }
44 return ret;
45 }
46
is_ipv4(const char * str,const size_t n)47 bool is_ipv4(const char* str, const size_t n)
48 {
49 // TODO: vectorize this
50 uint32_t ndots = 0;
51 uint32_t nip = 0;
52 int32_t cur_d = 2;
53 size_t i = 0;
54 char prev_c = 0;
55
56 if (n > 15) {
57 return false;
58 }
59
60 while ((i < n) && isspace(str[i])) {
61 i++;
62 }
63
64 for (; i < n; i++) {
65 const char c = str[i];
66 if (c == '.') {
67 if (prev_c == '.') {
68 return false;
69 }
70 ndots++;
71 nip /= powui(10, cur_d+1);
72 if (nip > 255) {
73 return false;
74 }
75 nip = 0;
76 cur_d = 2;
77 }
78 else {
79 if ((c < '0') || (c > '9')) {
80 if (prev_c == '.') {
81 return false;
82 }
83 break;
84 }
85 if (cur_d < 0) {
86 return false;
87 }
88 nip += ((uint32_t)((c-'0')))*powui(10, cur_d);
89 cur_d--;
90 }
91 prev_c = c;
92 }
93 nip /= powui(10, cur_d+1);
94 if (nip > 255) {
95 return false;
96 }
97 // Check trailing characters
98 for (; i < n; i++) {
99 if (!isspace(str[i])) {
100 return false;
101 }
102 }
103 return ndots == 3;
104 }
105
is_ipv6(const char * str,const size_t n)106 bool is_ipv6(const char* str, const size_t n)
107 {
108 if (n < 3) {
109 return false;
110 }
111
112 if ((str[0] == '[') && (str[n-1] == ']')) {
113 return true;
114 }
115
116 return false;
117 }
118
faup_decode(faup_handler_t * fh,const char * url,size_t url_len)119 const char *faup_decode(faup_handler_t *fh, const char *url, size_t url_len)
120 {
121 uint32_t total_size = 0;
122 int next_valid_token_pos = 0;
123 const char *retval_url = url;
124
125 #ifdef FAUP_LUA_MODULES
126 faup_modules_transformed_url_t *url_transformed_by_modules = NULL;
127 #endif
128
129 faup_features_t *url_features = NULL;
130
131 if (!url) {
132 return NULL;
133 }
134
135 if (fh->options->number_of_chars_to_remove >= url_len) {
136 if (url_len > 0) {
137 fprintf(stderr, "Warning: Cannot remove more characters than the url string! Not removing any character on this url: %s\n", url);
138 }
139 } else {
140 url_len -= fh->options->number_of_chars_to_remove;
141 }
142
143 fh->faup.decoded = true;
144 fh->faup.url_type = FAUP_URL_HAS_NO_TLD;
145
146 #ifdef FAUP_LUA_MODULES
147 if (fh->options->exec_modules != FAUP_MODULES_NOEXEC) {
148 url_transformed_by_modules = faup_modules_decode_url_start(fh, url, url_len);
149 if (url_transformed_by_modules) {
150 fh->faup.org_str = url_transformed_by_modules->url; // FIXME: Change to 'url' when the output has changed to reflect new way of doing with lua stuff
151 fh->faup.org_str_len = url_len;
152 faup_features_find(fh, url_transformed_by_modules->url, url_transformed_by_modules->url_len);
153 url_len = url_transformed_by_modules->url_len;
154 retval_url = url_transformed_by_modules->url;
155 }
156 }
157 if (!url_transformed_by_modules) {
158 fh->faup.org_str = url;
159 fh->faup.org_str_len = url_len;
160 faup_features_find(fh, url, url_len);
161 }
162 #else
163 // Nothing has been transformed, so we simply use our original url
164 fh->faup.org_str = url;
165 fh->faup.org_str_len = url_len;
166 faup_features_find(fh, url, url_len);
167 #endif // FAUP_LUA_MODULES
168
169 url_features = &fh->faup.features;
170
171 //FIXME: faup_features_errors_lookup _always_ return 0 => ?! This if statement is useless !
172 if (!faup_features_errors_lookup(url_features)) {
173 if ((faup_features_exist(url_features->scheme)) && (faup_features_exist(url_features->hierarchical))) {
174 total_size = url_features->hierarchical.pos - url_features->scheme.pos;
175 url_features->scheme.size = total_size;
176 }
177
178 if (faup_features_exist(url_features->credential)) {
179 total_size = url_features->host.pos - url_features->credential.pos - 1;
180 url_features->credential.size = total_size;
181 }
182
183 if (faup_features_exist(url_features->host)) {
184 if (faup_features_exist(url_features->port)) {
185 next_valid_token_pos = url_features->port.pos - 1;
186 } else if (faup_features_exist(url_features->resource_path)) {
187 next_valid_token_pos = url_features->resource_path.pos;
188 } else if (faup_features_exist(url_features->query_string)) {
189 next_valid_token_pos = url_features->query_string.pos;
190 } else if (faup_features_exist(url_features->fragment)) {
191 next_valid_token_pos = url_features->fragment.pos;
192 } else {
193 /* /\\* We have no next token *\\/ */
194 /* /\\* FIXME: We shall return after, no need to go further *\\/ */
195 next_valid_token_pos = url_len;
196 }
197
198 if (next_valid_token_pos > url_features->host.pos) {
199 const char *host = NULL;
200
201 total_size = next_valid_token_pos - url_features->host.pos;
202 url_features->host.size = total_size;
203 /* Check if we are dealing with an IPv(4|6) */
204 host = url + url_features->host.pos;
205
206 bool ipv4_host = is_ipv4(host, total_size);
207 bool ipv6_host = is_ipv6(host, total_size);
208 if (!ipv4_host && !ipv6_host) {
209 uint32_t tld_pos;
210 uintptr_t tld_len;
211 /* Extract the TLD now */
212 const char *tld = (const char*) memrchr(host, '.', url_features->host.size);
213 if (tld) {
214 tld++;
215
216 tld_pos = (uint32_t) (((uintptr_t)tld)-((uintptr_t)host));
217 tld_len = url_features->host.size - tld_pos;
218
219 if (tld_len>0) {
220 const char* domain;
221 /* We sometime have no resource_path after but a trailing slash ('www.honeynet.org/') */
222 if ((tld[tld_len-1] == '/') || (tld[tld_len-1] == '?')) {
223 tld_len--;
224 }
225
226 // All the features are detected, we can do some extra operations now
227 if (fh->options->tld_greater_extraction) {
228 faup_tld_tree_extracted_t tld_extracted = faup_tld_tree_extract(fh, fh->options->tld_tree);
229 url_features->tld.pos = tld_extracted.pos;
230 url_features->tld.size = tld_extracted.size;
231
232 // Since we do not have the -t option, check if the TLD wasn't > 1
233 if (tld_extracted.pos >= 0) {
234 fh->faup.url_type = FAUP_URL_HAS_MOZILLA_TLD;
235 tld_pos = tld_extracted.pos;
236 tld_len = tld_extracted.size;
237 } else {
238 fh->faup.url_type = FAUP_URL_HAS_UNKNOWN_TLD;
239 url_features->tld.pos = tld_pos + url_features->host.pos;
240 url_features->tld.size = tld_len;
241 }
242
243 } else {
244 fh->faup.url_type = FAUP_URL_HAS_UNKNOWN_TLD;
245 url_features->tld.pos = tld_pos + url_features->host.pos;
246 url_features->tld.size = tld_len;
247 }
248
249 // Extract the domain (google.com)
250 domain = (const char*) memrchr(host, '.', url_features->host.size - tld_len - 1);
251 if (domain) {
252 uint32_t domain_pos = (uint32_t) (((uintptr_t)domain)-((uintptr_t)host));
253 if (tld_pos > domain_pos) {
254 domain_pos += url_features->host.pos + 1;
255 url_features->domain.pos = domain_pos;
256 // Grab the TLD with us
257 url_features->domain.size = next_valid_token_pos - domain_pos;
258
259 // subdomaing is what remains from the host
260 if (url_features->domain.pos > 1) {
261 url_features->subdomain.pos = url_features->host.pos;
262 url_features->subdomain.size = url_features->domain.pos - url_features->host.pos - 1;
263 }
264 }
265 }
266 }
267 }
268 else {
269 // If no TLD, the domain is same as the host
270 url_features->domain = url_features->host;
271 }
272 }
273 else {
274 // If this is an IPv4, put it also in the host field
275 url_features->domain = url_features->host;
276 if (ipv4_host) {
277 fh->faup.url_type = FAUP_URL_IPV4;
278 }
279 if (ipv6_host) {
280 fh->faup.url_type = FAUP_URL_IPV6;
281 }
282 }
283 }
284 }
285
286 if (faup_features_exist(url_features->port)) {
287 if (faup_features_exist(url_features->resource_path)) {
288 next_valid_token_pos = url_features->resource_path.pos;
289 } else if (faup_features_exist(url_features->query_string)) {
290 next_valid_token_pos = url_features->query_string.pos;
291 } else if (faup_features_exist(url_features->fragment)) {
292 next_valid_token_pos = url_features->fragment.pos;
293 } else {
294 /* /\\* We have no next token *\\/ */
295 /* /\\* FIXME: We shall return after, no need to go further *\\/ */
296 next_valid_token_pos = url_len;
297 }
298 if (next_valid_token_pos > url_features->port.pos) {
299 total_size = next_valid_token_pos - url_features->port.pos;
300 url_features->port.size = total_size;
301 }
302 }
303
304 if (faup_features_exist(url_features->resource_path)) {
305 if (faup_features_exist(url_features->query_string)) {
306 next_valid_token_pos = url_features->query_string.pos;
307 } else if (faup_features_exist(url_features->fragment)) {
308 next_valid_token_pos = url_features->fragment.pos;
309 } else {
310 /* /\\* We have no next token *\\/ */
311 /* /\\* FIXME: We shall return after, no need to go further *\\/ */
312 next_valid_token_pos = url_len;
313 }
314 if (next_valid_token_pos > url_features->resource_path.pos) {
315 total_size = next_valid_token_pos - url_features->resource_path.pos;
316 url_features->resource_path.size = total_size;
317 }
318 }
319
320 if (faup_features_exist(url_features->query_string)) {
321 if (faup_features_exist(url_features->fragment)) {
322 next_valid_token_pos = url_features->fragment.pos;
323 } else {
324 /* /\\* We have no next token *\\/ */
325 /* /\\* FIXME: We shall return after, no need to go further *\\/ */
326 next_valid_token_pos = url_len;
327 }
328 if (next_valid_token_pos > url_features->query_string.pos) {
329 total_size = next_valid_token_pos - url_features->query_string.pos;
330 url_features->query_string.size = total_size;
331 }
332 }
333
334 if (faup_features_exist(url_features->fragment)) {
335 total_size = url_len - url_features->fragment.pos;
336 url_features->fragment.size = total_size;
337 }
338
339 // If no domain was found, and no subdomain, then our domain == host (see issue 24)
340 if (!faup_features_exist(url_features->domain) && (!faup_features_exist(url_features->subdomain))) {
341 url_features->domain.pos = url_features->host.pos;
342 url_features->domain.size = url_features->host.size;
343 }
344
345 // URL has been analyzed so we can determine 'domain_without_tld'
346 if( faup_features_exist(url_features->domain) ) {
347 url_features->domain_without_tld.pos = url_features->domain.pos;
348 url_features->domain_without_tld.size = url_features->domain.size;
349
350 if( faup_features_exist(url_features->tld) ) {
351 url_features->domain_without_tld.size -= (url_features->tld.size +1); //+1 for the dot before the tld
352 }
353 }
354
355 //faup_features_debug(url, url_features);
356 #ifdef FAUP_LUA_MODULES
357 if (url_transformed_by_modules) {
358 free(url_transformed_by_modules);
359 }
360 #endif // FAUP_LUA_MODULES
361 return retval_url;
362 }
363
364 // FIXME: we never go here because of the 'return 0' just here and in error_lookup() !
365
366
367 /* FIXME: Such a message should not belong to the library */
368 fprintf(stderr, "Cannot parse the url: '%s'\n", url);
369 return NULL;
370 }
371