1 /*
2 * File: url.c
3 *
4 * Copyright (C) 2001-2009 Jorge Arellano Cid <jcid@dillo.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
10 */
11
12 /*
13 * Parse and normalize all URL's inside Dillo.
14 * - <scheme> <authority> <path> <query> and <fragment> point to 'buffer'.
15 * - 'url_string' is built upon demand (transparent to the caller).
16 * - 'hostname' and 'port' are also being handled on demand.
17 */
18
19 /*
20 * Regular Expression as given in RFC3986 for URL parsing.
21 *
22 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
23 * 12 3 4 5 6 7 8 9
24 *
25 * scheme = $2
26 * authority = $4
27 * path = $5
28 * query = $7
29 * fragment = $9
30 *
31 *
32 * RFC-2396 BNF:
33 *
34 * absoluteURI = scheme ":" (hier_part | opaque_part)
35 * hier_part = (net_path | abs_path) ["?" query]
36 * net_path = "//" authority[abs_path]
37 * abs_path = "/" path_segments
38 *
39 * Notes:
40 * - "undefined" means "preceeding separator does not appear".
41 * - path is never "undefined" though it may be "empty".
42 */
43
44 #include <stdlib.h>
45 #include <string.h>
46 #include <ctype.h>
47
48 #include "url.h"
49 #include "msg.h"
50
51 static const char *HEX = "0123456789ABCDEF";
52
53 /* URL-field compare methods */
54 #define URL_STR_FIELD_CMP(s1,s2) \
55 (s1) && (s2) ? strcmp(s1,s2) : !(s1) && !(s2) ? 0 : (s1) ? 1 : -1
56 #define URL_STR_FIELD_I_CMP(s1,s2) \
57 (s1) && (s2) ? dStrAsciiCasecmp(s1,s2) : !(s1) && !(s2) ? 0 : (s1) ? 1 : -1
58
59 /*
60 * Return the url as a string.
61 * (initializing 'url_string' field if necessary)
62 */
a_Url_str(const DilloUrl * u)63 char *a_Url_str(const DilloUrl *u)
64 {
65 /* Internal url handling IS transparent to the caller */
66 DilloUrl *url = (DilloUrl *) u;
67
68 dReturn_val_if_fail (url != NULL, NULL);
69
70 if (!url->url_string) {
71 url->url_string = dStr_sized_new(60);
72 dStr_sprintf(
73 url->url_string, "%s%s%s%s%s%s%s%s%s%s",
74 url->scheme ? url->scheme : "",
75 url->scheme ? ":" : "",
76 url->authority ? "//" : "",
77 url->authority ? url->authority : "",
78 // (url->path && url->path[0] != '/' && url->authority) ? "/" : "",
79 (url->authority && (!url->path || *url->path != '/')) ? "/" : "",
80 url->path ? url->path : "",
81 url->query ? "?" : "",
82 url->query ? url->query : "",
83 url->fragment ? "#" : "",
84 url->fragment ? url->fragment : "");
85 }
86
87 return url->url_string->str;
88 }
89
90 /*
91 * Return the hostname as a string.
92 * (initializing 'hostname' and 'port' fields if necessary)
93 * Note: a similar approach can be taken for user:password auth.
94 */
a_Url_hostname(const DilloUrl * u)95 const char *a_Url_hostname(const DilloUrl *u)
96 {
97 char *p;
98 /* Internal url handling IS transparent to the caller */
99 DilloUrl *url = (DilloUrl *) u;
100
101 if (!url->hostname && url->authority) {
102 if (url->authority[0] == '[' && (p = strchr(url->authority, ']'))) {
103 /* numeric ipv6 address, strip the brackets */
104 url->hostname = dStrndup(url->authority + 1,
105 (uint_t)(p - url->authority - 1));
106 if ((p = strchr(p, ':'))) {
107 url->port = strtol(p + 1, NULL, 10);
108 }
109 } else {
110 /* numeric ipv4 or hostname */
111 if ((p = strchr(url->authority, ':'))) {
112 url->port = strtol(p + 1, NULL, 10);
113 url->hostname = dStrndup(url->authority,
114 (uint_t)(p - url->authority));
115 } else {
116 url->hostname = url->authority;
117 }
118 }
119 }
120
121 return url->hostname;
122 }
123
124 /*
125 * Create a DilloUrl object and initialize it.
126 * (buffer, scheme, authority, path, query and fragment).
127 */
Url_object_new(const char * uri_str)128 static DilloUrl *Url_object_new(const char *uri_str)
129 {
130 DilloUrl *url;
131 char *s, *p;
132
133 dReturn_val_if_fail (uri_str != NULL, NULL);
134
135 url = dNew0(DilloUrl, 1);
136
137 /* remove leading & trailing space from buffer */
138 url->buffer = dStrstrip(dStrdup(uri_str));
139
140 s = (char *) url->buffer;
141 p = strpbrk(s, ":/?#");
142 if (p && p[0] == ':' && p > s) { /* scheme */
143 *p = 0;
144 url->scheme = s;
145 s = ++p;
146 }
147 /* p = strpbrk(s, "/"); */
148 if (p == s && p[0] == '/' && p[1] == '/') { /* authority */
149 s = p + 2;
150 p = strpbrk(s, "/?#");
151 if (p) {
152 memmove(s - 2, s, (size_t)MAX(p - s, 1));
153 url->authority = s - 2;
154 p[-2] = 0;
155 s = p;
156 } else if (*s) {
157 url->authority = s;
158 return url;
159 }
160 }
161
162 p = strpbrk(s, "?#");
163 if (p) { /* path */
164 url->path = (p > s) ? s : NULL;
165 s = p;
166 } else if (*s) {
167 url->path = s;
168 return url;
169 }
170
171 p = strpbrk(s, "?#");
172 if (p && p[0] == '?') { /* query */
173 *p = 0;
174 s = p + 1;
175 url->query = s;
176 p = strpbrk(s, "#");
177 url->flags |= URL_Get;
178 }
179 if (p && p[0] == '#') { /* fragment */
180 *p = 0;
181 s = p + 1;
182 url->fragment = s;
183 }
184
185 return url;
186 }
187
188 /*
189 * Free a DilloUrl
190 * Do nothing if the argument is NULL
191 */
a_Url_free(DilloUrl * url)192 void a_Url_free(DilloUrl *url)
193 {
194 if (url) {
195 if (url->url_string)
196 dStr_free(url->url_string, TRUE);
197 if (url->hostname != url->authority)
198 dFree((char *)url->hostname);
199 dFree((char *)url->buffer);
200 dStr_free(url->data, 1);
201 dFree((char *)url->alt);
202 dFree(url);
203 }
204 }
205
206 /*
207 * Resolve the URL as RFC3986 suggests.
208 */
Url_resolve_relative(const char * RelStr,DilloUrl * BaseUrlPar,const char * BaseStr)209 static Dstr *Url_resolve_relative(const char *RelStr,
210 DilloUrl *BaseUrlPar,
211 const char *BaseStr)
212 {
213 char *p, *s, *e;
214 int i;
215 Dstr *SolvedUrl, *Path;
216 DilloUrl *RelUrl, *BaseUrl = NULL;
217
218 /* parse relative URL */
219 RelUrl = Url_object_new(RelStr);
220
221 if (BaseUrlPar) {
222 BaseUrl = BaseUrlPar;
223 } else if (RelUrl->scheme == NULL) {
224 /* only required when there's no <scheme> in RelStr */
225 BaseUrl = Url_object_new(BaseStr);
226 }
227
228 SolvedUrl = dStr_sized_new(64);
229 Path = dStr_sized_new(64);
230
231 /* path empty && scheme and authority undefined */
232 if (!RelUrl->path && !RelUrl->scheme && !RelUrl->authority) {
233 dStr_append(SolvedUrl, BaseStr);
234 if ((p = strchr(SolvedUrl->str, '#')))
235 dStr_truncate(SolvedUrl, p - SolvedUrl->str);
236 if (!BaseUrl->path)
237 dStr_append_c(SolvedUrl, '/');
238
239 if (RelUrl->query) { /* query */
240 if (BaseUrl->query)
241 dStr_truncate(SolvedUrl, BaseUrl->query - BaseUrl->buffer - 1);
242 dStr_append_c(SolvedUrl, '?');
243 dStr_append(SolvedUrl, RelUrl->query);
244 }
245 if (RelUrl->fragment) { /* fragment */
246 dStr_append_c(SolvedUrl, '#');
247 dStr_append(SolvedUrl, RelUrl->fragment);
248 }
249 goto done;
250
251 } else if (RelUrl->scheme) { /* scheme */
252 dStr_append(SolvedUrl, RelStr);
253 goto done;
254
255 } else if (RelUrl->authority) { /* authority */
256 // Set the Path buffer and goto "STEP 7";
257 if (RelUrl->path)
258 dStr_append(Path, RelUrl->path);
259
260 } else {
261 if (RelUrl->path && RelUrl->path[0] == '/') { /* absolute path */
262 ; /* Ignore BaseUrl path */
263 } else if (BaseUrl->path) { /* relative path */
264 dStr_append(Path, BaseUrl->path);
265 for (i = Path->len; --i >= 0 && Path->str[i] != '/'; ) ;
266 if (i >= 0 && Path->str[i] == '/')
267 dStr_truncate(Path, ++i);
268 }
269 if (RelUrl->path)
270 dStr_append(Path, RelUrl->path);
271
272 // erase "./"
273 while ((p=strstr(Path->str, "./")) &&
274 (p == Path->str || p[-1] == '/'))
275 dStr_erase(Path, p - Path->str, 2);
276 // erase last "."
277 if (Path->len && Path->str[Path->len - 1] == '.' &&
278 (Path->len == 1 || Path->str[Path->len - 2] == '/'))
279 dStr_truncate(Path, Path->len - 1);
280
281 // erase "<segment>/../" and "<segment>/.."
282 s = p = Path->str;
283 while ( (p = strstr(p, "/..")) != NULL ) {
284 if (p[3] == '/' || !p[3]) { // "/../" | "/.."
285 for (e = p + 3 ; p > s && p[-1] != '/'; --p) ;
286 dStr_erase(Path, p - Path->str, e - p + (p > s && *e != 0));
287 p -= (p > Path->str);
288 } else
289 p += 3;
290 }
291 }
292
293 /* STEP 7
294 */
295
296 /* scheme */
297 if (BaseUrl->scheme) {
298 dStr_append(SolvedUrl, BaseUrl->scheme);
299 dStr_append_c(SolvedUrl, ':');
300 }
301
302 /* authority */
303 if (RelUrl->authority) {
304 dStr_append(SolvedUrl, "//");
305 dStr_append(SolvedUrl, RelUrl->authority);
306 } else if (BaseUrl->authority) {
307 dStr_append(SolvedUrl, "//");
308 dStr_append(SolvedUrl, BaseUrl->authority);
309 }
310
311 /* path */
312 if ((RelUrl->authority || BaseUrl->authority) &&
313 ((Path->len == 0 && (RelUrl->query || RelUrl->fragment)) ||
314 (Path->len && Path->str[0] != '/')))
315 dStr_append_c(SolvedUrl, '/'); /* hack? */
316 dStr_append(SolvedUrl, Path->str);
317
318 /* query */
319 if (RelUrl->query) {
320 dStr_append_c(SolvedUrl, '?');
321 dStr_append(SolvedUrl, RelUrl->query);
322 }
323
324 /* fragment */
325 if (RelUrl->fragment) {
326 dStr_append_c(SolvedUrl, '#');
327 dStr_append(SolvedUrl, RelUrl->fragment);
328 }
329
330 done:
331 dStr_free(Path, TRUE);
332 a_Url_free(RelUrl);
333 if (BaseUrl != BaseUrlPar)
334 a_Url_free(BaseUrl);
335 return SolvedUrl;
336 }
337
338 /*
339 * Transform (and resolve) an URL string into the respective DilloURL.
340 * If URL = "http://dillo.sf.net:8080/index.html?long#part2"
341 * then the resulting DilloURL should be:
342 * DilloURL = {
343 * url_string = "http://dillo.sf.net:8080/index.html?long#part2"
344 * scheme = "http"
345 * authority = "dillo.sf.net:8080:
346 * path = "/index.html"
347 * query = "long"
348 * fragment = "part2"
349 * hostname = "dillo.sf.net"
350 * port = 8080
351 * flags = URL_Get
352 * data = Dstr * ("")
353 * alt = NULL
354 * ismap_url_len = 0
355 * }
356 *
357 * Return NULL if URL is badly formed.
358 */
a_Url_new(const char * url_str,const char * base_url)359 DilloUrl* a_Url_new(const char *url_str, const char *base_url)
360 {
361 DilloUrl *url;
362 char *urlstr = (char *)url_str; /* auxiliar variable, don't free */
363 char *p, *str1 = NULL, *str2 = NULL;
364 Dstr *SolvedUrl;
365 int i, n_ic, n_ic_spc;
366
367 dReturn_val_if_fail (url_str != NULL, NULL);
368
369 /* Count illegal characters (0x00-0x1F, 0x7F-0xFF and space) */
370 n_ic = n_ic_spc = 0;
371 for (p = (char*)url_str; *p; p++) {
372 n_ic_spc += (*p == ' ') ? 1 : 0;
373 n_ic += (*p != ' ' && *p > 0x1F && *p < 0x7F) ? 0 : 1;
374 }
375 if (n_ic) {
376 /* Encode illegal characters (they could also be stripped).
377 * There's no standard for illegal chars; we chose to encode. */
378 p = str1 = dNew(char, strlen(url_str) + 2*n_ic + 1);
379 for (i = 0; url_str[i]; ++i)
380 if (url_str[i] > 0x1F && url_str[i] < 0x7F && url_str[i] != ' ')
381 *p++ = url_str[i];
382 else {
383 *p++ = '%';
384 *p++ = HEX[(url_str[i] >> 4) & 15];
385 *p++ = HEX[url_str[i] & 15];
386 }
387 *p = 0;
388 urlstr = str1;
389 }
390
391 /* let's use a heuristic to set http: as default */
392 if (!base_url) {
393 base_url = "http:";
394 if (urlstr[0] != '/') {
395 p = strpbrk(urlstr, "/#?:");
396 if (!p || *p != ':')
397 urlstr = str2 = dStrconcat("//", urlstr, NULL);
398 } else if (urlstr[1] != '/')
399 urlstr = str2 = dStrconcat("/", urlstr, NULL);
400 }
401
402 /* Resolve the URL */
403 SolvedUrl = Url_resolve_relative(urlstr, NULL, base_url);
404 _MSG("SolvedUrl = %s\n", SolvedUrl->str);
405
406 /* Fill url data */
407 url = Url_object_new(SolvedUrl->str);
408 url->data = dStr_new("");
409 url->url_string = SolvedUrl;
410 url->illegal_chars = n_ic;
411 url->illegal_chars_spc = n_ic_spc;
412
413 dFree(str1);
414 dFree(str2);
415 return url;
416 }
417
418
419 /*
420 * Duplicate a Url structure
421 */
a_Url_dup(const DilloUrl * ori)422 DilloUrl* a_Url_dup(const DilloUrl *ori)
423 {
424 DilloUrl *url;
425
426 url = Url_object_new(URL_STR_(ori));
427 dReturn_val_if_fail (url != NULL, NULL);
428
429 url->url_string = dStr_new(URL_STR(ori));
430 url->port = ori->port;
431 url->flags = ori->flags;
432 url->alt = dStrdup(ori->alt);
433 url->ismap_url_len = ori->ismap_url_len;
434 url->illegal_chars = ori->illegal_chars;
435 url->illegal_chars_spc = ori->illegal_chars_spc;
436 url->data = dStr_sized_new(URL_DATA(ori)->len);
437 dStr_append_l(url->data, URL_DATA(ori)->str, URL_DATA(ori)->len);
438 return url;
439 }
440
441 /*
442 * Compare two Url's to check if they're the same, or which one is bigger.
443 *
444 * The fields which are compared here are:
445 * <scheme>, <authority>, <path>, <query> and <data>
446 * Other fields are left for the caller to check
447 *
448 * Return value: 0 if equal, > 0 if A > B, < 0 if A < B.
449 *
450 * Note: this function defines a sorting order different from strcmp!
451 */
a_Url_cmp(const DilloUrl * A,const DilloUrl * B)452 int a_Url_cmp(const DilloUrl *A, const DilloUrl *B)
453 {
454 int st;
455
456 dReturn_val_if_fail(A && B, 1);
457
458 if (A == B ||
459 ((st = URL_STR_FIELD_I_CMP(A->authority, B->authority)) == 0 &&
460 (st = strcmp(A->path ? A->path + (*A->path == '/') : "",
461 B->path ? B->path + (*B->path == '/') : "")) == 0 &&
462 //(st = URL_STR_FIELD_CMP(A->path, B->path)) == 0 &&
463 (st = URL_STR_FIELD_CMP(A->query, B->query)) == 0 &&
464 (st = dStr_cmp(A->data, B->data)) == 0 &&
465 (st = URL_STR_FIELD_I_CMP(A->scheme, B->scheme)) == 0))
466 return 0;
467 return st;
468 }
469
470 /*
471 * Set DilloUrl flags
472 */
a_Url_set_flags(DilloUrl * u,int flags)473 void a_Url_set_flags(DilloUrl *u, int flags)
474 {
475 if (u)
476 u->flags = flags;
477 }
478
479 /*
480 * Set DilloUrl data (like POST info, etc.)
481 */
a_Url_set_data(DilloUrl * u,Dstr ** data)482 void a_Url_set_data(DilloUrl *u, Dstr **data)
483 {
484 if (u) {
485 dStr_free(u->data, 1);
486 u->data = *data;
487 *data = NULL;
488 }
489 }
490
491 /*
492 * Set DilloUrl alt (alternate text to the URL. Used by image maps)
493 */
a_Url_set_alt(DilloUrl * u,const char * alt)494 void a_Url_set_alt(DilloUrl *u, const char *alt)
495 {
496 if (u) {
497 dFree((char *)u->alt);
498 u->alt = dStrdup(alt);
499 }
500 }
501
502 /*
503 * Set DilloUrl ismap coordinates
504 * (this is optimized for not hogging the CPU)
505 */
a_Url_set_ismap_coords(DilloUrl * u,char * coord_str)506 void a_Url_set_ismap_coords(DilloUrl *u, char *coord_str)
507 {
508 dReturn_if_fail (u && coord_str);
509
510 if (!u->ismap_url_len) {
511 /* Save base-url length (without coords) */
512 u->ismap_url_len = URL_STR_(u) ? u->url_string->len : 0;
513 a_Url_set_flags(u, URL_FLAGS(u) | URL_Ismap);
514 }
515 if (u->url_string) {
516 dStr_truncate(u->url_string, u->ismap_url_len);
517 dStr_append(u->url_string, coord_str);
518 u->query = u->url_string->str + u->ismap_url_len + 1;
519 }
520 }
521
522 /*
523 * Given an hex octet (e.g., e3, 2F, 20), return the corresponding
524 * character if the octet is valid, and -1 otherwise
525 */
Url_decode_hex_octet(const char * s)526 static int Url_decode_hex_octet(const char *s)
527 {
528 int hex_value;
529 char *tail, hex[3];
530
531 if (s && (hex[0] = s[0]) && (hex[1] = s[1])) {
532 hex[2] = 0;
533 hex_value = strtol(hex, &tail, 16);
534 if (tail - hex == 2)
535 return hex_value;
536 }
537 return -1;
538 }
539
540 /*
541 * Parse possible hexadecimal octets in the URI path.
542 * Returns a new allocated string.
543 */
a_Url_decode_hex_str(const char * str)544 char *a_Url_decode_hex_str(const char *str)
545 {
546 char *new_str, *dest;
547 int i, val;
548
549 if (!str)
550 return NULL;
551
552 /* most cases won't have hex octets */
553 if (!strchr(str, '%'))
554 return dStrdup(str);
555
556 dest = new_str = dNew(char, strlen(str) + 1);
557
558 for (i = 0; str[i]; i++) {
559 *dest++ = (str[i] == '%' && (val = Url_decode_hex_octet(str+i+1)) >= 0) ?
560 i+=2, val : str[i];
561 }
562 *dest++ = 0;
563
564 new_str = dRealloc(new_str, sizeof(char) * (dest - new_str));
565 return new_str;
566 }
567
568 /*
569 * Urlencode 'str'
570 * -RL :: According to the RFC 1738, only alphanumerics, the special
571 * characters "$-_.+!*'(),", and reserved characters ";/?:@=&" used
572 * for their *reserved purposes* may be used unencoded within a URL.
573 * We'll escape everything but alphanumeric and "-_.*" (as lynx). --Jcid
574 *
575 * Note: the content type "application/x-www-form-urlencoded" is used:
576 * i.e., ' ' -> '+' and '\n' -> CR LF (see HTML 4.01, Sec. 17.13.4)
577 */
a_Url_encode_hex_str(const char * str)578 char *a_Url_encode_hex_str(const char *str)
579 {
580 static const char *const verbatim = "-_.*";
581 char *newstr, *c;
582
583 if (!str)
584 return NULL;
585
586 newstr = dNew(char, 6*strlen(str)+1);
587
588 for (c = newstr; *str; str++)
589 if ((dIsalnum(*str) && isascii(*str)) || strchr(verbatim, *str))
590 *c++ = *str;
591 else if (*str == ' ')
592 *c++ = '+';
593 else if (*str == '\n') {
594 *c++ = '%';
595 *c++ = '0';
596 *c++ = 'D';
597 *c++ = '%';
598 *c++ = '0';
599 *c++ = 'A';
600 } else {
601 *c++ = '%';
602 *c++ = HEX[(*str >> 4) & 15];
603 *c++ = HEX[*str & 15];
604 }
605 *c = 0;
606
607 return newstr;
608 }
609
610
611 /*
612 * RFC-3986 suggests this stripping when "importing" URLs from other media.
613 * Strip: "URL:", enclosing < >, and embedded whitespace.
614 * (We also strip illegal chars: 00-1F and 7F-FF)
615 */
a_Url_string_strip_delimiters(const char * str)616 char *a_Url_string_strip_delimiters(const char *str)
617 {
618 char *p, *new_str, *text;
619
620 new_str = text = dStrdup(str);
621
622 if (new_str) {
623 if (strncmp(new_str, "URL:", 4) == 0)
624 text += 4;
625 if (*text == '<')
626 text++;
627
628 for (p = new_str; *text; text++)
629 if (*text > 0x1F && *text < 0x7F && *text != ' ')
630 *p++ = *text;
631 if (p > new_str && p[-1] == '>')
632 --p;
633 *p = 0;
634 }
635 return new_str;
636 }
637
638 /*
639 * Is the provided hostname an IP address?
640 */
Url_host_is_ip(const char * host)641 static bool_t Url_host_is_ip(const char *host)
642 {
643 uint_t len;
644
645 if (!host || !*host)
646 return FALSE;
647
648 len = strlen(host);
649
650 if (len == strspn(host, "0123456789.")) {
651 _MSG("an IPv4 address\n");
652 return TRUE;
653 }
654 if (strchr(host, ':') &&
655 (len == strspn(host, "0123456789abcdefABCDEF:."))) {
656 /* The precise format is shown in section 3.2.2 of rfc 3986 */
657 MSG("an IPv6 address\n");
658 return TRUE;
659 }
660 return FALSE;
661 }
662
663 /*
664 * How many internal dots are in the public portion of this hostname?
665 * e.g., for "www.dillo.org", it is one because everything under "dillo.org",
666 * as a .org domain, is part of one organization.
667 *
668 * Of course this is only a simple and imperfect approximation of
669 * organizational boundaries.
670 */
Url_host_public_internal_dots(const char * host)671 static uint_t Url_host_public_internal_dots(const char *host)
672 {
673 uint_t ret = 1;
674
675 if (host) {
676 int start, after, tld_len;
677
678 /* We may be able to trust the format of the host string more than
679 * I am here. Trailing dots and no dots are real possibilities, though.
680 */
681 after = strlen(host);
682 if (after > 0 && host[after - 1] == '.')
683 after--;
684 start = after;
685 while (start > 0 && host[start - 1] != '.')
686 start--;
687 tld_len = after - start;
688
689 if (tld_len > 0) {
690 /* These TLDs were chosen by examining the current publicsuffix list
691 * in October 2014 and picking out those where it was simplest for
692 * them to describe the situation by beginning with a "*.[tld]" rule
693 * or every rule was "[something].[tld]".
694 *
695 * TODO: Consider the old publicsuffix code again. This TLD list has
696 * shrunk and shrunk over the years, and has become a poorer and
697 * poorer approximation of administrative boundaries.
698 */
699 const char *const tlds[] = {"bd","bn","ck","cy","er","fj","fk",
700 "gu","il","jm","ke","kh","kw","mm","mz",
701 "ni","np","pg","ye","za","zm","zw"};
702 uint_t i, tld_num = sizeof(tlds) / sizeof(tlds[0]);
703
704 for (i = 0; i < tld_num; i++) {
705 if (strlen(tlds[i]) == (uint_t) tld_len &&
706 !dStrnAsciiCasecmp(tlds[i], host + start, tld_len)) {
707 _MSG("TLD code matched %s\n", tlds[i]);
708 ret++;
709 break;
710 }
711 }
712 }
713 }
714 return ret;
715 }
716
717 /*
718 * Given a URL host string, return the portion that is public, i.e., the
719 * domain that is in a registry outside the organization.
720 * For 'www.dillo.org', that would be 'dillo.org'.
721 */
Url_host_find_public_suffix(const char * host)722 static const char *Url_host_find_public_suffix(const char *host)
723 {
724 const char *s;
725 uint_t dots;
726
727 if (!host || !*host || Url_host_is_ip(host))
728 return host;
729
730 s = host;
731
732 while (s[1])
733 s++;
734
735 if (s > host && *s == '.') {
736 /* don't want to deal with trailing dot */
737 s--;
738 }
739
740 dots = Url_host_public_internal_dots(host);
741
742 /* With a proper host string, we should not be pointing to a dot now. */
743
744 while (s > host) {
745 if (s[-1] == '.') {
746 if (dots == 0)
747 break;
748 else
749 dots--;
750 }
751 s--;
752 }
753
754 _MSG("public suffix of %s is %s\n", host, s);
755 return s;
756 }
757
a_Url_same_organization(const DilloUrl * u1,const DilloUrl * u2)758 bool_t a_Url_same_organization(const DilloUrl *u1, const DilloUrl *u2)
759 {
760 if (!u1 || !u2)
761 return FALSE;
762
763 return dStrAsciiCasecmp(Url_host_find_public_suffix(URL_HOST(u1)),
764 Url_host_find_public_suffix(URL_HOST(u2)))
765 ? FALSE : TRUE;
766 }
767