1 /*
2  * File: url.c
3  *
4  * Copyright (C) 2001-2009 Jorge Arellano Cid <jcid@dillo.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 3 of the License, or
9  * (at your option) any later version.
10  */
11 
12 /*
13  * Parse and normalize all URL's inside Dillo.
14  *  - <scheme> <authority> <path> <query> and <fragment> point to 'buffer'.
15  *  - 'url_string' is built upon demand (transparent to the caller).
16  *  - 'hostname' and 'port' are also being handled on demand.
17  */
18 
19 /*
20  * Regular Expression as given in RFC3986 for URL parsing.
21  *
22  *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
23  *   12            3  4          5       6  7        8 9
24  *
25  *  scheme    = $2
26  *  authority = $4
27  *  path      = $5
28  *  query     = $7
29  *  fragment  = $9
30  *
31  *
32  *  RFC-2396 BNF:
33  *
34  *  absoluteURI = scheme ":" (hier_part | opaque_part)
35  *  hier_part   = (net_path | abs_path) ["?" query]
36  *  net_path    = "//" authority[abs_path]
37  *  abs_path    = "/" path_segments
38  *
39  *  Notes:
40  *    - "undefined" means "preceeding separator does not appear".
41  *    - path is never "undefined" though it may be "empty".
42  */
43 
44 #include <stdlib.h>
45 #include <string.h>
46 #include <ctype.h>
47 
48 #include "url.h"
49 #include "msg.h"
50 
51 static const char *HEX = "0123456789ABCDEF";
52 
53 /* URL-field compare methods */
54 #define URL_STR_FIELD_CMP(s1,s2) \
55    (s1) && (s2) ? strcmp(s1,s2) : !(s1) && !(s2) ? 0 : (s1) ? 1 : -1
56 #define URL_STR_FIELD_I_CMP(s1,s2) \
57    (s1) && (s2) ? dStrAsciiCasecmp(s1,s2) : !(s1) && !(s2) ? 0 : (s1) ? 1 : -1
58 
59 /*
60  * Return the url as a string.
61  * (initializing 'url_string' field if necessary)
62  */
a_Url_str(const DilloUrl * u)63 char *a_Url_str(const DilloUrl *u)
64 {
65    /* Internal url handling IS transparent to the caller */
66    DilloUrl *url = (DilloUrl *) u;
67 
68    dReturn_val_if_fail (url != NULL, NULL);
69 
70    if (!url->url_string) {
71       url->url_string = dStr_sized_new(60);
72       dStr_sprintf(
73          url->url_string, "%s%s%s%s%s%s%s%s%s%s",
74          url->scheme    ? url->scheme : "",
75          url->scheme    ? ":" : "",
76          url->authority ? "//" : "",
77          url->authority ? url->authority : "",
78          // (url->path && url->path[0] != '/' && url->authority) ? "/" : "",
79          (url->authority && (!url->path || *url->path != '/')) ? "/" : "",
80          url->path      ? url->path : "",
81          url->query     ? "?" : "",
82          url->query     ? url->query : "",
83          url->fragment  ? "#" : "",
84          url->fragment  ? url->fragment : "");
85    }
86 
87    return url->url_string->str;
88 }
89 
90 /*
91  * Return the hostname as a string.
92  * (initializing 'hostname' and 'port' fields if necessary)
93  * Note: a similar approach can be taken for user:password auth.
94  */
a_Url_hostname(const DilloUrl * u)95 const char *a_Url_hostname(const DilloUrl *u)
96 {
97    char *p;
98    /* Internal url handling IS transparent to the caller */
99    DilloUrl *url = (DilloUrl *) u;
100 
101    if (!url->hostname && url->authority) {
102       if (url->authority[0] == '[' && (p = strchr(url->authority, ']'))) {
103          /* numeric ipv6 address, strip the brackets */
104          url->hostname = dStrndup(url->authority + 1,
105                                   (uint_t)(p - url->authority - 1));
106          if ((p = strchr(p, ':'))) {
107             url->port = strtol(p + 1, NULL, 10);
108          }
109       } else {
110          /* numeric ipv4 or hostname */
111          if ((p = strchr(url->authority, ':'))) {
112             url->port = strtol(p + 1, NULL, 10);
113             url->hostname = dStrndup(url->authority,
114                                      (uint_t)(p - url->authority));
115          } else {
116             url->hostname = url->authority;
117          }
118       }
119    }
120 
121    return url->hostname;
122 }
123 
124 /*
125  *  Create a DilloUrl object and initialize it.
126  *  (buffer, scheme, authority, path, query and fragment).
127  */
Url_object_new(const char * uri_str)128 static DilloUrl *Url_object_new(const char *uri_str)
129 {
130    DilloUrl *url;
131    char *s, *p;
132 
133    dReturn_val_if_fail (uri_str != NULL, NULL);
134 
135    url = dNew0(DilloUrl, 1);
136 
137    /* remove leading & trailing space from buffer */
138    url->buffer = dStrstrip(dStrdup(uri_str));
139 
140    s = (char *) url->buffer;
141    p = strpbrk(s, ":/?#");
142    if (p && p[0] == ':' && p > s) {                /* scheme */
143       *p = 0;
144       url->scheme = s;
145       s = ++p;
146    }
147    /* p = strpbrk(s, "/"); */
148    if (p == s && p[0] == '/' && p[1] == '/') {     /* authority */
149       s = p + 2;
150       p = strpbrk(s, "/?#");
151       if (p) {
152          memmove(s - 2, s, (size_t)MAX(p - s, 1));
153          url->authority = s - 2;
154          p[-2] = 0;
155          s = p;
156       } else if (*s) {
157          url->authority = s;
158          return url;
159       }
160    }
161 
162    p = strpbrk(s, "?#");
163    if (p) {                                        /* path */
164       url->path = (p > s) ? s : NULL;
165       s = p;
166    } else if (*s) {
167       url->path = s;
168       return url;
169    }
170 
171    p = strpbrk(s, "?#");
172    if (p && p[0] == '?') {                         /* query */
173       *p = 0;
174       s = p + 1;
175       url->query = s;
176       p = strpbrk(s, "#");
177       url->flags |= URL_Get;
178    }
179    if (p && p[0] == '#') {                         /* fragment */
180       *p = 0;
181       s = p + 1;
182       url->fragment = s;
183    }
184 
185    return url;
186 }
187 
188 /*
189  *  Free a DilloUrl
190  *  Do nothing if the argument is NULL
191  */
a_Url_free(DilloUrl * url)192 void a_Url_free(DilloUrl *url)
193 {
194    if (url) {
195       if (url->url_string)
196          dStr_free(url->url_string, TRUE);
197       if (url->hostname != url->authority)
198          dFree((char *)url->hostname);
199       dFree((char *)url->buffer);
200       dStr_free(url->data, 1);
201       dFree((char *)url->alt);
202       dFree(url);
203    }
204 }
205 
206 /*
207  * Resolve the URL as RFC3986 suggests.
208  */
Url_resolve_relative(const char * RelStr,DilloUrl * BaseUrlPar,const char * BaseStr)209 static Dstr *Url_resolve_relative(const char *RelStr,
210                                   DilloUrl *BaseUrlPar,
211                                   const char *BaseStr)
212 {
213    char *p, *s, *e;
214    int i;
215    Dstr *SolvedUrl, *Path;
216    DilloUrl *RelUrl, *BaseUrl = NULL;
217 
218    /* parse relative URL */
219    RelUrl = Url_object_new(RelStr);
220 
221    if (BaseUrlPar) {
222       BaseUrl = BaseUrlPar;
223    } else if (RelUrl->scheme == NULL) {
224       /* only required when there's no <scheme> in RelStr */
225       BaseUrl = Url_object_new(BaseStr);
226    }
227 
228    SolvedUrl = dStr_sized_new(64);
229    Path = dStr_sized_new(64);
230 
231    /* path empty && scheme and authority undefined */
232    if (!RelUrl->path && !RelUrl->scheme && !RelUrl->authority) {
233       dStr_append(SolvedUrl, BaseStr);
234       if ((p = strchr(SolvedUrl->str, '#')))
235          dStr_truncate(SolvedUrl, p - SolvedUrl->str);
236       if (!BaseUrl->path)
237          dStr_append_c(SolvedUrl, '/');
238 
239       if (RelUrl->query) {                        /* query */
240          if (BaseUrl->query)
241             dStr_truncate(SolvedUrl, BaseUrl->query - BaseUrl->buffer - 1);
242          dStr_append_c(SolvedUrl, '?');
243          dStr_append(SolvedUrl, RelUrl->query);
244       }
245       if (RelUrl->fragment) {                    /* fragment */
246          dStr_append_c(SolvedUrl, '#');
247          dStr_append(SolvedUrl, RelUrl->fragment);
248       }
249       goto done;
250 
251    } else if (RelUrl->scheme) {                  /* scheme */
252       dStr_append(SolvedUrl, RelStr);
253       goto done;
254 
255    } else if (RelUrl->authority) {               /* authority */
256       // Set the Path buffer and goto "STEP 7";
257       if (RelUrl->path)
258          dStr_append(Path, RelUrl->path);
259 
260    } else {
261       if (RelUrl->path && RelUrl->path[0] == '/') {   /* absolute path */
262          ; /* Ignore BaseUrl path */
263       } else if (BaseUrl->path) {                     /* relative path */
264          dStr_append(Path, BaseUrl->path);
265          for (i = Path->len; --i >= 0 && Path->str[i] != '/'; ) ;
266          if (i >= 0 && Path->str[i] == '/')
267             dStr_truncate(Path, ++i);
268       }
269       if (RelUrl->path)
270          dStr_append(Path, RelUrl->path);
271 
272       // erase "./"
273       while ((p=strstr(Path->str, "./")) &&
274              (p == Path->str || p[-1] == '/'))
275          dStr_erase(Path, p - Path->str, 2);
276       // erase last "."
277       if (Path->len && Path->str[Path->len - 1] == '.' &&
278           (Path->len == 1 || Path->str[Path->len - 2] == '/'))
279          dStr_truncate(Path, Path->len - 1);
280 
281       // erase "<segment>/../" and "<segment>/.."
282       s = p = Path->str;
283       while ( (p = strstr(p, "/..")) != NULL ) {
284          if (p[3] == '/' || !p[3]) { //  "/../" | "/.."
285             for (e = p + 3 ; p > s && p[-1] != '/'; --p) ;
286             dStr_erase(Path, p - Path->str, e - p + (p > s && *e != 0));
287             p -= (p > Path->str);
288          } else
289             p += 3;
290       }
291    }
292 
293    /* STEP 7
294     */
295 
296    /* scheme */
297    if (BaseUrl->scheme) {
298       dStr_append(SolvedUrl, BaseUrl->scheme);
299       dStr_append_c(SolvedUrl, ':');
300    }
301 
302    /* authority */
303    if (RelUrl->authority) {
304       dStr_append(SolvedUrl, "//");
305       dStr_append(SolvedUrl, RelUrl->authority);
306    } else if (BaseUrl->authority) {
307       dStr_append(SolvedUrl, "//");
308       dStr_append(SolvedUrl, BaseUrl->authority);
309    }
310 
311    /* path */
312    if ((RelUrl->authority || BaseUrl->authority) &&
313        ((Path->len == 0 && (RelUrl->query || RelUrl->fragment)) ||
314         (Path->len && Path->str[0] != '/')))
315       dStr_append_c(SolvedUrl, '/'); /* hack? */
316    dStr_append(SolvedUrl, Path->str);
317 
318    /* query */
319    if (RelUrl->query) {
320       dStr_append_c(SolvedUrl, '?');
321       dStr_append(SolvedUrl, RelUrl->query);
322    }
323 
324    /* fragment */
325    if (RelUrl->fragment) {
326       dStr_append_c(SolvedUrl, '#');
327       dStr_append(SolvedUrl, RelUrl->fragment);
328    }
329 
330 done:
331    dStr_free(Path, TRUE);
332    a_Url_free(RelUrl);
333    if (BaseUrl != BaseUrlPar)
334       a_Url_free(BaseUrl);
335    return SolvedUrl;
336 }
337 
338 /*
339  *  Transform (and resolve) an URL string into the respective DilloURL.
340  *  If URL  =  "http://dillo.sf.net:8080/index.html?long#part2"
341  *  then the resulting DilloURL should be:
342  *  DilloURL = {
343  *     url_string         = "http://dillo.sf.net:8080/index.html?long#part2"
344  *     scheme             = "http"
345  *     authority          = "dillo.sf.net:8080:
346  *     path               = "/index.html"
347  *     query              = "long"
348  *     fragment           = "part2"
349  *     hostname           = "dillo.sf.net"
350  *     port               = 8080
351  *     flags              = URL_Get
352  *     data               = Dstr * ("")
353  *     alt                = NULL
354  *     ismap_url_len      = 0
355  *  }
356  *
357  *  Return NULL if URL is badly formed.
358  */
a_Url_new(const char * url_str,const char * base_url)359 DilloUrl* a_Url_new(const char *url_str, const char *base_url)
360 {
361    DilloUrl *url;
362    char *urlstr = (char *)url_str;  /* auxiliar variable, don't free */
363    char *p, *str1 = NULL, *str2 = NULL;
364    Dstr *SolvedUrl;
365    int i, n_ic, n_ic_spc;
366 
367    dReturn_val_if_fail (url_str != NULL, NULL);
368 
369    /* Count illegal characters (0x00-0x1F, 0x7F-0xFF and space) */
370    n_ic = n_ic_spc = 0;
371    for (p = (char*)url_str; *p; p++) {
372       n_ic_spc += (*p == ' ') ? 1 : 0;
373       n_ic += (*p != ' ' && *p > 0x1F && *p < 0x7F) ? 0 : 1;
374    }
375    if (n_ic) {
376       /* Encode illegal characters (they could also be stripped).
377        * There's no standard for illegal chars; we chose to encode. */
378       p = str1 = dNew(char, strlen(url_str) + 2*n_ic + 1);
379       for (i = 0; url_str[i]; ++i)
380          if (url_str[i] > 0x1F && url_str[i] < 0x7F && url_str[i] != ' ')
381             *p++ = url_str[i];
382          else  {
383            *p++ = '%';
384            *p++ = HEX[(url_str[i] >> 4) & 15];
385            *p++ = HEX[url_str[i] & 15];
386          }
387       *p = 0;
388       urlstr = str1;
389    }
390 
391    /* let's use a heuristic to set http: as default */
392    if (!base_url) {
393       base_url = "http:";
394       if (urlstr[0] != '/') {
395          p = strpbrk(urlstr, "/#?:");
396          if (!p || *p != ':')
397             urlstr = str2 = dStrconcat("//", urlstr, NULL);
398       } else if (urlstr[1] != '/')
399          urlstr = str2 = dStrconcat("/", urlstr, NULL);
400    }
401 
402    /* Resolve the URL */
403    SolvedUrl = Url_resolve_relative(urlstr, NULL, base_url);
404    _MSG("SolvedUrl = %s\n", SolvedUrl->str);
405 
406    /* Fill url data */
407    url = Url_object_new(SolvedUrl->str);
408    url->data = dStr_new("");
409    url->url_string = SolvedUrl;
410    url->illegal_chars = n_ic;
411    url->illegal_chars_spc = n_ic_spc;
412 
413    dFree(str1);
414    dFree(str2);
415    return url;
416 }
417 
418 
419 /*
420  *  Duplicate a Url structure
421  */
a_Url_dup(const DilloUrl * ori)422 DilloUrl* a_Url_dup(const DilloUrl *ori)
423 {
424    DilloUrl *url;
425 
426    url = Url_object_new(URL_STR_(ori));
427    dReturn_val_if_fail (url != NULL, NULL);
428 
429    url->url_string           = dStr_new(URL_STR(ori));
430    url->port                 = ori->port;
431    url->flags                = ori->flags;
432    url->alt                  = dStrdup(ori->alt);
433    url->ismap_url_len        = ori->ismap_url_len;
434    url->illegal_chars        = ori->illegal_chars;
435    url->illegal_chars_spc    = ori->illegal_chars_spc;
436    url->data                 = dStr_sized_new(URL_DATA(ori)->len);
437    dStr_append_l(url->data, URL_DATA(ori)->str, URL_DATA(ori)->len);
438    return url;
439 }
440 
441 /*
442  *  Compare two Url's to check if they're the same, or which one is bigger.
443  *
444  *  The fields which are compared here are:
445  *  <scheme>, <authority>, <path>, <query> and <data>
446  *  Other fields are left for the caller to check
447  *
448  *  Return value: 0 if equal, > 0 if A > B, < 0 if A < B.
449  *
450  *  Note: this function defines a sorting order different from strcmp!
451  */
a_Url_cmp(const DilloUrl * A,const DilloUrl * B)452 int a_Url_cmp(const DilloUrl *A, const DilloUrl *B)
453 {
454    int st;
455 
456    dReturn_val_if_fail(A && B, 1);
457 
458    if (A == B ||
459        ((st = URL_STR_FIELD_I_CMP(A->authority, B->authority)) == 0 &&
460         (st = strcmp(A->path ? A->path + (*A->path == '/') : "",
461                      B->path ? B->path + (*B->path == '/') : "")) == 0 &&
462         //(st = URL_STR_FIELD_CMP(A->path, B->path)) == 0 &&
463         (st = URL_STR_FIELD_CMP(A->query, B->query)) == 0 &&
464         (st = dStr_cmp(A->data, B->data)) == 0 &&
465         (st = URL_STR_FIELD_I_CMP(A->scheme, B->scheme)) == 0))
466       return 0;
467    return st;
468 }
469 
470 /*
471  * Set DilloUrl flags
472  */
a_Url_set_flags(DilloUrl * u,int flags)473 void a_Url_set_flags(DilloUrl *u, int flags)
474 {
475    if (u)
476       u->flags = flags;
477 }
478 
479 /*
480  * Set DilloUrl data (like POST info, etc.)
481  */
a_Url_set_data(DilloUrl * u,Dstr ** data)482 void a_Url_set_data(DilloUrl *u, Dstr **data)
483 {
484    if (u) {
485       dStr_free(u->data, 1);
486       u->data = *data;
487       *data = NULL;
488    }
489 }
490 
491 /*
492  * Set DilloUrl alt (alternate text to the URL. Used by image maps)
493  */
a_Url_set_alt(DilloUrl * u,const char * alt)494 void a_Url_set_alt(DilloUrl *u, const char *alt)
495 {
496    if (u) {
497       dFree((char *)u->alt);
498       u->alt = dStrdup(alt);
499    }
500 }
501 
502 /*
503  * Set DilloUrl ismap coordinates
504  * (this is optimized for not hogging the CPU)
505  */
a_Url_set_ismap_coords(DilloUrl * u,char * coord_str)506 void a_Url_set_ismap_coords(DilloUrl *u, char *coord_str)
507 {
508    dReturn_if_fail (u && coord_str);
509 
510    if (!u->ismap_url_len) {
511       /* Save base-url length (without coords) */
512       u->ismap_url_len  = URL_STR_(u) ? u->url_string->len : 0;
513       a_Url_set_flags(u, URL_FLAGS(u) | URL_Ismap);
514    }
515    if (u->url_string) {
516       dStr_truncate(u->url_string, u->ismap_url_len);
517       dStr_append(u->url_string, coord_str);
518       u->query = u->url_string->str + u->ismap_url_len + 1;
519    }
520 }
521 
522 /*
523  * Given an hex octet (e.g., e3, 2F, 20), return the corresponding
524  * character if the octet is valid, and -1 otherwise
525  */
Url_decode_hex_octet(const char * s)526 static int Url_decode_hex_octet(const char *s)
527 {
528    int hex_value;
529    char *tail, hex[3];
530 
531    if (s && (hex[0] = s[0]) && (hex[1] = s[1])) {
532       hex[2] = 0;
533       hex_value = strtol(hex, &tail, 16);
534       if (tail - hex == 2)
535         return hex_value;
536    }
537    return -1;
538 }
539 
540 /*
541  * Parse possible hexadecimal octets in the URI path.
542  * Returns a new allocated string.
543  */
a_Url_decode_hex_str(const char * str)544 char *a_Url_decode_hex_str(const char *str)
545 {
546    char *new_str, *dest;
547    int i, val;
548 
549    if (!str)
550       return NULL;
551 
552    /* most cases won't have hex octets */
553    if (!strchr(str, '%'))
554       return dStrdup(str);
555 
556    dest = new_str = dNew(char, strlen(str) + 1);
557 
558    for (i = 0; str[i]; i++) {
559       *dest++ = (str[i] == '%' && (val = Url_decode_hex_octet(str+i+1)) >= 0) ?
560                 i+=2, val : str[i];
561    }
562    *dest++ = 0;
563 
564    new_str = dRealloc(new_str, sizeof(char) * (dest - new_str));
565    return new_str;
566 }
567 
568 /*
569  * Urlencode 'str'
570  * -RL :: According to the RFC 1738, only alphanumerics, the special
571  *        characters "$-_.+!*'(),", and reserved characters ";/?:@=&" used
572  *        for their *reserved purposes* may be used unencoded within a URL.
573  * We'll escape everything but alphanumeric and "-_.*" (as lynx).  --Jcid
574  *
575  * Note: the content type "application/x-www-form-urlencoded" is used:
576  *       i.e., ' ' -> '+' and '\n' -> CR LF (see HTML 4.01, Sec. 17.13.4)
577  */
a_Url_encode_hex_str(const char * str)578 char *a_Url_encode_hex_str(const char *str)
579 {
580    static const char *const verbatim = "-_.*";
581    char *newstr, *c;
582 
583    if (!str)
584       return NULL;
585 
586    newstr = dNew(char, 6*strlen(str)+1);
587 
588    for (c = newstr; *str; str++)
589       if ((dIsalnum(*str) && isascii(*str)) || strchr(verbatim, *str))
590          *c++ = *str;
591       else if (*str == ' ')
592          *c++ = '+';
593       else if (*str == '\n') {
594          *c++ = '%';
595          *c++ = '0';
596          *c++ = 'D';
597          *c++ = '%';
598          *c++ = '0';
599          *c++ = 'A';
600       } else {
601          *c++ = '%';
602          *c++ = HEX[(*str >> 4) & 15];
603          *c++ = HEX[*str & 15];
604       }
605    *c = 0;
606 
607   return newstr;
608 }
609 
610 
611 /*
612  * RFC-3986 suggests this stripping when "importing" URLs from other media.
613  * Strip: "URL:", enclosing < >, and embedded whitespace.
614  * (We also strip illegal chars: 00-1F and 7F-FF)
615  */
a_Url_string_strip_delimiters(const char * str)616 char *a_Url_string_strip_delimiters(const char *str)
617 {
618    char *p, *new_str, *text;
619 
620    new_str = text = dStrdup(str);
621 
622    if (new_str) {
623       if (strncmp(new_str, "URL:", 4) == 0)
624          text += 4;
625       if (*text == '<')
626          text++;
627 
628       for (p = new_str; *text; text++)
629          if (*text > 0x1F && *text < 0x7F && *text != ' ')
630             *p++ = *text;
631       if (p > new_str && p[-1] == '>')
632          --p;
633       *p = 0;
634    }
635    return new_str;
636 }
637 
638 /*
639  * Is the provided hostname an IP address?
640  */
Url_host_is_ip(const char * host)641 static bool_t Url_host_is_ip(const char *host)
642 {
643    uint_t len;
644 
645    if (!host || !*host)
646       return FALSE;
647 
648    len = strlen(host);
649 
650    if (len == strspn(host, "0123456789.")) {
651       _MSG("an IPv4 address\n");
652       return TRUE;
653    }
654    if (strchr(host, ':') &&
655        (len == strspn(host, "0123456789abcdefABCDEF:."))) {
656       /* The precise format is shown in section 3.2.2 of rfc 3986 */
657       MSG("an IPv6 address\n");
658       return TRUE;
659    }
660    return FALSE;
661 }
662 
663 /*
664  * How many internal dots are in the public portion of this hostname?
665  * e.g., for "www.dillo.org", it is one because everything under "dillo.org",
666  * as a .org domain, is part of one organization.
667  *
668  * Of course this is only a simple and imperfect approximation of
669  * organizational boundaries.
670  */
Url_host_public_internal_dots(const char * host)671 static uint_t Url_host_public_internal_dots(const char *host)
672 {
673    uint_t ret = 1;
674 
675    if (host) {
676       int start, after, tld_len;
677 
678       /* We may be able to trust the format of the host string more than
679        * I am here. Trailing dots and no dots are real possibilities, though.
680        */
681       after = strlen(host);
682       if (after > 0 && host[after - 1] == '.')
683          after--;
684       start = after;
685       while (start > 0 && host[start - 1] != '.')
686          start--;
687       tld_len = after - start;
688 
689       if (tld_len > 0) {
690          /* These TLDs were chosen by examining the current publicsuffix list
691           * in October 2014 and picking out those where it was simplest for
692           * them to describe the situation by beginning with a "*.[tld]" rule
693           * or every rule was "[something].[tld]".
694           *
695           * TODO: Consider the old publicsuffix code again. This TLD list has
696           * shrunk and shrunk over the years, and has become a poorer and
697           * poorer approximation of administrative boundaries.
698           */
699          const char *const tlds[] = {"bd","bn","ck","cy","er","fj","fk",
700                                      "gu","il","jm","ke","kh","kw","mm","mz",
701                                      "ni","np","pg","ye","za","zm","zw"};
702          uint_t i, tld_num = sizeof(tlds) / sizeof(tlds[0]);
703 
704          for (i = 0; i < tld_num; i++) {
705             if (strlen(tlds[i]) == (uint_t) tld_len &&
706                 !dStrnAsciiCasecmp(tlds[i], host + start, tld_len)) {
707                _MSG("TLD code matched %s\n", tlds[i]);
708                ret++;
709                break;
710             }
711          }
712       }
713    }
714    return ret;
715 }
716 
717 /*
718  * Given a URL host string, return the portion that is public, i.e., the
719  * domain that is in a registry outside the organization.
720  * For 'www.dillo.org', that would be 'dillo.org'.
721  */
Url_host_find_public_suffix(const char * host)722 static const char *Url_host_find_public_suffix(const char *host)
723 {
724    const char *s;
725    uint_t dots;
726 
727    if (!host || !*host || Url_host_is_ip(host))
728       return host;
729 
730    s = host;
731 
732    while (s[1])
733       s++;
734 
735    if (s > host && *s == '.') {
736       /* don't want to deal with trailing dot */
737       s--;
738    }
739 
740    dots = Url_host_public_internal_dots(host);
741 
742    /* With a proper host string, we should not be pointing to a dot now. */
743 
744    while (s > host) {
745       if (s[-1] == '.') {
746          if (dots == 0)
747             break;
748          else
749             dots--;
750       }
751       s--;
752    }
753 
754    _MSG("public suffix of %s is %s\n", host, s);
755    return s;
756 }
757 
a_Url_same_organization(const DilloUrl * u1,const DilloUrl * u2)758 bool_t a_Url_same_organization(const DilloUrl *u1, const DilloUrl *u2)
759 {
760    if (!u1 || !u2)
761       return FALSE;
762 
763    return dStrAsciiCasecmp(Url_host_find_public_suffix(URL_HOST(u1)),
764                            Url_host_find_public_suffix(URL_HOST(u2)))
765           ? FALSE : TRUE;
766 }
767