1 /*
2 * Copyright 2006 John M Bell <jmb202@ecs.soton.ac.uk>
3 * Copyright 2009 John Tytgat <joty@netsurf-browser.org>
4 *
5 * This file is part of NetSurf, http://www.netsurf-browser.org/
6 *
7 * NetSurf is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; version 2 of the License.
10 *
11 * NetSurf is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 /**
21 * \file
22 * Unified URL information database implementation
23 *
24 * URLs are stored in a tree-based structure as follows:
25 *
26 * The host component is extracted from each URL and, if a FQDN, split on
27 * every '.'.The tree is constructed by inserting each FQDN segment in
28 * reverse order. Duplicate nodes are merged.
29 *
30 * If the host part of an URL is an IP address, then this is added to the
31 * tree verbatim (as if it were a TLD).
32 *
33 * This provides something looking like:
34 *
35 * root (a sentinel)
36 * |
37 * -------------------------------------------------
38 * | | | | | | |
39 * com edu gov 127.0.0.1 net org uk TLDs
40 * | | | | | |
41 * google ... ... ... ... co 2LDs
42 * | |
43 * www bbc Hosts/Subdomains
44 * |
45 * www ...
46 *
47 * Each of the nodes in this tree is a struct host_part. This stores the
48 * FQDN segment (or IP address) with which the node is concerned. Each node
49 * may contain further information about paths on a host (struct path_data)
50 * or SSL certificate processing on a host-wide basis
51 * (host_part::permit_invalid_certs).
52 *
53 * Path data is concerned with storing various metadata about the path in
54 * question. This includes global history data, HTTP authentication details
55 * and any associated HTTP cookies. This is stored as a tree of path segments
56 * hanging off the relevant host_part node.
57 *
58 * Therefore, to find the last visited time of the URL
59 * http://www.example.com/path/to/resource.html, the FQDN tree would be
60 * traversed in the order root -> "com" -> "example" -> "www". The "www"
61 * node would have attached to it a tree of struct path_data:
62 *
63 * (sentinel)
64 * |
65 * path
66 * |
67 * to
68 * |
69 * resource.html
70 *
71 * This represents the absolute path "/path/to/resource.html". The leaf node
72 * "resource.html" contains the last visited time of the resource.
73 *
74 * The mechanism described above is, however, not particularly conducive to
75 * fast searching of the database for a given URL (or URLs beginning with a
76 * given prefix). Therefore, an anciliary data structure is used to enable
77 * fast searching. This structure simply reflects the contents of the
78 * database, with entries being added/removed at the same time as for the
79 * core database. In order to ensure that degenerate cases are kept to a
80 * minimum, we use an AAtree. This is an approximation of a Red-Black tree
81 * with similar performance characteristics, but with a significantly
82 * simpler implementation. Entries in this tree comprise pointers to the
83 * leaf nodes of the host tree described above.
84 *
85 * REALLY IMPORTANT NOTE: urldb expects all URLs to be normalised. Use of
86 * non-normalised URLs with urldb will result in undefined behaviour and
87 * potential crashes.
88 */
89
90 #include <assert.h>
91 #include <stdbool.h>
92 #include <stdio.h>
93 #include <stdlib.h>
94 #include <string.h>
95 #include <strings.h>
96 #include <time.h>
97 #ifdef WITH_NSPSL
98 #include <nspsl.h>
99 #endif
100
101 #include "utils/inet.h"
102 #include "utils/nsoption.h"
103 #include "utils/log.h"
104 #include "utils/corestrings.h"
105 #include "utils/url.h"
106 #include "utils/utils.h"
107 #include "utils/bloom.h"
108 #include "utils/time.h"
109 #include "utils/nsurl.h"
110 #include "utils/ascii.h"
111 #include "utils/http.h"
112 #include "netsurf/bitmap.h"
113 #include "desktop/cookie_manager.h"
114 #include "desktop/gui_internal.h"
115
116 #include "content/content.h"
117 #include "content/urldb.h"
118
119 #ifdef WITH_AMISSL
120 /* AmiSSL needs everything to be using bsdsocket directly to avoid conflicts */
121 #include <proto/bsdsocket.h>
122 #endif
123
124 /**
125 * cookie entry.
126 *
127 * \warning This *must* be kept in sync with the public interface in
128 * netsurf/cookie_db.h
129 */
130 struct cookie_internal_data {
131 struct cookie_internal_data *prev; /**< Previous in list */
132 struct cookie_internal_data *next; /**< Next in list */
133
134 char *name; /**< Cookie name */
135 char *value; /**< Cookie value */
136 bool value_was_quoted; /**< Value was quoted in Set-Cookie: */
137 char *comment; /**< Cookie comment */
138 bool domain_from_set; /**< Domain came from Set-Cookie: header */
139 char *domain; /**< Domain */
140 bool path_from_set; /**< Path came from Set-Cookie: header */
141 char *path; /**< Path */
142 time_t expires; /**< Expiry timestamp, or -1 for session */
143 time_t last_used; /**< Last used time */
144 bool secure; /**< Only send for HTTPS requests */
145 bool http_only; /**< Only expose to HTTP(S) requests */
146 enum cookie_version version; /**< Specification compliance */
147 bool no_destroy; /**< Never destroy this cookie,
148 * unless it's expired */
149
150 };
151
152
153 /**
154 * A protection space
155 *
156 * This is defined as a tuple canonical_root_url and realm. This
157 * structure lives as linked list element in a leaf host_part struct
158 * so we need additional scheme and port to have a canonical_root_url.
159 */
160 struct prot_space_data {
161 /**
162 * URL scheme of canonical hostname of this protection space.
163 */
164 lwc_string *scheme;
165 /**
166 * Port number of canonical hostname of this protection
167 * space. When 0, it means the default port for given scheme,
168 * i.e. 80 (http), 443 (https).
169 */
170 unsigned int port;
171 /** Protection realm */
172 char *realm;
173
174 /**
175 * Authentication details for this protection space in form
176 * username:password
177 */
178 char *auth;
179 /** Next sibling */
180 struct prot_space_data *next;
181 };
182
183
184 /**
185 * meta data about a url
186 *
187 * \warning must be kept in sync with url_data structure in netsurf/url_db.h
188 */
189 struct url_internal_data {
190 char *title; /**< Resource title */
191 unsigned int visits; /**< Visit count */
192 time_t last_visit; /**< Last visit time */
193 content_type type; /**< Type of resource */
194 };
195
196
197 /**
198 * data entry for url
199 */
200 struct path_data {
201 nsurl *url; /**< Full URL */
202 lwc_string *scheme; /**< URL scheme for data */
203 unsigned int port; /**< Port number for data. When 0, it means
204 * the default port for given scheme, i.e.
205 * 80 (http), 443 (https). */
206 char *segment; /**< Path segment for this node */
207 unsigned int frag_cnt; /**< Number of entries in path_data::fragment */
208 char **fragment; /**< Array of fragments */
209 bool persistent; /**< This entry should persist */
210
211 struct url_internal_data urld; /**< URL data for resource */
212
213 /**
214 * Protection space to which this resource belongs too. Can be
215 * NULL when it does not belong to a protection space or when
216 * it is not known. No ownership (is with struct host_part::prot_space).
217 */
218 const struct prot_space_data *prot_space;
219 /** Cookies associated with resource */
220 struct cookie_internal_data *cookies;
221 /** Last cookie in list */
222 struct cookie_internal_data *cookies_end;
223
224 struct path_data *next; /**< Next sibling */
225 struct path_data *prev; /**< Previous sibling */
226 struct path_data *parent; /**< Parent path segment */
227 struct path_data *children; /**< Child path segments */
228 struct path_data *last; /**< Last child */
229 };
230
231 struct hsts_data {
232 time_t expires; /**< Expiry time */
233 bool include_sub_domains; /**< Whether to include subdomains */
234 };
235
236 struct host_part {
237 /**
238 * Known paths on this host. This _must_ be first so that
239 * struct host_part *h = (struct host_part *)mypath; works
240 */
241 struct path_data paths;
242 /**
243 * Allow access to SSL protected resources on this host
244 * without verifying certificate authenticity
245 */
246 bool permit_invalid_certs;
247 /* HSTS data */
248 struct hsts_data hsts;
249
250 /**
251 * Part of host string
252 */
253 char *part;
254
255 /**
256 * Linked list of all known proctection spaces known for this
257 * host and all its schems and ports.
258 */
259 struct prot_space_data *prot_space;
260
261 struct host_part *next; /**< Next sibling */
262 struct host_part *prev; /**< Previous sibling */
263 struct host_part *parent; /**< Parent host part */
264 struct host_part *children; /**< Child host parts */
265 };
266
267
268 /**
269 * search index node
270 */
271 struct search_node {
272 const struct host_part *data; /**< Host tree entry */
273
274 unsigned int level; /**< Node level */
275
276 struct search_node *left; /**< Left subtree */
277 struct search_node *right; /**< Right subtree */
278 };
279
280 /** Root database handle */
281 static struct host_part db_root;
282
283 /** Search trees - one per letter + 1 for IPs + 1 for Everything Else */
284 #define NUM_SEARCH_TREES 28
285 #define ST_IP 0
286 #define ST_EE 1
287 #define ST_DN 2
288 static struct search_node empty = { 0, 0, &empty, &empty };
289 static struct search_node *search_trees[NUM_SEARCH_TREES] = {
290 &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty,
291 &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty,
292 &empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty,
293 &empty, &empty, &empty, &empty
294 };
295
296 /** Minimum cookie database file version */
297 #define MIN_COOKIE_FILE_VERSION 100
298 /** Current cookie database file version */
299 #define COOKIE_FILE_VERSION 102
300 /** loaded cookie file version */
301 static int loaded_cookie_file_version;
302
303 /** Minimum URL database file version */
304 #define MIN_URL_FILE_VERSION 106
305 /** Current URL database file version */
306 #define URL_FILE_VERSION 107
307
308 /**
309 * filter for url presence in database
310 *
311 * Bloom filter used for short-circuting the false case of "is this
312 * URL in the database?". BLOOM_SIZE controls how large the filter is
313 * in bytes. Primitive experimentation shows that for a filter of X
314 * bytes filled with X items, searching for X items not in the filter
315 * has a 5% false-positive rate. We set it to 32kB, which should be
316 * enough for all but the largest databases, while not being
317 * shockingly wasteful on memory.
318 */
319 static struct bloom_filter *url_bloom;
320 /**
321 * Size of url filter
322 */
323 #define BLOOM_SIZE (1024 * 32)
324
325
326 /**
327 * write a time_t to a file portably
328 *
329 * \param fp File to write to
330 * \param val the unix time value to output
331 * \return NSERROR_OK on success
332 */
urldb_write_timet(FILE * fp,time_t val)333 static nserror urldb_write_timet(FILE *fp, time_t val)
334 {
335 int use;
336 char op[32];
337
338 use = nsc_sntimet(op, 32, &val);
339 if (use == 0) {
340 fprintf(fp, "%i\n", (int)val);
341 } else {
342 fprintf(fp, "%.*s\n", use, op);
343 }
344 return NSERROR_OK;
345 }
346
347 /**
348 * Write paths associated with a host
349 *
350 * \param parent Root of (sub)tree to write
351 * \param host Current host name
352 * \param fp File to write to
353 * \param path Current path string
354 * \param path_alloc Allocated size of path
355 * \param path_used Used size of path
356 * \param expiry Expiry time of URLs
357 */
358 static void
urldb_write_paths(const struct path_data * parent,const char * host,FILE * fp,char ** path,int * path_alloc,int * path_used,time_t expiry)359 urldb_write_paths(const struct path_data *parent,
360 const char *host,
361 FILE *fp,
362 char **path,
363 int *path_alloc,
364 int *path_used,
365 time_t expiry)
366 {
367 const struct path_data *p = parent;
368 int i;
369
370 do {
371 int seglen = p->segment != NULL ? strlen(p->segment) : 0;
372 int len = *path_used + seglen + 1;
373
374 if (*path_alloc < len) {
375 char *temp;
376 temp = realloc(*path,
377 (len > 64) ? len : *path_alloc + 64);
378 if (!temp) {
379 return;
380 }
381 *path = temp;
382 *path_alloc = (len > 64) ? len : *path_alloc + 64;
383 }
384
385 if (p->segment != NULL) {
386 memcpy(*path + *path_used - 1, p->segment, seglen);
387 }
388
389 if (p->children != NULL) {
390 (*path)[*path_used + seglen - 1] = '/';
391 (*path)[*path_used + seglen] = '\0';
392 } else {
393 (*path)[*path_used + seglen - 1] = '\0';
394 len -= 1;
395 }
396
397 *path_used = len;
398
399 if (p->children != NULL) {
400 /* Drill down into children */
401 p = p->children;
402 } else {
403 /* leaf node */
404 if (p->persistent ||
405 ((p->urld.last_visit > expiry) &&
406 (p->urld.visits > 0))) {
407 fprintf(fp, "%s\n", lwc_string_data(p->scheme));
408
409 if (p->port) {
410 fprintf(fp,"%d\n", p->port);
411 } else {
412 fprintf(fp, "\n");
413 }
414
415 fprintf(fp, "%s\n", *path);
416
417 /** \todo handle fragments? */
418
419 /* number of visits */
420 fprintf(fp, "%i\n", p->urld.visits);
421
422 /* time entry was last used */
423 urldb_write_timet(fp, p->urld.last_visit);
424
425 /* entry type */
426 fprintf(fp, "%i\n", (int)p->urld.type);
427
428 fprintf(fp, "\n");
429
430 if (p->urld.title) {
431 uint8_t *s = (uint8_t *) p->urld.title;
432
433 for (i = 0; s[i] != '\0'; i++)
434 if (s[i] < 32)
435 s[i] = ' ';
436 for (--i; ((i > 0) && (s[i] == ' '));
437 i--)
438 s[i] = '\0';
439 fprintf(fp, "%s\n", p->urld.title);
440 } else {
441 fprintf(fp, "\n");
442 }
443 }
444
445 /* Now, find next node to process. */
446 while (p != parent) {
447 int seglen = p->segment != NULL
448 ? strlen(p->segment) : 0;
449
450 /* Remove our segment from the path */
451 *path_used -= seglen;
452 (*path)[*path_used - 1] = '\0';
453
454 if (p->next != NULL) {
455 /* Have a sibling, process that */
456 p = p->next;
457 break;
458 }
459
460 /* Going up, so remove '/' */
461 *path_used -= 1;
462 (*path)[*path_used - 1] = '\0';
463
464 /* Ascend tree */
465 p = p->parent;
466 }
467 }
468 } while (p != parent);
469 }
470
471
472 /**
473 * Count number of URLs associated with a host
474 *
475 * \param root Root of path data tree
476 * \param expiry Expiry time for URLs
477 * \param count Pointer to count
478 */
479 static void
urldb_count_urls(const struct path_data * root,time_t expiry,unsigned int * count)480 urldb_count_urls(const struct path_data *root,
481 time_t expiry,
482 unsigned int *count)
483 {
484 const struct path_data *p = root;
485
486 do {
487 if (p->children != NULL) {
488 /* Drill down into children */
489 p = p->children;
490 } else {
491 /* No more children, increment count if required */
492 if (p->persistent ||
493 ((p->urld.last_visit > expiry) &&
494 (p->urld.visits > 0))) {
495 (*count)++;
496 }
497
498 /* Now, find next node to process. */
499 while (p != root) {
500 if (p->next != NULL) {
501 /* Have a sibling, process that */
502 p = p->next;
503 break;
504 }
505
506 /* Ascend tree */
507 p = p->parent;
508 }
509 }
510 } while (p != root);
511 }
512
513
514 /**
515 * Save a search (sub)tree
516 *
517 * \param parent root node of search tree to save.
518 * \param fp File to write to
519 */
urldb_save_search_tree(struct search_node * parent,FILE * fp)520 static void urldb_save_search_tree(struct search_node *parent, FILE *fp)
521 {
522 char host[256];
523 const struct host_part *h;
524 unsigned int path_count = 0;
525 char *path, *p, *end;
526 int path_alloc = 64, path_used = 1;
527 time_t expiry, hsts_expiry = 0;
528 int hsts_include_subdomains = 0;
529
530 expiry = time(NULL) - ((60 * 60 * 24) * nsoption_int(expire_url));
531
532 if (parent == &empty)
533 return;
534
535 urldb_save_search_tree(parent->left, fp);
536
537 path = malloc(path_alloc);
538 if (!path)
539 return;
540
541 path[0] = '\0';
542
543 for (h = parent->data, p = host, end = host + sizeof host;
544 h && h != &db_root && p < end; h = h->parent) {
545 int written = snprintf(p, end - p, "%s%s", h->part,
546 (h->parent && h->parent->parent) ? "." : "");
547 if (written < 0) {
548 free(path);
549 return;
550 }
551 p += written;
552 }
553
554 h = parent->data;
555 if (h && h->hsts.expires > expiry) {
556 hsts_expiry = h->hsts.expires;
557 hsts_include_subdomains = h->hsts.include_sub_domains;
558 }
559
560 urldb_count_urls(&parent->data->paths, expiry, &path_count);
561
562 if (path_count > 0) {
563 fprintf(fp, "%s %i ", host, hsts_include_subdomains);
564 urldb_write_timet(fp, hsts_expiry);
565 fprintf(fp, "%i\n", path_count);
566
567 urldb_write_paths(&parent->data->paths, host, fp,
568 &path, &path_alloc, &path_used, expiry);
569 } else if (hsts_expiry) {
570 fprintf(fp, "%s %i ", host, hsts_include_subdomains);
571 urldb_write_timet(fp, hsts_expiry);
572 fprintf(fp, "0\n");
573 }
574
575 free(path);
576
577 urldb_save_search_tree(parent->right, fp);
578 }
579
580
581 /**
582 * Path data iterator (internal)
583 *
584 * \param parent Root of subtree to iterate over
585 * \param url_callback Callback function
586 * \param cookie_callback Callback function
587 * \return true to continue, false otherwise
588 */
589 static bool
urldb_iterate_entries_path(const struct path_data * parent,bool (* url_callback)(nsurl * url,const struct url_data * data),bool (* cookie_callback)(const struct cookie_data * data))590 urldb_iterate_entries_path(const struct path_data *parent,
591 bool (*url_callback)(nsurl *url, const struct url_data *data),
592 bool (*cookie_callback)(const struct cookie_data *data))
593 {
594 const struct path_data *p = parent;
595 const struct cookie_data *c;
596
597 do {
598 if (p->children != NULL) {
599 /* Drill down into children */
600 p = p->children;
601 } else {
602 /* All leaf nodes in the path tree should have an URL or
603 * cookies attached to them. If this is not the case, it
604 * indicates that there's a bug in the file loader/URL
605 * insertion code. Therefore, assert this here. */
606 assert(url_callback || cookie_callback);
607
608 /** \todo handle fragments? */
609 if (url_callback) {
610 const struct url_internal_data *u = &p->urld;
611
612 assert(p->url);
613
614 if (!url_callback(p->url,
615 (const struct url_data *) u))
616 return false;
617 } else {
618 c = (const struct cookie_data *)p->cookies;
619 for (; c != NULL; c = c->next) {
620 if (!cookie_callback(c))
621 return false;
622 }
623 }
624
625 /* Now, find next node to process. */
626 while (p != parent) {
627 if (p->next != NULL) {
628 /* Have a sibling, process that */
629 p = p->next;
630 break;
631 }
632
633 /* Ascend tree */
634 p = p->parent;
635 }
636 }
637 } while (p != parent);
638
639 return true;
640 }
641
642
643 /**
644 * Check whether a host string is an IP address.
645 *
646 * This call detects IPv4 addresses (all of dotted-quad or subsets,
647 * decimal or hexadecimal notations) and IPv6 addresses (including
648 * those containing embedded IPv4 addresses.)
649 *
650 * \param host a hostname terminated by '\0'
651 * \return true if the hostname is an IP address, false otherwise
652 */
urldb__host_is_ip_address(const char * host)653 static bool urldb__host_is_ip_address(const char *host)
654 {
655 struct in_addr ipv4;
656 size_t host_len = strlen(host);
657 const char *sane_host;
658 const char *slash;
659 #ifndef NO_IPV6
660 struct in6_addr ipv6;
661 char ipv6_addr[64];
662 unsigned int ipv6_addr_len;
663 #endif
664 /**
665 * @todo FIXME Some parts of urldb.c make confusions between hosts
666 * and "prefixes", we can sometimes be erroneously passed more than
667 * just a host. Sometimes we may be passed trailing slashes, or even
668 * whole path segments. A specific criminal in this class is
669 * urldb_iterate_partial, which takes a prefix to search for, but
670 * passes that prefix to functions that expect only hosts.
671 *
672 * For the time being, we will accept such calls; we check if there
673 * is a / in the host parameter, and if there is, we take a copy and
674 * replace the / with a \0. This is not a permanent solution; we
675 * should search through NetSurf and find all the callers that are
676 * in error and fix them. When doing this task, it might be wise
677 * to replace the hideousness below with code that doesn't have to do
678 * this, and add assert(strchr(host, '/') == NULL); somewhere.
679 * -- rjek - 2010-11-04
680 */
681
682 slash = strchr(host, '/');
683 if (slash == NULL) {
684 sane_host = host;
685 } else {
686 char *c = strdup(host);
687 c[slash - host] = '\0';
688 sane_host = c;
689 host_len = slash - host;
690 NSLOG(netsurf, INFO, "WARNING: called with non-host '%s'",
691 host);
692 }
693
694 if (strspn(sane_host, "0123456789abcdefABCDEF[].:") < host_len)
695 goto out_false;
696
697 if (inet_aton(sane_host, &ipv4) != 0) {
698 /* This can only be a sane IPv4 address if it contains 3 dots.
699 * Helpfully, inet_aton is happy to treat "a", "a.b", "a.b.c",
700 * and "a.b.c.d" as valid IPv4 address strings where we only
701 * support the full, dotted-quad, form.
702 */
703 int num_dots = 0;
704 size_t index;
705
706 for (index = 0; index < host_len; index++) {
707 if (sane_host[index] == '.')
708 num_dots++;
709 }
710
711 if (num_dots == 3)
712 goto out_true;
713 else
714 goto out_false;
715 }
716
717 #ifndef NO_IPV6
718 if ((host_len < 6) ||
719 (sane_host[0] != '[') ||
720 (sane_host[host_len - 1] != ']')) {
721 goto out_false;
722 }
723
724 ipv6_addr_len = host_len - 2;
725 if (ipv6_addr_len >= sizeof(ipv6_addr)) {
726 ipv6_addr_len = sizeof(ipv6_addr) - 1;
727 }
728 strncpy(ipv6_addr, sane_host + 1, ipv6_addr_len);
729 ipv6_addr[ipv6_addr_len] = '\0';
730
731 if (inet_pton(AF_INET6, ipv6_addr, &ipv6) == 1)
732 goto out_true;
733 #endif
734
735 out_false:
736 if (slash != NULL) free((void *)sane_host);
737 return false;
738
739 out_true:
740 if (slash != NULL) free((void *)sane_host);
741 return true;
742 }
743
744
745 /**
746 * Compare host_part with prefix
747 *
748 * \param a host part
749 * \param b prefix
750 * \return 0 if match, non-zero, otherwise
751 */
urldb_search_match_prefix(const struct host_part * a,const char * b)752 static int urldb_search_match_prefix(const struct host_part *a, const char *b)
753 {
754 const char *end, *dot;
755 int plen, ret;
756
757 assert(a && a != &db_root && b);
758
759 if (urldb__host_is_ip_address(b)) {
760 /* IP address */
761 return strncasecmp(a->part, b, strlen(b));
762 }
763
764 end = b + strlen(b) + 1;
765
766 while (b < end && a && a != &db_root) {
767 dot = strchr(b, '.');
768 if (!dot) {
769 /* last segment */
770 dot = end - 1;
771 }
772
773 /* Compare strings (length limited) */
774 if ((ret = strncasecmp(a->part, b, dot - b)) != 0)
775 /* didn't match => return difference */
776 return ret;
777
778 /* The strings matched */
779 if (dot < end - 1) {
780 /* Consider segment lengths only in the case
781 * where the prefix contains segments */
782 plen = strlen(a->part);
783 if (plen > dot - b) {
784 /* len(a) > len(b) */
785 return 1;
786 } else if (plen < dot - b) {
787 /* len(a) < len(b) */
788 return -1;
789 }
790 }
791
792 b = dot + 1;
793 a = a->parent;
794 }
795
796 /* If we get here then either:
797 * a) The path lengths differ
798 * or b) The hosts are identical
799 */
800 if (a && a != &db_root && b >= end) {
801 /* len(a) > len(b) => prefix matches */
802 return 0;
803 } else if ((!a || a == &db_root) && b < end) {
804 /* len(a) < len(b) => prefix does not match */
805 return -1;
806 }
807
808 /* Identical */
809 return 0;
810 }
811
812
813 /**
814 * Partial host iterator (internal)
815 *
816 * \param root Root of (sub)tree to traverse
817 * \param prefix Prefix to match
818 * \param callback Callback function
819 * \return true to continue, false otherwise
820 */
821 static bool
urldb_iterate_partial_host(struct search_node * root,const char * prefix,bool (* callback)(nsurl * url,const struct url_data * data))822 urldb_iterate_partial_host(struct search_node *root,
823 const char *prefix,
824 bool (*callback)(nsurl *url, const struct url_data *data))
825 {
826 int c;
827
828 assert(root && prefix && callback);
829
830 if (root == &empty)
831 return true;
832
833 c = urldb_search_match_prefix(root->data, prefix);
834
835 if (c > 0) {
836 /* No match => look in left subtree */
837 return urldb_iterate_partial_host(root->left,
838 prefix,
839 callback);
840 } else if (c < 0) {
841 /* No match => look in right subtree */
842 return urldb_iterate_partial_host(root->right,
843 prefix,
844 callback);
845 } else {
846 /* Match => iterate over l/r subtrees & process this node */
847 if (!urldb_iterate_partial_host(root->left,
848 prefix,
849 callback)) {
850 return false;
851 }
852
853 if (root->data->paths.children) {
854 /* and extract all paths attached to this host */
855 if (!urldb_iterate_entries_path(&root->data->paths,
856 callback,
857 NULL)) {
858 return false;
859 }
860 }
861
862 if (!urldb_iterate_partial_host(root->right,
863 prefix,
864 callback)) {
865 return false;
866 }
867 }
868
869 return true;
870 }
871
872
873 /**
874 * Partial path iterator (internal)
875 *
876 * Given: http://www.example.org/a/b/c/d//e
877 * and assuming a path tree:
878 * ^
879 * / \
880 * a1 b1
881 * / \
882 * a2 b2
883 * /|\
884 * a b c
885 * 3 3 |
886 * d
887 * |
888 * e
889 * / \
890 * f g
891 *
892 * Prefix will be: p will be:
893 *
894 * a/b/c/d//e a1
895 * b/c/d//e a2
896 * b/c/d//e b3
897 * c/d//e a3
898 * c/d//e b3
899 * c/d//e c
900 * d//e d
901 * /e e (skip /)
902 * e e
903 *
904 * I.E. perform a breadth-first search of the tree.
905 *
906 * \param parent Root of (sub)tree to traverse
907 * \param prefix Prefix to match
908 * \param callback Callback function
909 * \return true to continue, false otherwise
910 */
911 static bool
urldb_iterate_partial_path(const struct path_data * parent,const char * prefix,bool (* callback)(nsurl * url,const struct url_data * data))912 urldb_iterate_partial_path(const struct path_data *parent,
913 const char *prefix,
914 bool (*callback)(nsurl *url, const struct url_data *data))
915 {
916 const struct path_data *p = parent->children;
917 const char *slash, *end = prefix + strlen(prefix);
918
919 do {
920 slash = strchr(prefix, '/');
921 if (!slash) {
922 slash = end;
923 }
924
925 if (slash == prefix && *prefix == '/') {
926 /* Ignore "//" */
927 prefix++;
928 continue;
929 }
930
931 if (strncasecmp(p->segment, prefix, slash - prefix) == 0) {
932 /* prefix matches so far */
933 if (slash == end) {
934 /* we've run out of prefix, so all
935 * paths below this one match */
936 if (!urldb_iterate_entries_path(p,
937 callback,
938 NULL)) {
939 return false;
940 }
941
942 /* Progress to next sibling */
943 p = p->next;
944 } else {
945 /* Skip over this segment */
946 prefix = slash + 1;
947
948 p = p->children;
949 }
950 } else {
951 /* Doesn't match this segment, try next sibling */
952 p = p->next;
953 }
954 } while (p != NULL);
955
956 return true;
957 }
958
959
960 /**
961 * Host data iterator (internal)
962 *
963 * \param parent Root of subtree to iterate over
964 * \param url_callback Callback function
965 * \param cookie_callback Callback function
966 * \return true to continue, false otherwise
967 */
968 static bool
urldb_iterate_entries_host(struct search_node * parent,bool (* url_callback)(nsurl * url,const struct url_data * data),bool (* cookie_callback)(const struct cookie_data * data))969 urldb_iterate_entries_host(struct search_node *parent,
970 bool (*url_callback)(nsurl *url, const struct url_data *data),
971 bool (*cookie_callback)(const struct cookie_data *data))
972 {
973 if (parent == &empty) {
974 return true;
975 }
976
977 if (!urldb_iterate_entries_host(parent->left,
978 url_callback,
979 cookie_callback)) {
980 return false;
981 }
982
983 if ((parent->data->paths.children) ||
984 ((cookie_callback) &&
985 (parent->data->paths.cookies))) {
986 /* We have paths (or domain cookies), so iterate them */
987 if (!urldb_iterate_entries_path(&parent->data->paths,
988 url_callback,
989 cookie_callback)) {
990 return false;
991 }
992 }
993
994 if (!urldb_iterate_entries_host(parent->right,
995 url_callback,
996 cookie_callback)) {
997 return false;
998 }
999
1000 return true;
1001 }
1002
1003
1004 /**
1005 * Add a host node to the tree
1006 *
1007 * \param part Host segment to add (or whole IP address) (copied)
1008 * \param parent Parent node to add to
1009 * \return Pointer to added node, or NULL on memory exhaustion
1010 */
1011 static struct host_part *
urldb_add_host_node(const char * part,struct host_part * parent)1012 urldb_add_host_node(const char *part, struct host_part *parent)
1013 {
1014 struct host_part *d;
1015
1016 assert(part && parent);
1017
1018 d = calloc(1, sizeof(struct host_part));
1019 if (!d) {
1020 return NULL;
1021 }
1022
1023 d->part = strdup(part);
1024 if (!d->part) {
1025 free(d);
1026 return NULL;
1027 }
1028
1029 d->next = parent->children;
1030 if (parent->children) {
1031 parent->children->prev = d;
1032 }
1033 d->parent = parent;
1034 parent->children = d;
1035
1036 return d;
1037 }
1038
1039
1040 /**
1041 * Fragment comparator callback for qsort
1042 *
1043 * \param a first value
1044 * \param b second value
1045 * \return 0 for equal else positive or negative value on comparison
1046 */
urldb_add_path_fragment_cmp(const void * a,const void * b)1047 static int urldb_add_path_fragment_cmp(const void *a, const void *b)
1048 {
1049 return strcasecmp(*((const char **) a), *((const char **) b));
1050 }
1051
1052
1053 /**
1054 * Add a fragment to a path segment
1055 *
1056 * \param segment Path segment to add to
1057 * \param fragment Fragment to add (copied), or NULL
1058 * \return segment or NULL on memory exhaustion
1059 */
1060 static struct path_data *
urldb_add_path_fragment(struct path_data * segment,lwc_string * fragment)1061 urldb_add_path_fragment(struct path_data *segment, lwc_string *fragment)
1062 {
1063 char **temp;
1064
1065 assert(segment);
1066
1067 /* If no fragment, this function is a NOP
1068 * This may seem strange, but it makes the rest
1069 * of the code cleaner */
1070 if (!fragment)
1071 return segment;
1072
1073 temp = realloc(segment->fragment,
1074 (segment->frag_cnt + 1) * sizeof(char *));
1075 if (!temp)
1076 return NULL;
1077
1078 segment->fragment = temp;
1079 segment->fragment[segment->frag_cnt] =
1080 strdup(lwc_string_data(fragment));
1081 if (!segment->fragment[segment->frag_cnt]) {
1082 /* Don't free temp - it's now our buffer */
1083 return NULL;
1084 }
1085
1086 segment->frag_cnt++;
1087
1088 /* We want fragments in alphabetical order, so sort them
1089 * It may prove better to insert in alphabetical order instead */
1090 qsort(segment->fragment,
1091 segment->frag_cnt,
1092 sizeof (char *),
1093 urldb_add_path_fragment_cmp);
1094
1095 return segment;
1096 }
1097
1098
1099 /**
1100 * Add a path node to the tree
1101 *
1102 * \param scheme URL scheme associated with path (copied)
1103 * \param port Port number on host associated with path
1104 * \param segment Path segment to add (copied)
1105 * \param fragment URL fragment (copied), or NULL
1106 * \param parent Parent node to add to
1107 * \return Pointer to added node, or NULL on memory exhaustion
1108 */
1109 static struct path_data *
urldb_add_path_node(lwc_string * scheme,unsigned int port,const char * segment,lwc_string * fragment,struct path_data * parent)1110 urldb_add_path_node(lwc_string *scheme,
1111 unsigned int port,
1112 const char *segment,
1113 lwc_string *fragment,
1114 struct path_data *parent)
1115 {
1116 struct path_data *d, *e;
1117
1118 assert(scheme && segment && parent);
1119
1120 d = calloc(1, sizeof(struct path_data));
1121 if (!d)
1122 return NULL;
1123
1124 d->scheme = lwc_string_ref(scheme);
1125
1126 d->port = port;
1127
1128 d->segment = strdup(segment);
1129 if (!d->segment) {
1130 lwc_string_unref(d->scheme);
1131 free(d);
1132 return NULL;
1133 }
1134
1135 if (fragment) {
1136 if (!urldb_add_path_fragment(d, fragment)) {
1137 free(d->segment);
1138 lwc_string_unref(d->scheme);
1139 free(d);
1140 return NULL;
1141 }
1142 }
1143
1144 for (e = parent->children; e; e = e->next) {
1145 if (strcmp(e->segment, d->segment) > 0)
1146 break;
1147 }
1148
1149 if (e) {
1150 d->prev = e->prev;
1151 d->next = e;
1152 if (e->prev)
1153 e->prev->next = d;
1154 else
1155 parent->children = d;
1156 e->prev = d;
1157 } else if (!parent->children) {
1158 d->prev = d->next = NULL;
1159 parent->children = parent->last = d;
1160 } else {
1161 d->next = NULL;
1162 d->prev = parent->last;
1163 parent->last->next = d;
1164 parent->last = d;
1165 }
1166 d->parent = parent;
1167
1168 return d;
1169 }
1170
1171
1172 /**
1173 * Get the search tree for a particular host
1174 *
1175 * \param host the host to lookup
1176 * \return the corresponding search tree
1177 */
urldb_get_search_tree_direct(const char * host)1178 static struct search_node **urldb_get_search_tree_direct(const char *host)
1179 {
1180 assert(host);
1181
1182 if (urldb__host_is_ip_address(host)) {
1183 return &search_trees[ST_IP];
1184 } else if (ascii_is_alpha(*host)) {
1185 return &search_trees[ST_DN + ascii_to_lower(*host) - 'a'];
1186 }
1187 return &search_trees[ST_EE];
1188 }
1189
1190
1191 /**
1192 * Get the search tree for a particular host
1193 *
1194 * \param host the host to lookup
1195 * \return the corresponding search tree
1196 */
urldb_get_search_tree(const char * host)1197 static struct search_node *urldb_get_search_tree(const char *host)
1198 {
1199 return *urldb_get_search_tree_direct(host);
1200 }
1201
1202
1203 /**
1204 * Compare host part with a string
1205 *
1206 * \param a host part
1207 * \param b string to compare
1208 * \return 0 if match, non-zero, otherwise
1209 */
urldb_search_match_string(const struct host_part * a,const char * b)1210 static int urldb_search_match_string(const struct host_part *a, const char *b)
1211 {
1212 const char *end, *dot;
1213 int plen, ret;
1214
1215 assert(a && a != &db_root && b);
1216
1217 if (urldb__host_is_ip_address(b)) {
1218 /* IP address */
1219 return strcasecmp(a->part, b);
1220 }
1221
1222 end = b + strlen(b) + 1;
1223
1224 while (b < end && a && a != &db_root) {
1225 dot = strchr(b, '.');
1226 if (!dot) {
1227 /* last segment */
1228 dot = end - 1;
1229 }
1230
1231 /* Compare strings (length limited) */
1232 if ((ret = strncasecmp(a->part, b, dot - b)) != 0)
1233 /* didn't match => return difference */
1234 return ret;
1235
1236 /* The strings matched, now check that the lengths do, too */
1237 plen = strlen(a->part);
1238
1239 if (plen > dot - b) {
1240 /* len(a) > len(b) */
1241 return 1;
1242 } else if (plen < dot - b) {
1243 /* len(a) < len(b) */
1244 return -1;
1245 }
1246
1247 b = dot + 1;
1248 a = a->parent;
1249 }
1250
1251 /* If we get here then either:
1252 * a) The path lengths differ
1253 * or b) The hosts are identical
1254 */
1255 if (a && a != &db_root && b >= end) {
1256 /* len(a) > len(b) */
1257 return 1;
1258 } else if ((!a || a == &db_root) && b < end) {
1259 /* len(a) < len(b) */
1260 return -1;
1261 }
1262
1263 /* Identical */
1264 return 0;
1265 }
1266
1267
1268 /**
1269 * Find a node in a search tree
1270 *
1271 * \param root Tree to look in
1272 * \param host Host to find
1273 * \return Pointer to host tree node, or NULL if not found
1274 */
1275 static const struct host_part *
urldb_search_find(struct search_node * root,const char * host)1276 urldb_search_find(struct search_node *root, const char *host)
1277 {
1278 int c;
1279
1280 assert(root && host);
1281
1282 if (root == &empty) {
1283 return NULL;
1284 }
1285
1286 c = urldb_search_match_string(root->data, host);
1287
1288 if (c > 0) {
1289 return urldb_search_find(root->left, host);
1290 } else if (c < 0) {
1291 return urldb_search_find(root->right, host);
1292 }
1293
1294 return root->data;
1295 }
1296
1297
1298 /**
1299 * Match a path string
1300 *
1301 * \param parent Path (sub)tree to look in
1302 * \param path The path to search for
1303 * \param scheme The URL scheme associated with the path
1304 * \param port The port associated with the path
1305 * \return Pointer to path data or NULL if not found.
1306 */
1307 static struct path_data *
urldb_match_path(const struct path_data * parent,const char * path,lwc_string * scheme,unsigned short port)1308 urldb_match_path(const struct path_data *parent,
1309 const char *path,
1310 lwc_string *scheme,
1311 unsigned short port)
1312 {
1313 const struct path_data *p;
1314 const char *slash;
1315 bool match;
1316
1317 assert(parent != NULL);
1318 assert(parent->segment == NULL);
1319
1320 if (path[0] != '/') {
1321 NSLOG(netsurf, INFO, "path is %s", path);
1322 }
1323
1324 assert(path[0] == '/');
1325
1326 /* Start with children, as parent has no segment */
1327 p = parent->children;
1328
1329 while (p != NULL) {
1330 slash = strchr(path + 1, '/');
1331 if (!slash) {
1332 slash = path + strlen(path);
1333 }
1334
1335 if (strncmp(p->segment, path + 1, slash - path - 1) == 0 &&
1336 lwc_string_isequal(p->scheme, scheme, &match) == lwc_error_ok &&
1337 match == true &&
1338 p->port == port) {
1339 if (*slash == '\0') {
1340 /* Complete match */
1341 return (struct path_data *) p;
1342 }
1343
1344 /* Match so far, go down tree */
1345 p = p->children;
1346
1347 path = slash;
1348 } else {
1349 /* No match, try next sibling */
1350 p = p->next;
1351 }
1352 }
1353
1354 return NULL;
1355 }
1356
1357
1358 /**
1359 * Find an URL in the database
1360 *
1361 * \param url Absolute URL to find
1362 * \return Pointer to path data, or NULL if not found
1363 */
urldb_find_url(nsurl * url)1364 static struct path_data *urldb_find_url(nsurl *url)
1365 {
1366 const struct host_part *h;
1367 struct path_data *p;
1368 struct search_node *tree;
1369 char *plq;
1370 const char *host_str;
1371 lwc_string *scheme, *host, *port;
1372 size_t len = 0;
1373 unsigned int port_int;
1374 bool match;
1375
1376 assert(url);
1377
1378 if (url_bloom != NULL) {
1379 if (bloom_search_hash(url_bloom, nsurl_hash(url)) == false) {
1380 return NULL;
1381 }
1382 }
1383
1384 scheme = nsurl_get_component(url, NSURL_SCHEME);
1385 if (scheme == NULL)
1386 return NULL;
1387
1388 if (lwc_string_isequal(scheme, corestring_lwc_mailto, &match) ==
1389 lwc_error_ok && match == true) {
1390 lwc_string_unref(scheme);
1391 return NULL;
1392 }
1393
1394 host = nsurl_get_component(url, NSURL_HOST);
1395 if (host != NULL) {
1396 host_str = lwc_string_data(host);
1397 lwc_string_unref(host);
1398
1399 } else if (lwc_string_isequal(scheme, corestring_lwc_file, &match) ==
1400 lwc_error_ok && match == true) {
1401 host_str = "localhost";
1402
1403 } else {
1404 lwc_string_unref(scheme);
1405 return NULL;
1406 }
1407
1408 tree = urldb_get_search_tree(host_str);
1409 h = urldb_search_find(tree, host_str);
1410 if (!h) {
1411 lwc_string_unref(scheme);
1412 return NULL;
1413 }
1414
1415 /* generate plq (path, leaf, query) */
1416 if (nsurl_get(url, NSURL_PATH | NSURL_QUERY, &plq, &len) != NSERROR_OK) {
1417 lwc_string_unref(scheme);
1418 return NULL;
1419 }
1420
1421 /* Get port */
1422 port = nsurl_get_component(url, NSURL_PORT);
1423 if (port != NULL) {
1424 port_int = atoi(lwc_string_data(port));
1425 lwc_string_unref(port);
1426 } else {
1427 port_int = 0;
1428 }
1429
1430 p = urldb_match_path(&h->paths, plq, scheme, port_int);
1431
1432 free(plq);
1433 lwc_string_unref(scheme);
1434
1435 return p;
1436 }
1437
1438
1439 /**
1440 * Dump URL database paths to stderr
1441 *
1442 * \param parent Parent node of tree to dump
1443 */
urldb_dump_paths(struct path_data * parent)1444 static void urldb_dump_paths(struct path_data *parent)
1445 {
1446 const struct path_data *p = parent;
1447 unsigned int i;
1448
1449 do {
1450 if (p->segment != NULL) {
1451 NSLOG(netsurf, INFO, "\t%s : %u",
1452 lwc_string_data(p->scheme), p->port);
1453
1454 NSLOG(netsurf, INFO, "\t\t'%s'", p->segment);
1455
1456 for (i = 0; i != p->frag_cnt; i++) {
1457 NSLOG(netsurf, INFO, "\t\t\t#%s",
1458 p->fragment[i]);
1459 }
1460 }
1461
1462 if (p->children != NULL) {
1463 p = p->children;
1464 } else {
1465 while (p != parent) {
1466 if (p->next != NULL) {
1467 p = p->next;
1468 break;
1469 }
1470
1471 p = p->parent;
1472 }
1473 }
1474 } while (p != parent);
1475 }
1476
1477
1478 /**
1479 * Dump URL database hosts to stderr
1480 *
1481 * \param parent Parent node of tree to dump
1482 */
urldb_dump_hosts(struct host_part * parent)1483 static void urldb_dump_hosts(struct host_part *parent)
1484 {
1485 struct host_part *h;
1486
1487 if (parent->part) {
1488 NSLOG(netsurf, INFO, "%s", parent->part);
1489
1490 NSLOG(netsurf, INFO, "\t%s invalid SSL certs",
1491 parent->permit_invalid_certs ? "Permits" : "Denies");
1492 }
1493
1494 /* Dump path data */
1495 urldb_dump_paths(&parent->paths);
1496
1497 /* and recurse */
1498 for (h = parent->children; h; h = h->next) {
1499 urldb_dump_hosts(h);
1500 }
1501 }
1502
1503
1504 /**
1505 * Dump search tree
1506 *
1507 * \param parent Parent node of tree to dump
1508 * \param depth Tree depth
1509 */
urldb_dump_search(struct search_node * parent,int depth)1510 static void urldb_dump_search(struct search_node *parent, int depth)
1511 {
1512 const struct host_part *h;
1513 int i; /* index into string */
1514 char s[1024];
1515 int r;
1516 int sl = sizeof(s) - 2;
1517
1518 if (parent == &empty)
1519 return;
1520
1521 urldb_dump_search(parent->left, depth + 1);
1522
1523 for (i = 0; i != depth; i++) {
1524 s[i] = ' ';
1525 }
1526
1527 for (h = parent->data; h; h = h->parent) {
1528 if (h->part) {
1529 r = snprintf(&s[i], sl - i, "%s", h->part);
1530 if ((i + r) > sl) {
1531 break;
1532 }
1533 i += r;
1534 }
1535
1536 if (h->parent && h->parent->parent) {
1537 s[i]='.';
1538 i++;
1539 }
1540 }
1541 s[i]= 0;
1542
1543 NSLOG(netsurf, INFO, "%s", s);
1544
1545 urldb_dump_search(parent->right, depth + 1);
1546 }
1547
1548
1549 /**
1550 * Compare a pair of host parts
1551 *
1552 * \param a first host part
1553 * \param b second host part
1554 * \return 0 if match, non-zero, otherwise
1555 */
1556 static int
urldb_search_match_host(const struct host_part * a,const struct host_part * b)1557 urldb_search_match_host(const struct host_part *a, const struct host_part *b)
1558 {
1559 int ret;
1560
1561 assert(a && b);
1562
1563 /* traverse up tree to root, comparing parts as we go. */
1564 for (; a && a != &db_root && b && b != &db_root;
1565 a = a->parent, b = b->parent) {
1566 if ((ret = strcasecmp(a->part, b->part)) != 0) {
1567 /* They differ => return the difference here */
1568 return ret;
1569 }
1570 }
1571
1572 /* If we get here then either:
1573 * a) The path lengths differ
1574 * or b) The hosts are identical
1575 */
1576 if (a && a != &db_root && (!b || b == &db_root)) {
1577 /* len(a) > len(b) */
1578 return 1;
1579 } else if ((!a || a == &db_root) && b && b != &db_root) {
1580 /* len(a) < len(b) */
1581 return -1;
1582 }
1583
1584 /* identical */
1585 return 0;
1586 }
1587
1588
1589 /**
1590 * Rotate a subtree right
1591 *
1592 * \param root Root of subtree to rotate
1593 * \return new root of subtree
1594 */
urldb_search_skew(struct search_node * root)1595 static struct search_node *urldb_search_skew(struct search_node *root)
1596 {
1597 assert(root);
1598
1599 if (root->left->level == root->level) {
1600 struct search_node *temp;
1601
1602 temp = root->left;
1603 root->left = temp->right;
1604 temp->right = root;
1605 root = temp;
1606 }
1607
1608 return root;
1609 }
1610
1611
1612 /**
1613 * Rotate a node left, increasing the parent's level
1614 *
1615 * \param root Root of subtree to rotate
1616 * \return New root of subtree
1617 */
urldb_search_split(struct search_node * root)1618 static struct search_node *urldb_search_split(struct search_node *root)
1619 {
1620 assert(root);
1621
1622 if (root->right->right->level == root->level) {
1623 struct search_node *temp;
1624
1625 temp = root->right;
1626 root->right = temp->left;
1627 temp->left = root;
1628 root = temp;
1629
1630 root->level++;
1631 }
1632
1633 return root;
1634 }
1635
1636
1637 /**
1638 * Insert node into search tree
1639 *
1640 * \param root Root of (sub)tree to insert into
1641 * \param n Node to insert
1642 * \return Pointer to updated root
1643 */
1644 static struct search_node *
urldb_search_insert_internal(struct search_node * root,struct search_node * n)1645 urldb_search_insert_internal(struct search_node *root, struct search_node *n)
1646 {
1647 assert(root && n);
1648
1649 if (root == &empty) {
1650 root = n;
1651 } else {
1652 int c = urldb_search_match_host(root->data, n->data);
1653
1654 if (c > 0) {
1655 root->left = urldb_search_insert_internal(
1656 root->left, n);
1657 } else if (c < 0) {
1658 root->right = urldb_search_insert_internal(
1659 root->right, n);
1660 } else {
1661 /* exact match */
1662 free(n);
1663 return root;
1664 }
1665
1666 root = urldb_search_skew(root);
1667 root = urldb_search_split(root);
1668 }
1669
1670 return root;
1671 }
1672
1673
1674 /**
1675 * Insert a node into the search tree
1676 *
1677 * \param root Root of tree to insert into
1678 * \param data User data to insert
1679 * \return Pointer to updated root, or NULL if failed
1680 */
1681 static struct search_node *
urldb_search_insert(struct search_node * root,const struct host_part * data)1682 urldb_search_insert(struct search_node *root, const struct host_part *data)
1683 {
1684 struct search_node *n;
1685
1686 assert(root && data);
1687
1688 n = malloc(sizeof(struct search_node));
1689 if (!n)
1690 return NULL;
1691
1692 n->level = 1;
1693 n->data = data;
1694 n->left = n->right = ∅
1695
1696 root = urldb_search_insert_internal(root, n);
1697
1698 return root;
1699 }
1700
1701
1702 /**
1703 * Parse a cookie avpair
1704 *
1705 * \param c Cookie struct to populate
1706 * \param n Name component
1707 * \param v Value component
1708 * \param was_quoted Whether \a v was quoted in the input
1709 * \return true on success, false on memory exhaustion
1710 */
1711 static bool
urldb_parse_avpair(struct cookie_internal_data * c,char * n,char * v,bool was_quoted)1712 urldb_parse_avpair(struct cookie_internal_data *c,
1713 char *n,
1714 char *v,
1715 bool was_quoted)
1716 {
1717 int vlen;
1718
1719 assert(c && n && v);
1720
1721 /* Strip whitespace from start of name */
1722 for (; *n; n++) {
1723 if (*n != ' ' && *n != '\t')
1724 break;
1725 }
1726
1727 /* Strip whitespace from end of name */
1728 for (vlen = strlen(n); vlen; vlen--) {
1729 if (n[vlen] == ' ' || n[vlen] == '\t')
1730 n[vlen] = '\0';
1731 else
1732 break;
1733 }
1734
1735 /* Strip whitespace from start of value */
1736 for (; *v; v++) {
1737 if (*v != ' ' && *v != '\t')
1738 break;
1739 }
1740
1741 /* Strip whitespace from end of value */
1742 for (vlen = strlen(v); vlen; vlen--) {
1743 if (v[vlen] == ' ' || v[vlen] == '\t')
1744 v[vlen] = '\0';
1745 else
1746 break;
1747 }
1748
1749 if (!c->comment && strcasecmp(n, "Comment") == 0) {
1750 c->comment = strdup(v);
1751 if (!c->comment)
1752 return false;
1753 } else if (!c->domain && strcasecmp(n, "Domain") == 0) {
1754 if (v[0] == '.') {
1755 /* Domain must start with a dot */
1756 c->domain_from_set = true;
1757 c->domain = strdup(v);
1758 if (!c->domain)
1759 return false;
1760 }
1761 } else if (strcasecmp(n, "Max-Age") == 0) {
1762 int temp = atoi(v);
1763 if (temp == 0)
1764 /* Special case - 0 means delete */
1765 c->expires = 0;
1766 else
1767 c->expires = time(NULL) + temp;
1768 } else if (!c->path && strcasecmp(n, "Path") == 0) {
1769 c->path_from_set = true;
1770 c->path = strdup(v);
1771 if (!c->path)
1772 return false;
1773 } else if (strcasecmp(n, "Version") == 0) {
1774 c->version = atoi(v);
1775 } else if (strcasecmp(n, "Expires") == 0) {
1776 char *datenoday;
1777 time_t expires;
1778 nserror res;
1779
1780 /* Strip dayname from date (these are hugely variable
1781 * and liable to break the parser. They also serve no
1782 * useful purpose) */
1783 for (datenoday = v;
1784 *datenoday && !ascii_is_digit(*datenoday);
1785 datenoday++) {
1786 /* do nothing */
1787 }
1788
1789 res = nsc_strntimet(datenoday, strlen(datenoday), &expires);
1790 if (res != NSERROR_OK) {
1791 /* assume we have an unrepresentable date =>
1792 * force it to the maximum possible value of a
1793 * 32bit time_t (this may break in 2038. We'll
1794 * deal with that once we come to it) */
1795 expires = (time_t)0x7fffffff;
1796 }
1797 c->expires = expires;
1798 } else if (strcasecmp(n, "Secure") == 0) {
1799 c->secure = true;
1800 } else if (strcasecmp(n, "HttpOnly") == 0) {
1801 c->http_only = true;
1802 } else if (!c->name) {
1803 c->name = strdup(n);
1804 c->value = strdup(v);
1805 c->value_was_quoted = was_quoted;
1806 if (!c->name || !c->value) {
1807 return false;
1808 }
1809 }
1810
1811 return true;
1812 }
1813
1814
1815 /**
1816 * Free a cookie
1817 *
1818 * \param c The cookie to free
1819 */
urldb_free_cookie(struct cookie_internal_data * c)1820 static void urldb_free_cookie(struct cookie_internal_data *c)
1821 {
1822 assert(c);
1823
1824 free(c->comment);
1825 free(c->domain);
1826 free(c->path);
1827 free(c->name);
1828 free(c->value);
1829 free(c);
1830 }
1831
1832
1833 /**
1834 * Parse a cookie
1835 *
1836 * \param url URL being fetched
1837 * \param cookie Pointer to cookie string (updated on exit)
1838 * \return Pointer to cookie structure (on heap, caller frees) or NULL
1839 */
1840 static struct cookie_internal_data *
urldb_parse_cookie(nsurl * url,const char ** cookie)1841 urldb_parse_cookie(nsurl *url, const char **cookie)
1842 {
1843 struct cookie_internal_data *c;
1844 const char *cur;
1845 char name[1024], value[4096];
1846 char *n = name, *v = value;
1847 bool in_value = false;
1848 bool had_value_data = false;
1849 bool value_verbatim = false;
1850 bool quoted = false;
1851 bool was_quoted = false;
1852
1853 assert(url && cookie && *cookie);
1854
1855 c = calloc(1, sizeof(struct cookie_internal_data));
1856 if (c == NULL)
1857 return NULL;
1858
1859 c->expires = -1;
1860
1861 name[0] = '\0';
1862 value[0] = '\0';
1863
1864 for (cur = *cookie; *cur; cur++) {
1865 if (*cur == '\r' && *(cur + 1) == '\n') {
1866 /* End of header */
1867 if (quoted) {
1868 /* Unmatched quote encountered */
1869
1870 /* Match Firefox 2.0.0.11 */
1871 value[0] = '\0';
1872
1873 }
1874
1875 break;
1876 } else if (*cur == '\r') {
1877 /* Spurious linefeed */
1878 continue;
1879 } else if (*cur == '\n') {
1880 /* Spurious newline */
1881 continue;
1882 }
1883
1884 if (in_value && !had_value_data) {
1885 if (*cur == ' ' || *cur == '\t') {
1886 /* Strip leading whitespace from value */
1887 continue;
1888 } else {
1889 had_value_data = true;
1890
1891 /* Value is taken verbatim if first non-space
1892 * character is not a " */
1893 if (*cur != '"') {
1894 value_verbatim = true;
1895 }
1896 }
1897 }
1898
1899 if (in_value && !value_verbatim && (*cur == '"')) {
1900 /* Only non-verbatim values may be quoted */
1901 if (cur == *cookie || *(cur - 1) != '\\') {
1902 /* Only unescaped quotes count */
1903 was_quoted = quoted;
1904 quoted = !quoted;
1905
1906 continue;
1907 }
1908 }
1909
1910 if (!quoted && !in_value && *cur == '=') {
1911 /* First equals => attr-value separator */
1912 in_value = true;
1913 continue;
1914 }
1915
1916 if (!quoted && (was_quoted || *cur == ';')) {
1917 /* Semicolon or after quoted value
1918 * => end of current avpair */
1919
1920 /* NUL-terminate tokens */
1921 *n = '\0';
1922 *v = '\0';
1923
1924 if (!urldb_parse_avpair(c, name, value, was_quoted)) {
1925 /* Memory exhausted */
1926 urldb_free_cookie(c);
1927 return NULL;
1928 }
1929
1930 /* And reset to start */
1931 n = name;
1932 v = value;
1933 in_value = false;
1934 had_value_data = false;
1935 value_verbatim = false;
1936 was_quoted = false;
1937
1938 /* Now, if the current input is anything other than a
1939 * semicolon, we must be sure to reprocess it */
1940 if (*cur != ';') {
1941 cur--;
1942 }
1943
1944 continue;
1945 }
1946
1947 /* And now handle commas. These are a pain as they may mean
1948 * any of the following:
1949 *
1950 * + End of cookie
1951 * + Day separator in Expires avpair
1952 * + (Invalid) comma in unquoted value
1953 *
1954 * Therefore, in order to handle all 3 cases (2 and 3 are
1955 * identical, the difference being that 2 is in the spec and
1956 * 3 isn't), we need to determine where the comma actually
1957 * lies. We use the following heuristic:
1958 *
1959 * Given a comma at the current input position, find the
1960 * immediately following semicolon (or end of input if none
1961 * found). Then, consider the input characters between
1962 * these two positions. If any of these characters is an
1963 * '=', we must assume that the comma signified the end of
1964 * the current cookie.
1965 *
1966 * This holds as the first avpair of any cookie must be
1967 * NAME=VALUE, so the '=' is guaranteed to appear in the
1968 * case where the comma marks the end of a cookie.
1969 *
1970 * This will fail, however, in the case where '=' appears in
1971 * the value of the current avpair after the comma or the
1972 * subsequent cookie does not start with NAME=VALUE. Neither
1973 * of these is particularly likely and if they do occur, the
1974 * website is more broken than we can be bothered to handle.
1975 */
1976 if (!quoted && *cur == ',') {
1977 /* Find semi-colon, if any */
1978 const char *p;
1979 const char *semi = strchr(cur + 1, ';');
1980 if (!semi)
1981 semi = cur + strlen(cur) - 2 /* CRLF */;
1982
1983 /* Look for equals sign between comma and semi */
1984 for (p = cur + 1; p < semi; p++)
1985 if (*p == '=')
1986 break;
1987
1988 if (p == semi) {
1989 /* none found => comma internal to value */
1990 /* do nothing */
1991 } else {
1992 /* found one => comma marks end of cookie */
1993 cur++;
1994 break;
1995 }
1996 }
1997
1998 /* Accumulate into buffers, always leaving space for a NUL */
1999 /** \todo is silently truncating overlong names/values wise? */
2000 if (!in_value) {
2001 if (n < name + (sizeof(name) - 1))
2002 *n++ = *cur;
2003 } else {
2004 if (v < value + (sizeof(value) - 1))
2005 *v++ = *cur;
2006 }
2007 }
2008
2009 /* Parse final avpair */
2010 *n = '\0';
2011 *v = '\0';
2012
2013 if (!urldb_parse_avpair(c, name, value, was_quoted)) {
2014 /* Memory exhausted */
2015 urldb_free_cookie(c);
2016 return NULL;
2017 }
2018
2019 /* Now fix-up default values */
2020 if (c->domain == NULL) {
2021 lwc_string *host = nsurl_get_component(url, NSURL_HOST);
2022 if (host == NULL) {
2023 urldb_free_cookie(c);
2024 return NULL;
2025 }
2026 c->domain = strdup(lwc_string_data(host));
2027 lwc_string_unref(host);
2028 }
2029
2030 if (c->path == NULL) {
2031 const char *path_data;
2032 char *path, *slash;
2033 lwc_string *path_lwc;
2034
2035 path_lwc = nsurl_get_component(url, NSURL_PATH);
2036 if (path_lwc == NULL) {
2037 urldb_free_cookie(c);
2038 return NULL;
2039 }
2040 path_data = lwc_string_data(path_lwc);
2041
2042 /* Strip leafname and trailing slash (4.3.1) */
2043 slash = strrchr(path_data, '/');
2044 if (slash != NULL) {
2045 /* Special case: retain first slash in path */
2046 if (slash == path_data)
2047 slash++;
2048
2049 slash = strndup(path_data, slash - path_data);
2050 if (slash == NULL) {
2051 lwc_string_unref(path_lwc);
2052 urldb_free_cookie(c);
2053 return NULL;
2054 }
2055
2056 path = slash;
2057 lwc_string_unref(path_lwc);
2058 } else {
2059 path = strdup(lwc_string_data(path_lwc));
2060 lwc_string_unref(path_lwc);
2061 if (path == NULL) {
2062 urldb_free_cookie(c);
2063 return NULL;
2064 }
2065 }
2066
2067 c->path = path;
2068 }
2069
2070 /* Write back current position */
2071 *cookie = cur;
2072
2073 return c;
2074 }
2075
2076
2077 /**
2078 * Add a path to the database, creating any intermediate entries
2079 *
2080 * \param scheme URL scheme associated with path
2081 * \param port Port number on host associated with path
2082 * \param host Host tree node to attach to
2083 * \param path_query Absolute path plus query to add (freed)
2084 * \param fragment URL fragment, or NULL
2085 * \param url URL (fragment ignored)
2086 * \return Pointer to leaf node, or NULL on memory exhaustion
2087 */
2088 static struct path_data *
urldb_add_path(lwc_string * scheme,unsigned int port,const struct host_part * host,char * path_query,lwc_string * fragment,nsurl * url)2089 urldb_add_path(lwc_string *scheme,
2090 unsigned int port,
2091 const struct host_part *host,
2092 char *path_query,
2093 lwc_string *fragment,
2094 nsurl *url)
2095 {
2096 struct path_data *d, *e;
2097 char *buf = path_query;
2098 char *segment, *slash;
2099 bool match;
2100
2101 assert(scheme && host && url);
2102
2103 d = (struct path_data *) &host->paths;
2104
2105 /* skip leading '/' */
2106 segment = buf;
2107 if (*segment == '/')
2108 segment++;
2109
2110 /* Process path segments */
2111 do {
2112 slash = strchr(segment, '/');
2113 if (!slash) {
2114 /* last segment */
2115 /* look for existing entry */
2116 for (e = d->children; e; e = e->next)
2117 if (strcmp(segment, e->segment) == 0 &&
2118 lwc_string_isequal(scheme,
2119 e->scheme, &match) ==
2120 lwc_error_ok &&
2121 match == true &&
2122 e->port == port)
2123 break;
2124
2125 d = e ? urldb_add_path_fragment(e, fragment) :
2126 urldb_add_path_node(scheme, port,
2127 segment, fragment, d);
2128 break;
2129 }
2130
2131 *slash = '\0';
2132
2133 /* look for existing entry */
2134 for (e = d->children; e; e = e->next)
2135 if (strcmp(segment, e->segment) == 0 &&
2136 lwc_string_isequal(scheme, e->scheme,
2137 &match) == lwc_error_ok &&
2138 match == true &&
2139 e->port == port)
2140 break;
2141
2142 d = e ? e : urldb_add_path_node(scheme, port, segment, NULL, d);
2143 if (!d)
2144 break;
2145
2146 segment = slash + 1;
2147 } while (1);
2148
2149 free(path_query);
2150
2151 if (d && !d->url) {
2152 /* Insert defragmented URL */
2153 if (nsurl_defragment(url, &d->url) != NSERROR_OK)
2154 return NULL;
2155 }
2156
2157 return d;
2158 }
2159
2160
2161 /**
2162 * Add a host to the database, creating any intermediate entries
2163 *
2164 * \param host Hostname to add
2165 * \return Pointer to leaf node, or NULL on memory exhaustion
2166 */
urldb_add_host(const char * host)2167 static struct host_part *urldb_add_host(const char *host)
2168 {
2169 struct host_part *d = (struct host_part *) &db_root, *e;
2170 struct search_node *s;
2171 char buf[256]; /* 256 bytes is sufficient - domain names are
2172 * limited to 255 chars. */
2173 char *part;
2174
2175 assert(host);
2176
2177 if (urldb__host_is_ip_address(host)) {
2178 /* Host is an IP, so simply add as TLD */
2179
2180 /* Check for existing entry */
2181 for (e = d->children; e; e = e->next)
2182 if (strcasecmp(host, e->part) == 0)
2183 /* found => return it */
2184 return e;
2185
2186 d = urldb_add_host_node(host, d);
2187
2188 s = urldb_search_insert(search_trees[ST_IP], d);
2189 if (!s) {
2190 /* failed */
2191 d = NULL;
2192 } else {
2193 search_trees[ST_IP] = s;
2194 }
2195
2196 return d;
2197 }
2198
2199 /* Copy host string, so we can corrupt it */
2200 strncpy(buf, host, sizeof buf);
2201 buf[sizeof buf - 1] = '\0';
2202
2203 /* Process FQDN segments backwards */
2204 do {
2205 part = strrchr(buf, '.');
2206 if (!part) {
2207 /* last segment */
2208 /* Check for existing entry */
2209 for (e = d->children; e; e = e->next)
2210 if (strcasecmp(buf, e->part) == 0)
2211 break;
2212
2213 if (e) {
2214 d = e;
2215 } else {
2216 d = urldb_add_host_node(buf, d);
2217 }
2218
2219 /* And insert into search tree */
2220 if (d) {
2221 struct search_node **r;
2222
2223 r = urldb_get_search_tree_direct(buf);
2224 s = urldb_search_insert(*r, d);
2225 if (!s) {
2226 /* failed */
2227 d = NULL;
2228 } else {
2229 *r = s;
2230 }
2231 }
2232 break;
2233 }
2234
2235 /* Check for existing entry */
2236 for (e = d->children; e; e = e->next)
2237 if (strcasecmp(part + 1, e->part) == 0)
2238 break;
2239
2240 d = e ? e : urldb_add_host_node(part + 1, d);
2241 if (!d)
2242 break;
2243
2244 *part = '\0';
2245 } while (1);
2246
2247 return d;
2248 }
2249
2250
2251 /**
2252 * Insert a cookie into the database
2253 *
2254 * \param c The cookie to insert
2255 * \param scheme URL scheme associated with cookie path
2256 * \param url URL (sans fragment) associated with cookie
2257 * \return true on success, false on memory exhaustion (c will be freed)
2258 */
2259 static bool
urldb_insert_cookie(struct cookie_internal_data * c,lwc_string * scheme,nsurl * url)2260 urldb_insert_cookie(struct cookie_internal_data *c,
2261 lwc_string *scheme,
2262 nsurl *url)
2263 {
2264 struct cookie_internal_data *d;
2265 const struct host_part *h;
2266 struct path_data *p;
2267 time_t now = time(NULL);
2268
2269 assert(c);
2270
2271 if (c->domain[0] == '.') {
2272 h = urldb_search_find(
2273 urldb_get_search_tree(&(c->domain[1])),
2274 c->domain + 1);
2275 if (!h) {
2276 h = urldb_add_host(c->domain + 1);
2277 if (!h) {
2278 urldb_free_cookie(c);
2279 return false;
2280 }
2281 }
2282
2283 p = (struct path_data *) &h->paths;
2284 } else {
2285 /* Need to have a URL and scheme, if it's not a domain cookie */
2286 assert(url != NULL);
2287 assert(scheme != NULL);
2288
2289 h = urldb_search_find(
2290 urldb_get_search_tree(c->domain),
2291 c->domain);
2292
2293 if (!h) {
2294 h = urldb_add_host(c->domain);
2295 if (!h) {
2296 urldb_free_cookie(c);
2297 return false;
2298 }
2299 }
2300
2301 /* find path */
2302 p = urldb_add_path(scheme, 0, h,
2303 strdup(c->path), NULL, url);
2304 if (!p) {
2305 urldb_free_cookie(c);
2306 return false;
2307 }
2308 }
2309
2310 /* add cookie */
2311 for (d = p->cookies; d; d = d->next) {
2312 if (!strcmp(d->domain, c->domain) &&
2313 !strcmp(d->path, c->path) &&
2314 !strcmp(d->name, c->name))
2315 break;
2316 }
2317
2318 if (d) {
2319 if (c->expires != -1 && c->expires < now) {
2320 /* remove cookie */
2321 if (d->next)
2322 d->next->prev = d->prev;
2323 else
2324 p->cookies_end = d->prev;
2325 if (d->prev)
2326 d->prev->next = d->next;
2327 else
2328 p->cookies = d->next;
2329
2330 cookie_manager_remove((struct cookie_data *)d);
2331
2332 urldb_free_cookie(d);
2333 urldb_free_cookie(c);
2334 } else {
2335 /* replace d with c */
2336 c->prev = d->prev;
2337 c->next = d->next;
2338 if (c->next)
2339 c->next->prev = c;
2340 else
2341 p->cookies_end = c;
2342 if (c->prev)
2343 c->prev->next = c;
2344 else
2345 p->cookies = c;
2346
2347 cookie_manager_remove((struct cookie_data *)d);
2348 urldb_free_cookie(d);
2349
2350 cookie_manager_add((struct cookie_data *)c);
2351 }
2352 } else {
2353 c->prev = p->cookies_end;
2354 c->next = NULL;
2355 if (p->cookies_end)
2356 p->cookies_end->next = c;
2357 else
2358 p->cookies = c;
2359 p->cookies_end = c;
2360
2361 cookie_manager_add((struct cookie_data *)c);
2362 }
2363
2364 return true;
2365 }
2366
2367
2368 /**
2369 * Concatenate a cookie into the provided buffer
2370 *
2371 * \param c Cookie to concatenate
2372 * \param version The version of the cookie string to output
2373 * \param used Pointer to amount of buffer used (updated)
2374 * \param alloc Pointer to allocated size of buffer (updated)
2375 * \param buf Pointer to Pointer to buffer (updated)
2376 * \return true on success, false on memory exhaustion
2377 */
2378 static bool
urldb_concat_cookie(struct cookie_internal_data * c,int version,int * used,int * alloc,char ** buf)2379 urldb_concat_cookie(struct cookie_internal_data *c,
2380 int version,
2381 int *used,
2382 int *alloc,
2383 char **buf)
2384 {
2385 /* Combined (A)BNF for the Cookie: request header:
2386 *
2387 * CHAR = <any US-ASCII character (octets 0 - 127)>
2388 * CTL = <any US-ASCII control character
2389 * (octets 0 - 31) and DEL (127)>
2390 * CR = <US-ASCII CR, carriage return (13)>
2391 * LF = <US-ASCII LF, linefeed (10)>
2392 * SP = <US-ASCII SP, space (32)>
2393 * HT = <US-ASCII HT, horizontal-tab (9)>
2394 * <"> = <US-ASCII double-quote mark (34)>
2395 *
2396 * CRLF = CR LF
2397 *
2398 * LWS = [CRLF] 1*( SP | HT )
2399 *
2400 * TEXT = <any OCTET except CTLs,
2401 * but including LWS>
2402 *
2403 * token = 1*<any CHAR except CTLs or separators>
2404 * separators = "(" | ")" | "<" | ">" | "@"
2405 * | "," | ";" | ":" | "\" | <">
2406 * | "/" | "[" | "]" | "?" | "="
2407 * | "{" | "}" | SP | HT
2408 *
2409 * quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
2410 * qdtext = <any TEXT except <">>
2411 * quoted-pair = "\" CHAR
2412 *
2413 * attr = token
2414 * value = word
2415 * word = token | quoted-string
2416 *
2417 * cookie = "Cookie:" cookie-version
2418 * 1*((";" | ",") cookie-value)
2419 * cookie-value = NAME "=" VALUE [";" path] [";" domain]
2420 * cookie-version = "$Version" "=" value
2421 * NAME = attr
2422 * VALUE = value
2423 * path = "$Path" "=" value
2424 * domain = "$Domain" "=" value
2425 *
2426 * A note on quoted-string handling:
2427 * The cookie data stored in the db is verbatim (i.e. sans enclosing
2428 * <">, if any, and with all quoted-pairs intact) thus all that we
2429 * need to do here is ensure that value strings which were quoted
2430 * in Set-Cookie or which include any of the separators are quoted
2431 * before use.
2432 *
2433 * A note on cookie-value separation:
2434 * We use semicolons for all separators, including between
2435 * cookie-values. This simplifies things and is backwards compatible.
2436 */
2437 const char * const separators = "()<>@,;:\\\"/[]?={} \t";
2438
2439 int max_len;
2440
2441 assert(c && used && alloc && buf && *buf);
2442
2443 /* "; " cookie-value
2444 * We allow for the possibility that values are quoted
2445 */
2446 max_len = 2 + strlen(c->name) + 1 + strlen(c->value) + 2 +
2447 (c->path_from_set ?
2448 8 + strlen(c->path) + 2 : 0) +
2449 (c->domain_from_set ?
2450 10 + strlen(c->domain) + 2 : 0);
2451
2452 if (*used + max_len >= *alloc) {
2453 char *temp = realloc(*buf, *alloc + 4096);
2454 if (!temp) {
2455 return false;
2456 }
2457 *buf = temp;
2458 *alloc += 4096;
2459 }
2460
2461 if (version == COOKIE_NETSCAPE) {
2462 /* Original Netscape cookie */
2463 sprintf(*buf + *used - 1, "; %s=", c->name);
2464 *used += 2 + strlen(c->name) + 1;
2465
2466 /* The Netscape spec doesn't mention quoting of cookie values.
2467 * RFC 2109 $10.1.3 indicates that values must not be quoted.
2468 *
2469 * However, other browsers preserve quoting, so we should, too
2470 */
2471 if (c->value_was_quoted) {
2472 sprintf(*buf + *used - 1, "\"%s\"", c->value);
2473 *used += 1 + strlen(c->value) + 1;
2474 } else {
2475 /** \todo should we %XX-encode [;HT,SP] ? */
2476 /** \todo Should we strip escaping backslashes? */
2477 sprintf(*buf + *used - 1, "%s", c->value);
2478 *used += strlen(c->value);
2479 }
2480
2481 /* We don't send path/domain information -- that's what the
2482 * Netscape spec suggests we should do, anyway. */
2483 } else {
2484 /* RFC2109 or RFC2965 cookie */
2485 sprintf(*buf + *used - 1, "; %s=", c->name);
2486 *used += 2 + strlen(c->name) + 1;
2487
2488 /* Value needs quoting if it contains any separator or if
2489 * it needs preserving from the Set-Cookie header */
2490 if (c->value_was_quoted ||
2491 strpbrk(c->value, separators) != NULL) {
2492 sprintf(*buf + *used - 1, "\"%s\"", c->value);
2493 *used += 1 + strlen(c->value) + 1;
2494 } else {
2495 sprintf(*buf + *used - 1, "%s", c->value);
2496 *used += strlen(c->value);
2497 }
2498
2499 if (c->path_from_set) {
2500 /* Path, quoted if necessary */
2501 sprintf(*buf + *used - 1, "; $Path=");
2502 *used += 8;
2503
2504 if (strpbrk(c->path, separators) != NULL) {
2505 sprintf(*buf + *used - 1, "\"%s\"", c->path);
2506 *used += 1 + strlen(c->path) + 1;
2507 } else {
2508 sprintf(*buf + *used - 1, "%s", c->path);
2509 *used += strlen(c->path);
2510 }
2511 }
2512
2513 if (c->domain_from_set) {
2514 /* Domain, quoted if necessary */
2515 sprintf(*buf + *used - 1, "; $Domain=");
2516 *used += 10;
2517
2518 if (strpbrk(c->domain, separators) != NULL) {
2519 sprintf(*buf + *used - 1, "\"%s\"", c->domain);
2520 *used += 1 + strlen(c->domain) + 1;
2521 } else {
2522 sprintf(*buf + *used - 1, "%s", c->domain);
2523 *used += strlen(c->domain);
2524 }
2525 }
2526 }
2527
2528 return true;
2529 }
2530
2531
2532 /**
2533 * deletes paths from a cookie.
2534 *
2535 * \param domain the cookie domain
2536 * \param path the cookie path
2537 * \param name The cookie name
2538 * \param parent The url data of the cookie
2539 */
2540 static void
urldb_delete_cookie_paths(const char * domain,const char * path,const char * name,struct path_data * parent)2541 urldb_delete_cookie_paths(const char *domain,
2542 const char *path,
2543 const char *name,
2544 struct path_data *parent)
2545 {
2546 struct cookie_internal_data *c;
2547 struct path_data *p = parent;
2548
2549 assert(parent);
2550
2551 do {
2552 for (c = p->cookies; c; c = c->next) {
2553 if (strcmp(c->domain, domain) == 0 &&
2554 strcmp(c->path, path) == 0 &&
2555 strcmp(c->name, name) == 0) {
2556 if (c->prev) {
2557 c->prev->next = c->next;
2558 } else {
2559 p->cookies = c->next;
2560 }
2561
2562 if (c->next) {
2563 c->next->prev = c->prev;
2564 } else {
2565 p->cookies_end = c->prev;
2566 }
2567
2568 urldb_free_cookie(c);
2569
2570 return;
2571 }
2572 }
2573
2574 if (p->children) {
2575 p = p->children;
2576 } else {
2577 while (p != parent) {
2578 if (p->next != NULL) {
2579 p = p->next;
2580 break;
2581 }
2582
2583 p = p->parent;
2584 }
2585 }
2586 } while (p != parent);
2587 }
2588
2589
2590 /**
2591 * Deletes cookie hosts and their assoicated paths
2592 *
2593 * \param domain the cookie domain
2594 * \param path the cookie path
2595 * \param name The cookie name
2596 * \param parent The url data of the cookie
2597 */
2598 static void
urldb_delete_cookie_hosts(const char * domain,const char * path,const char * name,struct host_part * parent)2599 urldb_delete_cookie_hosts(const char *domain,
2600 const char *path,
2601 const char *name,
2602 struct host_part *parent)
2603 {
2604 struct host_part *h;
2605 assert(parent);
2606
2607 urldb_delete_cookie_paths(domain, path, name, &parent->paths);
2608
2609 for (h = parent->children; h; h = h->next) {
2610 urldb_delete_cookie_hosts(domain, path, name, h);
2611 }
2612 }
2613
2614
2615 /**
2616 * Save a path subtree's cookies
2617 *
2618 * \param fp File pointer to write to
2619 * \param parent Parent path
2620 */
urldb_save_cookie_paths(FILE * fp,struct path_data * parent)2621 static void urldb_save_cookie_paths(FILE *fp, struct path_data *parent)
2622 {
2623 struct path_data *p = parent;
2624 time_t now = time(NULL);
2625
2626 assert(fp && parent);
2627
2628 do {
2629 if (p->cookies != NULL) {
2630 struct cookie_internal_data *c;
2631
2632 for (c = p->cookies; c != NULL; c = c->next) {
2633 if (c->expires == -1 || c->expires < now) {
2634 /* Skip expired & session cookies */
2635 continue;
2636 }
2637
2638 fprintf(fp,
2639 "%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t"
2640 "%s\t%s\t%d\t%s\t%s\t%s\n",
2641 c->version, c->domain,
2642 c->domain_from_set, c->path,
2643 c->path_from_set, c->secure,
2644 c->http_only,
2645 (int)c->expires, (int)c->last_used,
2646 c->no_destroy, c->name, c->value,
2647 c->value_was_quoted,
2648 p->scheme ? lwc_string_data(p->scheme) :
2649 "unused",
2650 p->url ? nsurl_access(p->url) :
2651 "unused",
2652 c->comment ? c->comment : "");
2653 }
2654 }
2655
2656 if (p->children != NULL) {
2657 p = p->children;
2658 } else {
2659 while (p != parent) {
2660 if (p->next != NULL) {
2661 p = p->next;
2662 break;
2663 }
2664
2665 p = p->parent;
2666 }
2667 }
2668 } while (p != parent);
2669 }
2670
2671
2672 /**
2673 * Save a host subtree's cookies
2674 *
2675 * \param fp File pointer to write to
2676 * \param parent Parent host
2677 */
urldb_save_cookie_hosts(FILE * fp,struct host_part * parent)2678 static void urldb_save_cookie_hosts(FILE *fp, struct host_part *parent)
2679 {
2680 struct host_part *h;
2681 assert(fp && parent);
2682
2683 urldb_save_cookie_paths(fp, &parent->paths);
2684
2685 for (h = parent->children; h; h = h->next)
2686 urldb_save_cookie_hosts(fp, h);
2687 }
2688
2689
2690 /**
2691 * Destroy a cookie node
2692 *
2693 * \param c Cookie to destroy
2694 */
urldb_destroy_cookie(struct cookie_internal_data * c)2695 static void urldb_destroy_cookie(struct cookie_internal_data *c)
2696 {
2697 free(c->name);
2698 free(c->value);
2699 free(c->comment);
2700 free(c->domain);
2701 free(c->path);
2702
2703 free(c);
2704 }
2705
2706
2707 /**
2708 * Destroy the contents of a path node
2709 *
2710 * \param node Node to destroy contents of (does not destroy node)
2711 */
urldb_destroy_path_node_content(struct path_data * node)2712 static void urldb_destroy_path_node_content(struct path_data *node)
2713 {
2714 struct cookie_internal_data *a, *b;
2715 unsigned int i;
2716
2717 if (node->url != NULL) {
2718 nsurl_unref(node->url);
2719 }
2720
2721 if (node->scheme != NULL) {
2722 lwc_string_unref(node->scheme);
2723 }
2724
2725 free(node->segment);
2726 for (i = 0; i < node->frag_cnt; i++)
2727 free(node->fragment[i]);
2728 free(node->fragment);
2729
2730 free(node->urld.title);
2731
2732 for (a = node->cookies; a; a = b) {
2733 b = a->next;
2734 urldb_destroy_cookie(a);
2735 }
2736 }
2737
2738
2739 /**
2740 * Destroy protection space data
2741 *
2742 * \param space Protection space to destroy
2743 */
urldb_destroy_prot_space(struct prot_space_data * space)2744 static void urldb_destroy_prot_space(struct prot_space_data *space)
2745 {
2746 lwc_string_unref(space->scheme);
2747 free(space->realm);
2748 free(space->auth);
2749
2750 free(space);
2751 }
2752
2753
2754 /**
2755 * Destroy a path tree
2756 *
2757 * \param root Root node of tree to destroy
2758 */
urldb_destroy_path_tree(struct path_data * root)2759 static void urldb_destroy_path_tree(struct path_data *root)
2760 {
2761 struct path_data *p = root;
2762
2763 do {
2764 if (p->children != NULL) {
2765 p = p->children;
2766 } else {
2767 struct path_data *q = p;
2768
2769 while (p != root) {
2770 if (p->next != NULL) {
2771 p = p->next;
2772 break;
2773 }
2774
2775 p = p->parent;
2776
2777 urldb_destroy_path_node_content(q);
2778 free(q);
2779
2780 q = p;
2781 }
2782
2783 urldb_destroy_path_node_content(q);
2784 free(q);
2785 }
2786 } while (p != root);
2787 }
2788
2789
2790 /**
2791 * Destroy a host tree
2792 *
2793 * \param root Root node of tree to destroy
2794 */
urldb_destroy_host_tree(struct host_part * root)2795 static void urldb_destroy_host_tree(struct host_part *root)
2796 {
2797 struct host_part *a, *b;
2798 struct path_data *p, *q;
2799 struct prot_space_data *s, *t;
2800
2801 /* Destroy children */
2802 for (a = root->children; a; a = b) {
2803 b = a->next;
2804 urldb_destroy_host_tree(a);
2805 }
2806
2807 /* Now clean up paths */
2808 for (p = root->paths.children; p; p = q) {
2809 q = p->next;
2810 urldb_destroy_path_tree(p);
2811 }
2812
2813 /* Root path */
2814 urldb_destroy_path_node_content(&root->paths);
2815
2816 /* Proctection space data */
2817 for (s = root->prot_space; s; s = t) {
2818 t = s->next;
2819 urldb_destroy_prot_space(s);
2820 }
2821
2822 /* And ourselves */
2823 free(root->part);
2824 free(root);
2825 }
2826
2827
2828 /**
2829 * Destroy a search tree
2830 *
2831 * \param root Root node of tree to destroy
2832 */
urldb_destroy_search_tree(struct search_node * root)2833 static void urldb_destroy_search_tree(struct search_node *root)
2834 {
2835 /* Destroy children */
2836 if (root->left != &empty)
2837 urldb_destroy_search_tree(root->left);
2838 if (root->right != &empty)
2839 urldb_destroy_search_tree(root->right);
2840
2841 /* And destroy ourselves */
2842 free(root);
2843 }
2844
2845
2846 /*************** External interface ***************/
2847
2848
2849 /* exported interface documented in content/urldb.h */
urldb_destroy(void)2850 void urldb_destroy(void)
2851 {
2852 struct host_part *a, *b;
2853 int i;
2854
2855 /* Clean up search trees */
2856 for (i = 0; i < NUM_SEARCH_TREES; i++) {
2857 if (search_trees[i] != &empty) {
2858 urldb_destroy_search_tree(search_trees[i]);
2859 search_trees[i] = ∅
2860 }
2861 }
2862
2863 /* And database */
2864 for (a = db_root.children; a; a = b) {
2865 b = a->next;
2866 urldb_destroy_host_tree(a);
2867 }
2868 memset(&db_root, 0, sizeof(db_root));
2869
2870 /* And the bloom filter */
2871 if (url_bloom != NULL) {
2872 bloom_destroy(url_bloom);
2873 url_bloom = NULL;
2874 }
2875 }
2876
2877
2878 /* exported interface documented in netsurf/url_db.h */
urldb_load(const char * filename)2879 nserror urldb_load(const char *filename)
2880 {
2881 #define MAXIMUM_URL_LENGTH 4096
2882 char s[MAXIMUM_URL_LENGTH];
2883 char host[256];
2884 struct host_part *h;
2885 int urls;
2886 int i;
2887 int version;
2888 int length;
2889 FILE *fp;
2890
2891 assert(filename);
2892
2893 NSLOG(netsurf, INFO, "Loading URL file %s", filename);
2894
2895 if (url_bloom == NULL)
2896 url_bloom = bloom_create(BLOOM_SIZE);
2897
2898 fp = fopen(filename, "r");
2899 if (!fp) {
2900 NSLOG(netsurf, INFO, "Failed to open file '%s' for reading",
2901 filename);
2902 return NSERROR_NOT_FOUND;
2903 }
2904
2905 if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) {
2906 fclose(fp);
2907 return NSERROR_NEED_DATA;
2908 }
2909
2910 version = atoi(s);
2911 if (version < MIN_URL_FILE_VERSION) {
2912 NSLOG(netsurf, INFO, "Unsupported URL file version.");
2913 fclose(fp);
2914 return NSERROR_INVALID;
2915 }
2916 if (version > URL_FILE_VERSION) {
2917 NSLOG(netsurf, INFO, "Unknown URL file version.");
2918 fclose(fp);
2919 return NSERROR_INVALID;
2920 }
2921
2922 while (fgets(host, sizeof host, fp)) {
2923 time_t hsts_expiry = 0;
2924 int hsts_include_sub_domains = 0;
2925
2926 /* get the hostname */
2927 length = strlen(host) - 1;
2928 host[length] = '\0';
2929
2930 /* skip data that has ended up with a host of '' */
2931 if (length == 0) {
2932 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
2933 break;
2934 urls = atoi(s);
2935 /* Eight fields/url */
2936 for (i = 0; i < (8 * urls); i++) {
2937 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
2938 break;
2939 }
2940 continue;
2941 }
2942
2943 if (version >= 107) {
2944 char *p = host;
2945 while (*p && *p != ' ') p++;
2946 while (*p && *p == ' ') { *p = '\0'; p++; }
2947 hsts_include_sub_domains = (*p == '1');
2948 while (*p && *p != ' ') p++;
2949 while (*p && *p == ' ') p++;
2950 nsc_snptimet(p, strlen(p), &hsts_expiry);
2951 }
2952
2953 h = urldb_add_host(host);
2954 if (!h) {
2955 NSLOG(netsurf, INFO, "Failed adding host: '%s'", host);
2956 fclose(fp);
2957 return NSERROR_NOMEM;
2958 }
2959 h->hsts.expires = hsts_expiry;
2960 h->hsts.include_sub_domains = hsts_include_sub_domains;
2961
2962 /* read number of URLs */
2963 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
2964 break;
2965 urls = atoi(s);
2966
2967 /* no URLs => try next host */
2968 if (urls == 0) {
2969 NSLOG(netsurf, INFO, "No URLs for '%s'", host);
2970 continue;
2971 }
2972
2973 /* load the non-corrupt data */
2974 for (i = 0; i < urls; i++) {
2975 struct path_data *p = NULL;
2976 char scheme[64], ports[10];
2977 char url[64 + 3 + 256 + 6 + 4096 + 1 + 1];
2978 unsigned int port;
2979 bool is_file = false;
2980 nsurl *nsurl;
2981 lwc_string *scheme_lwc, *fragment_lwc;
2982 char *path_query;
2983 size_t len;
2984
2985 if (!fgets(scheme, sizeof scheme, fp))
2986 break;
2987 length = strlen(scheme) - 1;
2988 scheme[length] = '\0';
2989
2990 if (!fgets(ports, sizeof ports, fp))
2991 break;
2992 length = strlen(ports) - 1;
2993 ports[length] = '\0';
2994 port = atoi(ports);
2995
2996 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
2997 break;
2998 length = strlen(s) - 1;
2999 s[length] = '\0';
3000
3001 if (!strcasecmp(host, "localhost") &&
3002 !strcasecmp(scheme, "file"))
3003 is_file = true;
3004
3005 snprintf(url, sizeof url, "%s://%s%s%s%s",
3006 scheme,
3007 /* file URLs have no host */
3008 (is_file ? "" : host),
3009 (port ? ":" : ""),
3010 (port ? ports : ""),
3011 s);
3012
3013 /* TODO: store URLs in pre-parsed state, and make
3014 * a nsurl_load to generate the nsurl more
3015 * swiftly.
3016 * Need a nsurl_save too.
3017 */
3018 if (nsurl_create(url, &nsurl) != NSERROR_OK) {
3019 NSLOG(netsurf, INFO, "Failed inserting '%s'",
3020 url);
3021 fclose(fp);
3022 return NSERROR_NOMEM;
3023 }
3024
3025 if (url_bloom != NULL) {
3026 uint32_t hash = nsurl_hash(nsurl);
3027 bloom_insert_hash(url_bloom, hash);
3028 }
3029
3030 /* Copy and merge path/query strings */
3031 if (nsurl_get(nsurl, NSURL_PATH | NSURL_QUERY,
3032 &path_query, &len) != NSERROR_OK) {
3033 NSLOG(netsurf, INFO, "Failed inserting '%s'",
3034 url);
3035 fclose(fp);
3036 return NSERROR_NOMEM;
3037 }
3038
3039 scheme_lwc = nsurl_get_component(nsurl, NSURL_SCHEME);
3040 fragment_lwc = nsurl_get_component(nsurl,
3041 NSURL_FRAGMENT);
3042 p = urldb_add_path(scheme_lwc, port, h, path_query,
3043 fragment_lwc, nsurl);
3044 if (!p) {
3045 NSLOG(netsurf, INFO, "Failed inserting '%s'",
3046 url);
3047 fclose(fp);
3048 return NSERROR_NOMEM;
3049 }
3050 nsurl_unref(nsurl);
3051 lwc_string_unref(scheme_lwc);
3052 if (fragment_lwc != NULL)
3053 lwc_string_unref(fragment_lwc);
3054
3055 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
3056 break;
3057 if (p)
3058 p->urld.visits = (unsigned int)atoi(s);
3059
3060 /* entry last use time */
3061 if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) {
3062 break;
3063 }
3064 if (p) {
3065 nsc_snptimet(s, strlen(s) - 1, &p->urld.last_visit);
3066 }
3067
3068 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
3069 break;
3070 if (p)
3071 p->urld.type = (content_type)atoi(s);
3072
3073 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
3074 break;
3075
3076
3077 if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
3078 break;
3079 length = strlen(s) - 1;
3080 if (p && length > 0) {
3081 s[length] = '\0';
3082 p->urld.title = malloc(length + 1);
3083 if (p->urld.title)
3084 memcpy(p->urld.title, s, length + 1);
3085 }
3086 }
3087 }
3088
3089 fclose(fp);
3090 NSLOG(netsurf, INFO, "Successfully loaded URL file");
3091 #undef MAXIMUM_URL_LENGTH
3092
3093 return NSERROR_OK;
3094 }
3095
3096 /* exported interface documented in netsurf/url_db.h */
urldb_save(const char * filename)3097 nserror urldb_save(const char *filename)
3098 {
3099 FILE *fp;
3100 int i;
3101
3102 assert(filename);
3103
3104 fp = fopen(filename, "w");
3105 if (!fp) {
3106 NSLOG(netsurf, INFO, "Failed to open file '%s' for writing",
3107 filename);
3108 return NSERROR_SAVE_FAILED;
3109 }
3110
3111 /* file format version number */
3112 fprintf(fp, "%d\n", URL_FILE_VERSION);
3113
3114 for (i = 0; i != NUM_SEARCH_TREES; i++) {
3115 urldb_save_search_tree(search_trees[i], fp);
3116 }
3117
3118 fclose(fp);
3119
3120 return NSERROR_OK;
3121 }
3122
3123
3124 /* exported interface documented in content/urldb.h */
urldb_set_url_persistence(nsurl * url,bool persist)3125 nserror urldb_set_url_persistence(nsurl *url, bool persist)
3126 {
3127 struct path_data *p;
3128
3129 assert(url);
3130
3131 p = urldb_find_url(url);
3132 if (!p) {
3133 return NSERROR_NOT_FOUND;
3134 }
3135
3136 p->persistent = persist;
3137
3138 return NSERROR_OK;
3139 }
3140
3141
3142 /* exported interface documented in content/urldb.h */
urldb_add_url(nsurl * url)3143 bool urldb_add_url(nsurl *url)
3144 {
3145 struct host_part *h;
3146 struct path_data *p;
3147 lwc_string *scheme;
3148 lwc_string *port;
3149 lwc_string *host;
3150 lwc_string *fragment;
3151 const char *host_str;
3152 char *path_query = NULL;
3153 size_t len;
3154 bool match;
3155 unsigned int port_int;
3156
3157 assert(url);
3158
3159 if (url_bloom == NULL)
3160 url_bloom = bloom_create(BLOOM_SIZE);
3161
3162 if (url_bloom != NULL) {
3163 uint32_t hash = nsurl_hash(url);
3164 bloom_insert_hash(url_bloom, hash);
3165 }
3166
3167 /* Copy and merge path/query strings */
3168 if (nsurl_get(url, NSURL_PATH | NSURL_QUERY, &path_query, &len) !=
3169 NSERROR_OK) {
3170 return false;
3171 }
3172 assert(path_query != NULL);
3173
3174 scheme = nsurl_get_component(url, NSURL_SCHEME);
3175 if (scheme == NULL) {
3176 free(path_query);
3177 return false;
3178 }
3179
3180 host = nsurl_get_component(url, NSURL_HOST);
3181 if (host != NULL) {
3182 host_str = lwc_string_data(host);
3183 lwc_string_unref(host);
3184
3185 } else if (lwc_string_isequal(scheme, corestring_lwc_file, &match) ==
3186 lwc_error_ok && match == true) {
3187 host_str = "localhost";
3188
3189 } else {
3190 lwc_string_unref(scheme);
3191 free(path_query);
3192 return false;
3193 }
3194
3195 fragment = nsurl_get_component(url, NSURL_FRAGMENT);
3196
3197 port = nsurl_get_component(url, NSURL_PORT);
3198 if (port != NULL) {
3199 port_int = atoi(lwc_string_data(port));
3200 lwc_string_unref(port);
3201 } else {
3202 port_int = 0;
3203 }
3204
3205 /* Get host entry */
3206 h = urldb_add_host(host_str);
3207
3208 /* Get path entry */
3209 if (h != NULL) {
3210 p = urldb_add_path(scheme,
3211 port_int,
3212 h,
3213 path_query,
3214 fragment,
3215 url);
3216 } else {
3217 p = NULL;
3218 }
3219
3220 lwc_string_unref(scheme);
3221 if (fragment != NULL)
3222 lwc_string_unref(fragment);
3223
3224 return (p != NULL);
3225 }
3226
3227
3228 /* exported interface documented in content/urldb.h */
urldb_set_url_title(nsurl * url,const char * title)3229 nserror urldb_set_url_title(nsurl *url, const char *title)
3230 {
3231 struct path_data *p;
3232 char *temp;
3233
3234 assert(url);
3235
3236 p = urldb_find_url(url);
3237 if (p == NULL) {
3238 return NSERROR_NOT_FOUND;
3239 }
3240
3241 /* copy the parameter if necessary */
3242 if (title != NULL) {
3243 temp = strdup(title);
3244 if (temp == NULL) {
3245 return NSERROR_NOMEM;
3246 }
3247 } else {
3248 temp = NULL;
3249 }
3250
3251 free(p->urld.title);
3252 p->urld.title = temp;
3253
3254 return NSERROR_OK;
3255 }
3256
3257
3258 /* exported interface documented in content/urldb.h */
urldb_set_url_content_type(nsurl * url,content_type type)3259 nserror urldb_set_url_content_type(nsurl *url, content_type type)
3260 {
3261 struct path_data *p;
3262
3263 assert(url);
3264
3265 p = urldb_find_url(url);
3266 if (!p) {
3267 return NSERROR_NOT_FOUND;
3268 }
3269
3270 p->urld.type = type;
3271
3272 return NSERROR_OK;
3273 }
3274
3275
3276 /* exported interface documented in content/urldb.h */
urldb_update_url_visit_data(nsurl * url)3277 nserror urldb_update_url_visit_data(nsurl *url)
3278 {
3279 struct path_data *p;
3280
3281 assert(url);
3282
3283 p = urldb_find_url(url);
3284 if (!p) {
3285 return NSERROR_NOT_FOUND;
3286 }
3287
3288 p->urld.last_visit = time(NULL);
3289 p->urld.visits++;
3290
3291 return NSERROR_OK;
3292 }
3293
3294
3295 /* exported interface documented in content/urldb.h */
urldb_reset_url_visit_data(nsurl * url)3296 void urldb_reset_url_visit_data(nsurl *url)
3297 {
3298 struct path_data *p;
3299
3300 assert(url);
3301
3302 p = urldb_find_url(url);
3303 if (!p)
3304 return;
3305
3306 p->urld.last_visit = (time_t)0;
3307 p->urld.visits = 0;
3308 }
3309
3310
3311 /* exported interface documented in netsurf/url_db.h */
urldb_get_url_data(nsurl * url)3312 const struct url_data *urldb_get_url_data(nsurl *url)
3313 {
3314 struct path_data *p;
3315 struct url_internal_data *u;
3316
3317 assert(url);
3318
3319 p = urldb_find_url(url);
3320 if (!p)
3321 return NULL;
3322
3323 u = &p->urld;
3324
3325 return (const struct url_data *) u;
3326 }
3327
3328
3329 /* exported interface documented in content/urldb.h */
urldb_get_url(nsurl * url)3330 nsurl *urldb_get_url(nsurl *url)
3331 {
3332 struct path_data *p;
3333
3334 assert(url);
3335
3336 p = urldb_find_url(url);
3337 if (!p)
3338 return NULL;
3339
3340 return p->url;
3341 }
3342
3343
3344 /* exported interface documented in netsurf/url_db.h */
urldb_set_auth_details(nsurl * url,const char * realm,const char * auth)3345 void urldb_set_auth_details(nsurl *url, const char *realm, const char *auth)
3346 {
3347 struct path_data *p, *pi;
3348 struct host_part *h;
3349 struct prot_space_data *space, *space_alloc;
3350 char *realm_alloc, *auth_alloc;
3351 bool match;
3352
3353 assert(url && realm && auth);
3354
3355 /* add url, in case it's missing */
3356 urldb_add_url(url);
3357
3358 p = urldb_find_url(url);
3359
3360 if (!p)
3361 return;
3362
3363 /* Search for host_part */
3364 for (pi = p; pi->parent != NULL; pi = pi->parent)
3365 ;
3366 h = (struct host_part *)pi;
3367
3368 /* Search if given URL belongs to a protection space we already know of. */
3369 for (space = h->prot_space; space; space = space->next) {
3370 if (!strcmp(space->realm, realm) &&
3371 lwc_string_isequal(space->scheme, p->scheme,
3372 &match) == lwc_error_ok &&
3373 match == true &&
3374 space->port == p->port)
3375 break;
3376 }
3377
3378 if (space != NULL) {
3379 /* Overrule existing auth. */
3380 free(space->auth);
3381 space->auth = strdup(auth);
3382 } else {
3383 /* Create a new protection space. */
3384 space = space_alloc = malloc(sizeof(struct prot_space_data));
3385 realm_alloc = strdup(realm);
3386 auth_alloc = strdup(auth);
3387
3388 if (!space_alloc || !realm_alloc || !auth_alloc) {
3389 free(space_alloc);
3390 free(realm_alloc);
3391 free(auth_alloc);
3392 return;
3393 }
3394
3395 space->scheme = lwc_string_ref(p->scheme);
3396 space->port = p->port;
3397 space->realm = realm_alloc;
3398 space->auth = auth_alloc;
3399 space->next = h->prot_space;
3400 h->prot_space = space;
3401 }
3402
3403 p->prot_space = space;
3404 }
3405
3406
3407 /* exported interface documented in netsurf/url_db.h */
urldb_get_auth_details(nsurl * url,const char * realm)3408 const char *urldb_get_auth_details(nsurl *url, const char *realm)
3409 {
3410 struct path_data *p, *p_cur, *p_top;
3411
3412 assert(url);
3413
3414 /* add to the db, so our lookup will work */
3415 urldb_add_url(url);
3416
3417 p = urldb_find_url(url);
3418 if (!p)
3419 return NULL;
3420
3421 /* Check for any auth details attached to the path_data node or any of
3422 * its parents.
3423 */
3424 for (p_cur = p; p_cur != NULL; p_top = p_cur, p_cur = p_cur->parent) {
3425 if (p_cur->prot_space) {
3426 return p_cur->prot_space->auth;
3427 }
3428 }
3429
3430 /* Only when we have a realm (and canonical root of given URL), we can
3431 * uniquely locate the protection space.
3432 */
3433 if (realm != NULL) {
3434 const struct host_part *h = (const struct host_part *)p_top;
3435 const struct prot_space_data *space;
3436 bool match;
3437
3438 /* Search for a possible matching protection space. */
3439 for (space = h->prot_space; space != NULL;
3440 space = space->next) {
3441 if (!strcmp(space->realm, realm) &&
3442 lwc_string_isequal(space->scheme,
3443 p->scheme, &match) ==
3444 lwc_error_ok &&
3445 match == true &&
3446 space->port == p->port) {
3447 p->prot_space = space;
3448 return p->prot_space->auth;
3449 }
3450 }
3451 }
3452
3453 return NULL;
3454 }
3455
3456
3457 /* exported interface documented in netsurf/url_db.h */
urldb_set_cert_permissions(nsurl * url,bool permit)3458 void urldb_set_cert_permissions(nsurl *url, bool permit)
3459 {
3460 struct path_data *p;
3461 struct host_part *h;
3462
3463 assert(url);
3464
3465 /* add url, in case it's missing */
3466 urldb_add_url(url);
3467
3468 p = urldb_find_url(url);
3469 if (!p)
3470 return;
3471
3472 for (; p && p->parent; p = p->parent)
3473 /* do nothing */;
3474 assert(p);
3475
3476 h = (struct host_part *)p;
3477
3478 h->permit_invalid_certs = permit;
3479 }
3480
3481
3482 /* exported interface documented in content/urldb.h */
urldb_get_cert_permissions(nsurl * url)3483 bool urldb_get_cert_permissions(nsurl *url)
3484 {
3485 struct path_data *p;
3486 const struct host_part *h;
3487
3488 assert(url);
3489
3490 p = urldb_find_url(url);
3491 if (!p)
3492 return false;
3493
3494 for (; p && p->parent; p = p->parent)
3495 /* do nothing */;
3496 assert(p);
3497
3498 h = (const struct host_part *)p;
3499
3500 return h->permit_invalid_certs;
3501 }
3502
3503
3504 /* exported interface documented in content/urldb.h */
urldb_set_hsts_policy(struct nsurl * url,const char * header)3505 bool urldb_set_hsts_policy(struct nsurl *url, const char *header)
3506 {
3507 struct path_data *p;
3508 struct host_part *h;
3509 lwc_string *host;
3510 time_t now = time(NULL);
3511 http_strict_transport_security *sts;
3512 uint32_t max_age = 0;
3513 nserror error;
3514
3515 assert(url);
3516
3517 host = nsurl_get_component(url, NSURL_HOST);
3518 if (host != NULL) {
3519 if (urldb__host_is_ip_address(lwc_string_data(host))) {
3520 /* Host is IP: ignore */
3521 lwc_string_unref(host);
3522 return true;
3523 } else if (lwc_string_length(host) == 0) {
3524 /* Host is blank: ignore */
3525 lwc_string_unref(host);
3526 return true;
3527 }
3528
3529 lwc_string_unref(host);
3530 } else {
3531 /* No host part: ignore */
3532 return true;
3533 }
3534
3535 /* add url, in case it's missing */
3536 urldb_add_url(url);
3537
3538 p = urldb_find_url(url);
3539 if (!p)
3540 return false;
3541
3542 for (; p && p->parent; p = p->parent)
3543 /* do nothing */;
3544 assert(p);
3545
3546 h = (struct host_part *)p;
3547 if (h->permit_invalid_certs) {
3548 /* Transport is tainted: ignore */
3549 return true;
3550 }
3551
3552 error = http_parse_strict_transport_security(header, &sts);
3553 if (error != NSERROR_OK) {
3554 /* Parse failed: ignore */
3555 return true;
3556 }
3557
3558 h->hsts.include_sub_domains =
3559 http_strict_transport_security_include_subdomains(sts);
3560
3561 max_age = http_strict_transport_security_max_age(sts);
3562 if (max_age == 0) {
3563 h->hsts.expires = 0;
3564 h->hsts.include_sub_domains = false;
3565 } else if ((time_t) (now + max_age) > h->hsts.expires) {
3566 h->hsts.expires = now + max_age;
3567 }
3568
3569 http_strict_transport_security_destroy(sts);
3570
3571 return true;
3572 }
3573
3574
3575 /* exported interface documented in content/urldb.h */
urldb_get_hsts_enabled(struct nsurl * url)3576 bool urldb_get_hsts_enabled(struct nsurl *url)
3577 {
3578 struct path_data *p;
3579 const struct host_part *h;
3580 lwc_string *host;
3581 time_t now = time(NULL);
3582
3583 assert(url);
3584
3585 host = nsurl_get_component(url, NSURL_HOST);
3586 if (host != NULL) {
3587 if (urldb__host_is_ip_address(lwc_string_data(host))) {
3588 /* Host is IP: not enabled */
3589 lwc_string_unref(host);
3590 return false;
3591 } else if (lwc_string_length(host) == 0) {
3592 /* Host is blank: not enabled */
3593 lwc_string_unref(host);
3594 return false;
3595 }
3596
3597 lwc_string_unref(host);
3598 } else {
3599 /* No host part: not enabled */
3600 return false;
3601 }
3602
3603 /* The URL must exist in the db in order to find HSTS policy, since
3604 * we search up the tree from the URL node, and policy from further
3605 * up may also apply. */
3606 urldb_add_url(url);
3607
3608 p = urldb_find_url(url);
3609 if (!p)
3610 return false;
3611
3612 for (; p && p->parent; p = p->parent)
3613 /* do nothing */;
3614 assert(p);
3615
3616 h = (const struct host_part *)p;
3617
3618 /* Consult record for this host */
3619 if (h->hsts.expires > now) {
3620 /* Not expired */
3621 return true;
3622 }
3623
3624 /* Consult parent domains */
3625 for (h = h->parent; h && h != &db_root; h = h->parent) {
3626 if (h->hsts.expires > now && h->hsts.include_sub_domains) {
3627 /* Not expired and subdomains included */
3628 return true;
3629 }
3630 }
3631
3632 return false;
3633 }
3634
3635
3636 /* exported interface documented in netsurf/url_db.h */
3637 void
urldb_iterate_partial(const char * prefix,bool (* callback)(nsurl * url,const struct url_data * data))3638 urldb_iterate_partial(const char *prefix,
3639 bool (*callback)(nsurl *url, const struct url_data *data))
3640 {
3641 char host[256];
3642 char buf[260]; /* max domain + "www." */
3643 const char *slash, *scheme_sep;
3644 struct search_node *tree;
3645 const struct host_part *h;
3646
3647 assert(prefix && callback);
3648
3649 /* strip scheme */
3650 scheme_sep = strstr(prefix, "://");
3651 if (scheme_sep)
3652 prefix = scheme_sep + 3;
3653
3654 slash = strchr(prefix, '/');
3655 tree = urldb_get_search_tree(prefix);
3656
3657 if (slash) {
3658 /* if there's a slash in the input, then we can
3659 * assume that we're looking for a path */
3660 snprintf(host, sizeof host, "%.*s",
3661 (int) (slash - prefix), prefix);
3662
3663 h = urldb_search_find(tree, host);
3664 if (!h) {
3665 int len = slash - prefix;
3666
3667 if (len <= 3 || strncasecmp(host, "www.", 4) != 0) {
3668 snprintf(buf, sizeof buf, "www.%s", host);
3669 h = urldb_search_find(
3670 search_trees[ST_DN + 'w' - 'a'],
3671 buf);
3672 if (!h)
3673 return;
3674 } else
3675 return;
3676 }
3677
3678 if (h->paths.children) {
3679 /* Have paths, iterate them */
3680 urldb_iterate_partial_path(&h->paths, slash + 1,
3681 callback);
3682 }
3683
3684 } else {
3685 int len = strlen(prefix);
3686
3687 /* looking for hosts */
3688 if (!urldb_iterate_partial_host(tree, prefix, callback))
3689 return;
3690
3691 if (len <= 3 || strncasecmp(prefix, "www.", 4) != 0) {
3692 /* now look for www.prefix */
3693 snprintf(buf, sizeof buf, "www.%s", prefix);
3694 if(!urldb_iterate_partial_host(
3695 search_trees[ST_DN + 'w' - 'a'],
3696 buf, callback))
3697 return;
3698 }
3699 }
3700 }
3701
3702
3703 /* exported interface documented in netsurf/url_db.h */
3704 void
urldb_iterate_entries(bool (* callback)(nsurl * url,const struct url_data * data))3705 urldb_iterate_entries(bool (*callback)(nsurl *url, const struct url_data *data))
3706 {
3707 int i;
3708
3709 assert(callback);
3710
3711 for (i = 0; i < NUM_SEARCH_TREES; i++) {
3712 if (!urldb_iterate_entries_host(search_trees[i],
3713 callback,
3714 NULL)) {
3715 break;
3716 }
3717 }
3718 }
3719
3720
3721 /* exported interface documented in content/urldb.h */
urldb_iterate_cookies(bool (* callback)(const struct cookie_data * data))3722 void urldb_iterate_cookies(bool (*callback)(const struct cookie_data *data))
3723 {
3724 int i;
3725
3726 assert(callback);
3727
3728 for (i = 0; i < NUM_SEARCH_TREES; i++) {
3729 if (!urldb_iterate_entries_host(search_trees[i],
3730 NULL, callback))
3731 break;
3732 }
3733 }
3734
3735
3736 /* exported interface documented in content/urldb.h */
urldb_set_cookie(const char * header,nsurl * url,nsurl * referer)3737 bool urldb_set_cookie(const char *header, nsurl *url, nsurl *referer)
3738 {
3739 const char *cur = header, *end;
3740 lwc_string *path, *host, *scheme;
3741 nsurl *urlt;
3742 bool match;
3743
3744 assert(url && header);
3745
3746 /* Get defragmented URL, as 'urlt' */
3747 if (nsurl_defragment(url, &urlt) != NSERROR_OK)
3748 return NULL;
3749
3750 scheme = nsurl_get_component(url, NSURL_SCHEME);
3751 if (scheme == NULL) {
3752 nsurl_unref(urlt);
3753 return false;
3754 }
3755
3756 path = nsurl_get_component(url, NSURL_PATH);
3757 if (path == NULL) {
3758 lwc_string_unref(scheme);
3759 nsurl_unref(urlt);
3760 return false;
3761 }
3762
3763 host = nsurl_get_component(url, NSURL_HOST);
3764 if (host == NULL) {
3765 lwc_string_unref(path);
3766 lwc_string_unref(scheme);
3767 nsurl_unref(urlt);
3768 return false;
3769 }
3770
3771 if (referer) {
3772 lwc_string *rhost;
3773
3774 /* Ensure that url's host name domain matches
3775 * referer's (4.3.5) */
3776 rhost = nsurl_get_component(referer, NSURL_HOST);
3777 if (rhost == NULL) {
3778 goto error;
3779 }
3780
3781 /* Domain match host names */
3782 if (lwc_string_isequal(host, rhost, &match) == lwc_error_ok &&
3783 match == false) {
3784 const char *hptr;
3785 const char *rptr;
3786 const char *dot;
3787 const char *host_data = lwc_string_data(host);
3788 const char *rhost_data = lwc_string_data(rhost);
3789
3790 /* Ensure neither host nor rhost are IP addresses */
3791 if (urldb__host_is_ip_address(host_data) ||
3792 urldb__host_is_ip_address(rhost_data)) {
3793 /* IP address, so no partial match */
3794 lwc_string_unref(rhost);
3795 goto error;
3796 }
3797
3798 /* Not exact match, so try the following:
3799 *
3800 * 1) Find the longest common suffix of host and rhost
3801 * (may be all of host/rhost)
3802 * 2) Discard characters from the start of the suffix
3803 * until the suffix starts with a dot
3804 * (prevents foobar.com matching bar.com)
3805 * 3) Ensure the suffix is non-empty and contains
3806 * embedded dots (to avoid permitting .com as a
3807 * suffix)
3808 *
3809 * Note that the above in no way resembles the
3810 * domain matching algorithm found in RFC2109.
3811 * It does, however, model the real world rather
3812 * more accurately.
3813 */
3814
3815 /** \todo In future, we should consult a TLD service
3816 * instead of just looking for embedded dots.
3817 */
3818
3819 hptr = host_data + lwc_string_length(host) - 1;
3820 rptr = rhost_data + lwc_string_length(rhost) - 1;
3821
3822 /* 1 */
3823 while (hptr >= host_data && rptr >= rhost_data) {
3824 if (*hptr != *rptr)
3825 break;
3826 hptr--;
3827 rptr--;
3828 }
3829 /* Ensure we end up pointing at the start of the
3830 * common suffix. The above loop will exit pointing
3831 * to the byte before the start of the suffix. */
3832 hptr++;
3833
3834 /* 2 */
3835 while (*hptr != '\0' && *hptr != '.')
3836 hptr++;
3837
3838 /* 3 */
3839 if (*hptr == '\0' ||
3840 (dot = strchr(hptr + 1, '.')) == NULL ||
3841 *(dot + 1) == '\0') {
3842 lwc_string_unref(rhost);
3843 goto error;
3844 }
3845 }
3846
3847 lwc_string_unref(rhost);
3848 }
3849
3850 end = cur + strlen(cur) - 2 /* Trailing CRLF */;
3851
3852 do {
3853 struct cookie_internal_data *c;
3854 char *dot;
3855 size_t len;
3856 #ifdef WITH_NSPSL
3857 const char *suffix;
3858 #endif
3859
3860 c = urldb_parse_cookie(url, &cur);
3861 if (!c) {
3862 /* failed => stop parsing */
3863 goto error;
3864 }
3865
3866 /* validate cookie */
3867
3868 /* 4.2.2:i Cookie must have NAME and VALUE */
3869 if (!c->name || !c->value) {
3870 urldb_free_cookie(c);
3871 goto error;
3872 }
3873
3874 /* 4.3.2:i Cookie path must be a prefix of URL path */
3875 len = strlen(c->path);
3876 if (len > lwc_string_length(path) ||
3877 strncmp(c->path, lwc_string_data(path),
3878 len) != 0) {
3879 urldb_free_cookie(c);
3880 goto error;
3881 }
3882
3883 #ifdef WITH_NSPSL
3884 /* check domain is not a public suffix */
3885 dot = c->domain;
3886 if (*dot == '.') {
3887 dot++;
3888 }
3889 suffix = nspsl_getpublicsuffix(dot);
3890 if (suffix == NULL) {
3891 NSLOG(netsurf, INFO,
3892 "domain %s was a public suffix domain", dot);
3893 urldb_free_cookie(c);
3894 goto error;
3895 }
3896 #else
3897 /* 4.3.2:ii Cookie domain must contain embedded dots */
3898 dot = strchr(c->domain + 1, '.');
3899 if (!dot || *(dot + 1) == '\0') {
3900 /* no embedded dots */
3901 urldb_free_cookie(c);
3902 goto error;
3903 }
3904 #endif
3905
3906 /* Domain match fetch host with cookie domain */
3907 if (strcasecmp(lwc_string_data(host), c->domain) != 0) {
3908 int hlen, dlen;
3909 char *domain = c->domain;
3910
3911 /* c->domain must be a domain cookie here because:
3912 * c->domain is either:
3913 * + specified in the header as a domain cookie
3914 * (non-domain cookies in the header are ignored
3915 * by urldb_parse_cookie / urldb_parse_avpair)
3916 * + defaulted to the URL's host part
3917 * (by urldb_parse_cookie if no valid domain was
3918 * specified in the header)
3919 *
3920 * The latter will pass the strcasecmp above, which
3921 * leaves the former (i.e. a domain cookie)
3922 */
3923 assert(c->domain[0] == '.');
3924
3925 /* 4.3.2:iii */
3926 if (urldb__host_is_ip_address(lwc_string_data(host))) {
3927 /* IP address, so no partial match */
3928 urldb_free_cookie(c);
3929 goto error;
3930 }
3931
3932 hlen = lwc_string_length(host);
3933 dlen = strlen(c->domain);
3934
3935 if (hlen <= dlen && hlen != dlen - 1) {
3936 /* Partial match not possible */
3937 urldb_free_cookie(c);
3938 goto error;
3939 }
3940
3941 if (hlen == dlen - 1) {
3942 /* Relax matching to allow
3943 * host a.com to match .a.com */
3944 domain++;
3945 dlen--;
3946 }
3947
3948 if (strcasecmp(lwc_string_data(host) + (hlen - dlen),
3949 domain)) {
3950 urldb_free_cookie(c);
3951 goto error;
3952 }
3953
3954 /* 4.3.2:iv Ensure H contains no dots
3955 *
3956 * If you believe the spec, H should contain no
3957 * dots in _any_ cookie. Unfortunately, however,
3958 * reality differs in that many sites send domain
3959 * cookies of the form .foo.com from hosts such
3960 * as bar.bat.foo.com and then expect domain
3961 * matching to work. Thus we have to do what they
3962 * expect, regardless of any potential security
3963 * implications.
3964 *
3965 * This is what code conforming to the spec would
3966 * look like:
3967 *
3968 * for (int i = 0; i < (hlen - dlen); i++) {
3969 * if (host[i] == '.') {
3970 * urldb_free_cookie(c);
3971 * goto error;
3972 * }
3973 * }
3974 */
3975 }
3976
3977 /* Now insert into database */
3978 if (!urldb_insert_cookie(c, scheme, urlt))
3979 goto error;
3980 } while (cur < end);
3981
3982 lwc_string_unref(host);
3983 lwc_string_unref(path);
3984 lwc_string_unref(scheme);
3985 nsurl_unref(urlt);
3986
3987 return true;
3988
3989 error:
3990 lwc_string_unref(host);
3991 lwc_string_unref(path);
3992 lwc_string_unref(scheme);
3993 nsurl_unref(urlt);
3994
3995 return false;
3996 }
3997
3998
3999 /* exported interface documented in content/urldb.h */
urldb_get_cookie(nsurl * url,bool include_http_only)4000 char *urldb_get_cookie(nsurl *url, bool include_http_only)
4001 {
4002 const struct path_data *p, *q;
4003 const struct host_part *h;
4004 lwc_string *path_lwc;
4005 struct cookie_internal_data *c;
4006 int count = 0, version = COOKIE_RFC2965;
4007 struct cookie_internal_data **matched_cookies;
4008 int matched_cookies_size = 20;
4009 int ret_alloc = 4096, ret_used = 1;
4010 const char *path;
4011 char *ret;
4012 lwc_string *scheme;
4013 time_t now;
4014 int i;
4015 bool match;
4016
4017 assert(url != NULL);
4018
4019 /* The URL must exist in the db in order to find relevant cookies, since
4020 * we search up the tree from the URL node, and cookies from further
4021 * up also apply. */
4022 urldb_add_url(url);
4023
4024 p = urldb_find_url(url);
4025 if (!p)
4026 return NULL;
4027
4028 scheme = p->scheme;
4029
4030 matched_cookies = malloc(matched_cookies_size *
4031 sizeof(struct cookie_internal_data *));
4032 if (!matched_cookies)
4033 return NULL;
4034
4035 #define GROW_MATCHED_COOKIES \
4036 do { \
4037 if (count == matched_cookies_size) { \
4038 struct cookie_internal_data **temp; \
4039 temp = realloc(matched_cookies, \
4040 (matched_cookies_size + 20) * \
4041 sizeof(struct cookie_internal_data *)); \
4042 \
4043 if (temp == NULL) { \
4044 free(ret); \
4045 free(matched_cookies); \
4046 return NULL; \
4047 } \
4048 \
4049 matched_cookies = temp; \
4050 matched_cookies_size += 20; \
4051 } \
4052 } while(0)
4053
4054 ret = malloc(ret_alloc);
4055 if (!ret) {
4056 free(matched_cookies);
4057 return NULL;
4058 }
4059
4060 ret[0] = '\0';
4061
4062 path_lwc = nsurl_get_component(url, NSURL_PATH);
4063 if (path_lwc == NULL) {
4064 free(ret);
4065 free(matched_cookies);
4066 return NULL;
4067 }
4068 path = lwc_string_data(path_lwc);
4069 lwc_string_unref(path_lwc);
4070
4071 now = time(NULL);
4072
4073 if (*(p->segment) != '\0') {
4074 /* Match exact path, unless directory, when prefix matching
4075 * will handle this case for us. */
4076 for (q = p->parent->children; q; q = q->next) {
4077 if (strcmp(q->segment, p->segment))
4078 continue;
4079
4080 /* Consider all cookies associated with
4081 * this exact path */
4082 for (c = q->cookies; c; c = c->next) {
4083 if (c->expires != -1 && c->expires < now)
4084 /* cookie has expired => ignore */
4085 continue;
4086
4087 if (c->secure && lwc_string_isequal(
4088 q->scheme,
4089 corestring_lwc_https,
4090 &match) &&
4091 match == false)
4092 /* secure cookie for insecure host.
4093 * ignore */
4094 continue;
4095
4096 if (c->http_only && !include_http_only)
4097 /* Ignore HttpOnly */
4098 continue;
4099
4100 matched_cookies[count++] = c;
4101
4102 GROW_MATCHED_COOKIES;
4103
4104 if (c->version < (unsigned int)version)
4105 version = c->version;
4106
4107 c->last_used = now;
4108
4109 cookie_manager_add((struct cookie_data *)c);
4110 }
4111 }
4112 }
4113
4114 /* Now consider cookies whose paths prefix-match ours */
4115 for (p = p->parent; p; p = p->parent) {
4116 /* Find directory's path entry(ies) */
4117 /* There are potentially multiple due to differing schemes */
4118 for (q = p->children; q; q = q->next) {
4119 if (*(q->segment) != '\0')
4120 continue;
4121
4122 for (c = q->cookies; c; c = c->next) {
4123 if (c->expires != -1 && c->expires < now)
4124 /* cookie has expired => ignore */
4125 continue;
4126
4127 if (c->secure && lwc_string_isequal(
4128 q->scheme,
4129 corestring_lwc_https,
4130 &match) &&
4131 match == false)
4132 /* Secure cookie for insecure server
4133 * => ignore */
4134 continue;
4135
4136 matched_cookies[count++] = c;
4137
4138 GROW_MATCHED_COOKIES;
4139
4140 if (c->version < (unsigned int) version)
4141 version = c->version;
4142
4143 c->last_used = now;
4144
4145 cookie_manager_add((struct cookie_data *)c);
4146 }
4147 }
4148
4149 if (!p->parent) {
4150 /* No parent, so bail here. This can't go in
4151 * the loop exit condition as we also want to
4152 * process the top-level node.
4153 *
4154 * If p->parent is NULL then p->cookies are
4155 * the domain cookies and thus we don't even
4156 * try matching against them.
4157 */
4158 break;
4159 }
4160
4161 /* Consider p itself - may be the result of Path=/foo */
4162 for (c = p->cookies; c; c = c->next) {
4163 if (c->expires != -1 && c->expires < now)
4164 /* cookie has expired => ignore */
4165 continue;
4166
4167 /* Ensure cookie path is a prefix of the resource */
4168 if (strncmp(c->path, path, strlen(c->path)) != 0)
4169 /* paths don't match => ignore */
4170 continue;
4171
4172 if (c->secure && lwc_string_isequal(p->scheme,
4173 corestring_lwc_https,
4174 &match) &&
4175 match == false)
4176 /* Secure cookie for insecure server
4177 * => ignore */
4178 continue;
4179
4180 matched_cookies[count++] = c;
4181
4182 GROW_MATCHED_COOKIES;
4183
4184 if (c->version < (unsigned int) version)
4185 version = c->version;
4186
4187 c->last_used = now;
4188
4189 cookie_manager_add((struct cookie_data *)c);
4190 }
4191
4192 }
4193
4194 /* Finally consider domain cookies for hosts which domain match ours */
4195 for (h = (const struct host_part *)p; h && h != &db_root;
4196 h = h->parent) {
4197 for (c = h->paths.cookies; c; c = c->next) {
4198 if (c->expires != -1 && c->expires < now)
4199 /* cookie has expired => ignore */
4200 continue;
4201
4202 /* Ensure cookie path is a prefix of the resource */
4203 if (strncmp(c->path, path, strlen(c->path)) != 0)
4204 /* paths don't match => ignore */
4205 continue;
4206
4207 if (c->secure && lwc_string_isequal(scheme,
4208 corestring_lwc_https,
4209 &match) &&
4210 match == false)
4211 /* secure cookie for insecure host. ignore */
4212 continue;
4213
4214 matched_cookies[count++] = c;
4215
4216 GROW_MATCHED_COOKIES;
4217
4218 if (c->version < (unsigned int)version)
4219 version = c->version;
4220
4221 c->last_used = now;
4222
4223 cookie_manager_add((struct cookie_data *)c);
4224 }
4225 }
4226
4227 if (count == 0) {
4228 /* No cookies found */
4229 free(ret);
4230 free(matched_cookies);
4231 return NULL;
4232 }
4233
4234 /* and build output string */
4235 if (version > COOKIE_NETSCAPE) {
4236 sprintf(ret, "$Version=%d", version);
4237 ret_used = strlen(ret) + 1;
4238 }
4239
4240 for (i = 0; i < count; i++) {
4241 if (!urldb_concat_cookie(matched_cookies[i], version,
4242 &ret_used, &ret_alloc, &ret)) {
4243 free(ret);
4244 free(matched_cookies);
4245 return NULL;
4246 }
4247 }
4248
4249 if (version == COOKIE_NETSCAPE) {
4250 /* Old-style cookies => no version & skip "; " */
4251 memmove(ret, ret + 2, ret_used - 2);
4252 ret_used -= 2;
4253 }
4254
4255 /* Now, shrink the output buffer to the required size */
4256 {
4257 char *temp = realloc(ret, ret_used);
4258 if (!temp) {
4259 free(ret);
4260 free(matched_cookies);
4261 return NULL;
4262 }
4263
4264 ret = temp;
4265 }
4266
4267 free(matched_cookies);
4268
4269 return ret;
4270
4271 #undef GROW_MATCHED_COOKIES
4272 }
4273
4274
4275 /* exported interface documented in content/urldb.h */
urldb_delete_cookie(const char * domain,const char * path,const char * name)4276 void urldb_delete_cookie(const char *domain, const char *path,
4277 const char *name)
4278 {
4279 urldb_delete_cookie_hosts(domain, path, name, &db_root);
4280 }
4281
4282
4283 /* exported interface documented in content/urldb.h */
urldb_load_cookies(const char * filename)4284 void urldb_load_cookies(const char *filename)
4285 {
4286 FILE *fp;
4287 char s[16*1024];
4288
4289 assert(filename);
4290
4291 fp = fopen(filename, "r");
4292 if (!fp)
4293 return;
4294
4295 #define FIND_T { \
4296 for (; *p && *p != '\t'; p++) \
4297 ; /* do nothing */ \
4298 if (p >= end) { \
4299 NSLOG(netsurf, INFO, "Overran input"); \
4300 continue; \
4301 } \
4302 *p++ = '\0'; \
4303 }
4304
4305 #define SKIP_T { \
4306 for (; *p && *p == '\t'; p++) \
4307 ; /* do nothing */ \
4308 if (p >= end) { \
4309 NSLOG(netsurf, INFO, "Overran input"); \
4310 continue; \
4311 } \
4312 }
4313
4314 while (fgets(s, sizeof s, fp)) {
4315 char *p = s, *end = 0,
4316 *domain, *path, *name, *value, *scheme, *url,
4317 *comment;
4318 int version, domain_specified, path_specified,
4319 secure, http_only, no_destroy, value_quoted;
4320 time_t expires, last_used;
4321 struct cookie_internal_data *c;
4322
4323 if(s[0] == 0 || s[0] == '#')
4324 /* Skip blank lines or comments */
4325 continue;
4326
4327 s[strlen(s) - 1] = '\0'; /* lose terminating newline */
4328 end = s + strlen(s);
4329
4330 /* Look for file version first
4331 * (all input is ignored until this is read)
4332 */
4333 if (strncasecmp(s, "Version:", 8) == 0) {
4334 FIND_T; SKIP_T; loaded_cookie_file_version = atoi(p);
4335
4336 if (loaded_cookie_file_version <
4337 MIN_COOKIE_FILE_VERSION) {
4338 NSLOG(netsurf, INFO,
4339 "Unsupported Cookie file version");
4340 break;
4341 }
4342
4343 continue;
4344 } else if (loaded_cookie_file_version == 0) {
4345 /* Haven't yet seen version; skip this input */
4346 continue;
4347 }
4348
4349 /* One cookie/line */
4350
4351 /* Parse input */
4352 FIND_T; version = atoi(s);
4353 SKIP_T; domain = p; FIND_T;
4354 SKIP_T; domain_specified = atoi(p); FIND_T;
4355 SKIP_T; path = p; FIND_T;
4356 SKIP_T; path_specified = atoi(p); FIND_T;
4357 SKIP_T; secure = atoi(p); FIND_T;
4358 if (loaded_cookie_file_version > 101) {
4359 /* Introduced in version 1.02 */
4360 SKIP_T; http_only = atoi(p); FIND_T;
4361 } else {
4362 http_only = 0;
4363 }
4364 SKIP_T; expires = (time_t)atoi(p); FIND_T;
4365 SKIP_T; last_used = (time_t)atoi(p); FIND_T;
4366 SKIP_T; no_destroy = atoi(p); FIND_T;
4367 SKIP_T; name = p; FIND_T;
4368 SKIP_T; value = p; FIND_T;
4369 if (loaded_cookie_file_version > 100) {
4370 /* Introduced in version 1.01 */
4371 SKIP_T; value_quoted = atoi(p); FIND_T;
4372 } else {
4373 value_quoted = 0;
4374 }
4375 SKIP_T; scheme = p; FIND_T;
4376 SKIP_T; url = p; FIND_T;
4377
4378 /* Comment may have no content, so don't
4379 * use macros as they'll break */
4380 for (; *p && *p == '\t'; p++)
4381 ; /* do nothing */
4382 comment = p;
4383
4384 assert(p <= end);
4385
4386 /* Now create cookie */
4387 c = malloc(sizeof(struct cookie_internal_data));
4388 if (!c)
4389 break;
4390
4391 c->name = strdup(name);
4392 c->value = strdup(value);
4393 c->value_was_quoted = value_quoted;
4394 c->comment = strdup(comment);
4395 c->domain_from_set = domain_specified;
4396 c->domain = strdup(domain);
4397 c->path_from_set = path_specified;
4398 c->path = strdup(path);
4399 c->expires = expires;
4400 c->last_used = last_used;
4401 c->secure = secure;
4402 c->http_only = http_only;
4403 c->version = version;
4404 c->no_destroy = no_destroy;
4405
4406 if (!(c->name && c->value && c->comment &&
4407 c->domain && c->path)) {
4408 urldb_free_cookie(c);
4409 break;
4410 }
4411
4412 if (c->domain[0] != '.') {
4413 lwc_string *scheme_lwc = NULL;
4414 nsurl *url_nsurl = NULL;
4415
4416 assert(scheme[0] != 'u');
4417
4418 if (nsurl_create(url, &url_nsurl) != NSERROR_OK) {
4419 urldb_free_cookie(c);
4420 break;
4421 }
4422 scheme_lwc = nsurl_get_component(url_nsurl,
4423 NSURL_SCHEME);
4424
4425 /* And insert it into database */
4426 if (!urldb_insert_cookie(c, scheme_lwc, url_nsurl)) {
4427 /* Cookie freed for us */
4428 nsurl_unref(url_nsurl);
4429 lwc_string_unref(scheme_lwc);
4430 break;
4431 }
4432 nsurl_unref(url_nsurl);
4433 lwc_string_unref(scheme_lwc);
4434
4435 } else {
4436 if (!urldb_insert_cookie(c, NULL, NULL)) {
4437 /* Cookie freed for us */
4438 break;
4439 }
4440 }
4441 }
4442
4443 #undef SKIP_T
4444 #undef FIND_T
4445
4446 fclose(fp);
4447 }
4448
4449
4450 /* exported interface documented in content/urldb.h */
urldb_save_cookies(const char * filename)4451 void urldb_save_cookies(const char *filename)
4452 {
4453 FILE *fp;
4454 int cookie_file_version = max(loaded_cookie_file_version,
4455 COOKIE_FILE_VERSION);
4456
4457 assert(filename);
4458
4459 fp = fopen(filename, "w");
4460 if (!fp)
4461 return;
4462
4463 fprintf(fp, "# NetSurf cookies file.\n"
4464 "#\n"
4465 "# Lines starting with a '#' are comments, "
4466 "blank lines are ignored.\n"
4467 "#\n"
4468 "# All lines prior to \"Version:\t%d\" are discarded.\n"
4469 "#\n"
4470 "# Version\tDomain\tDomain from Set-Cookie\tPath\t"
4471 "Path from Set-Cookie\tSecure\tHTTP-Only\tExpires\tLast used\t"
4472 "No destroy\tName\tValue\tValue was quoted\tScheme\t"
4473 "URL\tComment\n",
4474 cookie_file_version);
4475 fprintf(fp, "Version:\t%d\n", cookie_file_version);
4476
4477 urldb_save_cookie_hosts(fp, &db_root);
4478
4479 fclose(fp);
4480 }
4481
4482
4483 /* exported interface documented in netsurf/url_db.h */
urldb_dump(void)4484 void urldb_dump(void)
4485 {
4486 int i;
4487
4488 urldb_dump_hosts(&db_root);
4489
4490 for (i = 0; i != NUM_SEARCH_TREES; i++) {
4491 urldb_dump_search(search_trees[i], 0);
4492 }
4493 }
4494
4495
4496
4497
4498