1 /*
2  * Copyright 2006 John M Bell <jmb202@ecs.soton.ac.uk>
3  * Copyright 2009 John Tytgat <joty@netsurf-browser.org>
4  *
5  * This file is part of NetSurf, http://www.netsurf-browser.org/
6  *
7  * NetSurf is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; version 2 of the License.
10  *
11  * NetSurf is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 /**
21  * \file
22  * Unified URL information database implementation
23  *
24  * URLs are stored in a tree-based structure as follows:
25  *
26  * The host component is extracted from each URL and, if a FQDN, split on
27  * every '.'.The tree is constructed by inserting each FQDN segment in
28  * reverse order. Duplicate nodes are merged.
29  *
30  * If the host part of an URL is an IP address, then this is added to the
31  * tree verbatim (as if it were a TLD).
32  *
33  * This provides something looking like:
34  *
35  *			      root (a sentinel)
36  *				|
37  *	-------------------------------------------------
38  *	|	|	|	|	|	|	|
39  *     com     edu     gov  127.0.0.1  net     org     uk	TLDs
40  *	|	|	|		|	|	|
41  *    google   ...     ...             ...     ...     co	2LDs
42  *	|						|
43  *     www					       bbc  Hosts/Subdomains
44  *							|
45  *						       www	...
46  *
47  * Each of the nodes in this tree is a struct host_part. This stores the
48  * FQDN segment (or IP address) with which the node is concerned. Each node
49  * may contain further information about paths on a host (struct path_data)
50  * or SSL certificate processing on a host-wide basis
51  * (host_part::permit_invalid_certs).
52  *
53  * Path data is concerned with storing various metadata about the path in
54  * question. This includes global history data, HTTP authentication details
55  * and any associated HTTP cookies. This is stored as a tree of path segments
56  * hanging off the relevant host_part node.
57  *
58  * Therefore, to find the last visited time of the URL
59  * http://www.example.com/path/to/resource.html, the FQDN tree would be
60  * traversed in the order root -> "com" -> "example" -> "www". The "www"
61  * node would have attached to it a tree of struct path_data:
62  *
63  *			    (sentinel)
64  *				|
65  *			       path
66  *				|
67  *			       to
68  *				|
69  *			   resource.html
70  *
71  * This represents the absolute path "/path/to/resource.html". The leaf node
72  * "resource.html" contains the last visited time of the resource.
73  *
74  * The mechanism described above is, however, not particularly conducive to
75  * fast searching of the database for a given URL (or URLs beginning with a
76  * given prefix). Therefore, an anciliary data structure is used to enable
77  * fast searching. This structure simply reflects the contents of the
78  * database, with entries being added/removed at the same time as for the
79  * core database. In order to ensure that degenerate cases are kept to a
80  * minimum, we use an AAtree. This is an approximation of a Red-Black tree
81  * with similar performance characteristics, but with a significantly
82  * simpler implementation. Entries in this tree comprise pointers to the
83  * leaf nodes of the host tree described above.
84  *
85  * REALLY IMPORTANT NOTE: urldb expects all URLs to be normalised. Use of
86  * non-normalised URLs with urldb will result in undefined behaviour and
87  * potential crashes.
88  */
89 
90 #include <assert.h>
91 #include <stdbool.h>
92 #include <stdio.h>
93 #include <stdlib.h>
94 #include <string.h>
95 #include <strings.h>
96 #include <time.h>
97 #ifdef WITH_NSPSL
98 #include <nspsl.h>
99 #endif
100 
101 #include "utils/inet.h"
102 #include "utils/nsoption.h"
103 #include "utils/log.h"
104 #include "utils/corestrings.h"
105 #include "utils/url.h"
106 #include "utils/utils.h"
107 #include "utils/bloom.h"
108 #include "utils/time.h"
109 #include "utils/nsurl.h"
110 #include "utils/ascii.h"
111 #include "utils/http.h"
112 #include "netsurf/bitmap.h"
113 #include "desktop/cookie_manager.h"
114 #include "desktop/gui_internal.h"
115 
116 #include "content/content.h"
117 #include "content/urldb.h"
118 
119 #ifdef WITH_AMISSL
120 /* AmiSSL needs everything to be using bsdsocket directly to avoid conflicts */
121 #include <proto/bsdsocket.h>
122 #endif
123 
124 /**
125  * cookie entry.
126  *
127  * \warning This *must* be kept in sync with the public interface in
128  *   netsurf/cookie_db.h
129  */
130 struct cookie_internal_data {
131 	struct cookie_internal_data *prev;	/**< Previous in list */
132 	struct cookie_internal_data *next;	/**< Next in list */
133 
134 	char *name;		/**< Cookie name */
135 	char *value;		/**< Cookie value */
136 	bool value_was_quoted;	/**< Value was quoted in Set-Cookie: */
137 	char *comment;		/**< Cookie comment */
138 	bool domain_from_set;	/**< Domain came from Set-Cookie: header */
139 	char *domain;		/**< Domain */
140 	bool path_from_set;	/**< Path came from Set-Cookie: header */
141 	char *path;		/**< Path */
142 	time_t expires;		/**< Expiry timestamp, or -1 for session */
143 	time_t last_used;	/**< Last used time */
144 	bool secure;		/**< Only send for HTTPS requests */
145 	bool http_only;		/**< Only expose to HTTP(S) requests */
146 	enum cookie_version version;	/**< Specification compliance */
147 	bool no_destroy;	/**< Never destroy this cookie,
148 				 * unless it's expired */
149 
150 };
151 
152 
153 /**
154  * A protection space
155  *
156  * This is defined as a tuple canonical_root_url and realm.  This
157  * structure lives as linked list element in a leaf host_part struct
158  * so we need additional scheme and port to have a canonical_root_url.
159  */
160 struct prot_space_data {
161 	/**
162 	 * URL scheme of canonical hostname of this protection space.
163 	 */
164 	lwc_string *scheme;
165 	/**
166 	 * Port number of canonical hostname of this protection
167 	 * space. When 0, it means the default port for given scheme,
168 	 * i.e. 80 (http), 443 (https).
169 	 */
170 	unsigned int port;
171 	/** Protection realm */
172 	char *realm;
173 
174 	/**
175 	 * Authentication details for this protection space in form
176 	 * username:password
177 	 */
178 	char *auth;
179 	/** Next sibling */
180 	struct prot_space_data *next;
181 };
182 
183 
184 /**
185  * meta data about a url
186  *
187  * \warning must be kept in sync with url_data structure in netsurf/url_db.h
188  */
189 struct url_internal_data {
190 	char *title;		/**< Resource title */
191 	unsigned int visits;	/**< Visit count */
192 	time_t last_visit;	/**< Last visit time */
193 	content_type type;	/**< Type of resource */
194 };
195 
196 
197 /**
198  * data entry for url
199  */
200 struct path_data {
201 	nsurl *url;		/**< Full URL */
202 	lwc_string *scheme;	/**< URL scheme for data */
203 	unsigned int port;	/**< Port number for data. When 0, it means
204 				 * the default port for given scheme, i.e.
205 				 * 80 (http), 443 (https). */
206 	char *segment;		/**< Path segment for this node */
207 	unsigned int frag_cnt;	/**< Number of entries in path_data::fragment */
208 	char **fragment;	/**< Array of fragments */
209 	bool persistent;	/**< This entry should persist */
210 
211 	struct url_internal_data urld;	/**< URL data for resource */
212 
213 	/**
214 	 * Protection space to which this resource belongs too. Can be
215 	 * NULL when it does not belong to a protection space or when
216 	 * it is not known. No ownership (is with struct host_part::prot_space).
217 	 */
218 	const struct prot_space_data *prot_space;
219 	/** Cookies associated with resource */
220 	struct cookie_internal_data *cookies;
221 	/** Last cookie in list */
222 	struct cookie_internal_data *cookies_end;
223 
224 	struct path_data *next;	/**< Next sibling */
225 	struct path_data *prev;	/**< Previous sibling */
226 	struct path_data *parent; /**< Parent path segment */
227 	struct path_data *children; /**< Child path segments */
228 	struct path_data *last; /**< Last child */
229 };
230 
231 struct hsts_data {
232 	time_t expires; /**< Expiry time */
233 	bool include_sub_domains; /**< Whether to include subdomains */
234 };
235 
236 struct host_part {
237 	/**
238 	 * Known paths on this host. This _must_ be first so that
239 	 * struct host_part *h = (struct host_part *)mypath; works
240 	 */
241 	struct path_data paths;
242 	/**
243 	 * Allow access to SSL protected resources on this host
244 	 * without verifying certificate authenticity
245 	 */
246 	bool permit_invalid_certs;
247 	/* HSTS data */
248 	struct hsts_data hsts;
249 
250 	/**
251 	 * Part of host string
252 	 */
253 	char *part;
254 
255 	/**
256 	 * Linked list of all known proctection spaces known for this
257 	 * host and all its schems and ports.
258 	 */
259 	struct prot_space_data *prot_space;
260 
261 	struct host_part *next;	/**< Next sibling */
262 	struct host_part *prev;	/**< Previous sibling */
263 	struct host_part *parent; /**< Parent host part */
264 	struct host_part *children; /**< Child host parts */
265 };
266 
267 
268 /**
269  * search index node
270  */
271 struct search_node {
272 	const struct host_part *data;	/**< Host tree entry */
273 
274 	unsigned int level;		/**< Node level */
275 
276 	struct search_node *left;	/**< Left subtree */
277 	struct search_node *right;	/**< Right subtree */
278 };
279 
280 /** Root database handle */
281 static struct host_part db_root;
282 
283 /** Search trees - one per letter + 1 for IPs + 1 for Everything Else */
284 #define NUM_SEARCH_TREES 28
285 #define ST_IP 0
286 #define ST_EE 1
287 #define ST_DN 2
288 static struct search_node empty = { 0, 0, &empty, &empty };
289 static struct search_node *search_trees[NUM_SEARCH_TREES] = {
290 	&empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty,
291 	&empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty,
292 	&empty, &empty, &empty, &empty, &empty, &empty, &empty, &empty,
293 	&empty, &empty, &empty, &empty
294 };
295 
296 /** Minimum cookie database file version */
297 #define MIN_COOKIE_FILE_VERSION 100
298 /** Current cookie database file version */
299 #define COOKIE_FILE_VERSION 102
300 /** loaded cookie file version */
301 static int loaded_cookie_file_version;
302 
303 /** Minimum URL database file version */
304 #define MIN_URL_FILE_VERSION 106
305 /** Current URL database file version */
306 #define URL_FILE_VERSION 107
307 
308 /**
309  * filter for url presence in database
310  *
311  * Bloom filter used for short-circuting the false case of "is this
312  * URL in the database?".  BLOOM_SIZE controls how large the filter is
313  * in bytes.  Primitive experimentation shows that for a filter of X
314  * bytes filled with X items, searching for X items not in the filter
315  * has a 5% false-positive rate.  We set it to 32kB, which should be
316  * enough for all but the largest databases, while not being
317  * shockingly wasteful on memory.
318  */
319 static struct bloom_filter *url_bloom;
320 /**
321  * Size of url filter
322  */
323 #define BLOOM_SIZE (1024 * 32)
324 
325 
326 /**
327  * write a time_t to a file portably
328  *
329  * \param fp File to write to
330  * \param val the unix time value to output
331  * \return NSERROR_OK on success
332  */
urldb_write_timet(FILE * fp,time_t val)333 static nserror urldb_write_timet(FILE *fp, time_t val)
334 {
335 	int use;
336 	char op[32];
337 
338 	use = nsc_sntimet(op, 32, &val);
339 	if (use == 0) {
340 		fprintf(fp, "%i\n", (int)val);
341 	} else {
342 		fprintf(fp, "%.*s\n", use, op);
343 	}
344 	return NSERROR_OK;
345 }
346 
347 /**
348  * Write paths associated with a host
349  *
350  * \param parent Root of (sub)tree to write
351  * \param host Current host name
352  * \param fp File to write to
353  * \param path Current path string
354  * \param path_alloc Allocated size of path
355  * \param path_used Used size of path
356  * \param expiry Expiry time of URLs
357  */
358 static void
urldb_write_paths(const struct path_data * parent,const char * host,FILE * fp,char ** path,int * path_alloc,int * path_used,time_t expiry)359 urldb_write_paths(const struct path_data *parent,
360 		  const char *host,
361 		  FILE *fp,
362 		  char **path,
363 		  int *path_alloc,
364 		  int *path_used,
365 		  time_t expiry)
366 {
367 	const struct path_data *p = parent;
368 	int i;
369 
370 	do {
371 		int seglen = p->segment != NULL ? strlen(p->segment) : 0;
372 		int len = *path_used + seglen + 1;
373 
374 		if (*path_alloc < len) {
375 			char *temp;
376 			temp = realloc(*path,
377 				       (len > 64) ? len : *path_alloc + 64);
378 			if (!temp) {
379 				return;
380 			}
381 			*path = temp;
382 			*path_alloc = (len > 64) ? len : *path_alloc + 64;
383 		}
384 
385 		if (p->segment != NULL) {
386 			memcpy(*path + *path_used - 1, p->segment, seglen);
387 		}
388 
389 		if (p->children != NULL) {
390 			(*path)[*path_used + seglen - 1] = '/';
391 			(*path)[*path_used + seglen] = '\0';
392 		} else {
393 			(*path)[*path_used + seglen - 1] = '\0';
394 			len -= 1;
395 		}
396 
397 		*path_used = len;
398 
399 		if (p->children != NULL) {
400 			/* Drill down into children */
401 			p = p->children;
402 		} else {
403 			/* leaf node */
404 			if (p->persistent ||
405 			    ((p->urld.last_visit > expiry) &&
406 			     (p->urld.visits > 0))) {
407 				fprintf(fp, "%s\n", lwc_string_data(p->scheme));
408 
409 				if (p->port) {
410 					fprintf(fp,"%d\n", p->port);
411 				} else {
412 					fprintf(fp, "\n");
413 				}
414 
415 				fprintf(fp, "%s\n", *path);
416 
417 				/** \todo handle fragments? */
418 
419 				/* number of visits */
420 				fprintf(fp, "%i\n", p->urld.visits);
421 
422 				/* time entry was last used */
423 				urldb_write_timet(fp, p->urld.last_visit);
424 
425 				/* entry type */
426 				fprintf(fp, "%i\n", (int)p->urld.type);
427 
428 				fprintf(fp, "\n");
429 
430 				if (p->urld.title) {
431 					uint8_t *s = (uint8_t *) p->urld.title;
432 
433 					for (i = 0; s[i] != '\0'; i++)
434 						if (s[i] < 32)
435 							s[i] = ' ';
436 					for (--i; ((i > 0) && (s[i] == ' '));
437 					     i--)
438 						s[i] = '\0';
439 					fprintf(fp, "%s\n", p->urld.title);
440 				} else {
441 					fprintf(fp, "\n");
442 				}
443 			}
444 
445 			/* Now, find next node to process. */
446 			while (p != parent) {
447 				int seglen = p->segment != NULL
448 					? strlen(p->segment) : 0;
449 
450 				/* Remove our segment from the path */
451 				*path_used -= seglen;
452 				(*path)[*path_used - 1] = '\0';
453 
454 				if (p->next != NULL) {
455 					/* Have a sibling, process that */
456 					p = p->next;
457 					break;
458 				}
459 
460 				/* Going up, so remove '/' */
461 				*path_used -= 1;
462 				(*path)[*path_used - 1] = '\0';
463 
464 				/* Ascend tree */
465 				p = p->parent;
466 			}
467 		}
468 	} while (p != parent);
469 }
470 
471 
472 /**
473  * Count number of URLs associated with a host
474  *
475  * \param root Root of path data tree
476  * \param expiry Expiry time for URLs
477  * \param count Pointer to count
478  */
479 static void
urldb_count_urls(const struct path_data * root,time_t expiry,unsigned int * count)480 urldb_count_urls(const struct path_data *root,
481 		 time_t expiry,
482 		 unsigned int *count)
483 {
484 	const struct path_data *p = root;
485 
486 	do {
487 		if (p->children != NULL) {
488 			/* Drill down into children */
489 			p = p->children;
490 		} else {
491 			/* No more children, increment count if required */
492 			if (p->persistent ||
493 			    ((p->urld.last_visit > expiry) &&
494 			     (p->urld.visits > 0))) {
495 				(*count)++;
496 			}
497 
498 			/* Now, find next node to process. */
499 			while (p != root) {
500 				if (p->next != NULL) {
501 					/* Have a sibling, process that */
502 					p = p->next;
503 					break;
504 				}
505 
506 				/* Ascend tree */
507 				p = p->parent;
508 			}
509 		}
510 	} while (p != root);
511 }
512 
513 
514 /**
515  * Save a search (sub)tree
516  *
517  * \param parent root node of search tree to save.
518  * \param fp File to write to
519  */
urldb_save_search_tree(struct search_node * parent,FILE * fp)520 static void urldb_save_search_tree(struct search_node *parent, FILE *fp)
521 {
522 	char host[256];
523 	const struct host_part *h;
524 	unsigned int path_count = 0;
525 	char *path, *p, *end;
526 	int path_alloc = 64, path_used = 1;
527 	time_t expiry, hsts_expiry = 0;
528 	int hsts_include_subdomains = 0;
529 
530 	expiry = time(NULL) - ((60 * 60 * 24) * nsoption_int(expire_url));
531 
532 	if (parent == &empty)
533 		return;
534 
535 	urldb_save_search_tree(parent->left, fp);
536 
537 	path = malloc(path_alloc);
538 	if (!path)
539 		return;
540 
541 	path[0] = '\0';
542 
543 	for (h = parent->data, p = host, end = host + sizeof host;
544 	     h && h != &db_root && p < end; h = h->parent) {
545 		int written = snprintf(p, end - p, "%s%s", h->part,
546 				       (h->parent && h->parent->parent) ? "." : "");
547 		if (written < 0) {
548 			free(path);
549 			return;
550 		}
551 		p += written;
552 	}
553 
554 	h = parent->data;
555 	if (h && h->hsts.expires > expiry) {
556 		hsts_expiry = h->hsts.expires;
557 		hsts_include_subdomains = h->hsts.include_sub_domains;
558 	}
559 
560 	urldb_count_urls(&parent->data->paths, expiry, &path_count);
561 
562 	if (path_count > 0) {
563 		fprintf(fp, "%s %i ", host, hsts_include_subdomains);
564 		urldb_write_timet(fp, hsts_expiry);
565 		fprintf(fp, "%i\n", path_count);
566 
567 		urldb_write_paths(&parent->data->paths, host, fp,
568 				  &path, &path_alloc, &path_used, expiry);
569 	} else if (hsts_expiry) {
570 		fprintf(fp, "%s %i ", host, hsts_include_subdomains);
571 		urldb_write_timet(fp, hsts_expiry);
572 		fprintf(fp, "0\n");
573 	}
574 
575 	free(path);
576 
577 	urldb_save_search_tree(parent->right, fp);
578 }
579 
580 
581 /**
582  * Path data iterator (internal)
583  *
584  * \param parent Root of subtree to iterate over
585  * \param url_callback Callback function
586  * \param cookie_callback Callback function
587  * \return true to continue, false otherwise
588  */
589 static bool
urldb_iterate_entries_path(const struct path_data * parent,bool (* url_callback)(nsurl * url,const struct url_data * data),bool (* cookie_callback)(const struct cookie_data * data))590 urldb_iterate_entries_path(const struct path_data *parent,
591 		bool (*url_callback)(nsurl *url, const struct url_data *data),
592 		bool (*cookie_callback)(const struct cookie_data *data))
593 {
594 	const struct path_data *p = parent;
595 	const struct cookie_data *c;
596 
597 	do {
598 		if (p->children != NULL) {
599 			/* Drill down into children */
600 			p = p->children;
601 		} else {
602 			/* All leaf nodes in the path tree should have an URL or
603 			 * cookies attached to them. If this is not the case, it
604 			 * indicates that there's a bug in the file loader/URL
605 			 * insertion code. Therefore, assert this here. */
606 			assert(url_callback || cookie_callback);
607 
608 			/** \todo handle fragments? */
609 			if (url_callback) {
610 				const struct url_internal_data *u = &p->urld;
611 
612 				assert(p->url);
613 
614 				if (!url_callback(p->url,
615 						  (const struct url_data *) u))
616 					return false;
617 			} else {
618 				c = (const struct cookie_data *)p->cookies;
619 				for (; c != NULL; c = c->next) {
620 					if (!cookie_callback(c))
621 						return false;
622 				}
623 			}
624 
625 			/* Now, find next node to process. */
626 			while (p != parent) {
627 				if (p->next != NULL) {
628 					/* Have a sibling, process that */
629 					p = p->next;
630 					break;
631 				}
632 
633 				/* Ascend tree */
634 				p = p->parent;
635 			}
636 		}
637 	} while (p != parent);
638 
639 	return true;
640 }
641 
642 
643 /**
644  * Check whether a host string is an IP address.
645  *
646  * This call detects IPv4 addresses (all of dotted-quad or subsets,
647  * decimal or hexadecimal notations) and IPv6 addresses (including
648  * those containing embedded IPv4 addresses.)
649  *
650  * \param host a hostname terminated by '\0'
651  * \return true if the hostname is an IP address, false otherwise
652  */
urldb__host_is_ip_address(const char * host)653 static bool urldb__host_is_ip_address(const char *host)
654 {
655 	struct in_addr ipv4;
656 	size_t host_len = strlen(host);
657 	const char *sane_host;
658 	const char *slash;
659 #ifndef NO_IPV6
660 	struct in6_addr ipv6;
661 	char ipv6_addr[64];
662 	unsigned int ipv6_addr_len;
663 #endif
664 	/**
665 	 * @todo FIXME Some parts of urldb.c make confusions between hosts
666 	 * and "prefixes", we can sometimes be erroneously passed more than
667 	 * just a host.  Sometimes we may be passed trailing slashes, or even
668 	 * whole path segments.  A specific criminal in this class is
669 	 * urldb_iterate_partial, which takes a prefix to search for, but
670 	 * passes that prefix to functions that expect only hosts.
671 	 *
672 	 * For the time being, we will accept such calls; we check if there
673 	 * is a / in the host parameter, and if there is, we take a copy and
674 	 * replace the / with a \0.  This is not a permanent solution; we
675 	 * should search through NetSurf and find all the callers that are
676 	 * in error and fix them.  When doing this task, it might be wise
677 	 * to replace the hideousness below with code that doesn't have to do
678 	 * this, and add assert(strchr(host, '/') == NULL); somewhere.
679 	 * -- rjek - 2010-11-04
680 	 */
681 
682 	slash = strchr(host, '/');
683 	if (slash == NULL) {
684 		sane_host = host;
685 	} else {
686 		char *c = strdup(host);
687 		c[slash - host] = '\0';
688 		sane_host = c;
689 		host_len = slash - host;
690 		NSLOG(netsurf, INFO, "WARNING: called with non-host '%s'",
691 		      host);
692 	}
693 
694 	if (strspn(sane_host, "0123456789abcdefABCDEF[].:") < host_len)
695 		goto out_false;
696 
697 	if (inet_aton(sane_host, &ipv4) != 0) {
698 		/* This can only be a sane IPv4 address if it contains 3 dots.
699 		 * Helpfully, inet_aton is happy to treat "a", "a.b", "a.b.c",
700 		 * and "a.b.c.d" as valid IPv4 address strings where we only
701 		 * support the full, dotted-quad, form.
702 		 */
703 		int num_dots = 0;
704 		size_t index;
705 
706 		for (index = 0; index < host_len; index++) {
707 			if (sane_host[index] == '.')
708 				num_dots++;
709 		}
710 
711 		if (num_dots == 3)
712 			goto out_true;
713 		else
714 			goto out_false;
715 	}
716 
717 #ifndef NO_IPV6
718 	if ((host_len < 6) ||
719 	    (sane_host[0] != '[') ||
720 	    (sane_host[host_len - 1] != ']')) {
721 		goto out_false;
722 	}
723 
724 	ipv6_addr_len = host_len - 2;
725 	if (ipv6_addr_len >= sizeof(ipv6_addr)) {
726 		ipv6_addr_len = sizeof(ipv6_addr) - 1;
727 	}
728 	strncpy(ipv6_addr, sane_host + 1, ipv6_addr_len);
729 	ipv6_addr[ipv6_addr_len] = '\0';
730 
731 	if (inet_pton(AF_INET6, ipv6_addr, &ipv6) == 1)
732 		goto out_true;
733 #endif
734 
735 out_false:
736 	if (slash != NULL) free((void *)sane_host);
737 	return false;
738 
739 out_true:
740 	if (slash != NULL) free((void *)sane_host);
741 	return true;
742 }
743 
744 
745 /**
746  * Compare host_part with prefix
747  *
748  * \param a host part
749  * \param b prefix
750  * \return 0 if match, non-zero, otherwise
751  */
urldb_search_match_prefix(const struct host_part * a,const char * b)752 static int urldb_search_match_prefix(const struct host_part *a, const char *b)
753 {
754 	const char *end, *dot;
755 	int plen, ret;
756 
757 	assert(a && a != &db_root && b);
758 
759 	if (urldb__host_is_ip_address(b)) {
760 		/* IP address */
761 		return strncasecmp(a->part, b, strlen(b));
762 	}
763 
764 	end = b + strlen(b) + 1;
765 
766 	while (b < end && a && a != &db_root) {
767 		dot = strchr(b, '.');
768 		if (!dot) {
769 			/* last segment */
770 			dot = end - 1;
771 		}
772 
773 		/* Compare strings (length limited) */
774 		if ((ret = strncasecmp(a->part, b, dot - b)) != 0)
775 			/* didn't match => return difference */
776 			return ret;
777 
778 		/* The strings matched */
779 		if (dot < end - 1) {
780 			/* Consider segment lengths only in the case
781 			 * where the prefix contains segments */
782 			plen = strlen(a->part);
783 			if (plen > dot - b) {
784 				/* len(a) > len(b) */
785 				return 1;
786 			} else if (plen < dot - b) {
787 				/* len(a) < len(b) */
788 				return -1;
789 			}
790 		}
791 
792 		b = dot + 1;
793 		a = a->parent;
794 	}
795 
796 	/* If we get here then either:
797 	 *    a) The path lengths differ
798 	 * or b) The hosts are identical
799 	 */
800 	if (a && a != &db_root && b >= end) {
801 		/* len(a) > len(b) => prefix matches */
802 		return 0;
803 	} else if ((!a || a == &db_root) && b < end) {
804 		/* len(a) < len(b) => prefix does not match */
805 		return -1;
806 	}
807 
808 	/* Identical */
809 	return 0;
810 }
811 
812 
813 /**
814  * Partial host iterator (internal)
815  *
816  * \param root Root of (sub)tree to traverse
817  * \param prefix Prefix to match
818  * \param callback Callback function
819  * \return true to continue, false otherwise
820  */
821 static bool
urldb_iterate_partial_host(struct search_node * root,const char * prefix,bool (* callback)(nsurl * url,const struct url_data * data))822 urldb_iterate_partial_host(struct search_node *root,
823 		const char *prefix,
824 		bool (*callback)(nsurl *url, const struct url_data *data))
825 {
826 	int c;
827 
828 	assert(root && prefix && callback);
829 
830 	if (root == &empty)
831 		return true;
832 
833 	c = urldb_search_match_prefix(root->data, prefix);
834 
835 	if (c > 0) {
836 		/* No match => look in left subtree */
837 		return urldb_iterate_partial_host(root->left,
838 						  prefix,
839 						  callback);
840 	} else if (c < 0) {
841 		/* No match => look in right subtree */
842 		return urldb_iterate_partial_host(root->right,
843 						  prefix,
844 						  callback);
845 	} else {
846 		/* Match => iterate over l/r subtrees & process this node */
847 		if (!urldb_iterate_partial_host(root->left,
848 						prefix,
849 						callback)) {
850 			return false;
851 		}
852 
853 		if (root->data->paths.children) {
854 			/* and extract all paths attached to this host */
855 			if (!urldb_iterate_entries_path(&root->data->paths,
856 							callback,
857 							NULL)) {
858 				return false;
859 			}
860 		}
861 
862 		if (!urldb_iterate_partial_host(root->right,
863 						prefix,
864 						callback)) {
865 			return false;
866 		}
867 	}
868 
869 	return true;
870 }
871 
872 
873 /**
874  * Partial path iterator (internal)
875  *
876  * Given: http://www.example.org/a/b/c/d//e
877  * and assuming a path tree:
878  *     ^
879  *    /	\
880  *   a1 b1
881  *  / \
882  * a2 b2
883  *    /|\
884  *   a b c
885  *   3 3 |
886  *       d
887  *       |
888  *       e
889  *      / \
890  *      f g
891  *
892  * Prefix will be:	p will be:
893  *
894  * a/b/c/d//e		a1
895  *   b/c/d//e		a2
896  *   b/c/d//e		b3
897  *     c/d//e		a3
898  *     c/d//e		b3
899  *     c/d//e		c
900  *       d//e		d
901  *         /e		e		(skip /)
902  *          e		e
903  *
904  * I.E. perform a breadth-first search of the tree.
905  *
906  * \param parent Root of (sub)tree to traverse
907  * \param prefix Prefix to match
908  * \param callback Callback function
909  * \return true to continue, false otherwise
910  */
911 static bool
urldb_iterate_partial_path(const struct path_data * parent,const char * prefix,bool (* callback)(nsurl * url,const struct url_data * data))912 urldb_iterate_partial_path(const struct path_data *parent,
913 		const char *prefix,
914 		bool (*callback)(nsurl *url, const struct url_data *data))
915 {
916 	const struct path_data *p = parent->children;
917 	const char *slash, *end = prefix + strlen(prefix);
918 
919 	do {
920 		slash = strchr(prefix, '/');
921 		if (!slash) {
922 			slash = end;
923 		}
924 
925 		if (slash == prefix && *prefix == '/') {
926 			/* Ignore "//" */
927 			prefix++;
928 			continue;
929 		}
930 
931 		if (strncasecmp(p->segment, prefix, slash - prefix) == 0) {
932 			/* prefix matches so far */
933 			if (slash == end) {
934 				/* we've run out of prefix, so all
935 				 * paths below this one match */
936 				if (!urldb_iterate_entries_path(p,
937 								callback,
938 								NULL)) {
939 					return false;
940 				}
941 
942 				/* Progress to next sibling */
943 				p = p->next;
944 			} else {
945 				/* Skip over this segment */
946 				prefix = slash + 1;
947 
948 				p = p->children;
949 			}
950 		} else {
951 			/* Doesn't match this segment, try next sibling */
952 			p = p->next;
953 		}
954 	} while (p != NULL);
955 
956 	return true;
957 }
958 
959 
960 /**
961  * Host data iterator (internal)
962  *
963  * \param parent Root of subtree to iterate over
964  * \param url_callback Callback function
965  * \param cookie_callback Callback function
966  * \return true to continue, false otherwise
967  */
968 static bool
urldb_iterate_entries_host(struct search_node * parent,bool (* url_callback)(nsurl * url,const struct url_data * data),bool (* cookie_callback)(const struct cookie_data * data))969 urldb_iterate_entries_host(struct search_node *parent,
970 		bool (*url_callback)(nsurl *url, const struct url_data *data),
971 		bool (*cookie_callback)(const struct cookie_data *data))
972 {
973 	if (parent == &empty) {
974 		return true;
975 	}
976 
977 	if (!urldb_iterate_entries_host(parent->left,
978 					url_callback,
979 					cookie_callback)) {
980 		return false;
981 	}
982 
983 	if ((parent->data->paths.children) ||
984 	    ((cookie_callback) &&
985 	     (parent->data->paths.cookies))) {
986 		/* We have paths (or domain cookies), so iterate them */
987 		if (!urldb_iterate_entries_path(&parent->data->paths,
988 						url_callback,
989 						cookie_callback)) {
990 			return false;
991 		}
992 	}
993 
994 	if (!urldb_iterate_entries_host(parent->right,
995 					url_callback,
996 					cookie_callback)) {
997 		return false;
998 	}
999 
1000 	return true;
1001 }
1002 
1003 
1004 /**
1005  * Add a host node to the tree
1006  *
1007  * \param part Host segment to add (or whole IP address) (copied)
1008  * \param parent Parent node to add to
1009  * \return Pointer to added node, or NULL on memory exhaustion
1010  */
1011 static struct host_part *
urldb_add_host_node(const char * part,struct host_part * parent)1012 urldb_add_host_node(const char *part, struct host_part *parent)
1013 {
1014 	struct host_part *d;
1015 
1016 	assert(part && parent);
1017 
1018 	d = calloc(1, sizeof(struct host_part));
1019 	if (!d) {
1020 		return NULL;
1021 	}
1022 
1023 	d->part = strdup(part);
1024 	if (!d->part) {
1025 		free(d);
1026 		return NULL;
1027 	}
1028 
1029 	d->next = parent->children;
1030 	if (parent->children) {
1031 		parent->children->prev = d;
1032 	}
1033 	d->parent = parent;
1034 	parent->children = d;
1035 
1036 	return d;
1037 }
1038 
1039 
1040 /**
1041  * Fragment comparator callback for qsort
1042  *
1043  * \param a first value
1044  * \param b second value
1045  * \return 0 for equal else positive or negative value on comparison
1046  */
urldb_add_path_fragment_cmp(const void * a,const void * b)1047 static int urldb_add_path_fragment_cmp(const void *a, const void *b)
1048 {
1049 	return strcasecmp(*((const char **) a), *((const char **) b));
1050 }
1051 
1052 
1053 /**
1054  * Add a fragment to a path segment
1055  *
1056  * \param segment Path segment to add to
1057  * \param fragment Fragment to add (copied), or NULL
1058  * \return segment or NULL on memory exhaustion
1059  */
1060 static struct path_data *
urldb_add_path_fragment(struct path_data * segment,lwc_string * fragment)1061 urldb_add_path_fragment(struct path_data *segment, lwc_string *fragment)
1062 {
1063 	char **temp;
1064 
1065 	assert(segment);
1066 
1067 	/* If no fragment, this function is a NOP
1068 	 * This may seem strange, but it makes the rest
1069 	 * of the code cleaner */
1070 	if (!fragment)
1071 		return segment;
1072 
1073 	temp = realloc(segment->fragment,
1074 		       (segment->frag_cnt + 1) * sizeof(char *));
1075 	if (!temp)
1076 		return NULL;
1077 
1078 	segment->fragment = temp;
1079 	segment->fragment[segment->frag_cnt] =
1080 		strdup(lwc_string_data(fragment));
1081 	if (!segment->fragment[segment->frag_cnt]) {
1082 		/* Don't free temp - it's now our buffer */
1083 		return NULL;
1084 	}
1085 
1086 	segment->frag_cnt++;
1087 
1088 	/* We want fragments in alphabetical order, so sort them
1089 	 * It may prove better to insert in alphabetical order instead */
1090 	qsort(segment->fragment,
1091 	      segment->frag_cnt,
1092 	      sizeof (char *),
1093 	      urldb_add_path_fragment_cmp);
1094 
1095 	return segment;
1096 }
1097 
1098 
1099 /**
1100  * Add a path node to the tree
1101  *
1102  * \param scheme URL scheme associated with path (copied)
1103  * \param port Port number on host associated with path
1104  * \param segment Path segment to add (copied)
1105  * \param fragment URL fragment (copied), or NULL
1106  * \param parent Parent node to add to
1107  * \return Pointer to added node, or NULL on memory exhaustion
1108  */
1109 static struct path_data *
urldb_add_path_node(lwc_string * scheme,unsigned int port,const char * segment,lwc_string * fragment,struct path_data * parent)1110 urldb_add_path_node(lwc_string *scheme,
1111 		    unsigned int port,
1112 		    const char *segment,
1113 		    lwc_string *fragment,
1114 		    struct path_data *parent)
1115 {
1116 	struct path_data *d, *e;
1117 
1118 	assert(scheme && segment && parent);
1119 
1120 	d = calloc(1, sizeof(struct path_data));
1121 	if (!d)
1122 		return NULL;
1123 
1124 	d->scheme = lwc_string_ref(scheme);
1125 
1126 	d->port = port;
1127 
1128 	d->segment = strdup(segment);
1129 	if (!d->segment) {
1130 		lwc_string_unref(d->scheme);
1131 		free(d);
1132 		return NULL;
1133 	}
1134 
1135 	if (fragment) {
1136 		if (!urldb_add_path_fragment(d, fragment)) {
1137 			free(d->segment);
1138 			lwc_string_unref(d->scheme);
1139 			free(d);
1140 			return NULL;
1141 		}
1142 	}
1143 
1144 	for (e = parent->children; e; e = e->next) {
1145 		if (strcmp(e->segment, d->segment) > 0)
1146 			break;
1147 	}
1148 
1149 	if (e) {
1150 		d->prev = e->prev;
1151 		d->next = e;
1152 		if (e->prev)
1153 			e->prev->next = d;
1154 		else
1155 			parent->children = d;
1156 		e->prev = d;
1157 	} else if (!parent->children) {
1158 		d->prev = d->next = NULL;
1159 		parent->children = parent->last = d;
1160 	} else {
1161 		d->next = NULL;
1162 		d->prev = parent->last;
1163 		parent->last->next = d;
1164 		parent->last = d;
1165 	}
1166 	d->parent = parent;
1167 
1168 	return d;
1169 }
1170 
1171 
1172 /**
1173  * Get the search tree for a particular host
1174  *
1175  * \param host the host to lookup
1176  * \return the corresponding search tree
1177  */
urldb_get_search_tree_direct(const char * host)1178 static struct search_node **urldb_get_search_tree_direct(const char *host)
1179 {
1180 	assert(host);
1181 
1182 	if (urldb__host_is_ip_address(host)) {
1183 		return &search_trees[ST_IP];
1184 	} else if (ascii_is_alpha(*host)) {
1185 		return &search_trees[ST_DN + ascii_to_lower(*host) - 'a'];
1186 	}
1187 	return &search_trees[ST_EE];
1188 }
1189 
1190 
1191 /**
1192  * Get the search tree for a particular host
1193  *
1194  * \param host the host to lookup
1195  * \return the corresponding search tree
1196  */
urldb_get_search_tree(const char * host)1197 static struct search_node *urldb_get_search_tree(const char *host)
1198 {
1199 	return *urldb_get_search_tree_direct(host);
1200 }
1201 
1202 
1203 /**
1204  * Compare host part with a string
1205  *
1206  * \param a host part
1207  * \param b string to compare
1208  * \return 0 if match, non-zero, otherwise
1209  */
urldb_search_match_string(const struct host_part * a,const char * b)1210 static int urldb_search_match_string(const struct host_part *a, const char *b)
1211 {
1212 	const char *end, *dot;
1213 	int plen, ret;
1214 
1215 	assert(a && a != &db_root && b);
1216 
1217 	if (urldb__host_is_ip_address(b)) {
1218 		/* IP address */
1219 		return strcasecmp(a->part, b);
1220 	}
1221 
1222 	end = b + strlen(b) + 1;
1223 
1224 	while (b < end && a && a != &db_root) {
1225 		dot = strchr(b, '.');
1226 		if (!dot) {
1227 			/* last segment */
1228 			dot = end - 1;
1229 		}
1230 
1231 		/* Compare strings (length limited) */
1232 		if ((ret = strncasecmp(a->part, b, dot - b)) != 0)
1233 			/* didn't match => return difference */
1234 			return ret;
1235 
1236 		/* The strings matched, now check that the lengths do, too */
1237 		plen = strlen(a->part);
1238 
1239 		if (plen > dot - b) {
1240 			/* len(a) > len(b) */
1241 			return 1;
1242 		} else if (plen < dot - b) {
1243 			/* len(a) < len(b) */
1244 			return -1;
1245 		}
1246 
1247 		b = dot + 1;
1248 		a = a->parent;
1249 	}
1250 
1251 	/* If we get here then either:
1252 	 *    a) The path lengths differ
1253 	 * or b) The hosts are identical
1254 	 */
1255 	if (a && a != &db_root && b >= end) {
1256 		/* len(a) > len(b) */
1257 		return 1;
1258 	} else if ((!a || a == &db_root) && b < end) {
1259 		/* len(a) < len(b) */
1260 		return -1;
1261 	}
1262 
1263 	/* Identical */
1264 	return 0;
1265 }
1266 
1267 
1268 /**
1269  * Find a node in a search tree
1270  *
1271  * \param root Tree to look in
1272  * \param host Host to find
1273  * \return Pointer to host tree node, or NULL if not found
1274  */
1275 static const struct host_part *
urldb_search_find(struct search_node * root,const char * host)1276 urldb_search_find(struct search_node *root, const char *host)
1277 {
1278 	int c;
1279 
1280 	assert(root && host);
1281 
1282 	if (root == &empty) {
1283 		return NULL;
1284 	}
1285 
1286 	c = urldb_search_match_string(root->data, host);
1287 
1288 	if (c > 0) {
1289 		return urldb_search_find(root->left, host);
1290 	} else if (c < 0) {
1291 		return urldb_search_find(root->right, host);
1292 	}
1293 
1294 	return root->data;
1295 }
1296 
1297 
1298 /**
1299  * Match a path string
1300  *
1301  * \param parent Path (sub)tree to look in
1302  * \param path The path to search for
1303  * \param scheme The URL scheme associated with the path
1304  * \param port The port associated with the path
1305  * \return Pointer to path data or NULL if not found.
1306  */
1307 static struct path_data *
urldb_match_path(const struct path_data * parent,const char * path,lwc_string * scheme,unsigned short port)1308 urldb_match_path(const struct path_data *parent,
1309 		 const char *path,
1310 		 lwc_string *scheme,
1311 		 unsigned short port)
1312 {
1313 	const struct path_data *p;
1314 	const char *slash;
1315 	bool match;
1316 
1317 	assert(parent != NULL);
1318 	assert(parent->segment == NULL);
1319 
1320 	if (path[0] != '/') {
1321 		NSLOG(netsurf, INFO, "path is %s", path);
1322 	}
1323 
1324 	assert(path[0] == '/');
1325 
1326 	/* Start with children, as parent has no segment */
1327 	p = parent->children;
1328 
1329 	while (p != NULL) {
1330 		slash = strchr(path + 1, '/');
1331 		if (!slash) {
1332 			slash = path + strlen(path);
1333 		}
1334 
1335 		if (strncmp(p->segment, path + 1, slash - path - 1) == 0 &&
1336 		    lwc_string_isequal(p->scheme, scheme, &match) == lwc_error_ok &&
1337 		    match == true &&
1338 		    p->port == port) {
1339 			if (*slash == '\0') {
1340 				/* Complete match */
1341 				return (struct path_data *) p;
1342 			}
1343 
1344 			/* Match so far, go down tree */
1345 			p = p->children;
1346 
1347 			path = slash;
1348 		} else {
1349 			/* No match, try next sibling */
1350 			p = p->next;
1351 		}
1352 	}
1353 
1354 	return NULL;
1355 }
1356 
1357 
1358 /**
1359  * Find an URL in the database
1360  *
1361  * \param url Absolute URL to find
1362  * \return Pointer to path data, or NULL if not found
1363  */
urldb_find_url(nsurl * url)1364 static struct path_data *urldb_find_url(nsurl *url)
1365 {
1366 	const struct host_part *h;
1367 	struct path_data *p;
1368 	struct search_node *tree;
1369 	char *plq;
1370 	const char *host_str;
1371 	lwc_string *scheme, *host, *port;
1372 	size_t len = 0;
1373 	unsigned int port_int;
1374 	bool match;
1375 
1376 	assert(url);
1377 
1378 	if (url_bloom != NULL) {
1379 		if (bloom_search_hash(url_bloom, nsurl_hash(url)) == false) {
1380 			return NULL;
1381 		}
1382 	}
1383 
1384 	scheme = nsurl_get_component(url, NSURL_SCHEME);
1385 	if (scheme == NULL)
1386 		return NULL;
1387 
1388 	if (lwc_string_isequal(scheme, corestring_lwc_mailto, &match) ==
1389 	    lwc_error_ok && match == true) {
1390 		lwc_string_unref(scheme);
1391 		return NULL;
1392 	}
1393 
1394 	host = nsurl_get_component(url, NSURL_HOST);
1395 	if (host != NULL) {
1396 		host_str = lwc_string_data(host);
1397 		lwc_string_unref(host);
1398 
1399 	} else if (lwc_string_isequal(scheme, corestring_lwc_file, &match) ==
1400 		   lwc_error_ok && match == true) {
1401 		host_str = "localhost";
1402 
1403 	} else {
1404 		lwc_string_unref(scheme);
1405 		return NULL;
1406 	}
1407 
1408 	tree = urldb_get_search_tree(host_str);
1409 	h = urldb_search_find(tree, host_str);
1410 	if (!h) {
1411 		lwc_string_unref(scheme);
1412 		return NULL;
1413 	}
1414 
1415 	/* generate plq (path, leaf, query) */
1416 	if (nsurl_get(url, NSURL_PATH | NSURL_QUERY, &plq, &len) != NSERROR_OK) {
1417 		lwc_string_unref(scheme);
1418 		return NULL;
1419 	}
1420 
1421 	/* Get port */
1422 	port = nsurl_get_component(url, NSURL_PORT);
1423 	if (port != NULL) {
1424 		port_int = atoi(lwc_string_data(port));
1425 		lwc_string_unref(port);
1426 	} else {
1427 		port_int = 0;
1428 	}
1429 
1430 	p = urldb_match_path(&h->paths, plq, scheme, port_int);
1431 
1432 	free(plq);
1433 	lwc_string_unref(scheme);
1434 
1435 	return p;
1436 }
1437 
1438 
1439 /**
1440  * Dump URL database paths to stderr
1441  *
1442  * \param parent Parent node of tree to dump
1443  */
urldb_dump_paths(struct path_data * parent)1444 static void urldb_dump_paths(struct path_data *parent)
1445 {
1446 	const struct path_data *p = parent;
1447 	unsigned int i;
1448 
1449 	do {
1450 		if (p->segment != NULL) {
1451 			NSLOG(netsurf, INFO, "\t%s : %u",
1452 			      lwc_string_data(p->scheme), p->port);
1453 
1454 			NSLOG(netsurf, INFO, "\t\t'%s'", p->segment);
1455 
1456 			for (i = 0; i != p->frag_cnt; i++) {
1457 				NSLOG(netsurf, INFO, "\t\t\t#%s",
1458 				      p->fragment[i]);
1459 			}
1460 		}
1461 
1462 		if (p->children != NULL) {
1463 			p = p->children;
1464 		} else {
1465 			while (p != parent) {
1466 				if (p->next != NULL) {
1467 					p = p->next;
1468 					break;
1469 				}
1470 
1471 				p = p->parent;
1472 			}
1473 		}
1474 	} while (p != parent);
1475 }
1476 
1477 
1478 /**
1479  * Dump URL database hosts to stderr
1480  *
1481  * \param parent Parent node of tree to dump
1482  */
urldb_dump_hosts(struct host_part * parent)1483 static void urldb_dump_hosts(struct host_part *parent)
1484 {
1485 	struct host_part *h;
1486 
1487 	if (parent->part) {
1488 		NSLOG(netsurf, INFO, "%s", parent->part);
1489 
1490 		NSLOG(netsurf, INFO, "\t%s invalid SSL certs",
1491 		      parent->permit_invalid_certs ? "Permits" : "Denies");
1492 	}
1493 
1494 	/* Dump path data */
1495 	urldb_dump_paths(&parent->paths);
1496 
1497 	/* and recurse */
1498 	for (h = parent->children; h; h = h->next) {
1499 		urldb_dump_hosts(h);
1500 	}
1501 }
1502 
1503 
1504 /**
1505  * Dump search tree
1506  *
1507  * \param parent Parent node of tree to dump
1508  * \param depth Tree depth
1509  */
urldb_dump_search(struct search_node * parent,int depth)1510 static void urldb_dump_search(struct search_node *parent, int depth)
1511 {
1512 	const struct host_part *h;
1513 	int i; /* index into string */
1514 	char s[1024];
1515 	int r;
1516 	int sl = sizeof(s) - 2;
1517 
1518 	if (parent == &empty)
1519 		return;
1520 
1521 	urldb_dump_search(parent->left, depth + 1);
1522 
1523 	for (i = 0; i != depth; i++) {
1524 		s[i] = ' ';
1525 	}
1526 
1527 	for (h = parent->data; h; h = h->parent) {
1528 		if (h->part) {
1529 			r = snprintf(&s[i], sl - i, "%s", h->part);
1530 			if ((i + r) > sl) {
1531 				break;
1532 			}
1533 			i += r;
1534 		}
1535 
1536 		if (h->parent && h->parent->parent) {
1537 			s[i]='.';
1538 			i++;
1539 		}
1540 	}
1541 	s[i]= 0;
1542 
1543 	NSLOG(netsurf, INFO, "%s", s);
1544 
1545 	urldb_dump_search(parent->right, depth + 1);
1546 }
1547 
1548 
1549 /**
1550  * Compare a pair of host parts
1551  *
1552  * \param a first host part
1553  * \param b second host part
1554  * \return 0 if match, non-zero, otherwise
1555  */
1556 static int
urldb_search_match_host(const struct host_part * a,const struct host_part * b)1557 urldb_search_match_host(const struct host_part *a, const struct host_part *b)
1558 {
1559 	int ret;
1560 
1561 	assert(a && b);
1562 
1563 	/* traverse up tree to root, comparing parts as we go. */
1564 	for (; a && a != &db_root && b && b != &db_root;
1565 	     a = a->parent, b = b->parent) {
1566 		if ((ret = strcasecmp(a->part, b->part)) != 0) {
1567 			/* They differ => return the difference here */
1568 			return ret;
1569 		}
1570 	}
1571 
1572 	/* If we get here then either:
1573 	 *    a) The path lengths differ
1574 	 * or b) The hosts are identical
1575 	 */
1576 	if (a && a != &db_root && (!b || b == &db_root)) {
1577 		/* len(a) > len(b) */
1578 		return 1;
1579 	} else if ((!a || a == &db_root) && b && b != &db_root) {
1580 		/* len(a) < len(b) */
1581 		return -1;
1582 	}
1583 
1584 	/* identical */
1585 	return 0;
1586 }
1587 
1588 
1589 /**
1590  * Rotate a subtree right
1591  *
1592  * \param root Root of subtree to rotate
1593  * \return new root of subtree
1594  */
urldb_search_skew(struct search_node * root)1595 static struct search_node *urldb_search_skew(struct search_node *root)
1596 {
1597 	assert(root);
1598 
1599 	if (root->left->level == root->level) {
1600 		struct search_node *temp;
1601 
1602 		temp = root->left;
1603 		root->left = temp->right;
1604 		temp->right = root;
1605 		root = temp;
1606 	}
1607 
1608 	return root;
1609 }
1610 
1611 
1612 /**
1613  * Rotate a node left, increasing the parent's level
1614  *
1615  * \param root Root of subtree to rotate
1616  * \return New root of subtree
1617  */
urldb_search_split(struct search_node * root)1618 static struct search_node *urldb_search_split(struct search_node *root)
1619 {
1620 	assert(root);
1621 
1622 	if (root->right->right->level == root->level) {
1623 		struct search_node *temp;
1624 
1625 		temp = root->right;
1626 		root->right = temp->left;
1627 		temp->left = root;
1628 		root = temp;
1629 
1630 		root->level++;
1631 	}
1632 
1633 	return root;
1634 }
1635 
1636 
1637 /**
1638  * Insert node into search tree
1639  *
1640  * \param root Root of (sub)tree to insert into
1641  * \param n Node to insert
1642  * \return Pointer to updated root
1643  */
1644 static struct search_node *
urldb_search_insert_internal(struct search_node * root,struct search_node * n)1645 urldb_search_insert_internal(struct search_node *root, struct search_node *n)
1646 {
1647 	assert(root && n);
1648 
1649 	if (root == &empty) {
1650 		root = n;
1651 	} else {
1652 		int c = urldb_search_match_host(root->data, n->data);
1653 
1654 		if (c > 0) {
1655 			root->left = urldb_search_insert_internal(
1656 				root->left, n);
1657 		} else if (c < 0) {
1658 			root->right = urldb_search_insert_internal(
1659 				root->right, n);
1660 		} else {
1661 			/* exact match */
1662 			free(n);
1663 			return root;
1664 		}
1665 
1666 		root = urldb_search_skew(root);
1667 		root = urldb_search_split(root);
1668 	}
1669 
1670 	return root;
1671 }
1672 
1673 
1674 /**
1675  * Insert a node into the search tree
1676  *
1677  * \param root Root of tree to insert into
1678  * \param data User data to insert
1679  * \return Pointer to updated root, or NULL if failed
1680  */
1681 static struct search_node *
urldb_search_insert(struct search_node * root,const struct host_part * data)1682 urldb_search_insert(struct search_node *root, const struct host_part *data)
1683 {
1684 	struct search_node *n;
1685 
1686 	assert(root && data);
1687 
1688 	n = malloc(sizeof(struct search_node));
1689 	if (!n)
1690 		return NULL;
1691 
1692 	n->level = 1;
1693 	n->data = data;
1694 	n->left = n->right = &empty;
1695 
1696 	root = urldb_search_insert_internal(root, n);
1697 
1698 	return root;
1699 }
1700 
1701 
1702 /**
1703  * Parse a cookie avpair
1704  *
1705  * \param c Cookie struct to populate
1706  * \param n Name component
1707  * \param v Value component
1708  * \param was_quoted Whether \a v was quoted in the input
1709  * \return true on success, false on memory exhaustion
1710  */
1711 static bool
urldb_parse_avpair(struct cookie_internal_data * c,char * n,char * v,bool was_quoted)1712 urldb_parse_avpair(struct cookie_internal_data *c,
1713 		   char *n,
1714 		   char *v,
1715 		   bool was_quoted)
1716 {
1717 	int vlen;
1718 
1719 	assert(c && n && v);
1720 
1721 	/* Strip whitespace from start of name */
1722 	for (; *n; n++) {
1723 		if (*n != ' ' && *n != '\t')
1724 			break;
1725 	}
1726 
1727 	/* Strip whitespace from end of name */
1728 	for (vlen = strlen(n); vlen; vlen--) {
1729 		if (n[vlen] == ' ' || n[vlen] == '\t')
1730 			n[vlen] = '\0';
1731 		else
1732 			break;
1733 	}
1734 
1735 	/* Strip whitespace from start of value */
1736 	for (; *v; v++) {
1737 		if (*v != ' ' && *v != '\t')
1738 			break;
1739 	}
1740 
1741 	/* Strip whitespace from end of value */
1742 	for (vlen = strlen(v); vlen; vlen--) {
1743 		if (v[vlen] == ' ' || v[vlen] == '\t')
1744 			v[vlen] = '\0';
1745 		else
1746 			break;
1747 	}
1748 
1749 	if (!c->comment && strcasecmp(n, "Comment") == 0) {
1750 		c->comment = strdup(v);
1751 		if (!c->comment)
1752 			return false;
1753 	} else if (!c->domain && strcasecmp(n, "Domain") == 0) {
1754 		if (v[0] == '.') {
1755 			/* Domain must start with a dot */
1756 			c->domain_from_set = true;
1757 			c->domain = strdup(v);
1758 			if (!c->domain)
1759 				return false;
1760 		}
1761 	} else if (strcasecmp(n, "Max-Age") == 0) {
1762 		int temp = atoi(v);
1763 		if (temp == 0)
1764 			/* Special case - 0 means delete */
1765 			c->expires = 0;
1766 		else
1767 			c->expires = time(NULL) + temp;
1768 	} else if (!c->path && strcasecmp(n, "Path") == 0) {
1769 		c->path_from_set = true;
1770 		c->path = strdup(v);
1771 		if (!c->path)
1772 			return false;
1773 	} else if (strcasecmp(n, "Version") == 0) {
1774 		c->version = atoi(v);
1775 	} else if (strcasecmp(n, "Expires") == 0) {
1776 		char *datenoday;
1777 		time_t expires;
1778 		nserror res;
1779 
1780 		/* Strip dayname from date (these are hugely variable
1781 		 * and liable to break the parser.  They also serve no
1782 		 * useful purpose) */
1783 		for (datenoday = v;
1784 		     *datenoday && !ascii_is_digit(*datenoday);
1785 		     datenoday++) {
1786 			/* do nothing */
1787 		}
1788 
1789 		res = nsc_strntimet(datenoday, strlen(datenoday), &expires);
1790 		if (res != NSERROR_OK) {
1791 			/* assume we have an unrepresentable date =>
1792 			 * force it to the maximum possible value of a
1793 			 * 32bit time_t (this may break in 2038. We'll
1794 			 * deal with that once we come to it) */
1795 			expires = (time_t)0x7fffffff;
1796 		}
1797 		c->expires = expires;
1798 	} else if (strcasecmp(n, "Secure") == 0) {
1799 		c->secure = true;
1800 	} else if (strcasecmp(n, "HttpOnly") == 0) {
1801 		c->http_only = true;
1802 	} else if (!c->name) {
1803 		c->name = strdup(n);
1804 		c->value = strdup(v);
1805 		c->value_was_quoted = was_quoted;
1806 		if (!c->name || !c->value) {
1807 			return false;
1808 		}
1809 	}
1810 
1811 	return true;
1812 }
1813 
1814 
1815 /**
1816  * Free a cookie
1817  *
1818  * \param c The cookie to free
1819  */
urldb_free_cookie(struct cookie_internal_data * c)1820 static void urldb_free_cookie(struct cookie_internal_data *c)
1821 {
1822 	assert(c);
1823 
1824 	free(c->comment);
1825 	free(c->domain);
1826 	free(c->path);
1827 	free(c->name);
1828 	free(c->value);
1829 	free(c);
1830 }
1831 
1832 
1833 /**
1834  * Parse a cookie
1835  *
1836  * \param url URL being fetched
1837  * \param cookie Pointer to cookie string (updated on exit)
1838  * \return Pointer to cookie structure (on heap, caller frees) or NULL
1839  */
1840 static struct cookie_internal_data *
urldb_parse_cookie(nsurl * url,const char ** cookie)1841 urldb_parse_cookie(nsurl *url, const char **cookie)
1842 {
1843 	struct cookie_internal_data *c;
1844 	const char *cur;
1845 	char name[1024], value[4096];
1846 	char *n = name, *v = value;
1847 	bool in_value = false;
1848 	bool had_value_data = false;
1849 	bool value_verbatim = false;
1850 	bool quoted = false;
1851 	bool was_quoted = false;
1852 
1853 	assert(url && cookie && *cookie);
1854 
1855 	c = calloc(1, sizeof(struct cookie_internal_data));
1856 	if (c == NULL)
1857 		return NULL;
1858 
1859 	c->expires = -1;
1860 
1861 	name[0] = '\0';
1862 	value[0] = '\0';
1863 
1864 	for (cur = *cookie; *cur; cur++) {
1865 		if (*cur == '\r' && *(cur + 1) == '\n') {
1866 			/* End of header */
1867 			if (quoted) {
1868 				/* Unmatched quote encountered */
1869 
1870 				/* Match Firefox 2.0.0.11 */
1871 				value[0] = '\0';
1872 
1873 			}
1874 
1875 			break;
1876 		} else if (*cur == '\r') {
1877 			/* Spurious linefeed */
1878 			continue;
1879 		} else if (*cur == '\n') {
1880 			/* Spurious newline */
1881 			continue;
1882 		}
1883 
1884 		if (in_value && !had_value_data) {
1885 			if (*cur == ' ' || *cur == '\t') {
1886 				/* Strip leading whitespace from value */
1887 				continue;
1888 			} else {
1889 				had_value_data = true;
1890 
1891 				/* Value is taken verbatim if first non-space
1892 				 * character is not a " */
1893 				if (*cur != '"') {
1894 					value_verbatim = true;
1895 				}
1896 			}
1897 		}
1898 
1899 		if (in_value && !value_verbatim && (*cur == '"')) {
1900 			/* Only non-verbatim values may be quoted */
1901 			if (cur == *cookie || *(cur - 1) != '\\') {
1902 				/* Only unescaped quotes count */
1903 				was_quoted = quoted;
1904 				quoted = !quoted;
1905 
1906 				continue;
1907 			}
1908 		}
1909 
1910 		if (!quoted && !in_value && *cur == '=') {
1911 			/* First equals => attr-value separator */
1912 			in_value = true;
1913 			continue;
1914 		}
1915 
1916 		if (!quoted && (was_quoted || *cur == ';')) {
1917 			/* Semicolon or after quoted value
1918 			 * => end of current avpair */
1919 
1920 			/* NUL-terminate tokens */
1921 			*n = '\0';
1922 			*v = '\0';
1923 
1924 			if (!urldb_parse_avpair(c, name, value, was_quoted)) {
1925 				/* Memory exhausted */
1926 				urldb_free_cookie(c);
1927 				return NULL;
1928 			}
1929 
1930 			/* And reset to start */
1931 			n = name;
1932 			v = value;
1933 			in_value = false;
1934 			had_value_data = false;
1935 			value_verbatim = false;
1936 			was_quoted = false;
1937 
1938 			/* Now, if the current input is anything other than a
1939 			 * semicolon, we must be sure to reprocess it */
1940 			if (*cur != ';') {
1941 				cur--;
1942 			}
1943 
1944 			continue;
1945 		}
1946 
1947 		/* And now handle commas. These are a pain as they may mean
1948 		 * any of the following:
1949 		 *
1950 		 * + End of cookie
1951 		 * + Day separator in Expires avpair
1952 		 * + (Invalid) comma in unquoted value
1953 		 *
1954 		 * Therefore, in order to handle all 3 cases (2 and 3 are
1955 		 * identical, the difference being that 2 is in the spec and
1956 		 * 3 isn't), we need to determine where the comma actually
1957 		 * lies. We use the following heuristic:
1958 		 *
1959 		 *   Given a comma at the current input position, find the
1960 		 *   immediately following semicolon (or end of input if none
1961 		 *   found). Then, consider the input characters between
1962 		 *   these two positions. If any of these characters is an
1963 		 *   '=', we must assume that the comma signified the end of
1964 		 *   the current cookie.
1965 		 *
1966 		 * This holds as the first avpair of any cookie must be
1967 		 * NAME=VALUE, so the '=' is guaranteed to appear in the
1968 		 * case where the comma marks the end of a cookie.
1969 		 *
1970 		 * This will fail, however, in the case where '=' appears in
1971 		 * the value of the current avpair after the comma or the
1972 		 * subsequent cookie does not start with NAME=VALUE. Neither
1973 		 * of these is particularly likely and if they do occur, the
1974 		 * website is more broken than we can be bothered to handle.
1975 		 */
1976 		if (!quoted && *cur == ',') {
1977 			/* Find semi-colon, if any */
1978 			const char *p;
1979 			const char *semi = strchr(cur + 1, ';');
1980 			if (!semi)
1981 				semi = cur + strlen(cur) - 2 /* CRLF */;
1982 
1983 			/* Look for equals sign between comma and semi */
1984 			for (p = cur + 1; p < semi; p++)
1985 				if (*p == '=')
1986 					break;
1987 
1988 			if (p == semi) {
1989 				/* none found => comma internal to value */
1990 				/* do nothing */
1991 			} else {
1992 				/* found one => comma marks end of cookie */
1993 				cur++;
1994 				break;
1995 			}
1996 		}
1997 
1998 		/* Accumulate into buffers, always leaving space for a NUL */
1999 		/** \todo is silently truncating overlong names/values wise? */
2000 		if (!in_value) {
2001 			if (n < name + (sizeof(name) - 1))
2002 				*n++ = *cur;
2003 		} else {
2004 			if (v < value + (sizeof(value) - 1))
2005 				*v++ = *cur;
2006 		}
2007 	}
2008 
2009 	/* Parse final avpair */
2010 	*n = '\0';
2011 	*v = '\0';
2012 
2013 	if (!urldb_parse_avpair(c, name, value, was_quoted)) {
2014 		/* Memory exhausted */
2015 		urldb_free_cookie(c);
2016 		return NULL;
2017 	}
2018 
2019 	/* Now fix-up default values */
2020 	if (c->domain == NULL) {
2021 		lwc_string *host = nsurl_get_component(url, NSURL_HOST);
2022 		if (host == NULL) {
2023 			urldb_free_cookie(c);
2024 			return NULL;
2025 		}
2026 		c->domain = strdup(lwc_string_data(host));
2027 		lwc_string_unref(host);
2028 	}
2029 
2030 	if (c->path == NULL) {
2031 		const char *path_data;
2032 		char *path, *slash;
2033 		lwc_string *path_lwc;
2034 
2035 		path_lwc = nsurl_get_component(url, NSURL_PATH);
2036 		if (path_lwc == NULL) {
2037 			urldb_free_cookie(c);
2038 			return NULL;
2039 		}
2040 		path_data = lwc_string_data(path_lwc);
2041 
2042 		/* Strip leafname and trailing slash (4.3.1) */
2043 		slash = strrchr(path_data, '/');
2044 		if (slash != NULL) {
2045 			/* Special case: retain first slash in path */
2046 			if (slash == path_data)
2047 				slash++;
2048 
2049 			slash = strndup(path_data, slash - path_data);
2050 			if (slash == NULL) {
2051 				lwc_string_unref(path_lwc);
2052 				urldb_free_cookie(c);
2053 				return NULL;
2054 			}
2055 
2056 			path = slash;
2057 			lwc_string_unref(path_lwc);
2058 		} else {
2059 			path = strdup(lwc_string_data(path_lwc));
2060 			lwc_string_unref(path_lwc);
2061 			if (path == NULL) {
2062 				urldb_free_cookie(c);
2063 				return NULL;
2064 			}
2065 		}
2066 
2067 		c->path = path;
2068 	}
2069 
2070 	/* Write back current position */
2071 	*cookie = cur;
2072 
2073 	return c;
2074 }
2075 
2076 
2077 /**
2078  * Add a path to the database, creating any intermediate entries
2079  *
2080  * \param scheme URL scheme associated with path
2081  * \param port Port number on host associated with path
2082  * \param host Host tree node to attach to
2083  * \param path_query Absolute path plus query to add (freed)
2084  * \param fragment URL fragment, or NULL
2085  * \param url URL (fragment ignored)
2086  * \return Pointer to leaf node, or NULL on memory exhaustion
2087  */
2088 static struct path_data *
urldb_add_path(lwc_string * scheme,unsigned int port,const struct host_part * host,char * path_query,lwc_string * fragment,nsurl * url)2089 urldb_add_path(lwc_string *scheme,
2090 	       unsigned int port,
2091 	       const struct host_part *host,
2092 	       char *path_query,
2093 	       lwc_string *fragment,
2094 	       nsurl *url)
2095 {
2096 	struct path_data *d, *e;
2097 	char *buf = path_query;
2098 	char *segment, *slash;
2099 	bool match;
2100 
2101 	assert(scheme && host && url);
2102 
2103 	d = (struct path_data *) &host->paths;
2104 
2105 	/* skip leading '/' */
2106 	segment = buf;
2107 	if (*segment == '/')
2108 		segment++;
2109 
2110 	/* Process path segments */
2111 	do {
2112 		slash = strchr(segment, '/');
2113 		if (!slash) {
2114 			/* last segment */
2115 			/* look for existing entry */
2116 			for (e = d->children; e; e = e->next)
2117 				if (strcmp(segment, e->segment) == 0 &&
2118 				    lwc_string_isequal(scheme,
2119 						       e->scheme, &match) ==
2120 				    lwc_error_ok &&
2121 				    match == true &&
2122 				    e->port == port)
2123 					break;
2124 
2125 			d = e ? urldb_add_path_fragment(e, fragment) :
2126 				urldb_add_path_node(scheme, port,
2127 						    segment, fragment, d);
2128 			break;
2129 		}
2130 
2131 		*slash = '\0';
2132 
2133 		/* look for existing entry */
2134 		for (e = d->children; e; e = e->next)
2135 			if (strcmp(segment, e->segment) == 0 &&
2136 			    lwc_string_isequal(scheme, e->scheme,
2137 					       &match) == lwc_error_ok &&
2138 			    match == true &&
2139 			    e->port == port)
2140 				break;
2141 
2142 		d = e ? e : urldb_add_path_node(scheme, port, segment, NULL, d);
2143 		if (!d)
2144 			break;
2145 
2146 		segment = slash + 1;
2147 	} while (1);
2148 
2149 	free(path_query);
2150 
2151 	if (d && !d->url) {
2152 		/* Insert defragmented URL */
2153 		if (nsurl_defragment(url, &d->url) != NSERROR_OK)
2154 			return NULL;
2155 	}
2156 
2157 	return d;
2158 }
2159 
2160 
2161 /**
2162  * Add a host to the database, creating any intermediate entries
2163  *
2164  * \param host Hostname to add
2165  * \return Pointer to leaf node, or NULL on memory exhaustion
2166  */
urldb_add_host(const char * host)2167 static struct host_part *urldb_add_host(const char *host)
2168 {
2169 	struct host_part *d = (struct host_part *) &db_root, *e;
2170 	struct search_node *s;
2171 	char buf[256]; /* 256 bytes is sufficient - domain names are
2172 			* limited to 255 chars. */
2173 	char *part;
2174 
2175 	assert(host);
2176 
2177 	if (urldb__host_is_ip_address(host)) {
2178 		/* Host is an IP, so simply add as TLD */
2179 
2180 		/* Check for existing entry */
2181 		for (e = d->children; e; e = e->next)
2182 			if (strcasecmp(host, e->part) == 0)
2183 				/* found => return it */
2184 				return e;
2185 
2186 		d = urldb_add_host_node(host, d);
2187 
2188 		s = urldb_search_insert(search_trees[ST_IP], d);
2189 		if (!s) {
2190 			/* failed */
2191 			d = NULL;
2192 		} else {
2193 			search_trees[ST_IP] = s;
2194 		}
2195 
2196 		return d;
2197 	}
2198 
2199 	/* Copy host string, so we can corrupt it */
2200 	strncpy(buf, host, sizeof buf);
2201 	buf[sizeof buf - 1] = '\0';
2202 
2203 	/* Process FQDN segments backwards */
2204 	do {
2205 		part = strrchr(buf, '.');
2206 		if (!part) {
2207 			/* last segment */
2208 			/* Check for existing entry */
2209 			for (e = d->children; e; e = e->next)
2210 				if (strcasecmp(buf, e->part) == 0)
2211 					break;
2212 
2213 			if (e) {
2214 				d = e;
2215 			} else {
2216 				d = urldb_add_host_node(buf, d);
2217 			}
2218 
2219 			/* And insert into search tree */
2220 			if (d) {
2221 				struct search_node **r;
2222 
2223 				r = urldb_get_search_tree_direct(buf);
2224 				s = urldb_search_insert(*r, d);
2225 				if (!s) {
2226 					/* failed */
2227 					d = NULL;
2228 				} else {
2229 					*r = s;
2230 				}
2231 			}
2232 			break;
2233 		}
2234 
2235 		/* Check for existing entry */
2236 		for (e = d->children; e; e = e->next)
2237 			if (strcasecmp(part + 1, e->part) == 0)
2238 				break;
2239 
2240 		d = e ? e : urldb_add_host_node(part + 1, d);
2241 		if (!d)
2242 			break;
2243 
2244 		*part = '\0';
2245 	} while (1);
2246 
2247 	return d;
2248 }
2249 
2250 
2251 /**
2252  * Insert a cookie into the database
2253  *
2254  * \param c The cookie to insert
2255  * \param scheme URL scheme associated with cookie path
2256  * \param url URL (sans fragment) associated with cookie
2257  * \return true on success, false on memory exhaustion (c will be freed)
2258  */
2259 static bool
urldb_insert_cookie(struct cookie_internal_data * c,lwc_string * scheme,nsurl * url)2260 urldb_insert_cookie(struct cookie_internal_data *c,
2261 		    lwc_string *scheme,
2262 		    nsurl *url)
2263 {
2264 	struct cookie_internal_data *d;
2265 	const struct host_part *h;
2266 	struct path_data *p;
2267 	time_t now = time(NULL);
2268 
2269 	assert(c);
2270 
2271 	if (c->domain[0] == '.') {
2272 		h = urldb_search_find(
2273 			urldb_get_search_tree(&(c->domain[1])),
2274 			c->domain + 1);
2275 		if (!h) {
2276 			h = urldb_add_host(c->domain + 1);
2277 			if (!h) {
2278 				urldb_free_cookie(c);
2279 				return false;
2280 			}
2281 		}
2282 
2283 		p = (struct path_data *) &h->paths;
2284 	} else {
2285 		/* Need to have a URL and scheme, if it's not a domain cookie */
2286 		assert(url != NULL);
2287 		assert(scheme != NULL);
2288 
2289 		h = urldb_search_find(
2290 			urldb_get_search_tree(c->domain),
2291 			c->domain);
2292 
2293 		if (!h) {
2294 			h = urldb_add_host(c->domain);
2295 			if (!h) {
2296 				urldb_free_cookie(c);
2297 				return false;
2298 			}
2299 		}
2300 
2301 		/* find path */
2302 		p = urldb_add_path(scheme, 0, h,
2303 				   strdup(c->path), NULL, url);
2304 		if (!p) {
2305 			urldb_free_cookie(c);
2306 			return false;
2307 		}
2308 	}
2309 
2310 	/* add cookie */
2311 	for (d = p->cookies; d; d = d->next) {
2312 		if (!strcmp(d->domain, c->domain) &&
2313 		    !strcmp(d->path, c->path) &&
2314 		    !strcmp(d->name, c->name))
2315 			break;
2316 	}
2317 
2318 	if (d) {
2319 		if (c->expires != -1 && c->expires < now) {
2320 			/* remove cookie */
2321 			if (d->next)
2322 				d->next->prev = d->prev;
2323 			else
2324 				p->cookies_end = d->prev;
2325 			if (d->prev)
2326 				d->prev->next = d->next;
2327 			else
2328 				p->cookies = d->next;
2329 
2330 			cookie_manager_remove((struct cookie_data *)d);
2331 
2332 			urldb_free_cookie(d);
2333 			urldb_free_cookie(c);
2334 		} else {
2335 			/* replace d with c */
2336 			c->prev = d->prev;
2337 			c->next = d->next;
2338 			if (c->next)
2339 				c->next->prev = c;
2340 			else
2341 				p->cookies_end = c;
2342 			if (c->prev)
2343 				c->prev->next = c;
2344 			else
2345 				p->cookies = c;
2346 
2347 			cookie_manager_remove((struct cookie_data *)d);
2348 			urldb_free_cookie(d);
2349 
2350 			cookie_manager_add((struct cookie_data *)c);
2351 		}
2352 	} else {
2353 		c->prev = p->cookies_end;
2354 		c->next = NULL;
2355 		if (p->cookies_end)
2356 			p->cookies_end->next = c;
2357 		else
2358 			p->cookies = c;
2359 		p->cookies_end = c;
2360 
2361 		cookie_manager_add((struct cookie_data *)c);
2362 	}
2363 
2364 	return true;
2365 }
2366 
2367 
2368 /**
2369  * Concatenate a cookie into the provided buffer
2370  *
2371  * \param c Cookie to concatenate
2372  * \param version The version of the cookie string to output
2373  * \param used Pointer to amount of buffer used (updated)
2374  * \param alloc Pointer to allocated size of buffer (updated)
2375  * \param buf Pointer to Pointer to buffer (updated)
2376  * \return true on success, false on memory exhaustion
2377  */
2378 static bool
urldb_concat_cookie(struct cookie_internal_data * c,int version,int * used,int * alloc,char ** buf)2379 urldb_concat_cookie(struct cookie_internal_data *c,
2380 		    int version,
2381 		    int *used,
2382 		    int *alloc,
2383 		    char **buf)
2384 {
2385 	/* Combined (A)BNF for the Cookie: request header:
2386 	 *
2387 	 * CHAR           = <any US-ASCII character (octets 0 - 127)>
2388 	 * CTL            = <any US-ASCII control character
2389 	 *                  (octets 0 - 31) and DEL (127)>
2390 	 * CR             = <US-ASCII CR, carriage return (13)>
2391 	 * LF             = <US-ASCII LF, linefeed (10)>
2392 	 * SP             = <US-ASCII SP, space (32)>
2393 	 * HT             = <US-ASCII HT, horizontal-tab (9)>
2394 	 * <">            = <US-ASCII double-quote mark (34)>
2395 	 *
2396 	 * CRLF           = CR LF
2397 	 *
2398 	 * LWS            = [CRLF] 1*( SP | HT )
2399 	 *
2400 	 * TEXT           = <any OCTET except CTLs,
2401 	 *                  but including LWS>
2402 	 *
2403 	 * token          = 1*<any CHAR except CTLs or separators>
2404 	 * separators     = "(" | ")" | "<" | ">" | "@"
2405 	 *                | "," | ";" | ":" | "\" | <">
2406 	 *                | "/" | "[" | "]" | "?" | "="
2407 	 *                | "{" | "}" | SP | HT
2408 	 *
2409 	 * quoted-string  = ( <"> *(qdtext | quoted-pair ) <"> )
2410 	 * qdtext         = <any TEXT except <">>
2411 	 * quoted-pair    = "\" CHAR
2412 	 *
2413 	 * attr            =       token
2414 	 * value           =       word
2415 	 * word            =       token | quoted-string
2416 	 *
2417 	 * cookie          =       "Cookie:" cookie-version
2418 	 *                         1*((";" | ",") cookie-value)
2419 	 * cookie-value    =       NAME "=" VALUE [";" path] [";" domain]
2420 	 * cookie-version  =       "$Version" "=" value
2421 	 * NAME            =       attr
2422 	 * VALUE           =       value
2423 	 * path            =       "$Path" "=" value
2424 	 * domain          =       "$Domain" "=" value
2425 	 *
2426 	 * A note on quoted-string handling:
2427 	 *   The cookie data stored in the db is verbatim (i.e. sans enclosing
2428 	 *   <">, if any, and with all quoted-pairs intact) thus all that we
2429 	 *   need to do here is ensure that value strings which were quoted
2430 	 *   in Set-Cookie or which include any of the separators are quoted
2431 	 *   before use.
2432 	 *
2433 	 * A note on cookie-value separation:
2434 	 *   We use semicolons for all separators, including between
2435 	 *   cookie-values. This simplifies things and is backwards compatible.
2436 	 */
2437 	const char * const separators = "()<>@,;:\\\"/[]?={} \t";
2438 
2439 	int max_len;
2440 
2441 	assert(c && used && alloc && buf && *buf);
2442 
2443 	/* "; " cookie-value
2444 	 * We allow for the possibility that values are quoted
2445 	 */
2446 	max_len = 2 + strlen(c->name) + 1 + strlen(c->value) + 2 +
2447 		(c->path_from_set ?
2448 		 8 + strlen(c->path) + 2 : 0) +
2449 		(c->domain_from_set ?
2450 		 10 + strlen(c->domain) + 2 : 0);
2451 
2452 	if (*used + max_len >= *alloc) {
2453 		char *temp = realloc(*buf, *alloc + 4096);
2454 		if (!temp) {
2455 			return false;
2456 		}
2457 		*buf = temp;
2458 		*alloc += 4096;
2459 	}
2460 
2461 	if (version == COOKIE_NETSCAPE) {
2462 		/* Original Netscape cookie */
2463 		sprintf(*buf + *used - 1, "; %s=", c->name);
2464 		*used += 2 + strlen(c->name) + 1;
2465 
2466 		/* The Netscape spec doesn't mention quoting of cookie values.
2467 		 * RFC 2109 $10.1.3 indicates that values must not be quoted.
2468 		 *
2469 		 * However, other browsers preserve quoting, so we should, too
2470 		 */
2471 		if (c->value_was_quoted) {
2472 			sprintf(*buf + *used - 1, "\"%s\"", c->value);
2473 			*used += 1 + strlen(c->value) + 1;
2474 		} else {
2475 			/** \todo should we %XX-encode [;HT,SP] ? */
2476 			/** \todo Should we strip escaping backslashes? */
2477 			sprintf(*buf + *used - 1, "%s", c->value);
2478 			*used += strlen(c->value);
2479 		}
2480 
2481 		/* We don't send path/domain information -- that's what the
2482 		 * Netscape spec suggests we should do, anyway. */
2483 	} else {
2484 		/* RFC2109 or RFC2965 cookie */
2485 		sprintf(*buf + *used - 1, "; %s=", c->name);
2486 		*used += 2 + strlen(c->name) + 1;
2487 
2488 		/* Value needs quoting if it contains any separator or if
2489 		 * it needs preserving from the Set-Cookie header */
2490 		if (c->value_was_quoted ||
2491 		    strpbrk(c->value, separators) != NULL) {
2492 			sprintf(*buf + *used - 1, "\"%s\"", c->value);
2493 			*used += 1 + strlen(c->value) + 1;
2494 		} else {
2495 			sprintf(*buf + *used - 1, "%s", c->value);
2496 			*used += strlen(c->value);
2497 		}
2498 
2499 		if (c->path_from_set) {
2500 			/* Path, quoted if necessary */
2501 			sprintf(*buf + *used - 1, "; $Path=");
2502 			*used += 8;
2503 
2504 			if (strpbrk(c->path, separators) != NULL) {
2505 				sprintf(*buf + *used - 1, "\"%s\"", c->path);
2506 				*used += 1 + strlen(c->path) + 1;
2507 			} else {
2508 				sprintf(*buf + *used - 1, "%s", c->path);
2509 				*used += strlen(c->path);
2510 			}
2511 		}
2512 
2513 		if (c->domain_from_set) {
2514 			/* Domain, quoted if necessary */
2515 			sprintf(*buf + *used - 1, "; $Domain=");
2516 			*used += 10;
2517 
2518 			if (strpbrk(c->domain, separators) != NULL) {
2519 				sprintf(*buf + *used - 1, "\"%s\"", c->domain);
2520 				*used += 1 + strlen(c->domain) + 1;
2521 			} else {
2522 				sprintf(*buf + *used - 1, "%s", c->domain);
2523 				*used += strlen(c->domain);
2524 			}
2525 		}
2526 	}
2527 
2528 	return true;
2529 }
2530 
2531 
2532 /**
2533  * deletes paths from a cookie.
2534  *
2535  * \param domain the cookie domain
2536  * \param path the cookie path
2537  * \param name The cookie name
2538  * \param parent The url data of the cookie
2539  */
2540 static void
urldb_delete_cookie_paths(const char * domain,const char * path,const char * name,struct path_data * parent)2541 urldb_delete_cookie_paths(const char *domain,
2542 			  const char *path,
2543 			  const char *name,
2544 			  struct path_data *parent)
2545 {
2546 	struct cookie_internal_data *c;
2547 	struct path_data *p = parent;
2548 
2549 	assert(parent);
2550 
2551 	do {
2552 		for (c = p->cookies; c; c = c->next) {
2553 			if (strcmp(c->domain, domain) == 0 &&
2554 			    strcmp(c->path, path) == 0 &&
2555 			    strcmp(c->name, name) == 0) {
2556 				if (c->prev) {
2557 					c->prev->next = c->next;
2558 				} else {
2559 					p->cookies = c->next;
2560 				}
2561 
2562 				if (c->next) {
2563 					c->next->prev = c->prev;
2564 				} else {
2565 					p->cookies_end = c->prev;
2566 				}
2567 
2568 				urldb_free_cookie(c);
2569 
2570 				return;
2571 			}
2572 		}
2573 
2574 		if (p->children) {
2575 			p = p->children;
2576 		} else {
2577 			while (p != parent) {
2578 				if (p->next != NULL) {
2579 					p = p->next;
2580 					break;
2581 				}
2582 
2583 				p = p->parent;
2584 			}
2585 		}
2586 	} while (p != parent);
2587 }
2588 
2589 
2590 /**
2591  * Deletes cookie hosts and their assoicated paths
2592  *
2593  * \param domain the cookie domain
2594  * \param path the cookie path
2595  * \param name The cookie name
2596  * \param parent The url data of the cookie
2597  */
2598 static void
urldb_delete_cookie_hosts(const char * domain,const char * path,const char * name,struct host_part * parent)2599 urldb_delete_cookie_hosts(const char *domain,
2600 			  const char *path,
2601 			  const char *name,
2602 			  struct host_part *parent)
2603 {
2604 	struct host_part *h;
2605 	assert(parent);
2606 
2607 	urldb_delete_cookie_paths(domain, path, name, &parent->paths);
2608 
2609 	for (h = parent->children; h; h = h->next) {
2610 		urldb_delete_cookie_hosts(domain, path, name, h);
2611 	}
2612 }
2613 
2614 
2615 /**
2616  * Save a path subtree's cookies
2617  *
2618  * \param fp File pointer to write to
2619  * \param parent Parent path
2620  */
urldb_save_cookie_paths(FILE * fp,struct path_data * parent)2621 static void urldb_save_cookie_paths(FILE *fp, struct path_data *parent)
2622 {
2623 	struct path_data *p = parent;
2624 	time_t now = time(NULL);
2625 
2626 	assert(fp && parent);
2627 
2628 	do {
2629 		if (p->cookies != NULL) {
2630 			struct cookie_internal_data *c;
2631 
2632 			for (c = p->cookies; c != NULL; c = c->next) {
2633 				if (c->expires == -1 || c->expires < now) {
2634 					/* Skip expired & session cookies */
2635 					continue;
2636 				}
2637 
2638 				fprintf(fp,
2639 					"%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t"
2640 					"%s\t%s\t%d\t%s\t%s\t%s\n",
2641 					c->version, c->domain,
2642 					c->domain_from_set, c->path,
2643 					c->path_from_set, c->secure,
2644 					c->http_only,
2645 					(int)c->expires, (int)c->last_used,
2646 					c->no_destroy, c->name, c->value,
2647 					c->value_was_quoted,
2648 					p->scheme ? lwc_string_data(p->scheme) :
2649 					"unused",
2650 					p->url ? nsurl_access(p->url) :
2651 					"unused",
2652 					c->comment ? c->comment : "");
2653 			}
2654 		}
2655 
2656 		if (p->children != NULL) {
2657 			p = p->children;
2658 		} else {
2659 			while (p != parent) {
2660 				if (p->next != NULL) {
2661 					p = p->next;
2662 					break;
2663 				}
2664 
2665 				p = p->parent;
2666 			}
2667 		}
2668 	} while (p != parent);
2669 }
2670 
2671 
2672 /**
2673  * Save a host subtree's cookies
2674  *
2675  * \param fp File pointer to write to
2676  * \param parent Parent host
2677  */
urldb_save_cookie_hosts(FILE * fp,struct host_part * parent)2678 static void urldb_save_cookie_hosts(FILE *fp, struct host_part *parent)
2679 {
2680 	struct host_part *h;
2681 	assert(fp && parent);
2682 
2683 	urldb_save_cookie_paths(fp, &parent->paths);
2684 
2685 	for (h = parent->children; h; h = h->next)
2686 		urldb_save_cookie_hosts(fp, h);
2687 }
2688 
2689 
2690 /**
2691  * Destroy a cookie node
2692  *
2693  * \param c Cookie to destroy
2694  */
urldb_destroy_cookie(struct cookie_internal_data * c)2695 static void urldb_destroy_cookie(struct cookie_internal_data *c)
2696 {
2697 	free(c->name);
2698 	free(c->value);
2699 	free(c->comment);
2700 	free(c->domain);
2701 	free(c->path);
2702 
2703 	free(c);
2704 }
2705 
2706 
2707 /**
2708  * Destroy the contents of a path node
2709  *
2710  * \param node Node to destroy contents of (does not destroy node)
2711  */
urldb_destroy_path_node_content(struct path_data * node)2712 static void urldb_destroy_path_node_content(struct path_data *node)
2713 {
2714 	struct cookie_internal_data *a, *b;
2715 	unsigned int i;
2716 
2717 	if (node->url != NULL) {
2718 		nsurl_unref(node->url);
2719 	}
2720 
2721 	if (node->scheme != NULL) {
2722 		lwc_string_unref(node->scheme);
2723 	}
2724 
2725 	free(node->segment);
2726 	for (i = 0; i < node->frag_cnt; i++)
2727 		free(node->fragment[i]);
2728 	free(node->fragment);
2729 
2730 	free(node->urld.title);
2731 
2732 	for (a = node->cookies; a; a = b) {
2733 		b = a->next;
2734 		urldb_destroy_cookie(a);
2735 	}
2736 }
2737 
2738 
2739 /**
2740  * Destroy protection space data
2741  *
2742  * \param space Protection space to destroy
2743  */
urldb_destroy_prot_space(struct prot_space_data * space)2744 static void urldb_destroy_prot_space(struct prot_space_data *space)
2745 {
2746 	lwc_string_unref(space->scheme);
2747 	free(space->realm);
2748 	free(space->auth);
2749 
2750 	free(space);
2751 }
2752 
2753 
2754 /**
2755  * Destroy a path tree
2756  *
2757  * \param root Root node of tree to destroy
2758  */
urldb_destroy_path_tree(struct path_data * root)2759 static void urldb_destroy_path_tree(struct path_data *root)
2760 {
2761 	struct path_data *p = root;
2762 
2763 	do {
2764 		if (p->children != NULL) {
2765 			p = p->children;
2766 		} else {
2767 			struct path_data *q = p;
2768 
2769 			while (p != root) {
2770 				if (p->next != NULL) {
2771 					p = p->next;
2772 					break;
2773 				}
2774 
2775 				p = p->parent;
2776 
2777 				urldb_destroy_path_node_content(q);
2778 				free(q);
2779 
2780 				q = p;
2781 			}
2782 
2783 			urldb_destroy_path_node_content(q);
2784 			free(q);
2785 		}
2786 	} while (p != root);
2787 }
2788 
2789 
2790 /**
2791  * Destroy a host tree
2792  *
2793  * \param root Root node of tree to destroy
2794  */
urldb_destroy_host_tree(struct host_part * root)2795 static void urldb_destroy_host_tree(struct host_part *root)
2796 {
2797 	struct host_part *a, *b;
2798 	struct path_data *p, *q;
2799 	struct prot_space_data *s, *t;
2800 
2801 	/* Destroy children */
2802 	for (a = root->children; a; a = b) {
2803 		b = a->next;
2804 		urldb_destroy_host_tree(a);
2805 	}
2806 
2807 	/* Now clean up paths */
2808 	for (p = root->paths.children; p; p = q) {
2809 		q = p->next;
2810 		urldb_destroy_path_tree(p);
2811 	}
2812 
2813 	/* Root path */
2814 	urldb_destroy_path_node_content(&root->paths);
2815 
2816 	/* Proctection space data */
2817 	for (s = root->prot_space; s; s = t) {
2818 		t = s->next;
2819 		urldb_destroy_prot_space(s);
2820 	}
2821 
2822 	/* And ourselves */
2823 	free(root->part);
2824 	free(root);
2825 }
2826 
2827 
2828 /**
2829  * Destroy a search tree
2830  *
2831  * \param root Root node of tree to destroy
2832  */
urldb_destroy_search_tree(struct search_node * root)2833 static void urldb_destroy_search_tree(struct search_node *root)
2834 {
2835 	/* Destroy children */
2836 	if (root->left != &empty)
2837 		urldb_destroy_search_tree(root->left);
2838 	if (root->right != &empty)
2839 		urldb_destroy_search_tree(root->right);
2840 
2841 	/* And destroy ourselves */
2842 	free(root);
2843 }
2844 
2845 
2846 /*************** External interface ***************/
2847 
2848 
2849 /* exported interface documented in content/urldb.h */
urldb_destroy(void)2850 void urldb_destroy(void)
2851 {
2852 	struct host_part *a, *b;
2853 	int i;
2854 
2855 	/* Clean up search trees */
2856 	for (i = 0; i < NUM_SEARCH_TREES; i++) {
2857 		if (search_trees[i] != &empty) {
2858 			urldb_destroy_search_tree(search_trees[i]);
2859 			search_trees[i] = &empty;
2860 		}
2861 	}
2862 
2863 	/* And database */
2864 	for (a = db_root.children; a; a = b) {
2865 		b = a->next;
2866 		urldb_destroy_host_tree(a);
2867 	}
2868 	memset(&db_root, 0, sizeof(db_root));
2869 
2870 	/* And the bloom filter */
2871 	if (url_bloom != NULL) {
2872 		bloom_destroy(url_bloom);
2873 		url_bloom = NULL;
2874 	}
2875 }
2876 
2877 
2878 /* exported interface documented in netsurf/url_db.h */
urldb_load(const char * filename)2879 nserror urldb_load(const char *filename)
2880 {
2881 #define MAXIMUM_URL_LENGTH 4096
2882 	char s[MAXIMUM_URL_LENGTH];
2883 	char host[256];
2884 	struct host_part *h;
2885 	int urls;
2886 	int i;
2887 	int version;
2888 	int length;
2889 	FILE *fp;
2890 
2891 	assert(filename);
2892 
2893 	NSLOG(netsurf, INFO, "Loading URL file %s", filename);
2894 
2895 	if (url_bloom == NULL)
2896 		url_bloom = bloom_create(BLOOM_SIZE);
2897 
2898 	fp = fopen(filename, "r");
2899 	if (!fp) {
2900 		NSLOG(netsurf, INFO, "Failed to open file '%s' for reading",
2901 		      filename);
2902 		return NSERROR_NOT_FOUND;
2903 	}
2904 
2905 	if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) {
2906 		fclose(fp);
2907 		return NSERROR_NEED_DATA;
2908 	}
2909 
2910 	version = atoi(s);
2911 	if (version < MIN_URL_FILE_VERSION) {
2912 		NSLOG(netsurf, INFO, "Unsupported URL file version.");
2913 		fclose(fp);
2914 		return NSERROR_INVALID;
2915 	}
2916 	if (version > URL_FILE_VERSION) {
2917 		NSLOG(netsurf, INFO, "Unknown URL file version.");
2918 		fclose(fp);
2919 		return NSERROR_INVALID;
2920 	}
2921 
2922 	while (fgets(host, sizeof host, fp)) {
2923 		time_t hsts_expiry = 0;
2924 		int hsts_include_sub_domains = 0;
2925 
2926 		/* get the hostname */
2927 		length = strlen(host) - 1;
2928 		host[length] = '\0';
2929 
2930 		/* skip data that has ended up with a host of '' */
2931 		if (length == 0) {
2932 			if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
2933 				break;
2934 			urls = atoi(s);
2935 			/* Eight fields/url */
2936 			for (i = 0; i < (8 * urls); i++) {
2937 				if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
2938 					break;
2939 			}
2940 			continue;
2941 		}
2942 
2943 		if (version >= 107) {
2944 			char *p = host;
2945 			while (*p && *p != ' ') p++;
2946 			while (*p && *p == ' ') { *p = '\0'; p++; }
2947 			hsts_include_sub_domains = (*p == '1');
2948 			while (*p && *p != ' ') p++;
2949 			while (*p && *p == ' ') p++;
2950 			nsc_snptimet(p, strlen(p), &hsts_expiry);
2951 		}
2952 
2953 		h = urldb_add_host(host);
2954 		if (!h) {
2955 			NSLOG(netsurf, INFO, "Failed adding host: '%s'", host);
2956 			fclose(fp);
2957 			return NSERROR_NOMEM;
2958 		}
2959 		h->hsts.expires = hsts_expiry;
2960 		h->hsts.include_sub_domains = hsts_include_sub_domains;
2961 
2962 		/* read number of URLs */
2963 		if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
2964 			break;
2965 		urls = atoi(s);
2966 
2967 		/* no URLs => try next host */
2968 		if (urls == 0) {
2969 			NSLOG(netsurf, INFO, "No URLs for '%s'", host);
2970 			continue;
2971 		}
2972 
2973 		/* load the non-corrupt data */
2974 		for (i = 0; i < urls; i++) {
2975 			struct path_data *p = NULL;
2976 			char scheme[64], ports[10];
2977 			char url[64 + 3 + 256 + 6 + 4096 + 1 + 1];
2978 			unsigned int port;
2979 			bool is_file = false;
2980 			nsurl *nsurl;
2981 			lwc_string *scheme_lwc, *fragment_lwc;
2982 			char *path_query;
2983 			size_t len;
2984 
2985 			if (!fgets(scheme, sizeof scheme, fp))
2986 				break;
2987 			length = strlen(scheme) - 1;
2988 			scheme[length] = '\0';
2989 
2990 			if (!fgets(ports, sizeof ports, fp))
2991 				break;
2992 			length = strlen(ports) - 1;
2993 			ports[length] = '\0';
2994 			port = atoi(ports);
2995 
2996 			if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
2997 				break;
2998 			length = strlen(s) - 1;
2999 			s[length] = '\0';
3000 
3001 			if (!strcasecmp(host, "localhost") &&
3002 			    !strcasecmp(scheme, "file"))
3003 				is_file = true;
3004 
3005 			snprintf(url, sizeof url, "%s://%s%s%s%s",
3006 				 scheme,
3007 				 /* file URLs have no host */
3008 				 (is_file ? "" : host),
3009 				 (port ? ":" : ""),
3010 				 (port ? ports : ""),
3011 				 s);
3012 
3013 			/* TODO: store URLs in pre-parsed state, and make
3014 			 *       a nsurl_load to generate the nsurl more
3015 			 *       swiftly.
3016 			 *       Need a nsurl_save too.
3017 			 */
3018 			if (nsurl_create(url, &nsurl) != NSERROR_OK) {
3019 				NSLOG(netsurf, INFO, "Failed inserting '%s'",
3020 				      url);
3021 				fclose(fp);
3022 				return NSERROR_NOMEM;
3023 			}
3024 
3025 			if (url_bloom != NULL) {
3026 				uint32_t hash = nsurl_hash(nsurl);
3027 				bloom_insert_hash(url_bloom, hash);
3028 			}
3029 
3030 			/* Copy and merge path/query strings */
3031 			if (nsurl_get(nsurl, NSURL_PATH | NSURL_QUERY,
3032 				      &path_query, &len) != NSERROR_OK) {
3033 				NSLOG(netsurf, INFO, "Failed inserting '%s'",
3034 				      url);
3035 				fclose(fp);
3036 				return NSERROR_NOMEM;
3037 			}
3038 
3039 			scheme_lwc = nsurl_get_component(nsurl, NSURL_SCHEME);
3040 			fragment_lwc = nsurl_get_component(nsurl,
3041 							   NSURL_FRAGMENT);
3042 			p = urldb_add_path(scheme_lwc, port, h, path_query,
3043 					   fragment_lwc, nsurl);
3044 			if (!p) {
3045 				NSLOG(netsurf, INFO, "Failed inserting '%s'",
3046 				      url);
3047 				fclose(fp);
3048 				return NSERROR_NOMEM;
3049 			}
3050 			nsurl_unref(nsurl);
3051 			lwc_string_unref(scheme_lwc);
3052 			if (fragment_lwc != NULL)
3053 				lwc_string_unref(fragment_lwc);
3054 
3055 			if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
3056 				break;
3057 			if (p)
3058 				p->urld.visits = (unsigned int)atoi(s);
3059 
3060 			/* entry last use time */
3061 			if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) {
3062 				break;
3063 			}
3064 			if (p) {
3065 				nsc_snptimet(s, strlen(s) - 1, &p->urld.last_visit);
3066 			}
3067 
3068 			if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
3069 				break;
3070 			if (p)
3071 				p->urld.type = (content_type)atoi(s);
3072 
3073 			if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
3074 				break;
3075 
3076 
3077 			if (!fgets(s, MAXIMUM_URL_LENGTH, fp))
3078 				break;
3079 			length = strlen(s) - 1;
3080 			if (p && length > 0) {
3081 				s[length] = '\0';
3082 				p->urld.title = malloc(length + 1);
3083 				if (p->urld.title)
3084 					memcpy(p->urld.title, s, length + 1);
3085 			}
3086 		}
3087 	}
3088 
3089 	fclose(fp);
3090 	NSLOG(netsurf, INFO, "Successfully loaded URL file");
3091 #undef MAXIMUM_URL_LENGTH
3092 
3093 	return NSERROR_OK;
3094 }
3095 
3096 /* exported interface documented in netsurf/url_db.h */
urldb_save(const char * filename)3097 nserror urldb_save(const char *filename)
3098 {
3099 	FILE *fp;
3100 	int i;
3101 
3102 	assert(filename);
3103 
3104 	fp = fopen(filename, "w");
3105 	if (!fp) {
3106 		NSLOG(netsurf, INFO, "Failed to open file '%s' for writing",
3107 		      filename);
3108 		return NSERROR_SAVE_FAILED;
3109 	}
3110 
3111 	/* file format version number */
3112 	fprintf(fp, "%d\n", URL_FILE_VERSION);
3113 
3114 	for (i = 0; i != NUM_SEARCH_TREES; i++) {
3115 		urldb_save_search_tree(search_trees[i], fp);
3116 	}
3117 
3118 	fclose(fp);
3119 
3120 	return NSERROR_OK;
3121 }
3122 
3123 
3124 /* exported interface documented in content/urldb.h */
urldb_set_url_persistence(nsurl * url,bool persist)3125 nserror urldb_set_url_persistence(nsurl *url, bool persist)
3126 {
3127 	struct path_data *p;
3128 
3129 	assert(url);
3130 
3131 	p = urldb_find_url(url);
3132 	if (!p) {
3133 		return NSERROR_NOT_FOUND;
3134 	}
3135 
3136 	p->persistent = persist;
3137 
3138 	return NSERROR_OK;
3139 }
3140 
3141 
3142 /* exported interface documented in content/urldb.h */
urldb_add_url(nsurl * url)3143 bool urldb_add_url(nsurl *url)
3144 {
3145 	struct host_part *h;
3146 	struct path_data *p;
3147 	lwc_string *scheme;
3148 	lwc_string *port;
3149 	lwc_string *host;
3150 	lwc_string *fragment;
3151 	const char *host_str;
3152 	char *path_query = NULL;
3153 	size_t len;
3154 	bool match;
3155 	unsigned int port_int;
3156 
3157 	assert(url);
3158 
3159 	if (url_bloom == NULL)
3160 		url_bloom = bloom_create(BLOOM_SIZE);
3161 
3162 	if (url_bloom != NULL) {
3163 		uint32_t hash = nsurl_hash(url);
3164 		bloom_insert_hash(url_bloom, hash);
3165 	}
3166 
3167 	/* Copy and merge path/query strings */
3168 	if (nsurl_get(url, NSURL_PATH | NSURL_QUERY, &path_query, &len) !=
3169 	    NSERROR_OK) {
3170 		return false;
3171 	}
3172 	assert(path_query != NULL);
3173 
3174 	scheme = nsurl_get_component(url, NSURL_SCHEME);
3175 	if (scheme == NULL) {
3176 		free(path_query);
3177 		return false;
3178 	}
3179 
3180 	host = nsurl_get_component(url, NSURL_HOST);
3181 	if (host != NULL) {
3182 		host_str = lwc_string_data(host);
3183 		lwc_string_unref(host);
3184 
3185 	} else if (lwc_string_isequal(scheme, corestring_lwc_file, &match) ==
3186 		   lwc_error_ok && match == true) {
3187 		host_str = "localhost";
3188 
3189 	} else {
3190 		lwc_string_unref(scheme);
3191 		free(path_query);
3192 		return false;
3193 	}
3194 
3195 	fragment = nsurl_get_component(url, NSURL_FRAGMENT);
3196 
3197 	port = nsurl_get_component(url, NSURL_PORT);
3198 	if (port != NULL) {
3199 		port_int = atoi(lwc_string_data(port));
3200 		lwc_string_unref(port);
3201 	} else {
3202 		port_int = 0;
3203 	}
3204 
3205 	/* Get host entry */
3206 	h = urldb_add_host(host_str);
3207 
3208 	/* Get path entry */
3209 	if (h != NULL) {
3210 		p = urldb_add_path(scheme,
3211 				   port_int,
3212 				   h,
3213 				   path_query,
3214 				   fragment,
3215 				   url);
3216 	} else {
3217 		p = NULL;
3218 	}
3219 
3220 	lwc_string_unref(scheme);
3221 	if (fragment != NULL)
3222 		lwc_string_unref(fragment);
3223 
3224 	return (p != NULL);
3225 }
3226 
3227 
3228 /* exported interface documented in content/urldb.h */
urldb_set_url_title(nsurl * url,const char * title)3229 nserror urldb_set_url_title(nsurl *url, const char *title)
3230 {
3231 	struct path_data *p;
3232 	char *temp;
3233 
3234 	assert(url);
3235 
3236 	p = urldb_find_url(url);
3237 	if (p == NULL) {
3238 		return NSERROR_NOT_FOUND;
3239 	}
3240 
3241 	/* copy the parameter if necessary */
3242 	if (title != NULL) {
3243 		temp = strdup(title);
3244 		if (temp == NULL) {
3245 			return NSERROR_NOMEM;
3246 		}
3247 	} else {
3248 		temp = NULL;
3249 	}
3250 
3251 	free(p->urld.title);
3252 	p->urld.title = temp;
3253 
3254 	return NSERROR_OK;
3255 }
3256 
3257 
3258 /* exported interface documented in content/urldb.h */
urldb_set_url_content_type(nsurl * url,content_type type)3259 nserror urldb_set_url_content_type(nsurl *url, content_type type)
3260 {
3261 	struct path_data *p;
3262 
3263 	assert(url);
3264 
3265 	p = urldb_find_url(url);
3266 	if (!p) {
3267 		return NSERROR_NOT_FOUND;
3268 	}
3269 
3270 	p->urld.type = type;
3271 
3272 	return NSERROR_OK;
3273 }
3274 
3275 
3276 /* exported interface documented in content/urldb.h */
urldb_update_url_visit_data(nsurl * url)3277 nserror urldb_update_url_visit_data(nsurl *url)
3278 {
3279 	struct path_data *p;
3280 
3281 	assert(url);
3282 
3283 	p = urldb_find_url(url);
3284 	if (!p) {
3285 		return NSERROR_NOT_FOUND;
3286 	}
3287 
3288 	p->urld.last_visit = time(NULL);
3289 	p->urld.visits++;
3290 
3291 	return NSERROR_OK;
3292 }
3293 
3294 
3295 /* exported interface documented in content/urldb.h */
urldb_reset_url_visit_data(nsurl * url)3296 void urldb_reset_url_visit_data(nsurl *url)
3297 {
3298 	struct path_data *p;
3299 
3300 	assert(url);
3301 
3302 	p = urldb_find_url(url);
3303 	if (!p)
3304 		return;
3305 
3306 	p->urld.last_visit = (time_t)0;
3307 	p->urld.visits = 0;
3308 }
3309 
3310 
3311 /* exported interface documented in netsurf/url_db.h */
urldb_get_url_data(nsurl * url)3312 const struct url_data *urldb_get_url_data(nsurl *url)
3313 {
3314 	struct path_data *p;
3315 	struct url_internal_data *u;
3316 
3317 	assert(url);
3318 
3319 	p = urldb_find_url(url);
3320 	if (!p)
3321 		return NULL;
3322 
3323 	u = &p->urld;
3324 
3325 	return (const struct url_data *) u;
3326 }
3327 
3328 
3329 /* exported interface documented in content/urldb.h */
urldb_get_url(nsurl * url)3330 nsurl *urldb_get_url(nsurl *url)
3331 {
3332 	struct path_data *p;
3333 
3334 	assert(url);
3335 
3336 	p = urldb_find_url(url);
3337 	if (!p)
3338 		return NULL;
3339 
3340 	return p->url;
3341 }
3342 
3343 
3344 /* exported interface documented in netsurf/url_db.h */
urldb_set_auth_details(nsurl * url,const char * realm,const char * auth)3345 void urldb_set_auth_details(nsurl *url, const char *realm, const char *auth)
3346 {
3347 	struct path_data *p, *pi;
3348 	struct host_part *h;
3349 	struct prot_space_data *space, *space_alloc;
3350 	char *realm_alloc, *auth_alloc;
3351 	bool match;
3352 
3353 	assert(url && realm && auth);
3354 
3355 	/* add url, in case it's missing */
3356 	urldb_add_url(url);
3357 
3358 	p = urldb_find_url(url);
3359 
3360 	if (!p)
3361 		return;
3362 
3363 	/* Search for host_part */
3364 	for (pi = p; pi->parent != NULL; pi = pi->parent)
3365 		;
3366 	h = (struct host_part *)pi;
3367 
3368 	/* Search if given URL belongs to a protection space we already know of. */
3369 	for (space = h->prot_space; space; space = space->next) {
3370 		if (!strcmp(space->realm, realm) &&
3371 		    lwc_string_isequal(space->scheme, p->scheme,
3372 				       &match) == lwc_error_ok &&
3373 		    match == true &&
3374 		    space->port == p->port)
3375 			break;
3376 	}
3377 
3378 	if (space != NULL) {
3379 		/* Overrule existing auth. */
3380 		free(space->auth);
3381 		space->auth = strdup(auth);
3382 	} else {
3383 		/* Create a new protection space. */
3384 		space = space_alloc = malloc(sizeof(struct prot_space_data));
3385 		realm_alloc = strdup(realm);
3386 		auth_alloc = strdup(auth);
3387 
3388 		if (!space_alloc || !realm_alloc || !auth_alloc) {
3389 			free(space_alloc);
3390 			free(realm_alloc);
3391 			free(auth_alloc);
3392 			return;
3393 		}
3394 
3395 		space->scheme = lwc_string_ref(p->scheme);
3396 		space->port = p->port;
3397 		space->realm = realm_alloc;
3398 		space->auth = auth_alloc;
3399 		space->next = h->prot_space;
3400 		h->prot_space = space;
3401 	}
3402 
3403 	p->prot_space = space;
3404 }
3405 
3406 
3407 /* exported interface documented in netsurf/url_db.h */
urldb_get_auth_details(nsurl * url,const char * realm)3408 const char *urldb_get_auth_details(nsurl *url, const char *realm)
3409 {
3410 	struct path_data *p, *p_cur, *p_top;
3411 
3412 	assert(url);
3413 
3414 	/* add to the db, so our lookup will work */
3415 	urldb_add_url(url);
3416 
3417 	p = urldb_find_url(url);
3418 	if (!p)
3419 		return NULL;
3420 
3421 	/* Check for any auth details attached to the path_data node or any of
3422 	 * its parents.
3423 	 */
3424 	for (p_cur = p; p_cur != NULL; p_top = p_cur, p_cur = p_cur->parent) {
3425 		if (p_cur->prot_space) {
3426 			return p_cur->prot_space->auth;
3427 		}
3428 	}
3429 
3430 	/* Only when we have a realm (and canonical root of given URL), we can
3431 	 * uniquely locate the protection space.
3432 	 */
3433 	if (realm != NULL) {
3434 		const struct host_part *h = (const struct host_part *)p_top;
3435 		const struct prot_space_data *space;
3436 		bool match;
3437 
3438 		/* Search for a possible matching protection space. */
3439 		for (space = h->prot_space; space != NULL;
3440 		     space = space->next) {
3441 			if (!strcmp(space->realm, realm) &&
3442 			    lwc_string_isequal(space->scheme,
3443 					       p->scheme, &match) ==
3444 			    lwc_error_ok &&
3445 			    match == true &&
3446 			    space->port == p->port) {
3447 				p->prot_space = space;
3448 				return p->prot_space->auth;
3449 			}
3450 		}
3451 	}
3452 
3453 	return NULL;
3454 }
3455 
3456 
3457 /* exported interface documented in netsurf/url_db.h */
urldb_set_cert_permissions(nsurl * url,bool permit)3458 void urldb_set_cert_permissions(nsurl *url, bool permit)
3459 {
3460 	struct path_data *p;
3461 	struct host_part *h;
3462 
3463 	assert(url);
3464 
3465 	/* add url, in case it's missing */
3466 	urldb_add_url(url);
3467 
3468 	p = urldb_find_url(url);
3469 	if (!p)
3470 		return;
3471 
3472 	for (; p && p->parent; p = p->parent)
3473 		/* do nothing */;
3474 	assert(p);
3475 
3476 	h = (struct host_part *)p;
3477 
3478 	h->permit_invalid_certs = permit;
3479 }
3480 
3481 
3482 /* exported interface documented in content/urldb.h */
urldb_get_cert_permissions(nsurl * url)3483 bool urldb_get_cert_permissions(nsurl *url)
3484 {
3485 	struct path_data *p;
3486 	const struct host_part *h;
3487 
3488 	assert(url);
3489 
3490 	p = urldb_find_url(url);
3491 	if (!p)
3492 		return false;
3493 
3494 	for (; p && p->parent; p = p->parent)
3495 		/* do nothing */;
3496 	assert(p);
3497 
3498 	h = (const struct host_part *)p;
3499 
3500 	return h->permit_invalid_certs;
3501 }
3502 
3503 
3504 /* exported interface documented in content/urldb.h */
urldb_set_hsts_policy(struct nsurl * url,const char * header)3505 bool urldb_set_hsts_policy(struct nsurl *url, const char *header)
3506 {
3507 	struct path_data *p;
3508 	struct host_part *h;
3509 	lwc_string *host;
3510 	time_t now = time(NULL);
3511 	http_strict_transport_security *sts;
3512 	uint32_t max_age = 0;
3513 	nserror error;
3514 
3515 	assert(url);
3516 
3517 	host = nsurl_get_component(url, NSURL_HOST);
3518 	if (host != NULL) {
3519 		if (urldb__host_is_ip_address(lwc_string_data(host))) {
3520 			/* Host is IP: ignore */
3521 			lwc_string_unref(host);
3522 			return true;
3523 		} else if (lwc_string_length(host) == 0) {
3524 			/* Host is blank: ignore */
3525 			lwc_string_unref(host);
3526 			return true;
3527 		}
3528 
3529 		lwc_string_unref(host);
3530 	} else {
3531 		/* No host part: ignore */
3532 		return true;
3533 	}
3534 
3535 	/* add url, in case it's missing */
3536 	urldb_add_url(url);
3537 
3538 	p = urldb_find_url(url);
3539 	if (!p)
3540 		return false;
3541 
3542 	for (; p && p->parent; p = p->parent)
3543 		/* do nothing */;
3544 	assert(p);
3545 
3546 	h = (struct host_part *)p;
3547 	if (h->permit_invalid_certs) {
3548 		/* Transport is tainted: ignore */
3549 		return true;
3550 	}
3551 
3552 	error = http_parse_strict_transport_security(header, &sts);
3553 	if (error != NSERROR_OK) {
3554 		/* Parse failed: ignore */
3555 		return true;
3556 	}
3557 
3558 	h->hsts.include_sub_domains =
3559 		http_strict_transport_security_include_subdomains(sts);
3560 
3561 	max_age = http_strict_transport_security_max_age(sts);
3562 	if (max_age == 0) {
3563 		h->hsts.expires = 0;
3564 		h->hsts.include_sub_domains = false;
3565 	} else if ((time_t) (now + max_age) > h->hsts.expires) {
3566 		h->hsts.expires = now + max_age;
3567 	}
3568 
3569 	http_strict_transport_security_destroy(sts);
3570 
3571 	return true;
3572 }
3573 
3574 
3575 /* exported interface documented in content/urldb.h */
urldb_get_hsts_enabled(struct nsurl * url)3576 bool urldb_get_hsts_enabled(struct nsurl *url)
3577 {
3578 	struct path_data *p;
3579 	const struct host_part *h;
3580 	lwc_string *host;
3581 	time_t now = time(NULL);
3582 
3583 	assert(url);
3584 
3585 	host = nsurl_get_component(url, NSURL_HOST);
3586 	if (host != NULL) {
3587 		if (urldb__host_is_ip_address(lwc_string_data(host))) {
3588 			/* Host is IP: not enabled */
3589 			lwc_string_unref(host);
3590 			return false;
3591 		} else if (lwc_string_length(host) == 0) {
3592 			/* Host is blank: not enabled */
3593 			lwc_string_unref(host);
3594 			return false;
3595 		}
3596 
3597 		lwc_string_unref(host);
3598 	} else {
3599 		/* No host part: not enabled */
3600 		return false;
3601 	}
3602 
3603 	/* The URL must exist in the db in order to find HSTS policy, since
3604 	 * we search up the tree from the URL node, and policy from further
3605 	 * up may also apply. */
3606 	urldb_add_url(url);
3607 
3608 	p = urldb_find_url(url);
3609 	if (!p)
3610 		return false;
3611 
3612 	for (; p && p->parent; p = p->parent)
3613 		/* do nothing */;
3614 	assert(p);
3615 
3616 	h = (const struct host_part *)p;
3617 
3618 	/* Consult record for this host */
3619 	if (h->hsts.expires > now) {
3620 		/* Not expired */
3621 		return true;
3622 	}
3623 
3624 	/* Consult parent domains */
3625 	for (h = h->parent; h && h != &db_root; h = h->parent) {
3626 		if (h->hsts.expires > now && h->hsts.include_sub_domains) {
3627 			/* Not expired and subdomains included */
3628 			return true;
3629 		}
3630 	}
3631 
3632 	return false;
3633 }
3634 
3635 
3636 /* exported interface documented in netsurf/url_db.h */
3637 void
urldb_iterate_partial(const char * prefix,bool (* callback)(nsurl * url,const struct url_data * data))3638 urldb_iterate_partial(const char *prefix,
3639 		      bool (*callback)(nsurl *url, const struct url_data *data))
3640 {
3641 	char host[256];
3642 	char buf[260]; /* max domain + "www." */
3643 	const char *slash, *scheme_sep;
3644 	struct search_node *tree;
3645 	const struct host_part *h;
3646 
3647 	assert(prefix && callback);
3648 
3649 	/* strip scheme */
3650 	scheme_sep = strstr(prefix, "://");
3651 	if (scheme_sep)
3652 		prefix = scheme_sep + 3;
3653 
3654 	slash = strchr(prefix, '/');
3655 	tree = urldb_get_search_tree(prefix);
3656 
3657 	if (slash) {
3658 		/* if there's a slash in the input, then we can
3659 		 * assume that we're looking for a path */
3660 		snprintf(host, sizeof host, "%.*s",
3661 			 (int) (slash - prefix), prefix);
3662 
3663 		h = urldb_search_find(tree, host);
3664 		if (!h) {
3665 			int len = slash - prefix;
3666 
3667 			if (len <= 3 || strncasecmp(host, "www.", 4) != 0) {
3668 				snprintf(buf, sizeof buf, "www.%s", host);
3669 				h = urldb_search_find(
3670 					search_trees[ST_DN + 'w' - 'a'],
3671 					buf);
3672 				if (!h)
3673 					return;
3674 			} else
3675 				return;
3676 		}
3677 
3678 		if (h->paths.children) {
3679 			/* Have paths, iterate them */
3680 			urldb_iterate_partial_path(&h->paths, slash + 1,
3681 						   callback);
3682 		}
3683 
3684 	} else {
3685 		int len = strlen(prefix);
3686 
3687 		/* looking for hosts */
3688 		if (!urldb_iterate_partial_host(tree, prefix, callback))
3689 			return;
3690 
3691 		if (len <= 3 || strncasecmp(prefix, "www.", 4) != 0) {
3692 			/* now look for www.prefix */
3693 			snprintf(buf, sizeof buf, "www.%s", prefix);
3694 			if(!urldb_iterate_partial_host(
3695 				   search_trees[ST_DN + 'w' - 'a'],
3696 				   buf, callback))
3697 				return;
3698 		}
3699 	}
3700 }
3701 
3702 
3703 /* exported interface documented in netsurf/url_db.h */
3704 void
urldb_iterate_entries(bool (* callback)(nsurl * url,const struct url_data * data))3705 urldb_iterate_entries(bool (*callback)(nsurl *url, const struct url_data *data))
3706 {
3707 	int i;
3708 
3709 	assert(callback);
3710 
3711 	for (i = 0; i < NUM_SEARCH_TREES; i++) {
3712 		if (!urldb_iterate_entries_host(search_trees[i],
3713 						callback,
3714 						NULL)) {
3715 			break;
3716 		}
3717 	}
3718 }
3719 
3720 
3721 /* exported interface documented in content/urldb.h */
urldb_iterate_cookies(bool (* callback)(const struct cookie_data * data))3722 void urldb_iterate_cookies(bool (*callback)(const struct cookie_data *data))
3723 {
3724 	int i;
3725 
3726 	assert(callback);
3727 
3728 	for (i = 0; i < NUM_SEARCH_TREES; i++) {
3729 		if (!urldb_iterate_entries_host(search_trees[i],
3730 						NULL, callback))
3731 			break;
3732 	}
3733 }
3734 
3735 
3736 /* exported interface documented in content/urldb.h */
urldb_set_cookie(const char * header,nsurl * url,nsurl * referer)3737 bool urldb_set_cookie(const char *header, nsurl *url, nsurl *referer)
3738 {
3739 	const char *cur = header, *end;
3740 	lwc_string *path, *host, *scheme;
3741 	nsurl *urlt;
3742 	bool match;
3743 
3744 	assert(url && header);
3745 
3746 	/* Get defragmented URL, as 'urlt' */
3747 	if (nsurl_defragment(url, &urlt) != NSERROR_OK)
3748 		return NULL;
3749 
3750 	scheme = nsurl_get_component(url, NSURL_SCHEME);
3751 	if (scheme == NULL) {
3752 		nsurl_unref(urlt);
3753 		return false;
3754 	}
3755 
3756 	path = nsurl_get_component(url, NSURL_PATH);
3757 	if (path == NULL) {
3758 		lwc_string_unref(scheme);
3759 		nsurl_unref(urlt);
3760 		return false;
3761 	}
3762 
3763 	host = nsurl_get_component(url, NSURL_HOST);
3764 	if (host == NULL) {
3765 		lwc_string_unref(path);
3766 		lwc_string_unref(scheme);
3767 		nsurl_unref(urlt);
3768 		return false;
3769 	}
3770 
3771 	if (referer) {
3772 		lwc_string *rhost;
3773 
3774 		/* Ensure that url's host name domain matches
3775 		 * referer's (4.3.5) */
3776 		rhost = nsurl_get_component(referer, NSURL_HOST);
3777 		if (rhost == NULL) {
3778 			goto error;
3779 		}
3780 
3781 		/* Domain match host names */
3782 		if (lwc_string_isequal(host, rhost, &match) == lwc_error_ok &&
3783 		    match == false) {
3784 			const char *hptr;
3785 			const char *rptr;
3786 			const char *dot;
3787 			const char *host_data = lwc_string_data(host);
3788 			const char *rhost_data = lwc_string_data(rhost);
3789 
3790 			/* Ensure neither host nor rhost are IP addresses */
3791 			if (urldb__host_is_ip_address(host_data) ||
3792 			    urldb__host_is_ip_address(rhost_data)) {
3793 				/* IP address, so no partial match */
3794 				lwc_string_unref(rhost);
3795 				goto error;
3796 			}
3797 
3798 			/* Not exact match, so try the following:
3799 			 *
3800 			 * 1) Find the longest common suffix of host and rhost
3801 			 *    (may be all of host/rhost)
3802 			 * 2) Discard characters from the start of the suffix
3803 			 *    until the suffix starts with a dot
3804 			 *    (prevents foobar.com matching bar.com)
3805 			 * 3) Ensure the suffix is non-empty and contains
3806 			 *    embedded dots (to avoid permitting .com as a
3807 			 *    suffix)
3808 			 *
3809 			 * Note that the above in no way resembles the
3810 			 * domain matching algorithm found in RFC2109.
3811 			 * It does, however, model the real world rather
3812 			 * more accurately.
3813 			 */
3814 
3815 			/** \todo In future, we should consult a TLD service
3816 			 * instead of just looking for embedded dots.
3817 			 */
3818 
3819 			hptr = host_data + lwc_string_length(host) - 1;
3820 			rptr = rhost_data + lwc_string_length(rhost) - 1;
3821 
3822 			/* 1 */
3823 			while (hptr >= host_data && rptr >= rhost_data) {
3824 				if (*hptr != *rptr)
3825 					break;
3826 				hptr--;
3827 				rptr--;
3828 			}
3829 			/* Ensure we end up pointing at the start of the
3830 			 * common suffix. The above loop will exit pointing
3831 			 * to the byte before the start of the suffix. */
3832 			hptr++;
3833 
3834 			/* 2 */
3835 			while (*hptr != '\0' && *hptr != '.')
3836 				hptr++;
3837 
3838 			/* 3 */
3839 			if (*hptr == '\0' ||
3840 			    (dot = strchr(hptr + 1, '.')) == NULL ||
3841 			    *(dot + 1) == '\0') {
3842 				lwc_string_unref(rhost);
3843 				goto error;
3844 			}
3845 		}
3846 
3847 		lwc_string_unref(rhost);
3848 	}
3849 
3850 	end = cur + strlen(cur) - 2 /* Trailing CRLF */;
3851 
3852 	do {
3853 		struct cookie_internal_data *c;
3854 		char *dot;
3855 		size_t len;
3856 #ifdef WITH_NSPSL
3857 		const char *suffix;
3858 #endif
3859 
3860 		c = urldb_parse_cookie(url, &cur);
3861 		if (!c) {
3862 			/* failed => stop parsing */
3863 			goto error;
3864 		}
3865 
3866 		/* validate cookie */
3867 
3868 		/* 4.2.2:i Cookie must have NAME and VALUE */
3869 		if (!c->name || !c->value) {
3870 			urldb_free_cookie(c);
3871 			goto error;
3872 		}
3873 
3874 		/* 4.3.2:i Cookie path must be a prefix of URL path */
3875 		len = strlen(c->path);
3876 		if (len > lwc_string_length(path) ||
3877 		    strncmp(c->path, lwc_string_data(path),
3878 			    len) != 0) {
3879 			urldb_free_cookie(c);
3880 			goto error;
3881 		}
3882 
3883 #ifdef WITH_NSPSL
3884 		/* check domain is not a public suffix */
3885 		dot = c->domain;
3886 		if (*dot == '.') {
3887 			dot++;
3888 		}
3889 		suffix = nspsl_getpublicsuffix(dot);
3890 		if (suffix == NULL) {
3891 			NSLOG(netsurf, INFO,
3892 			      "domain %s was a public suffix domain", dot);
3893 			urldb_free_cookie(c);
3894 			goto error;
3895 		}
3896 #else
3897 		/* 4.3.2:ii Cookie domain must contain embedded dots */
3898 		dot = strchr(c->domain + 1, '.');
3899 		if (!dot || *(dot + 1) == '\0') {
3900 			/* no embedded dots */
3901 			urldb_free_cookie(c);
3902 			goto error;
3903 		}
3904 #endif
3905 
3906 		/* Domain match fetch host with cookie domain */
3907 		if (strcasecmp(lwc_string_data(host), c->domain) != 0) {
3908 			int hlen, dlen;
3909 			char *domain = c->domain;
3910 
3911 			/* c->domain must be a domain cookie here because:
3912 			 * c->domain is either:
3913 			 *   + specified in the header as a domain cookie
3914 			 *     (non-domain cookies in the header are ignored
3915 			 *      by urldb_parse_cookie / urldb_parse_avpair)
3916 			 *   + defaulted to the URL's host part
3917 			 *     (by urldb_parse_cookie if no valid domain was
3918 			 *      specified in the header)
3919 			 *
3920 			 * The latter will pass the strcasecmp above, which
3921 			 * leaves the former (i.e. a domain cookie)
3922 			 */
3923 			assert(c->domain[0] == '.');
3924 
3925 			/* 4.3.2:iii */
3926 			if (urldb__host_is_ip_address(lwc_string_data(host))) {
3927 				/* IP address, so no partial match */
3928 				urldb_free_cookie(c);
3929 				goto error;
3930 			}
3931 
3932 			hlen = lwc_string_length(host);
3933 			dlen = strlen(c->domain);
3934 
3935 			if (hlen <= dlen && hlen != dlen - 1) {
3936 				/* Partial match not possible */
3937 				urldb_free_cookie(c);
3938 				goto error;
3939 			}
3940 
3941 			if (hlen == dlen - 1) {
3942 				/* Relax matching to allow
3943 				 * host a.com to match .a.com */
3944 				domain++;
3945 				dlen--;
3946 			}
3947 
3948 			if (strcasecmp(lwc_string_data(host) + (hlen - dlen),
3949 				       domain)) {
3950 				urldb_free_cookie(c);
3951 				goto error;
3952 			}
3953 
3954 			/* 4.3.2:iv Ensure H contains no dots
3955 			 *
3956 			 * If you believe the spec, H should contain no
3957 			 * dots in _any_ cookie. Unfortunately, however,
3958 			 * reality differs in that many sites send domain
3959 			 * cookies of the form .foo.com from hosts such
3960 			 * as bar.bat.foo.com and then expect domain
3961 			 * matching to work. Thus we have to do what they
3962 			 * expect, regardless of any potential security
3963 			 * implications.
3964 			 *
3965 			 * This is what code conforming to the spec would
3966 			 * look like:
3967 			 *
3968 			 * for (int i = 0; i < (hlen - dlen); i++) {
3969 			 *	if (host[i] == '.') {
3970 			 *		urldb_free_cookie(c);
3971 			 *		goto error;
3972 			 *	}
3973 			 * }
3974 			 */
3975 		}
3976 
3977 		/* Now insert into database */
3978 		if (!urldb_insert_cookie(c, scheme, urlt))
3979 			goto error;
3980 	} while (cur < end);
3981 
3982 	lwc_string_unref(host);
3983 	lwc_string_unref(path);
3984 	lwc_string_unref(scheme);
3985 	nsurl_unref(urlt);
3986 
3987 	return true;
3988 
3989 error:
3990 	lwc_string_unref(host);
3991 	lwc_string_unref(path);
3992 	lwc_string_unref(scheme);
3993 	nsurl_unref(urlt);
3994 
3995 	return false;
3996 }
3997 
3998 
3999 /* exported interface documented in content/urldb.h */
urldb_get_cookie(nsurl * url,bool include_http_only)4000 char *urldb_get_cookie(nsurl *url, bool include_http_only)
4001 {
4002 	const struct path_data *p, *q;
4003 	const struct host_part *h;
4004 	lwc_string *path_lwc;
4005 	struct cookie_internal_data *c;
4006 	int count = 0, version = COOKIE_RFC2965;
4007 	struct cookie_internal_data **matched_cookies;
4008 	int matched_cookies_size = 20;
4009 	int ret_alloc = 4096, ret_used = 1;
4010 	const char *path;
4011 	char *ret;
4012 	lwc_string *scheme;
4013 	time_t now;
4014 	int i;
4015 	bool match;
4016 
4017 	assert(url != NULL);
4018 
4019 	/* The URL must exist in the db in order to find relevant cookies, since
4020 	 * we search up the tree from the URL node, and cookies from further
4021 	 * up also apply. */
4022 	urldb_add_url(url);
4023 
4024 	p = urldb_find_url(url);
4025 	if (!p)
4026 		return NULL;
4027 
4028 	scheme = p->scheme;
4029 
4030 	matched_cookies = malloc(matched_cookies_size *
4031 				 sizeof(struct cookie_internal_data *));
4032 	if (!matched_cookies)
4033 		return NULL;
4034 
4035 #define GROW_MATCHED_COOKIES						\
4036 	do {								\
4037 		if (count == matched_cookies_size) {			\
4038 			struct cookie_internal_data **temp;		\
4039 			temp = realloc(matched_cookies,			\
4040 				       (matched_cookies_size + 20) *	\
4041 				       sizeof(struct cookie_internal_data *)); \
4042 									\
4043 			if (temp == NULL) {				\
4044 				free(ret);				\
4045 				free(matched_cookies);			\
4046 				return NULL;				\
4047 			}						\
4048 									\
4049 			matched_cookies = temp;				\
4050 			matched_cookies_size += 20;			\
4051 		}							\
4052 	} while(0)
4053 
4054 	ret = malloc(ret_alloc);
4055 	if (!ret) {
4056 		free(matched_cookies);
4057 		return NULL;
4058 	}
4059 
4060 	ret[0] = '\0';
4061 
4062 	path_lwc = nsurl_get_component(url, NSURL_PATH);
4063 	if (path_lwc == NULL) {
4064 		free(ret);
4065 		free(matched_cookies);
4066 		return NULL;
4067 	}
4068 	path = lwc_string_data(path_lwc);
4069 	lwc_string_unref(path_lwc);
4070 
4071 	now = time(NULL);
4072 
4073 	if (*(p->segment) != '\0') {
4074 		/* Match exact path, unless directory, when prefix matching
4075 		 * will handle this case for us. */
4076 		for (q = p->parent->children; q; q = q->next) {
4077 			if (strcmp(q->segment, p->segment))
4078 				continue;
4079 
4080 			/* Consider all cookies associated with
4081 			 * this exact path */
4082 			for (c = q->cookies; c; c = c->next) {
4083 				if (c->expires != -1 && c->expires < now)
4084 					/* cookie has expired => ignore */
4085 					continue;
4086 
4087 				if (c->secure && lwc_string_isequal(
4088 					    q->scheme,
4089 					    corestring_lwc_https,
4090 					    &match) &&
4091 				    match == false)
4092 					/* secure cookie for insecure host.
4093 					 * ignore */
4094 					continue;
4095 
4096 				if (c->http_only && !include_http_only)
4097 					/* Ignore HttpOnly */
4098 					continue;
4099 
4100 				matched_cookies[count++] = c;
4101 
4102 				GROW_MATCHED_COOKIES;
4103 
4104 				if (c->version < (unsigned int)version)
4105 					version = c->version;
4106 
4107 				c->last_used = now;
4108 
4109 				cookie_manager_add((struct cookie_data *)c);
4110 			}
4111 		}
4112 	}
4113 
4114 	/* Now consider cookies whose paths prefix-match ours */
4115 	for (p = p->parent; p; p = p->parent) {
4116 		/* Find directory's path entry(ies) */
4117 		/* There are potentially multiple due to differing schemes */
4118 		for (q = p->children; q; q = q->next) {
4119 			if (*(q->segment) != '\0')
4120 				continue;
4121 
4122 			for (c = q->cookies; c; c = c->next) {
4123 				if (c->expires != -1 && c->expires < now)
4124 					/* cookie has expired => ignore */
4125 					continue;
4126 
4127 				if (c->secure && lwc_string_isequal(
4128 					    q->scheme,
4129 					    corestring_lwc_https,
4130 					    &match) &&
4131 				    match == false)
4132 					/* Secure cookie for insecure server
4133 					 * => ignore */
4134 					continue;
4135 
4136 				matched_cookies[count++] = c;
4137 
4138 				GROW_MATCHED_COOKIES;
4139 
4140 				if (c->version < (unsigned int) version)
4141 					version = c->version;
4142 
4143 				c->last_used = now;
4144 
4145 				cookie_manager_add((struct cookie_data *)c);
4146 			}
4147 		}
4148 
4149 		if (!p->parent) {
4150 			/* No parent, so bail here. This can't go in
4151 			 * the loop exit condition as we also want to
4152 			 * process the top-level node.
4153 			 *
4154 			 * If p->parent is NULL then p->cookies are
4155 			 * the domain cookies and thus we don't even
4156 			 * try matching against them.
4157 			 */
4158 			break;
4159 		}
4160 
4161 		/* Consider p itself - may be the result of Path=/foo */
4162 		for (c = p->cookies; c; c = c->next) {
4163 			if (c->expires != -1 && c->expires < now)
4164 				/* cookie has expired => ignore */
4165 				continue;
4166 
4167 			/* Ensure cookie path is a prefix of the resource */
4168 			if (strncmp(c->path, path, strlen(c->path)) != 0)
4169 				/* paths don't match => ignore */
4170 				continue;
4171 
4172 			if (c->secure && lwc_string_isequal(p->scheme,
4173 							    corestring_lwc_https,
4174 							    &match) &&
4175 			    match == false)
4176 				/* Secure cookie for insecure server
4177 				 * => ignore */
4178 				continue;
4179 
4180 			matched_cookies[count++] = c;
4181 
4182 			GROW_MATCHED_COOKIES;
4183 
4184 			if (c->version < (unsigned int) version)
4185 				version = c->version;
4186 
4187 			c->last_used = now;
4188 
4189 			cookie_manager_add((struct cookie_data *)c);
4190 		}
4191 
4192 	}
4193 
4194 	/* Finally consider domain cookies for hosts which domain match ours */
4195 	for (h = (const struct host_part *)p; h && h != &db_root;
4196 	     h = h->parent) {
4197 		for (c = h->paths.cookies; c; c = c->next) {
4198 			if (c->expires != -1 && c->expires < now)
4199 				/* cookie has expired => ignore */
4200 				continue;
4201 
4202 			/* Ensure cookie path is a prefix of the resource */
4203 			if (strncmp(c->path, path, strlen(c->path)) != 0)
4204 				/* paths don't match => ignore */
4205 				continue;
4206 
4207 			if (c->secure && lwc_string_isequal(scheme,
4208 							    corestring_lwc_https,
4209 							    &match) &&
4210 			    match == false)
4211 				/* secure cookie for insecure host. ignore */
4212 				continue;
4213 
4214 			matched_cookies[count++] = c;
4215 
4216 			GROW_MATCHED_COOKIES;
4217 
4218 			if (c->version < (unsigned int)version)
4219 				version = c->version;
4220 
4221 			c->last_used = now;
4222 
4223 			cookie_manager_add((struct cookie_data *)c);
4224 		}
4225 	}
4226 
4227 	if (count == 0) {
4228 		/* No cookies found */
4229 		free(ret);
4230 		free(matched_cookies);
4231 		return NULL;
4232 	}
4233 
4234 	/* and build output string */
4235 	if (version > COOKIE_NETSCAPE) {
4236 		sprintf(ret, "$Version=%d", version);
4237 		ret_used = strlen(ret) + 1;
4238 	}
4239 
4240 	for (i = 0; i < count; i++) {
4241 		if (!urldb_concat_cookie(matched_cookies[i], version,
4242 					 &ret_used, &ret_alloc, &ret)) {
4243 			free(ret);
4244 			free(matched_cookies);
4245 			return NULL;
4246 		}
4247 	}
4248 
4249 	if (version == COOKIE_NETSCAPE) {
4250 		/* Old-style cookies => no version & skip "; " */
4251 		memmove(ret, ret + 2, ret_used - 2);
4252 		ret_used -= 2;
4253 	}
4254 
4255 	/* Now, shrink the output buffer to the required size */
4256 	{
4257 		char *temp = realloc(ret, ret_used);
4258 		if (!temp) {
4259 			free(ret);
4260 			free(matched_cookies);
4261 			return NULL;
4262 		}
4263 
4264 		ret = temp;
4265 	}
4266 
4267 	free(matched_cookies);
4268 
4269 	return ret;
4270 
4271 #undef GROW_MATCHED_COOKIES
4272 }
4273 
4274 
4275 /* exported interface documented in content/urldb.h */
urldb_delete_cookie(const char * domain,const char * path,const char * name)4276 void urldb_delete_cookie(const char *domain, const char *path,
4277 			 const char *name)
4278 {
4279 	urldb_delete_cookie_hosts(domain, path, name, &db_root);
4280 }
4281 
4282 
4283 /* exported interface documented in content/urldb.h */
urldb_load_cookies(const char * filename)4284 void urldb_load_cookies(const char *filename)
4285 {
4286 	FILE *fp;
4287 	char s[16*1024];
4288 
4289 	assert(filename);
4290 
4291 	fp = fopen(filename, "r");
4292 	if (!fp)
4293 		return;
4294 
4295 #define FIND_T {				\
4296 		for (; *p && *p != '\t'; p++)	\
4297 			; /* do nothing */	\
4298 		if (p >= end) {			\
4299 			NSLOG(netsurf, INFO, "Overran input");	\
4300 			continue;		\
4301 		}				\
4302 		*p++ = '\0';			\
4303 	}
4304 
4305 #define SKIP_T {				\
4306 		for (; *p && *p == '\t'; p++)	\
4307 			; /* do nothing */	\
4308 		if (p >= end) {			\
4309 			NSLOG(netsurf, INFO, "Overran input");	\
4310 			continue;		\
4311 		}				\
4312 	}
4313 
4314 	while (fgets(s, sizeof s, fp)) {
4315 		char *p = s, *end = 0,
4316 			*domain, *path, *name, *value, *scheme, *url,
4317 			*comment;
4318 		int version, domain_specified, path_specified,
4319 			secure, http_only, no_destroy, value_quoted;
4320 		time_t expires, last_used;
4321 		struct cookie_internal_data *c;
4322 
4323 		if(s[0] == 0 || s[0] == '#')
4324 			/* Skip blank lines or comments */
4325 			continue;
4326 
4327 		s[strlen(s) - 1] = '\0'; /* lose terminating newline */
4328 		end = s + strlen(s);
4329 
4330 		/* Look for file version first
4331 		 * (all input is ignored until this is read)
4332 		 */
4333 		if (strncasecmp(s, "Version:", 8) == 0) {
4334 			FIND_T; SKIP_T; loaded_cookie_file_version = atoi(p);
4335 
4336 			if (loaded_cookie_file_version <
4337 			    MIN_COOKIE_FILE_VERSION) {
4338 				NSLOG(netsurf, INFO,
4339 				      "Unsupported Cookie file version");
4340 				break;
4341 			}
4342 
4343 			continue;
4344 		} else if (loaded_cookie_file_version == 0) {
4345 			/* Haven't yet seen version; skip this input */
4346 			continue;
4347 		}
4348 
4349 		/* One cookie/line */
4350 
4351 		/* Parse input */
4352 		FIND_T; version = atoi(s);
4353 		SKIP_T; domain = p; FIND_T;
4354 		SKIP_T; domain_specified = atoi(p); FIND_T;
4355 		SKIP_T; path = p; FIND_T;
4356 		SKIP_T; path_specified = atoi(p); FIND_T;
4357 		SKIP_T; secure = atoi(p); FIND_T;
4358 		if (loaded_cookie_file_version > 101) {
4359 			/* Introduced in version 1.02 */
4360 			SKIP_T; http_only = atoi(p); FIND_T;
4361 		} else {
4362 			http_only = 0;
4363 		}
4364 		SKIP_T; expires = (time_t)atoi(p); FIND_T;
4365 		SKIP_T; last_used = (time_t)atoi(p); FIND_T;
4366 		SKIP_T; no_destroy = atoi(p); FIND_T;
4367 		SKIP_T; name = p; FIND_T;
4368 		SKIP_T; value = p; FIND_T;
4369 		if (loaded_cookie_file_version > 100) {
4370 			/* Introduced in version 1.01 */
4371 			SKIP_T;	value_quoted = atoi(p); FIND_T;
4372 		} else {
4373 			value_quoted = 0;
4374 		}
4375 		SKIP_T; scheme = p; FIND_T;
4376 		SKIP_T; url = p; FIND_T;
4377 
4378 		/* Comment may have no content, so don't
4379 		 * use macros as they'll break */
4380 		for (; *p && *p == '\t'; p++)
4381 			; /* do nothing */
4382 		comment = p;
4383 
4384 		assert(p <= end);
4385 
4386 		/* Now create cookie */
4387 		c = malloc(sizeof(struct cookie_internal_data));
4388 		if (!c)
4389 			break;
4390 
4391 		c->name = strdup(name);
4392 		c->value = strdup(value);
4393 		c->value_was_quoted = value_quoted;
4394 		c->comment = strdup(comment);
4395 		c->domain_from_set = domain_specified;
4396 		c->domain = strdup(domain);
4397 		c->path_from_set = path_specified;
4398 		c->path = strdup(path);
4399 		c->expires = expires;
4400 		c->last_used = last_used;
4401 		c->secure = secure;
4402 		c->http_only = http_only;
4403 		c->version = version;
4404 		c->no_destroy = no_destroy;
4405 
4406 		if (!(c->name && c->value && c->comment &&
4407 		      c->domain && c->path)) {
4408 			urldb_free_cookie(c);
4409 			break;
4410 		}
4411 
4412 		if (c->domain[0] != '.') {
4413 			lwc_string *scheme_lwc = NULL;
4414 			nsurl *url_nsurl = NULL;
4415 
4416 			assert(scheme[0] != 'u');
4417 
4418 			if (nsurl_create(url, &url_nsurl) != NSERROR_OK) {
4419 				urldb_free_cookie(c);
4420 				break;
4421 			}
4422 			scheme_lwc = nsurl_get_component(url_nsurl,
4423 							 NSURL_SCHEME);
4424 
4425 			/* And insert it into database */
4426 			if (!urldb_insert_cookie(c, scheme_lwc, url_nsurl)) {
4427 				/* Cookie freed for us */
4428 				nsurl_unref(url_nsurl);
4429 				lwc_string_unref(scheme_lwc);
4430 				break;
4431 			}
4432 			nsurl_unref(url_nsurl);
4433 			lwc_string_unref(scheme_lwc);
4434 
4435 		} else {
4436 			if (!urldb_insert_cookie(c, NULL, NULL)) {
4437 				/* Cookie freed for us */
4438 				break;
4439 			}
4440 		}
4441 	}
4442 
4443 #undef SKIP_T
4444 #undef FIND_T
4445 
4446 	fclose(fp);
4447 }
4448 
4449 
4450 /* exported interface documented in content/urldb.h */
urldb_save_cookies(const char * filename)4451 void urldb_save_cookies(const char *filename)
4452 {
4453 	FILE *fp;
4454 	int cookie_file_version = max(loaded_cookie_file_version,
4455 				      COOKIE_FILE_VERSION);
4456 
4457 	assert(filename);
4458 
4459 	fp = fopen(filename, "w");
4460 	if (!fp)
4461 		return;
4462 
4463 	fprintf(fp, "# NetSurf cookies file.\n"
4464 		"#\n"
4465 		"# Lines starting with a '#' are comments, "
4466 		"blank lines are ignored.\n"
4467 		"#\n"
4468 		"# All lines prior to \"Version:\t%d\" are discarded.\n"
4469 		"#\n"
4470 		"# Version\tDomain\tDomain from Set-Cookie\tPath\t"
4471 		"Path from Set-Cookie\tSecure\tHTTP-Only\tExpires\tLast used\t"
4472 		"No destroy\tName\tValue\tValue was quoted\tScheme\t"
4473 		"URL\tComment\n",
4474 		cookie_file_version);
4475 	fprintf(fp, "Version:\t%d\n", cookie_file_version);
4476 
4477 	urldb_save_cookie_hosts(fp, &db_root);
4478 
4479 	fclose(fp);
4480 }
4481 
4482 
4483 /* exported interface documented in netsurf/url_db.h */
urldb_dump(void)4484 void urldb_dump(void)
4485 {
4486 	int i;
4487 
4488 	urldb_dump_hosts(&db_root);
4489 
4490 	for (i = 0; i != NUM_SEARCH_TREES; i++) {
4491 		urldb_dump_search(search_trees[i], 0);
4492 	}
4493 }
4494 
4495 
4496 
4497 
4498