1 /*
2  * Copyright 2011 Michael Drake <tlsa@netsurf-browser.org>
3  *
4  * This file is part of NetSurf, http://www.netsurf-browser.org/
5  *
6  * NetSurf is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; version 2 of the License.
9  *
10  * NetSurf is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 /**
20  * \file
21  * NetSurf URL handling implementation.
22  *
23  * This is the common implementation of all URL handling within the
24  * browser. This implementation is based upon RFC3986 although this has
25  * been superceeded by https://url.spec.whatwg.org/ which is based on
26  * actual contemporary implementations.
27  *
28  * Care must be taken with character encodings within this module as
29  * the specifications work with specific ascii ranges and must not be
30  * affected by locale. Hence the c library character type functions
31  * are not used.
32  */
33 
34 #include <assert.h>
35 #include <libwapcaplet/libwapcaplet.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <strings.h>
39 
40 #include "netsurf/inttypes.h"
41 
42 #include "utils/ascii.h"
43 #include "utils/corestrings.h"
44 #include "utils/errors.h"
45 #include "utils/idna.h"
46 #include "utils/log.h"
47 #include "utils/nsurl.h"
48 #include "utils/nsurl/private.h"
49 #include "utils/utils.h"
50 
51 
52 /** Marker set, indicating positions of sections within a URL string */
53 struct url_markers {
54 	size_t start; /** start of URL */
55 	size_t scheme_end;
56 	size_t authority;
57 
58 	size_t colon_first;
59 	size_t at;
60 	size_t colon_last;
61 
62 	size_t path;
63 	size_t query;
64 	size_t fragment;
65 
66 	size_t end; /** end of URL */
67 
68 	enum nsurl_scheme_type scheme_type;
69 };
70 
71 
72 /** Sections of a URL */
73 enum url_sections {
74 	URL_SCHEME,
75 	URL_CREDENTIALS,
76 	URL_HOST,
77 	URL_PATH,
78 	URL_QUERY,
79 	URL_FRAGMENT
80 };
81 
82 
83 /**
84  * Return a hex digit for the given numerical value.
85  *
86  * \param digit the value to get the hex digit for.
87  * \return character in range 0-9A-F
88  */
digit2uppercase_hex(unsigned char digit)89 inline static char digit2uppercase_hex(unsigned char digit) {
90 	assert(digit < 16);
91 	return "0123456789ABCDEF"[digit];
92 }
93 
94 /**
95  * determine if a character is unreserved
96  *
97  * \param c character to classify.
98  * \return true if the character is unreserved else false.
99  */
nsurl__is_unreserved(unsigned char c)100 static bool nsurl__is_unreserved(unsigned char c)
101 {
102 	/* From RFC3986 section 2.3 (unreserved characters)
103 	 *
104 	 *      unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
105 	 *
106 	 */
107 	static const bool unreserved[256] = {
108 		false, false, false, false, false, false, false, false, /* 00 */
109 		false, false, false, false, false, false, false, false, /* 08 */
110 		false, false, false, false, false, false, false, false, /* 10 */
111 		false, false, false, false, false, false, false, false, /* 18 */
112 		false, false, false, false, false, false, false, false, /* 20 */
113 		false, false, false, false, false, true,  true,  false, /* 28 */
114 		true,  true,  true,  true,  true,  true,  true,  true,  /* 30 */
115 		true,  true,  false, false, false, false, false, false, /* 38 */
116 		false, true,  true,  true,  true,  true,  true,  true,  /* 40 */
117 		true,  true,  true,  true,  true,  true,  true,  true,  /* 48 */
118 		true,  true,  true,  true,  true,  true,  true,  true,  /* 50 */
119 		true,  true,  true,  false, false, false, false, true,  /* 58 */
120 		false, true,  true,  true,  true,  true,  true,  true,  /* 60 */
121 		true,  true,  true,  true,  true,  true,  true,  true,  /* 68 */
122 		true,  true,  true,  true,  true,  true,  true,  true,  /* 70 */
123 		true,  true,  true,  false, false, false, true,  false, /* 78 */
124 		false, false, false, false, false, false, false, false, /* 80 */
125 		false, false, false, false, false, false, false, false, /* 88 */
126 		false, false, false, false, false, false, false, false, /* 90 */
127 		false, false, false, false, false, false, false, false, /* 98 */
128 		false, false, false, false, false, false, false, false, /* A0 */
129 		false, false, false, false, false, false, false, false, /* A8 */
130 		false, false, false, false, false, false, false, false, /* B0 */
131 		false, false, false, false, false, false, false, false, /* B8 */
132 		false, false, false, false, false, false, false, false, /* C0 */
133 		false, false, false, false, false, false, false, false, /* C8 */
134 		false, false, false, false, false, false, false, false, /* D0 */
135 		false, false, false, false, false, false, false, false, /* D8 */
136 		false, false, false, false, false, false, false, false, /* E0 */
137 		false, false, false, false, false, false, false, false, /* E8 */
138 		false, false, false, false, false, false, false, false, /* F0 */
139 		false, false, false, false, false, false, false, false  /* F8 */
140 	};
141 	return unreserved[c];
142 }
143 
144 /**
145  * determine if a character should be percent escaped.
146  *
147  * The ASCII codes which should not be percent escaped
148  *
149  * \param c character to classify.
150  * \return true if the character should not be escaped else false.
151  */
nsurl__is_no_escape(unsigned char c)152 static bool nsurl__is_no_escape(unsigned char c)
153 {
154 	static const bool no_escape[256] = {
155 		false, false, false, false, false, false, false, false, /* 00 */
156 		false, false, false, false, false, false, false, false, /* 08 */
157 		false, false, false, false, false, false, false, false, /* 10 */
158 		false, false, false, false, false, false, false, false, /* 18 */
159 		false, true,  false, true,  true,  false, true,  true,  /* 20 */
160 		true,  true,  true,  true,  true,  true,  true,  true,  /* 28 */
161 		true,  true,  true,  true,  true,  true,  true,  true,  /* 30 */
162 		true,  true,  true,  true,  false, true,  false, true,  /* 38 */
163 		true,  true,  true,  true,  true,  true,  true,  true,  /* 40 */
164 		true,  true,  true,  true,  true,  true,  true,  true,  /* 48 */
165 		true,  true,  true,  true,  true,  true,  true,  true,  /* 50 */
166 		true,  true,  true,  true,  false, true,  false, true,  /* 58 */
167 		false, true,  true,  true,  true,  true,  true,  true,  /* 60 */
168 		true,  true,  true,  true,  true,  true,  true,  true,  /* 68 */
169 		true,  true,  true,  true,  true,  true,  true,  true,  /* 70 */
170 		true,  true,  true,  false, true,  false, true,  false, /* 78 */
171 		false, false, false, false, false, false, false, false, /* 80 */
172 		false, false, false, false, false, false, false, false, /* 88 */
173 		false, false, false, false, false, false, false, false, /* 90 */
174 		false, false, false, false, false, false, false, false, /* 98 */
175 		false, false, false, false, false, false, false, false, /* A0 */
176 		false, false, false, false, false, false, false, false, /* A8 */
177 		false, false, false, false, false, false, false, false, /* B0 */
178 		false, false, false, false, false, false, false, false, /* B8 */
179 		false, false, false, false, false, false, false, false, /* C0 */
180 		false, false, false, false, false, false, false, false, /* C8 */
181 		false, false, false, false, false, false, false, false, /* D0 */
182 		false, false, false, false, false, false, false, false, /* D8 */
183 		false, false, false, false, false, false, false, false, /* E0 */
184 		false, false, false, false, false, false, false, false, /* E8 */
185 		false, false, false, false, false, false, false, false, /* F0 */
186 		false, false, false, false, false, false, false, false, /* F8 */
187 	};
188 	return no_escape[c];
189 }
190 
191 
192 /**
193  * Obtains a set of markers delimiting sections in a URL string
194  *
195  * \param url_s		URL string
196  * \param markers	Updated to mark sections in the URL string
197  * \param joining	True iff URL string is a relative URL for joining
198  */
nsurl__get_string_markers(const char * const url_s,struct url_markers * markers,bool joining)199 static void nsurl__get_string_markers(const char * const url_s,
200 		struct url_markers *markers, bool joining)
201 {
202 	const char *pos = url_s; /** current position in url_s */
203 	bool is_http = false;
204 	bool trailing_whitespace = false;
205 
206 	/* Initialise marker set */
207 	struct url_markers marker = { 0, 0, 0,   0, 0, 0,
208 				      0, 0, 0,   0, NSURL_SCHEME_OTHER };
209 
210 	/* Skip any leading whitespace in url_s */
211 	while (ascii_is_space(*pos))
212 		pos++;
213 
214 	/* Record start point */
215 	marker.start = pos - url_s;
216 
217 	marker.scheme_end = marker.authority = marker.colon_first = marker.at =
218 			marker.colon_last = marker.path = marker.start;
219 
220 	if (*pos == '\0') {
221 		/* Nothing but whitespace, early exit */
222 		marker.query = marker.fragment = marker.end = marker.path;
223 		*markers = marker;
224 		return;
225 	}
226 
227 	/* Get scheme */
228 	if (ascii_is_alpha(*pos)) {
229 		pos++;
230 
231 		while (*pos != ':' && *pos != '\0') {
232 			if (!ascii_is_alphanumerical(*pos) && (*pos != '+') &&
233 					(*pos != '-') && (*pos != '.')) {
234 				/* This character is not valid in the
235 				 * scheme */
236 				break;
237 			}
238 			pos++;
239 		}
240 
241 		if (*pos == ':') {
242 			/* This delimits the end of the scheme */
243 			size_t off;
244 
245 			marker.scheme_end = pos - url_s;
246 
247 			off = marker.scheme_end - marker.start;
248 
249 			/* Detect http(s) and mailto for scheme specifc
250 			 * normalisation */
251 			if (off == SLEN("http") &&
252 					(((*(pos - off + 0) == 'h') ||
253 					  (*(pos - off + 0) == 'H')) &&
254 					 ((*(pos - off + 1) == 't') ||
255 					  (*(pos - off + 1) == 'T')) &&
256 					 ((*(pos - off + 2) == 't') ||
257 					  (*(pos - off + 2) == 'T')) &&
258 					 ((*(pos - off + 3) == 'p') ||
259 					  (*(pos - off + 3) == 'P')))) {
260 				marker.scheme_type = NSURL_SCHEME_HTTP;
261 				is_http = true;
262 			} else if (off == SLEN("https") &&
263 					(((*(pos - off + 0) == 'h') ||
264 					  (*(pos - off + 0) == 'H')) &&
265 					 ((*(pos - off + 1) == 't') ||
266 					  (*(pos - off + 1) == 'T')) &&
267 					 ((*(pos - off + 2) == 't') ||
268 					  (*(pos - off + 2) == 'T')) &&
269 					 ((*(pos - off + 3) == 'p') ||
270 					  (*(pos - off + 3) == 'P')) &&
271 					 ((*(pos - off + 4) == 's') ||
272 					  (*(pos - off + 4) == 'S')))) {
273 				marker.scheme_type = NSURL_SCHEME_HTTPS;
274 				is_http = true;
275 			} else if (off == SLEN("file") &&
276 					(((*(pos - off + 0) == 'f') ||
277 					  (*(pos - off + 0) == 'F')) &&
278 					 ((*(pos - off + 1) == 'i') ||
279 					  (*(pos - off + 1) == 'I')) &&
280 					 ((*(pos - off + 2) == 'l') ||
281 					  (*(pos - off + 2) == 'L')) &&
282 					 ((*(pos - off + 3) == 'e') ||
283 					  (*(pos - off + 3) == 'E')))) {
284 				marker.scheme_type = NSURL_SCHEME_FILE;
285 			} else if (off == SLEN("ftp") &&
286 					(((*(pos - off + 0) == 'f') ||
287 					  (*(pos - off + 0) == 'F')) &&
288 					 ((*(pos - off + 1) == 't') ||
289 					  (*(pos - off + 1) == 'T')) &&
290 					 ((*(pos - off + 2) == 'p') ||
291 					  (*(pos - off + 2) == 'P')))) {
292 				marker.scheme_type = NSURL_SCHEME_FTP;
293 			} else if (off == SLEN("mailto") &&
294 					(((*(pos - off + 0) == 'm') ||
295 					  (*(pos - off + 0) == 'M')) &&
296 					 ((*(pos - off + 1) == 'a') ||
297 					  (*(pos - off + 1) == 'A')) &&
298 					 ((*(pos - off + 2) == 'i') ||
299 					  (*(pos - off + 2) == 'I')) &&
300 					 ((*(pos - off + 3) == 'l') ||
301 					  (*(pos - off + 3) == 'L')) &&
302 					 ((*(pos - off + 4) == 't') ||
303 					  (*(pos - off + 4) == 'T')) &&
304 					 ((*(pos - off + 5) == 'o') ||
305 					  (*(pos - off + 5) == 'O')))) {
306 				marker.scheme_type = NSURL_SCHEME_MAILTO;
307 			} else if (off == SLEN("data") &&
308 					(((*(pos - off + 0) == 'd') ||
309 					  (*(pos - off + 0) == 'D')) &&
310 					 ((*(pos - off + 1) == 'a') ||
311 					  (*(pos - off + 1) == 'A')) &&
312 					 ((*(pos - off + 2) == 't') ||
313 					  (*(pos - off + 2) == 'T')) &&
314 					 ((*(pos - off + 3) == 'a') ||
315 					  (*(pos - off + 3) == 'A')))) {
316 				marker.scheme_type = NSURL_SCHEME_DATA;
317 			}
318 
319 			/* Skip over colon */
320 			pos++;
321 
322 			/* Mark place as start of authority */
323 			marker.authority = marker.colon_first = marker.at =
324 					marker.colon_last = marker.path =
325 					pos - url_s;
326 
327 		} else {
328 			/* Not found a scheme  */
329 			if (joining == false) {
330 				/* Assuming no scheme == http */
331 				marker.scheme_type = NSURL_SCHEME_HTTP;
332 				is_http = true;
333 			}
334 		}
335 	}
336 
337 	/* Get authority
338 	 *
339 	 * Two slashes always indicates the start of an authority.
340 	 *
341 	 * We are more relaxed in the case of http:
342 	 *   a. when joining, one or more slashes indicates start of authority
343 	 *   b. when not joining, we assume authority if no scheme was present
344 	 * and in the case of mailto: when we assume there is an authority.
345 	 */
346 	if ((*pos == '/' && *(pos + 1) == '/') ||
347 			(is_http && ((joining && *pos == '/') ||
348 					(joining == false &&
349 					marker.scheme_end != marker.start))) ||
350 			marker.scheme_type == NSURL_SCHEME_MAILTO) {
351 
352 		/* Skip over leading slashes */
353 		if (*pos == '/') {
354 			if (is_http == false) {
355 				if (*pos == '/') pos++;
356 				if (*pos == '/') pos++;
357 			} else {
358 				while (*pos == '/')
359 					pos++;
360 			}
361 
362 			marker.authority = marker.colon_first = marker.at =
363 					marker.colon_last = marker.path =
364 					pos - url_s;
365 		}
366 
367 		/* Need to get (or complete) the authority */
368 		while (*pos != '\0') {
369 			if (*pos == '/' || *pos == '?' || *pos == '#') {
370 				/* End of the authority */
371 				break;
372 
373 			} else if (marker.scheme_type != NSURL_SCHEME_MAILTO &&
374 					*pos == ':' && marker.colon_first ==
375 					marker.authority) {
376 				/* could be username:password or host:port
377 				 * separator */
378 				marker.colon_first = pos - url_s;
379 
380 			} else if (marker.scheme_type != NSURL_SCHEME_MAILTO &&
381 					*pos == ':' && marker.colon_first !=
382 					marker.authority) {
383 				/* could be host:port separator */
384 				marker.colon_last = pos - url_s;
385 
386 			} else if (*pos == '@' && marker.at ==
387 					marker.authority) {
388 				/* Credentials @ host separator */
389 				marker.at = pos - url_s;
390 			}
391 
392 			pos++;
393 		}
394 
395 		marker.path = pos - url_s;
396 
397 	} else if ((*pos == '\0' || *pos == '/') &&
398 			joining == false && is_http == true) {
399 		marker.path = pos - url_s;
400 	}
401 
402 	/* Get path
403 	 *
404 	 * Needs to start with '/' if there's no authority
405 	 */
406 	if (*pos == '/' || ((marker.path == marker.authority) &&
407 			(*pos != '?') && (*pos != '#') && (*pos != '\0'))) {
408 		while (*(++pos) != '\0') {
409 			if (*pos == '?' || *pos == '#') {
410 				/* End of the path */
411 				break;
412 			}
413 		}
414 	}
415 
416 	marker.query = pos - url_s;
417 
418 	/* Get query */
419 	if (*pos == '?') {
420 		while (*(++pos) != '\0') {
421 			if (*pos == '#') {
422 				/* End of the query */
423 				break;
424 			}
425 		}
426 	}
427 
428 	marker.fragment = pos - url_s;
429 
430 	/* Get fragment */
431 	if (*pos == '#') {
432 		while (*(++pos) != '\0')
433 			;
434 	}
435 
436 	/* We got to the end of url_s.
437 	 * Need to skip back over trailing whitespace to find end of URL */
438 	pos--;
439 	if (pos >= url_s && ascii_is_space(*pos)) {
440 		trailing_whitespace = true;
441 		while (pos >= url_s && ascii_is_space(*pos))
442 			pos--;
443 	}
444 
445 	marker.end = pos + 1 - url_s;
446 
447 	if (trailing_whitespace == true) {
448 		/* Ensure last url section doesn't pass end */
449 		if (marker.fragment > marker.end)
450 			marker.fragment = marker.end;
451 		if (marker.query > marker.end)
452 			marker.query = marker.end;
453 		if (marker.path > marker.end)
454 			marker.path = marker.end;
455 		if (marker.colon_last > marker.end)
456 			marker.colon_last = marker.end;
457 		if (marker.at > marker.end)
458 			marker.at = marker.end;
459 		if (marker.colon_last > marker.end)
460 			marker.colon_last = marker.end;
461 		if (marker.fragment > marker.end)
462 			marker.fragment = marker.end;
463 	}
464 
465 	NSLOG(netsurf, DEEPDEBUG,
466 	      "marker.start: %"PRIsizet, marker.start);
467 	NSLOG(netsurf, DEEPDEBUG,
468 	      "marker.scheme_end: %"PRIsizet, marker.scheme_end);
469 	NSLOG(netsurf, DEEPDEBUG,
470 	      "marker.authority: %"PRIsizet, marker.authority);
471 
472 	NSLOG(netsurf, DEEPDEBUG,
473 	      "marker.colon_first: %"PRIsizet, marker.colon_first);
474 	NSLOG(netsurf, DEEPDEBUG,
475 	      "marker.at: %"PRIsizet, marker.at);
476 	NSLOG(netsurf, DEEPDEBUG,
477 	      "marker.colon_last: %"PRIsizet, marker.colon_last);
478 
479 	NSLOG(netsurf, DEEPDEBUG,
480 	      "marker.path: %"PRIsizet, marker.path);
481 	NSLOG(netsurf, DEEPDEBUG,
482 	      "marker.query: %"PRIsizet, marker.query);
483 	NSLOG(netsurf, DEEPDEBUG,
484 	      "marker.fragment: %"PRIsizet, marker.fragment);
485 
486 	NSLOG(netsurf, DEEPDEBUG,
487 	      "marker.end: %"PRIsizet, marker.end);
488 
489 	/* Got all the URL components pegged out now */
490 	*markers = marker;
491 }
492 
493 
494 /**
495  * Remove dot segments from a path, as per rfc 3986, 5.2.4
496  *
497  * \param path		path to remove dot segments from ('\0' terminated)
498  * \param output	path with dot segments removed
499  * \return size of output
500  */
nsurl__remove_dot_segments(char * path,char * output)501 static size_t nsurl__remove_dot_segments(char *path, char *output)
502 {
503 	char *path_pos = path;
504 	char *output_pos = output;
505 
506 	while (*path_pos != '\0') {
507 		NSLOG(netsurf, DEEPDEBUG, " in:%s", path_pos);
508 		NSLOG(netsurf, DEEPDEBUG, "out:%.*s",
509 				(int)(output_pos - output), output);
510 
511 		if (*path_pos == '.') {
512 			if (*(path_pos + 1) == '.' &&
513 					*(path_pos + 2) == '/') {
514 				/* Found prefix of "../" */
515 				path_pos += SLEN("../");
516 				continue;
517 
518 			} else if (*(path_pos + 1) == '/') {
519 				/* Found prefix of "./" */
520 				path_pos += SLEN("./");
521 				continue;
522 			}
523 		} else if (*path_pos == '/' && *(path_pos + 1) == '.') {
524 			if (*(path_pos + 2) == '/') {
525 				/* Found prefix of "/./" */
526 				path_pos += SLEN("/.");
527 				continue;
528 
529 			} else if (*(path_pos + 2) == '\0') {
530 				/* Found "/." at end of path */
531 				*(output_pos++) = '/';
532 
533 				/* End of input path */
534 				break;
535 
536 			} else if (*(path_pos + 2) == '.') {
537 				if (*(path_pos + 3) == '/') {
538 					/* Found prefix of "/../" */
539 					path_pos += SLEN("/..");
540 
541 					if (output_pos > output)
542 						output_pos--;
543 					while (output_pos > output &&
544 							*output_pos != '/')
545 						output_pos--;
546 
547 					continue;
548 
549 				} else if (*(path_pos + 3) == '\0') {
550 					/* Found "/.." at end of path */
551 
552 					while (output_pos > output &&
553 							*(output_pos -1 ) !='/')
554 						output_pos--;
555 
556 					/* End of input path */
557 					break;
558 				}
559 			}
560 		} else if (*path_pos == '.') {
561 			if (*(path_pos + 1) == '\0') {
562 				/* Found "." at end of path */
563 
564 				/* End of input path */
565 				break;
566 
567 			} else if (*(path_pos + 1) == '.' &&
568 					*(path_pos + 2) == '\0') {
569 				/* Found ".." at end of path */
570 
571 				/* End of input path */
572 				break;
573 			}
574 		}
575 		/* Copy first character into output path */
576 		*output_pos++ = *path_pos++;
577 
578 		/* Copy up to but not including next '/' */
579 		  while ((*path_pos != '/') && (*path_pos != '\0'))
580 			*output_pos++ = *path_pos++;
581 	}
582 
583 	return output_pos - output;
584 }
585 
586 
587 /**
588  * Get the length of the longest section
589  *
590  * \param m	markers delimiting url sections in a string
591  * \return the length of the longest section
592  */
nsurl__get_longest_section(struct url_markers * m)593 static size_t nsurl__get_longest_section(struct url_markers *m)
594 {
595 	size_t length = m->scheme_end - m->start;	/* scheme */
596 
597 	if (length < m->at - m->authority)		/* credentials */
598 		length = m->at - m->authority;
599 
600 	if (length < m->path - m->at)			/* host */
601 		length = m->path - m->at;
602 
603 	if (length < m->query - m->path)		/* path */
604 		length = m->query - m->path;
605 
606 	if (length < m->fragment - m->query)		/* query */
607 		length = m->fragment - m->query;
608 
609 	if (length < m->end - m->fragment)		/* fragment */
610 		length = m->end - m->fragment;
611 
612 	return length;
613 }
614 
615 
616 /**
617  * Create the components of a NetSurf URL object for a section of a URL string
618  *
619  * \param url_s		URL string
620  * \param section	Sets which section of URL string is to be normalised
621  * \param pegs		Set of markers delimiting the URL string's sections
622  * \param pos_norm	A buffer large enough for the normalised string (*3 + 1)
623  * \param url		A NetSurf URL object, to which components may be added
624  * \return NSERROR_OK on success, appropriate error otherwise
625  *
626  * The section of url_s is normalised appropriately.
627  */
nsurl__create_from_section(const char * const url_s,const enum url_sections section,const struct url_markers * pegs,char * pos_norm,struct nsurl_components * url)628 static nserror nsurl__create_from_section(const char * const url_s,
629 		const enum url_sections section,
630 		const struct url_markers *pegs,
631 		char *pos_norm,
632 		struct nsurl_components *url)
633 {
634 	nserror ret;
635 	int ascii_offset;
636 	int start = 0;
637 	int end = 0;
638 	const char *pos;
639 	const char *pos_url_s;
640 	char *norm_start = pos_norm;
641 	char *host;
642 	size_t copy_len;
643 	size_t length;
644 	size_t host_len;
645 	enum {
646 		NSURL_F_NO_PORT		= (1 << 0)
647 	} flags = 0;
648 
649 	switch (section) {
650 	case URL_SCHEME:
651 		start = pegs->start;
652 		end = pegs->scheme_end;
653 		break;
654 
655 	case URL_CREDENTIALS:
656 		start = pegs->authority;
657 		end = pegs->at;
658 		break;
659 
660 	case URL_HOST:
661 		start = (pegs->at == pegs->authority &&
662 				*(url_s + pegs->at) != '@') ?
663 				pegs->at :
664 				pegs->at + 1;
665 		end = pegs->path;
666 		break;
667 
668 	case URL_PATH:
669 		start = pegs->path;
670 		end = pegs->query;
671 		break;
672 
673 	case URL_QUERY:
674 		start = (*(url_s + pegs->query) != '?') ?
675 				pegs->query :
676 				pegs->query + 1;
677 		end = pegs->fragment;
678 		break;
679 
680 	case URL_FRAGMENT:
681 		start = (*(url_s + pegs->fragment) != '#') ?
682 				pegs->fragment :
683 				pegs->fragment + 1;
684 		end = pegs->end;
685 		break;
686 	}
687 
688 	if (end < start)
689 		end = start;
690 
691 	length = end - start;
692 
693 	/* Stage 1: Normalise the required section */
694 
695 	pos = pos_url_s = url_s + start;
696 	copy_len = 0;
697 	for (; pos < url_s + end; pos++) {
698 		if (*pos == '%' && (pos + 2 < url_s + end)) {
699 			/* Might be an escaped character needing unescaped */
700 
701 			/* Find which character which was escaped */
702 			ascii_offset = ascii_hex_to_value_2_chars(*(pos + 1),
703 					*(pos + 2));
704 
705 			if (ascii_offset < 0) {
706 				/* % with invalid hex digits. */
707 				copy_len++;
708 				continue;
709 			}
710 
711 			if ((section != URL_SCHEME && section != URL_HOST) &&
712 				(nsurl__is_unreserved(ascii_offset) == false)) {
713 				/* This character should be escaped after all,
714 				 * just let it get copied */
715 				copy_len += 3;
716 				pos += 2;
717 				continue;
718 			}
719 
720 			if (copy_len > 0) {
721 				/* Copy up to here */
722 				memcpy(pos_norm, pos_url_s, copy_len);
723 				pos_norm += copy_len;
724 				copy_len = 0;
725 			}
726 
727 			/* Put the unescaped character in the normalised URL */
728 			*(pos_norm++) = (char)ascii_offset;
729 			pos += 2;
730 			pos_url_s = pos + 1;
731 
732 			length -= 2;
733 
734 		} else if ((section != URL_SCHEME && section != URL_HOST) &&
735 				(nsurl__is_no_escape(*pos) == false)) {
736 
737 			/* This needs to be escaped */
738 			if (copy_len > 0) {
739 				/* Copy up to here */
740 				memcpy(pos_norm, pos_url_s, copy_len);
741 				pos_norm += copy_len;
742 				copy_len = 0;
743 			}
744 
745 			/* escape */
746 			*(pos_norm++) = '%';
747 			*(pos_norm++) = digit2uppercase_hex(
748 					((unsigned char)*pos) >> 4);
749 			*(pos_norm++) = digit2uppercase_hex(
750 					((unsigned char)*pos) & 0xf);
751 			pos_url_s = pos + 1;
752 
753 			length += 2;
754 
755 		} else if ((section == URL_SCHEME || section == URL_HOST) &&
756 				ascii_is_alpha_upper(*pos)) {
757 			/* Lower case this letter */
758 
759 			if (copy_len > 0) {
760 				/* Copy up to here */
761 				memcpy(pos_norm, pos_url_s, copy_len);
762 				pos_norm += copy_len;
763 				copy_len = 0;
764 			}
765 			/* Copy lower cased letter into normalised URL */
766 			*(pos_norm++) = ascii_to_lower(*pos);
767 			pos_url_s = pos + 1;
768 
769 		} else {
770 			/* This character is safe in normalised URL */
771 			copy_len++;
772 		}
773 	}
774 
775 	if (copy_len > 0) {
776 		/* Copy up to here */
777 		memcpy(pos_norm, pos_url_s, copy_len);
778 		pos_norm += copy_len;
779 	}
780 
781 	/* Mark end of section */
782 	(*pos_norm) = '\0';
783 
784 	/* Stage 2: Create the URL components for the required section */
785 	switch (section) {
786 	case URL_SCHEME:
787 		if (length == 0) {
788 			/* No scheme, assuming http */
789 			url->scheme = lwc_string_ref(corestring_lwc_http);
790 		} else {
791 			/* Add scheme to URL */
792 			if (lwc_intern_string(norm_start, length,
793 					&url->scheme) != lwc_error_ok) {
794 				return NSERROR_NOMEM;
795 			}
796 		}
797 
798 		break;
799 
800 	case URL_CREDENTIALS:
801 		url->username = NULL;
802 		url->password = NULL;
803 
804 		/* file: URLs don't have credentials */
805 		if (url->scheme_type == NSURL_SCHEME_FILE) {
806 			break;
807 		}
808 
809 		if (length != 0 && *norm_start != ':') {
810 			char *sec_start = norm_start;
811 			if (pegs->colon_first != pegs->authority &&
812 					pegs->at > pegs->colon_first + 1) {
813 				/* there's a password */
814 				sec_start += pegs->colon_first -
815 						pegs->authority + 1;
816 				if (lwc_intern_string(sec_start,
817 						pegs->at - pegs->colon_first -1,
818 						&url->password) !=
819 						lwc_error_ok) {
820 					return NSERROR_NOMEM;
821 				}
822 
823 				/* update start pos and length for username */
824 				sec_start = norm_start;
825 				length -= pegs->at - pegs->colon_first;
826 			} else if (pegs->colon_first != pegs->authority &&
827 					pegs->at == pegs->colon_first + 1) {
828 				/* strip username colon */
829 				length--;
830 			}
831 
832 			/* Username */
833 			if (lwc_intern_string(sec_start, length,
834 					&url->username) != lwc_error_ok) {
835 				return NSERROR_NOMEM;
836 			}
837 		}
838 
839 		break;
840 
841 	case URL_HOST:
842 		url->host = NULL;
843 		url->port = NULL;
844 
845 		/* file: URLs don't have a host */
846 		if (url->scheme_type == NSURL_SCHEME_FILE) {
847 			break;
848 		}
849 
850 		if (length != 0) {
851 			size_t colon = 0;
852 			char *sec_start = norm_start;
853 			if (pegs->at < pegs->colon_first &&
854 					pegs->colon_last == pegs->authority) {
855 				/* There's one colon and it's after @ marker */
856 				colon = pegs->colon_first;
857 			} else if (pegs->colon_last != pegs->authority) {
858 				/* There's more than one colon */
859 				colon = pegs->colon_last;
860 			} else {
861 				/* There's no colon that could be a port
862 				 * separator */
863 				flags |= NSURL_F_NO_PORT;
864 			}
865 
866 			if (!(flags & NSURL_F_NO_PORT)) {
867 				/* Determine whether colon is a port separator
868 				 */
869 				sec_start += colon - pegs->at;
870 				while (++sec_start < norm_start + length) {
871 					if (!ascii_is_digit(*sec_start)) {
872 						/* Character after port isn't a
873 						 * digit; not a port separator
874 						 */
875 						flags |= NSURL_F_NO_PORT;
876 						break;
877 					}
878 				}
879 			}
880 
881 			if (!(flags & NSURL_F_NO_PORT)) {
882 				/* There's a port */
883 				size_t skip = (pegs->at == pegs->authority) ?
884 						1 : 0;
885 				sec_start = norm_start + colon - pegs->at +
886 						skip;
887 				if (url->scheme != NULL &&
888 						url->scheme_type ==
889 						NSURL_SCHEME_HTTP &&
890 						length -
891 						(colon - pegs->at + skip) == 2 &&
892 						*sec_start == '8' &&
893 						*(sec_start + 1) == '0') {
894 					/* Scheme is http, and port is default
895 					 * (80) */
896 					flags |= NSURL_F_NO_PORT;
897 				}
898 
899 				if (length <= (colon - pegs->at + skip)) {
900 					/* No space for a port after the colon
901 					 */
902 					flags |= NSURL_F_NO_PORT;
903 				}
904 
905 				/* Add non-redundant ports to NetSurf URL */
906 				sec_start = norm_start + colon - pegs->at +
907 						skip;
908 				if (!(flags & NSURL_F_NO_PORT) &&
909 						lwc_intern_string(sec_start,
910 						length -
911 						(colon - pegs->at + skip),
912 						&url->port) != lwc_error_ok) {
913 					return NSERROR_NOMEM;
914 				}
915 
916 				/* update length for host */
917 				skip = (pegs->at == pegs->authority) ? 0 : 1;
918 				length = colon - pegs->at - skip;
919 			}
920 
921 			/* host */
922 			/* Encode host according to IDNA2008 */
923 			ret = idna_encode(norm_start, length, &host, &host_len);
924 			if (ret == NSERROR_OK) {
925 				/* valid idna encoding */
926 				if (lwc_intern_string(host, host_len,
927 						&url->host) != lwc_error_ok) {
928 					return NSERROR_NOMEM;
929 				}
930 				free(host);
931 			} else {
932 				/* fall back to straight interning */
933 				if (lwc_intern_string(norm_start, length,
934 						      &url->host) != lwc_error_ok) {
935 					return NSERROR_NOMEM;
936 				}
937 			}
938 		}
939 
940 		break;
941 
942 	case URL_PATH:
943 		if (length != 0) {
944 			if (lwc_intern_string(norm_start, length,
945 					&url->path) != lwc_error_ok) {
946 				return NSERROR_NOMEM;
947 			}
948 		} else if ((url->host != NULL &&
949 				url->scheme_type != NSURL_SCHEME_MAILTO) ||
950 				url->scheme_type == NSURL_SCHEME_FILE) {
951 			/* Set empty path to "/" if:
952 			 *   - there's a host and its not a mailto: URL
953 			 *   - its a file: URL
954 			 */
955 			if (lwc_intern_string("/", SLEN("/"),
956 					&url->path) != lwc_error_ok) {
957 				return NSERROR_NOMEM;
958 			}
959 		} else {
960 			url->path = NULL;
961 		}
962 
963 		break;
964 
965 	case URL_QUERY:
966 		if (length != 0) {
967 			if (lwc_intern_string(norm_start, length,
968 					&url->query) != lwc_error_ok) {
969 				return NSERROR_NOMEM;
970 			}
971 		} else {
972 			url->query = NULL;
973 		}
974 
975 		break;
976 
977 	case URL_FRAGMENT:
978 		if (length != 0) {
979 			if (lwc_intern_string(norm_start, length,
980 					&url->fragment) != lwc_error_ok) {
981 				return NSERROR_NOMEM;
982 			}
983 		} else {
984 			url->fragment = NULL;
985 		}
986 
987 		break;
988 	}
989 
990 	return NSERROR_OK;
991 }
992 
993 
994 /**
995  * Get nsurl string info; total length, component lengths, & components present
996  *
997  * \param url		NetSurf URL components
998  * \param parts		Which parts of the URL are required in the string
999  * \param url_l		Updated to total string length
1000  * \param lengths	Updated with individual component lengths
1001  * \param pflags	Updated to contain relevant string flags
1002  */
nsurl__get_string_data(const struct nsurl_components * url,nsurl_component parts,size_t * url_l,struct nsurl_component_lengths * lengths,enum nsurl_string_flags * pflags)1003 static void nsurl__get_string_data(const struct nsurl_components *url,
1004 		nsurl_component parts, size_t *url_l,
1005 		struct nsurl_component_lengths *lengths,
1006 		enum nsurl_string_flags *pflags)
1007 {
1008 	enum nsurl_string_flags flags = *pflags;
1009 	*url_l = 0;
1010 
1011 	/* Intersection of required parts and available parts gives
1012 	 * the output parts */
1013 	if (url->scheme && parts & NSURL_SCHEME) {
1014 		flags |= NSURL_F_SCHEME;
1015 
1016 		lengths->scheme = lwc_string_length(url->scheme);
1017 		*url_l += lengths->scheme;
1018 	}
1019 
1020 	if (url->username && parts & NSURL_USERNAME) {
1021 		flags |= NSURL_F_USERNAME;
1022 
1023 		lengths->username = lwc_string_length(url->username);
1024 		*url_l += lengths->username;
1025 	}
1026 
1027 	if (url->password && parts & NSURL_PASSWORD) {
1028 		flags |= NSURL_F_PASSWORD;
1029 
1030 		lengths->password = lwc_string_length(url->password);
1031 		*url_l += SLEN(":") + lengths->password;
1032 	}
1033 
1034 	if (url->host && parts & NSURL_HOST) {
1035 		flags |= NSURL_F_HOST;
1036 
1037 		lengths->host = lwc_string_length(url->host);
1038 		*url_l += lengths->host;
1039 	}
1040 
1041 	if (url->port && parts & NSURL_PORT) {
1042 		flags |= NSURL_F_PORT;
1043 
1044 		lengths->port = lwc_string_length(url->port);
1045 		*url_l += SLEN(":") + lengths->port;
1046 	}
1047 
1048 	if (url->path && parts & NSURL_PATH) {
1049 		flags |= NSURL_F_PATH;
1050 
1051 		lengths->path = lwc_string_length(url->path);
1052 		*url_l += lengths->path;
1053 	}
1054 
1055 	if (url->query && parts & NSURL_QUERY) {
1056 		flags |= NSURL_F_QUERY;
1057 
1058 		lengths->query = lwc_string_length(url->query);
1059 		*url_l += lengths->query;
1060 	}
1061 
1062 	if (url->fragment && parts & NSURL_FRAGMENT) {
1063 		flags |= NSURL_F_FRAGMENT;
1064 
1065 		lengths->fragment = lwc_string_length(url->fragment);
1066 		*url_l += lengths->fragment;
1067 	}
1068 
1069 	/* Turn on any spanned punctuation */
1070 	if ((flags & NSURL_F_SCHEME) && (parts > NSURL_SCHEME)) {
1071 		flags |= NSURL_F_SCHEME_PUNCTUATION;
1072 
1073 		*url_l += SLEN(":");
1074 	}
1075 
1076 	if ((flags & NSURL_F_SCHEME) && (flags > NSURL_F_SCHEME) &&
1077 			url->path && lwc_string_data(url->path)[0] == '/') {
1078 		flags |= NSURL_F_AUTHORITY_PUNCTUATION;
1079 
1080 		*url_l += SLEN("//");
1081 	}
1082 
1083 	if ((flags & (NSURL_F_USERNAME | NSURL_F_PASSWORD)) &&
1084 				flags & NSURL_F_HOST) {
1085 		flags |= NSURL_F_CREDENTIALS_PUNCTUATION;
1086 
1087 		*url_l += SLEN("@");
1088 	}
1089 
1090 	/* spanned query question mark */
1091 	if ((flags & ~(NSURL_F_QUERY | NSURL_F_FRAGMENT)) &&
1092 	    (flags & NSURL_F_QUERY)) {
1093 		flags |= NSURL_F_QUERY_PUNCTUATION;
1094 
1095 		*url_l += SLEN("?");
1096 	}
1097 
1098 	/* spanned fragment hash mark */
1099 	if ((flags & ~NSURL_F_FRAGMENT) && (flags & NSURL_F_FRAGMENT)) {
1100 		flags |= NSURL_F_FRAGMENT_PUNCTUATION;
1101 
1102 		*url_l += SLEN("#");
1103 	}
1104 
1105 	*pflags = flags;
1106 }
1107 
1108 
1109 /**
1110  * Copy url string into provided buffer
1111  *
1112  * \param url		NetSurf URL components
1113  * \param url_s		Updated to contain the string
1114  * \param l		Individual component lengths
1115  * \param flags		String flags
1116  */
nsurl__get_string(const struct nsurl_components * url,char * url_s,struct nsurl_component_lengths * l,enum nsurl_string_flags flags)1117 static void nsurl__get_string(const struct nsurl_components *url, char *url_s,
1118 		struct nsurl_component_lengths *l,
1119 		enum nsurl_string_flags flags)
1120 {
1121 	char *pos;
1122 
1123 	/* Copy the required parts into the url string */
1124 	pos = url_s;
1125 
1126 	if (flags & NSURL_F_SCHEME) {
1127 		memcpy(pos, lwc_string_data(url->scheme), l->scheme);
1128 		pos += l->scheme;
1129 	}
1130 
1131 	if (flags & NSURL_F_SCHEME_PUNCTUATION) {
1132 		*(pos++) = ':';
1133 	}
1134 
1135 	if (flags & NSURL_F_AUTHORITY_PUNCTUATION) {
1136 		*(pos++) = '/';
1137 		*(pos++) = '/';
1138 	}
1139 
1140 	if (flags & NSURL_F_USERNAME) {
1141 		memcpy(pos, lwc_string_data(url->username), l->username);
1142 		pos += l->username;
1143 	}
1144 
1145 	if (flags & NSURL_F_PASSWORD) {
1146 		*(pos++) = ':';
1147 		memcpy(pos, lwc_string_data(url->password), l->password);
1148 		pos += l->password;
1149 	}
1150 
1151 	if (flags & NSURL_F_CREDENTIALS_PUNCTUATION) {
1152 		*(pos++) = '@';
1153 	}
1154 
1155 	if (flags & NSURL_F_HOST) {
1156 		memcpy(pos, lwc_string_data(url->host), l->host);
1157 		pos += l->host;
1158 	}
1159 
1160 	if (flags & NSURL_F_PORT) {
1161 		*(pos++) = ':';
1162 		memcpy(pos, lwc_string_data(url->port), l->port);
1163 		pos += l->port;
1164 	}
1165 
1166 	if (flags & NSURL_F_PATH) {
1167 		memcpy(pos, lwc_string_data(url->path), l->path);
1168 		pos += l->path;
1169 	}
1170 
1171 	if (flags & NSURL_F_QUERY) {
1172 		if (flags & NSURL_F_QUERY_PUNCTUATION)
1173 			*(pos++) = '?';
1174 		memcpy(pos, lwc_string_data(url->query), l->query);
1175 		pos += l->query;
1176 	}
1177 
1178 	if (flags & NSURL_F_FRAGMENT) {
1179 		if (flags & NSURL_F_FRAGMENT_PUNCTUATION)
1180 			*(pos++) = '#';
1181 		memcpy(pos, lwc_string_data(url->fragment), l->fragment);
1182 		pos += l->fragment;
1183 	}
1184 
1185 	*pos = '\0';
1186 }
1187 
1188 
1189 /* exported interface, documented in nsurl.h */
nsurl__components_to_string(const struct nsurl_components * components,nsurl_component parts,size_t pre_padding,char ** url_s_out,size_t * url_l_out)1190 nserror nsurl__components_to_string(
1191 		const struct nsurl_components *components,
1192 		nsurl_component parts, size_t pre_padding,
1193 		char **url_s_out, size_t *url_l_out)
1194 {
1195 	struct nsurl_component_lengths str_len = { 0, 0, 0, 0,  0, 0, 0, 0 };
1196 	enum nsurl_string_flags str_flags = 0;
1197 	size_t url_l;
1198 	char *url_s;
1199 
1200 	assert(components != NULL);
1201 
1202 	/* Get the string length and find which parts of url need copied */
1203 	nsurl__get_string_data(components, parts, &url_l,
1204 			&str_len, &str_flags);
1205 
1206 	if (url_l == 0) {
1207 		return NSERROR_BAD_URL;
1208 	}
1209 
1210 	/* Allocate memory for url string */
1211 	url_s = malloc(pre_padding + url_l + 1); /* adding 1 for '\0' */
1212 	if (url_s == NULL) {
1213 		return NSERROR_NOMEM;
1214 	}
1215 
1216 	/* Copy the required parts into the url string */
1217 	nsurl__get_string(components, url_s + pre_padding, &str_len, str_flags);
1218 
1219 	*url_s_out = url_s;
1220 	*url_l_out = url_l;
1221 
1222 	return NSERROR_OK;
1223 }
1224 
1225 
1226 /**
1227  * Calculate hash value
1228  *
1229  * \param url		NetSurf URL object to set hash value for
1230  */
nsurl__calc_hash(nsurl * url)1231 void nsurl__calc_hash(nsurl *url)
1232 {
1233 	uint32_t hash = 0;
1234 
1235 	if (url->components.scheme)
1236 		hash ^= lwc_string_hash_value(url->components.scheme);
1237 
1238 	if (url->components.username)
1239 		hash ^= lwc_string_hash_value(url->components.username);
1240 
1241 	if (url->components.password)
1242 		hash ^= lwc_string_hash_value(url->components.password);
1243 
1244 	if (url->components.host)
1245 		hash ^= lwc_string_hash_value(url->components.host);
1246 
1247 	if (url->components.port)
1248 		hash ^= lwc_string_hash_value(url->components.port);
1249 
1250 	if (url->components.path)
1251 		hash ^= lwc_string_hash_value(url->components.path);
1252 
1253 	if (url->components.query)
1254 		hash ^= lwc_string_hash_value(url->components.query);
1255 
1256 	url->hash = hash;
1257 }
1258 
1259 
1260 /******************************************************************************
1261  * NetSurf URL Public API                                                     *
1262  ******************************************************************************/
1263 
1264 /* exported interface, documented in nsurl.h */
nsurl_create(const char * const url_s,nsurl ** url)1265 nserror nsurl_create(const char * const url_s, nsurl **url)
1266 {
1267 	struct url_markers m;
1268 	struct nsurl_components c;
1269 	size_t length;
1270 	char *buff;
1271 	nserror e = NSERROR_OK;
1272 	bool match;
1273 
1274 	assert(url_s != NULL);
1275 
1276 	/* Peg out the URL sections */
1277 	nsurl__get_string_markers(url_s, &m, false);
1278 
1279 	/* Get the length of the longest section */
1280 	length = nsurl__get_longest_section(&m);
1281 
1282 	/* Allocate enough memory to url escape the longest section */
1283 	buff = malloc(length * 3 + 1);
1284 	if (buff == NULL)
1285 		return NSERROR_NOMEM;
1286 
1287 	/* Set scheme type */
1288 	c.scheme_type = m.scheme_type;
1289 
1290 	/* Build NetSurf URL object from sections */
1291 	e |= nsurl__create_from_section(url_s, URL_SCHEME, &m, buff, &c);
1292 	e |= nsurl__create_from_section(url_s, URL_CREDENTIALS, &m, buff, &c);
1293 	e |= nsurl__create_from_section(url_s, URL_HOST, &m, buff, &c);
1294 	e |= nsurl__create_from_section(url_s, URL_PATH, &m, buff, &c);
1295 	e |= nsurl__create_from_section(url_s, URL_QUERY, &m, buff, &c);
1296 	e |= nsurl__create_from_section(url_s, URL_FRAGMENT, &m, buff, &c);
1297 
1298 	/* Finished with buffer */
1299 	free(buff);
1300 
1301 	if (e != NSERROR_OK) {
1302 		nsurl__components_destroy(&c);
1303 		return NSERROR_NOMEM;
1304 	}
1305 
1306 	/* Validate URL */
1307 	if ((lwc_string_isequal(c.scheme, corestring_lwc_http,
1308 			&match) == lwc_error_ok && match == true) ||
1309 			(lwc_string_isequal(c.scheme, corestring_lwc_https,
1310 			&match) == lwc_error_ok && match == true)) {
1311 		/* http, https must have host */
1312 		if (c.host == NULL) {
1313 			nsurl__components_destroy(&c);
1314 			return NSERROR_BAD_URL;
1315 		}
1316 	}
1317 
1318 	e = nsurl__components_to_string(&c, NSURL_WITH_FRAGMENT,
1319 			offsetof(nsurl, string), (char **)url, &length);
1320 	if (e != NSERROR_OK) {
1321 		return e;
1322 	}
1323 
1324 	(*url)->components = c;
1325 	(*url)->length = length;
1326 
1327 	/* Get the nsurl's hash */
1328 	nsurl__calc_hash(*url);
1329 
1330 	/* Give the URL a reference */
1331 	(*url)->count = 1;
1332 
1333 	return NSERROR_OK;
1334 }
1335 
1336 
1337 /* exported interface, documented in nsurl.h */
nsurl_join(const nsurl * base,const char * rel,nsurl ** joined)1338 nserror nsurl_join(const nsurl *base, const char *rel, nsurl **joined)
1339 {
1340 	struct url_markers m;
1341 	struct nsurl_components c;
1342 	size_t length;
1343 	char *buff;
1344 	char *buff_pos;
1345 	char *buff_start;
1346 	nserror error = 0;
1347 	enum {
1348 		NSURL_F_REL		=  0,
1349 		NSURL_F_BASE_SCHEME	= (1 << 0),
1350 		NSURL_F_BASE_AUTHORITY	= (1 << 1),
1351 		NSURL_F_BASE_PATH	= (1 << 2),
1352 		NSURL_F_MERGED_PATH	= (1 << 3),
1353 		NSURL_F_BASE_QUERY	= (1 << 4)
1354 	} joined_parts;
1355 
1356 	assert(base != NULL);
1357 	assert(rel != NULL);
1358 
1359 	NSLOG(netsurf, DEEPDEBUG, "base: \"%s\", rel: \"%s\"",
1360 			nsurl_access(base), rel);
1361 
1362 	/* Peg out the URL sections */
1363 	nsurl__get_string_markers(rel, &m, true);
1364 
1365 	/* Get the length of the longest section */
1366 	length = nsurl__get_longest_section(&m);
1367 
1368 	/* Initially assume that the joined URL can be formed entierly from
1369 	 * the relative URL.
1370 	 */
1371 	joined_parts = NSURL_F_REL;
1372 
1373 	/* Update joined_compnents to indicate any required parts from the
1374 	 * base URL.
1375 	 */
1376 	if (m.scheme_end - m.start <= 0) {
1377 		/* The relative url has no scheme.
1378 		 * Use base URL's scheme. */
1379 		joined_parts |= NSURL_F_BASE_SCHEME;
1380 
1381 		if (m.path - m.authority <= 0) {
1382 			/* The relative URL has no authority.
1383 			 * Use base URL's authority. */
1384 			joined_parts |= NSURL_F_BASE_AUTHORITY;
1385 
1386 			if (m.query - m.path <= 0) {
1387 				/* The relative URL has no path.
1388 				 * Use base URL's path. */
1389 				joined_parts |= NSURL_F_BASE_PATH;
1390 
1391 				if (m.fragment - m.query <= 0) {
1392 					/* The relative URL has no query.
1393 					 * Use base URL's query. */
1394 					joined_parts |= NSURL_F_BASE_QUERY;
1395 				}
1396 
1397 			} else if (*(rel + m.path) != '/') {
1398 				/* Relative URL has relative path */
1399 				joined_parts |= NSURL_F_MERGED_PATH;
1400 			}
1401 		}
1402 	}
1403 
1404 	/* Allocate enough memory to url escape the longest section, plus
1405 	 * space for path merging (if required).
1406 	 */
1407 	if (joined_parts & NSURL_F_MERGED_PATH) {
1408 		/* Need to merge paths */
1409 		length += (base->components.path != NULL) ?
1410 				lwc_string_length(base->components.path) : 0;
1411 	}
1412 	length *= 4;
1413 	/* Plus space for removing dots from path */
1414 	length += (m.query - m.path) + ((base->components.path != NULL) ?
1415 			lwc_string_length(base->components.path) : 0);
1416 
1417 	buff = malloc(length + 5);
1418 	if (buff == NULL) {
1419 		return NSERROR_NOMEM;
1420 	}
1421 
1422 	buff_pos = buff;
1423 
1424 	/* Form joined URL from base or rel components, as appropriate */
1425 
1426 	if (joined_parts & NSURL_F_BASE_SCHEME) {
1427 		c.scheme_type = base->components.scheme_type;
1428 
1429 		c.scheme = nsurl__component_copy(base->components.scheme);
1430 	} else {
1431 		c.scheme_type = m.scheme_type;
1432 
1433 		error = nsurl__create_from_section(rel, URL_SCHEME, &m,	buff, &c);
1434 		if (error != NSERROR_OK) {
1435 			free(buff);
1436 			return error;
1437 		}
1438 	}
1439 
1440 	if (joined_parts & NSURL_F_BASE_AUTHORITY) {
1441 		c.username = nsurl__component_copy(base->components.username);
1442 		c.password = nsurl__component_copy(base->components.password);
1443 		c.host = nsurl__component_copy(base->components.host);
1444 		c.port = nsurl__component_copy(base->components.port);
1445 	} else {
1446 		error = nsurl__create_from_section(rel, URL_CREDENTIALS, &m,
1447 						   buff, &c);
1448 		if (error == NSERROR_OK) {
1449 			error = nsurl__create_from_section(rel, URL_HOST, &m,
1450 							   buff, &c);
1451 		}
1452 		if (error != NSERROR_OK) {
1453 			free(buff);
1454 			return error;
1455 		}
1456 	}
1457 
1458 	if (joined_parts & NSURL_F_BASE_PATH) {
1459 		c.path = nsurl__component_copy(base->components.path);
1460 
1461 	} else if (joined_parts & NSURL_F_MERGED_PATH) {
1462 		struct url_markers m_path;
1463 		size_t new_length;
1464 
1465 		/* RFC3986 said to append relative path to "/" if the
1466 		 * base path had no path and an authority.
1467 		 *
1468 		 * However, that specification is redundant, and base paths
1469 		 * are normalised, so file, http, and https URLs will always
1470 		 * have a non-empty path.  (Empty paths become "/".)
1471 		 */
1472 
1473 		{
1474 			/* Append relative path to all but last segment of
1475 			 * base path. */
1476 			size_t path_end = lwc_string_length(
1477 					base->components.path);
1478 			const char *path = lwc_string_data(
1479 					base->components.path);
1480 
1481 			while (*(path + path_end) != '/' &&
1482 					path_end != 0) {
1483 				path_end--;
1484 			}
1485 			if (*(path + path_end) == '/')
1486 				path_end++;
1487 
1488 			/* Copy the base part */
1489 			memcpy(buff_pos, path, path_end);
1490 			buff_pos += path_end;
1491 
1492 			/* Copy the relative part */
1493 			memcpy(buff_pos, rel + m.path, m.query - m.path);
1494 			buff_pos += m.query - m.path;
1495 		}
1496 
1497 		/* add termination to string */
1498 		*buff_pos++ = '\0';
1499 
1500 		new_length = nsurl__remove_dot_segments(buff, buff_pos);
1501 
1502 		m_path.path = 0;
1503 		m_path.query = new_length;
1504 
1505 		buff_start = buff_pos + new_length;
1506 		error = nsurl__create_from_section(buff_pos, URL_PATH, &m_path,
1507 				buff_start, &c);
1508 		if (error != NSERROR_OK) {
1509 			free(buff);
1510 			return error;
1511 		}
1512 
1513 	} else {
1514 		struct url_markers m_path;
1515 		size_t new_length;
1516 
1517 		memcpy(buff_pos, rel + m.path, m.query - m.path);
1518 		buff_pos += m.query - m.path;
1519 		*(buff_pos++) = '\0';
1520 
1521 		new_length = nsurl__remove_dot_segments(buff, buff_pos);
1522 
1523 		m_path.path = 0;
1524 		m_path.query = new_length;
1525 
1526 		buff_start = buff_pos + new_length;
1527 
1528 		error = nsurl__create_from_section(buff_pos, URL_PATH, &m_path,
1529 				buff_start, &c);
1530 		if (error != NSERROR_OK) {
1531 			free(buff);
1532 			return error;
1533 		}
1534 	}
1535 
1536 	if (joined_parts & NSURL_F_BASE_QUERY) {
1537 		c.query = nsurl__component_copy(base->components.query);
1538 	} else {
1539 		error = nsurl__create_from_section(rel, URL_QUERY, &m,
1540 				buff, &c);
1541 		if (error != NSERROR_OK) {
1542 			free(buff);
1543 			return error;
1544 		}
1545 	}
1546 
1547 	error = nsurl__create_from_section(rel, URL_FRAGMENT, &m, buff, &c);
1548 
1549 	/* Free temporary buffer */
1550 	free(buff);
1551 
1552 	if (error != NSERROR_OK) {
1553 		return error;
1554 	}
1555 
1556 	error = nsurl__components_to_string(&c, NSURL_WITH_FRAGMENT,
1557 			offsetof(nsurl, string), (char **)joined, &length);
1558 	if (error != NSERROR_OK) {
1559 		return error;
1560 	}
1561 
1562 	(*joined)->components = c;
1563 	(*joined)->length = length;
1564 
1565 	/* Get the nsurl's hash */
1566 	nsurl__calc_hash(*joined);
1567 
1568 	/* Give the URL a reference */
1569 	(*joined)->count = 1;
1570 
1571 	return NSERROR_OK;
1572 }
1573