1 /* url.c
2  * Separate a url into pieces, turn a relative file into an absolute url.
3  * This file is part of the edbrowse project, released under GPL.
4  */
5 
6 #include "eb.h"
7 
8 struct PROTOCOL {
9 	const char prot[MAXPROTLEN];
10 	int port;
11 	bool free_syntax;
12 	bool need_slashes;
13 	bool need_slash_after_host;
14 } protocols[] = {
15 	{
16 	"file", 0, true, true, false}, {
17 	"http", 80, false, true, true}, {
18 	"https", 443, false, true, true}, {
19 	"pop3", 110, false, true, true}, {
20 	"pop3s", 995, false, true, true}, {
21 	"imap", 220, false, true, true}, {
22 	"imaps", 993, false, true, true}, {
23 	"smtp", 25, false, true, true}, {
24 	"submission", 587, false, true, true}, {
25 	"smtps", 465, false, true, true}, {
26 	"proxy", 3128, false, true, true}, {
27 	"ftp", 21, false, true, true}, {
28 	"sftp", 22, false, true, true}, {
29 	"scp", 22, false, true, true}, {
30 	"ftps", 990, false, true, true}, {
31 	"tftp", 69, false, true, true}, {
32 	"rtsp", 554, false, true, true}, {
33 	"pnm", 7070, false, true, true}, {
34 	"finger", 79, false, true, true}, {
35 	"smb", 139, false, true, true}, {
36 	"mailto", 0, false, false, false}, {
37 	"telnet", 23, false, false, false}, {
38 	"tn3270", 0, false, false, false}, {
39 	"data", 0, true, false, false}, {
40 	"javascript", 0, true, false, false}, {
41 	"git", 0, false, true, false}, {
42 	"svn", 0, false, true, false}, {
43 	"gopher", 70, false, true, true}, {
44 	"magnet", 0, false, false, false}, {
45 	"irc", 0, false, true, false}, {
46 "", 0},};
47 
protocolByName(const char * p,int l)48 static int protocolByName(const char *p, int l)
49 {
50 	int i;
51 	for (i = 0; protocols[i].prot[0]; i++)
52 		if (strlen(protocols[i].prot) == l &&
53 		    memEqualCI(protocols[i].prot, p, l))
54 			return i;
55 	return -1;
56 }				/* protocolByName */
57 
58 /* Unpercent the host component of a url, not the data component. */
unpercentURL(char * url)59 void unpercentURL(char *url)
60 {
61 	char c, *u, *w;
62 	int n;
63 	u = w = url;
64 	while ((c = *u)) {
65 		++u;
66 		if (c == '+')
67 			c = ' ';
68 		if (c == '%' && isxdigit(u[0]) && isxdigit(u[1])) {
69 			c = fromHex(u[0], u[1]);
70 			u += 2;
71 		}
72 		if (!c)
73 			c = ' ';	/* should never happen */
74 		*w++ = c;
75 		if (strchr("?#\1", c))
76 			break;
77 		if (c != '/')
78 			continue;
79 		n = w - url;
80 		if (n == 1 || n > 16)
81 			break;
82 		if (w[-2] != ':' && w[-2] != '/')
83 			break;
84 	}
85 	strmove(w, u);
86 }				/* unpercentURL */
87 
88 /* Unpercent an entire string. */
unpercentString(char * s)89 void unpercentString(char *s)
90 {
91 	char c, *u, *w;
92 	u = w = s;
93 	while ((c = *u)) {
94 		++u;
95 		if (c == '+')
96 			c = ' ';
97 		if (c == '%' && isxdigit(u[0]) && isxdigit(u[1])) {
98 			c = fromHex(u[0], u[1]);
99 			u += 2;
100 		}
101 		if (!c)
102 			c = ' ';	/* should never happen */
103 		*w++ = c;
104 	}
105 	*w = 0;
106 }				/* unpercentString */
107 
108 /*
109  * Function: percentURL
110  * Arguments:
111  ** start: pointer to start of input string
112   ** end: pointer to end of input string.
113  * Return value: A new string with the url encoded.
114  * There is an extra byte, room for / at the end.
115  * This function copies its input to a dynamically-allocated buffer,
116  * while performing the following transformation.  Change backslash to slash,
117  * and percent-escape some of the reserved characters as per RFC3986.
118  * Some of the chars retain their reserved semantics and should not be changed.
119  * This is a friggin guess!
120  * All characters in the area between start and end, not including end,
121  * are copied or transformed, except the hash, which is removed.
122  * This function is used to sanitize user-supplied URLs.  */
123 
124 /* these punctuations are percentable, anywhere in a url.
125  * The order is important.
126  * Google has commas in encoded URLs, and wikipedia has parentheses,
127  * so those are (sort of) ok. */
128 static const char percentable[] = "+,()'\"\\<>!*[]$";
129 static const char hexdigits[] = "0123456789abcdef";
130 #define ESCAPED_CHAR_LENGTH 3
131 
percentURL(const char * start,const char * end)132 char *percentURL(const char *start, const char *end)
133 {
134 	int bytes_to_alloc;
135 	char *new_copy;
136 	const char *in_pointer;
137 	char *out_pointer;
138 	char *frag;
139 
140 	if (!end)
141 		end = start + strlen(start);
142 	bytes_to_alloc = end - start + 2;
143 	new_copy = NULL;
144 	in_pointer = NULL;
145 	out_pointer = NULL;
146 
147 	for (in_pointer = start; in_pointer < end; in_pointer++)
148 		if (*in_pointer <= ' ' || strchr(percentable, *in_pointer))
149 			bytes_to_alloc += (ESCAPED_CHAR_LENGTH - 1);
150 
151 	new_copy = allocMem(bytes_to_alloc);
152 	out_pointer = new_copy;
153 	for (in_pointer = start; in_pointer < end; in_pointer++) {
154 		if (*in_pointer == '\\')
155 			*out_pointer++ = '/';
156 		else if (*in_pointer <= ' ' || strchr(percentable, *in_pointer)) {
157 			*out_pointer++ = '%';
158 			*out_pointer++ =
159 			    hexdigits[(uchar) (*in_pointer & 0xf0) >> 4];
160 			*out_pointer++ = hexdigits[(*in_pointer & 0x0f)];
161 		} else
162 			*out_pointer++ = *in_pointer;
163 	}
164 	*out_pointer = '\0';
165 /* excise #hash, required by some web servers */
166 	frag = findHash(new_copy);
167 	if (frag)
168 		*frag = 0;
169 
170 	return new_copy;
171 }				/* percentURL */
172 
173 // For debugging only.
looksPercented(const char * start,const char * end)174 bool looksPercented(const char *start, const char *end)
175 {
176 	const char *s;
177 	if (!end)
178 		end = start + strlen(start);
179 	for (s = start; s < end; ++s)
180 		if (*s < ' ' || strchr(percentable + 5, *s))
181 			return false;
182 	return true;
183 }				/* looksPercented */
184 
185 /* escape & < > for display on a web page */
htmlEscape0(const char * s,bool do_and)186 char *htmlEscape0(const char *s, bool do_and)
187 {
188 	char *t;
189 	int l;
190 	if (!s)
191 		return 0;
192 	if (!*s)
193 		return emptyString;
194 	t = initString(&l);
195 	for (; *s; ++s) {
196 		if (*s == '&' && do_and) {
197 			stringAndString(&t, &l, "&amp;");
198 			continue;
199 		}
200 		if (*s == '<') {
201 			stringAndString(&t, &l, "&lt;");
202 			continue;
203 		}
204 		if (*s == '>') {
205 			stringAndString(&t, &l, "&gt;");
206 			continue;
207 		}
208 		stringAndChar(&t, &l, *s);
209 	}
210 	return t;
211 }				/* htmlEscape0 */
212 
213 /* Decide if it looks like a web url. */
214 /* Don't do this in a href context  <a href=www.google.com> */
215 static bool hrefContext;
httpDefault(const char * url)216 static bool httpDefault(const char *url)
217 {
218 	static const char *const domainSuffix[] = {
219 		"com", "biz", "info", "net", "org", "gov", "edu", "us", "uk",
220 		"au",
221 		"ca", "de", "jp", "nz", 0
222 	};
223 	int n, len;
224 	const char *s, *lastdot, *end;
225 	if (hrefContext)
226 		return false;
227 	end = url + strcspn(url, "/?#\1");
228 	if (end - url > 7 && stringEqual(end - 7, ".browse"))
229 		end -= 7;
230 	s = strrchr(url, ':');
231 	if (s && s < end) {
232 		const char *colon = s;
233 		++s;
234 		while (isdigitByte(*s))
235 			++s;
236 		if (s == end)
237 			end = colon;
238 	}
239 // only domain characters allowed
240 	for (s = url; s < end; ++s)
241 		if (!isalnumByte(*s) && *s != '.' && *s != '-')
242 			return false;
243 /* need at least two embedded dots */
244 	n = 0;
245 	for (s = url + 1; s < end - 1; ++s)
246 		if (*s == '.' && s[-1] != '.' && s[1] != '.')
247 			++n, lastdot = s;
248 	if (n < 2)
249 		return false;
250 /* All digits, like an ip address, is ok. */
251 	if (n == 3) {
252 		for (s = url; s < end; ++s)
253 			if (!isdigitByte(*s) && *s != '.')
254 				break;
255 		if (s == end)
256 			return true;
257 	}
258 /* Look for standard domain suffix */
259 	++lastdot;
260 	len = end - lastdot;
261 	for (n = 0; domainSuffix[n]; ++n)
262 		if (memEqualCI(lastdot, domainSuffix[n], len)
263 		    && !domainSuffix[n][len])
264 			return true;
265 /* www.anything.xx is ok */
266 	if (len >= 2 && memEqualCI(url, "www.", 4))
267 		return true;
268 	return false;
269 }				/* httpDefault */
270 
271 /*********************************************************************
272 From wikipedia url
273 scheme://domain:port/path?query_string#fragment_id
274 but I allow, at the end of this, control a followed by post data, with the
275 understanding that there should not be query_string and post data simultaneously.
276 *********************************************************************/
277 
parseURL(const char * url,const char ** proto,int * prlen,const char ** user,int * uslen,const char ** pass,int * palen,const char ** host,int * holen,const char ** portloc,int * port,const char ** data,int * dalen,const char ** post,bool * freep)278 static bool parseURL(const char *url, const char **proto, int *prlen, const char **user, int *uslen, const char **pass, int *palen,	/* ftp protocol */
279 		     const char **host, int *holen,
280 		     const char **portloc, int *port,
281 		     const char **data, int *dalen, const char **post,
282 		     bool * freep)
283 {
284 	const char *p, *q, *pp;
285 	int a;
286 
287 	if (proto)
288 		*proto = NULL;
289 	if (prlen)
290 		*prlen = 0;
291 	if (user)
292 		*user = NULL;
293 	if (uslen)
294 		*uslen = 0;
295 	if (pass)
296 		*pass = NULL;
297 	if (palen)
298 		*palen = 0;
299 	if (host)
300 		*host = NULL;
301 	if (holen)
302 		*holen = 0;
303 	if (portloc)
304 		*portloc = 0;
305 	if (port)
306 		*port = 0;
307 	if (data)
308 		*data = NULL;
309 	if (dalen)
310 		*dalen = 0;
311 	if (post)
312 		*post = NULL;
313 	if (freep)
314 		*freep = false;
315 
316 	if (!url)
317 		return false;
318 
319 /* Find the leading protocol:// */
320 	a = -1;
321 	p = strchr(url, ':');
322 	if (p) {
323 		for (q = url; q < p; ++q)
324 			if (!isalnumByte(*q))
325 				break;
326 		if (q < p)
327 			p = 0;
328 		if (isdigit(url[0]))
329 			p = 0;
330 	}
331 
332 	if (p) {
333 		q = p + 1;
334 		if (*q == '/')
335 			++q;
336 		if (*q == '/')
337 			++q;
338 		skipWhite(&q);
339 
340 		if (!*q) {
341 // You have to have something after the colon
342 // but javascript: is technically a url, I guess
343 			if (strncmp(url, "javascript:", 11))
344 				return false;
345 		}
346 
347 		if (proto)
348 			*proto = url;
349 		if (prlen)
350 			*prlen = p - url;
351 		a = protocolByName(url, p - url);
352 #if 0
353 // not sure why I had this code
354 		if (a < 0 && q == p + 1)
355 			return false;
356 #endif
357 		if (a >= 0 && !protocols[a].need_slashes)
358 			++p;
359 		else
360 			p = q;
361 	} else if (httpDefault(url)) {
362 		static const char http[] = "http://";
363 		if (proto)
364 			*proto = http;
365 		if (prlen)
366 			*prlen = 4;
367 		a = 1;
368 		p = url;
369 	} else
370 		return false;
371 
372 	if (a < 0 || protocols[a].free_syntax) {
373 		if (data)
374 			*data = p;
375 		if (dalen)
376 			*dalen = strlen(p);
377 		if (freep)
378 			*freep = true;
379 		return true;
380 	}
381 
382 	if (a < 0)
383 		return true;	// don't know anything else
384 
385 /* find the end of the domain */
386 	q = p + strcspn(p, "@?#/\1");
387 	if (*q == '@') {	/* user:password@host */
388 		pp = strchr(p, ':');
389 		if (!pp || pp > q) {	/* no password */
390 			if (user)
391 				*user = p;
392 			if (uslen)
393 				*uslen = q - p;
394 		} else {
395 			if (user)
396 				*user = p;
397 			if (uslen)
398 				*uslen = pp - p;
399 			if (pass)
400 				*pass = pp + 1;
401 			if (palen)
402 				*palen = q - pp - 1;
403 		}
404 		p = q + 1;
405 	}
406 
407 /* again, look for the end of the domain */
408 	q = p + strcspn(p, ":?#/\1");
409 // only domain characters allowed
410 	for (pp = p; pp < q; ++pp)
411 		if (!isalnumByte(*pp) && *pp != '.' && *pp != '-')
412 			return false;
413 	if (host)
414 		*host = p;
415 	if (holen) {
416 		*holen = q - p;
417 // Watch out. Accessing document.cookie from javascript calls this function,
418 // and we might have .browse on the end of the domain, which causes trouble.
419 		if(*holen > 7 && stringEqual(q - 7, ".browse"))
420 			*holen -= 7;
421 	}
422 	if (*q == ':') {	/* port specified */
423 		int n;
424 		const char *cc, *pp = q + strcspn(q, "/?#\1");
425 		if (pp > q + 1) {
426 			n = strtol(q + 1, (char **)&cc, 10);
427 			if (cc != pp || !isdigitByte(q[1])) {
428 //                              setError(MSG_BadPort);
429 				return false;
430 			}
431 			if (port)
432 				*port = n;
433 		}
434 		if (portloc)
435 			*portloc = q;
436 		q = pp;		/* up to the slash */
437 	} else {
438 		if (port)
439 			*port = protocols[a].port;
440 	}			/* colon or not */
441 
442 /* Skip past /, but not ? or # */
443 	if (*q == '/')
444 		q++;
445 	p = q;
446 
447 /* post data is handled separately */
448 	q = p + strcspn(p, "\1");
449 	if (data)
450 		*data = p;
451 	if (dalen)
452 		*dalen = q - p;
453 	if (post)
454 		*post = *q ? q + 1 : NULL;
455 	return true;
456 }				/* parseURL */
457 
isURL(const char * url)458 bool isURL(const char *url)
459 {
460 	return parseURL(url, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
461 }				/* isURL */
462 
isSQL(const char * s)463 bool isSQL(const char *s)
464 {
465 	char c = *s;
466 	const char *c1 = 0;
467 
468 	if (!sqlPresent)
469 		goto no;
470 
471 	if (isURL(s))
472 		goto no;
473 
474 // look for word] or word:word]
475 	if (!isalphaByte(c))
476 		goto no;
477 
478 	for (++s; (c = *s); ++s) {
479 		if (c == '_')
480 			continue;
481 		if (isalnumByte(c))
482 			continue;
483 		if (c == ':') {
484 			if (c1)
485 				goto no;
486 			c1 = s;
487 			continue;
488 		}
489 		if (c == ']')
490 			goto yes;
491 	}
492 
493 no:
494 	return false;
495 
496 yes:
497 	return true;
498 }				/* isSQL */
499 
500 // non-FTP URLs are always browsable.  FTP URLs are browsable if they end with
501 //a slash. gopher urls are a bit more complicated, not yet implemented.
isBrowseableURL(const char * url)502 bool isBrowseableURL(const char *url)
503 {
504 	if (isURL(url))
505 		return (!memEqualCI(url, "ftp://", 6))
506 		    || (url[strlen(url) - 1] == '/');
507 	else
508 		return false;
509 }				/* isBrowseableURL */
510 
isDataURI(const char * u)511 bool isDataURI(const char *u)
512 {
513 	return u && memEqualCI(u, "data:", 5);
514 }				/* isDataURI */
515 
516 /* Helper functions to return pieces of the URL.
517  * Makes a copy, so you can have your 0 on the end.
518  * Return 0 for an error, and "" if that piece is missing. */
519 
getProtURL(const char * url)520 const char *getProtURL(const char *url)
521 {
522 	static char buf[MAXPROTLEN];
523 	int l;
524 	const char *s;
525 	if (!parseURL(url, &s, &l, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
526 		return 0;
527 	if (l >= MAXPROTLEN)
528 		l = MAXPROTLEN - 1;
529 	memcpy(buf, s, l);
530 	buf[l] = 0;
531 	return buf;
532 }				/* getProtURL */
533 
534 // Is this a url without http:// in front?
missingProtURL(const char * url)535 bool missingProtURL(const char *url)
536 {
537 	const char *s;
538 	if (!parseURL(url, &s, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
539 		return false;	// not a url
540 // protocol is always the start of url, unless url is a recognized
541 // format like www.foo.bar.com, then s points to the static string "http://".
542 	return (s != url);
543 }
544 
545 static char hostbuf[MAXHOSTLEN];
getHostURL(const char * url)546 const char *getHostURL(const char *url)
547 {
548 	int l;
549 	const char *s;
550 	char *t;
551 	char c, d;
552 	bool fs;
553 	bool rc = parseURL(url, 0, 0, 0, 0, 0, 0, &s, &l, 0, 0, 0, 0, 0, &fs);
554 	if (!rc || fs)
555 		return 0;
556 	if (!s)
557 		return emptyString;
558 	if (l >= sizeof(hostbuf)) {
559 		setError(MSG_DomainLong);
560 		return 0;
561 	}
562 	memcpy(hostbuf, s, l);
563 	if (l && hostbuf[l - 1] == '.')
564 		--l;
565 	hostbuf[l] = 0;
566 /* domain names must be ascii, with no spaces */
567 	d = 0;
568 	for (s = t = hostbuf; (c = *s); ++s) {
569 		c &= 0x7f;
570 		if (c == ' ')
571 			continue;
572 		if (c == '.' && d == '.')
573 			continue;
574 		*t++ = d = c;
575 	}
576 	*t = 0;
577 	return hostbuf;
578 }				/* getHostURL */
579 
getProtHostURL(const char * url,char * pp,char * hp)580 bool getProtHostURL(const char *url, char *pp, char *hp)
581 {
582 	int l1, l2;
583 	const char *s1, *s2;
584 	bool fs;
585 	if (!parseURL(url, &s1, &l1, 0, 0, 0, 0, &s2, &l2, 0, 0, 0, 0, 0, &fs))
586 		return false;
587 	if (pp) {
588 		*pp = 0;
589 		if (s1) {
590 			if (l1 >= MAXPROTLEN)
591 				l1 = MAXPROTLEN - 1;
592 			memcpy(pp, s1, l1);
593 			pp[l1] = 0;
594 		}
595 	}
596 	if (hp) {
597 		*hp = 0;
598 		if (s2) {
599 			if (l2 >= MAXHOSTLEN)
600 				l2 = MAXHOSTLEN - 1;
601 			memcpy(hp, s2, l2);
602 			hp[l2] = 0;
603 		}
604 	}
605 	return true;
606 }				/* getProtHostURL */
607 
608 // return user:password. Fails only if user or password too long.
getCredsURL(const char * url,char * buf)609 int getCredsURL(const char *url, char *buf)
610 {
611 	int l1, l2;
612 	const char *s1, *s2;
613 	bool fs;
614 	bool rc =
615 	    parseURL(url, 0, 0, &s1, &l1, &s2, &l2, 0, 0, 0, 0, 0, 0, 0, &fs);
616 	strcpy(buf, ":");
617 	if (!rc || fs)
618 		return 0;
619 	if (s1 && l1 > MAXUSERPASS)
620 		return 1;
621 	if (s2 && l2 > MAXUSERPASS)
622 		return 2;
623 	if (s1)
624 		strncpy(buf, s1, l1);
625 	else
626 		l1 = 0;
627 	buf[l1++] = ':';
628 	if (s2)
629 		strncpy(buf + l1, s2, l2);
630 	else
631 		l2 = 0;
632 	buf[l1 + l2] = 0;
633 	return 0;
634 }
635 
getDataURL(const char * url)636 const char *getDataURL(const char *url)
637 {
638 	const char *s;
639 	bool rc = parseURL(url, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, &s, 0, 0, 0);
640 	if (!rc)
641 		return 0;
642 	return s;
643 }				/* getDataURL */
644 
645 // return null for free syntax
getDataURL1(const char * url)646 static const char *getDataURL1(const char *url)
647 {
648 	const char *s;
649 	bool fs;
650 	bool rc = parseURL(url, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, &s, 0, 0, &fs);
651 	if (!rc || fs)
652 		return 0;
653 	return s;
654 }				/* getDataURL1 */
655 
getDirURL(const char * url,const char ** start_p,const char ** end_p)656 void getDirURL(const char *url, const char **start_p, const char **end_p)
657 {
658 	const char *dir = getDataURL1(url);
659 	const char *end;
660 	static const char myslash[] = "/";
661 	if (!dir || dir == url)
662 		goto slash;
663 	if (!strchr("#?\1", *dir)) {
664 		if (*--dir != '/')
665 			i_printfExit(MSG_BadDirSlash, url);
666 	}
667 	if (*dir == '#')	/* special case */
668 		end = dir;
669 	else
670 		end = strpbrk(dir, "?\1");
671 	if (!end)
672 		end = dir + strlen(dir);
673 	while (end > dir && end[-1] != '/')
674 		--end;
675 	if (end > dir) {
676 		*start_p = dir;
677 		*end_p = end;
678 		return;
679 	}
680 slash:
681 	*start_p = myslash;
682 	*end_p = myslash + 1;
683 }				/* getDirURL */
684 
685 /* #tag is only meaningfull after the last slash */
findHash(const char * s)686 char *findHash(const char *s)
687 {
688 	const char *t = strrchr(s, '/');
689 	if (t)
690 		s = t;
691 	return (char *)strchr(s, '#');
692 }				/* findHash */
693 
694 /* extract the file piece of a pathname or url */
695 /* This is for debugPrint or w/, so could be chopped for convenience */
getFileURL(const char * url,bool chophash)696 char *getFileURL(const char *url, bool chophash)
697 {
698 	const char *s;
699 	const char *e;
700 	s = strrchr(url, '/');
701 	if (s)
702 		++s;
703 	else
704 		s = url;
705 	e = 0;
706 	if (isURL(url)) {
707 		chophash = true;
708 		e = strpbrk(s, "?\1");
709 	}
710 	if (!e)
711 		e = s + strlen(s);
712 	if (chophash) {
713 		const char *h = findHash(s);
714 		if (h)
715 			e = h;
716 	}
717 // if slash at the end then back up to the prior slash
718 // /.browse is like / at the end
719 	if (s > url && (e == s || (e - s == 7 && !strncmp(s, ".browse", 7)))) {
720 		while (s > url && s[-1] == '/')
721 			--s;
722 		e = s;
723 		while (s > url && s[-1] != '/')
724 			--s;
725 	}
726 /* don't retain the .browse suffix on a url */
727 	if (e - s > 7 && stringEqual(e - 7, ".browse"))
728 		e -= 7;
729 	if (e - s > 64)
730 		e = s + 64;
731 	if (e == s)
732 		strcpy(hostbuf, "/");
733 	else {
734 		strncpy(hostbuf, s, e - s);
735 		hostbuf[e - s] = 0;
736 	}
737 	return hostbuf;
738 }				/* getFileURL */
739 
getPortLocURL(const char * url,const char ** portloc,int * port)740 bool getPortLocURL(const char *url, const char **portloc, int *port)
741 {
742 	bool fs;
743 	bool rc =
744 	    parseURL(url, 0, 0, 0, 0, 0, 0, 0, 0, portloc, port, 0, 0, 0, &fs);
745 	if (!rc || fs)
746 		return false;
747 	return true;
748 }				/* getPortLocURL */
749 
getPortURL(const char * url)750 int getPortURL(const char *url)
751 {
752 	int port;
753 	bool fs;
754 	bool rc = parseURL(url, 0, 0, 0, 0, 0, 0, 0, 0, 0, &port, 0, 0, 0, &fs);
755 	if (!rc || fs)
756 		return 0;
757 	return port;
758 }				/* getPortURL */
759 
isProxyURL(const char * url)760 bool isProxyURL(const char *url)
761 {
762 	return ((url[0] | 0x20) == 'p');
763 }
764 
765 /*
766  * copyPathSegment: copy everything from *src, starting with the leftmost
767  * character (a slash), and ending with either the next slash (not included)
768  * or the end of the string.
769  * Advance *src to point to the character succeeding the copied text.
770  */
copyPathSegment(char ** src,char ** dest,int * destlen)771 static void copyPathSegment(char **src, char **dest, int *destlen)
772 {
773 	int spanlen = strcspn(*src + 1, "/") + 1;
774 	stringAndBytes(dest, destlen, *src, spanlen);
775 	*src = *src + spanlen;
776 }				/* copyPathSegment */
777 
778 /*
779  * Remove the rightmost component of a path,
780  * including the preceding slash, if any.
781  */
snipLastSegment(char ** path,int * pathLen)782 static void snipLastSegment(char **path, int *pathLen)
783 {
784 	char *rightmostSlash = strrchr(*path, '/');
785 	if (rightmostSlash == NULL)
786 		rightmostSlash = *path;
787 	*rightmostSlash = '\0';
788 	*pathLen = rightmostSlash - *path;
789 }				/* snipLastSegment */
790 
squashDirectories(char * url)791 static void squashDirectories(char *url)
792 {
793 	char *dd = (char *)getDataURL(url);
794 	char *s, *end;
795 	char *inPath = NULL;
796 	char *outPath;
797 	int outPathLen = 0;
798 	char *rest = NULL;
799 
800 	outPath = initString(&outPathLen);
801 	if (memEqualCI(url, "javascript:", 11))
802 		return;
803 	if (!dd || dd == url)
804 		return;
805 	if (!*dd)
806 		return;
807 	if (strchr("#?\1", *dd))
808 		return;
809 	--dd;
810 /* dd could point to : in bogus code such as <A href=crap:foobar> */
811 /* crap: looks like a slashless protocol, perhaps unknown to us. */
812 	if (*dd == ':')
813 		return;
814 	if (*dd != '/')
815 		i_printfExit(MSG_BadSlash, url);
816 	end = dd + strcspn(dd, "?\1");
817 	rest = cloneString(end);
818 	inPath = pullString1(dd, end);
819 	s = inPath;
820 
821 /* The following algorithm is straight out of RFC 3986, section 5.2.4. */
822 /* We can ignore several steps because of a loop invariant: */
823 /* After the test, *s is always a slash. */
824 	while (*s) {
825 		if (!strncmp(s, "/./", 3))
826 			s += 2;	/* Point s at 2nd slash */
827 		else if (!strcmp(s, "/.")) {
828 			s[1] = '\0';
829 			/* We'll copy the segment "/" on the next iteration. */
830 			/* And that will be the final iteration of the loop. */
831 		} else if (!strncmp(s, "/../", 4)) {
832 			s += 3;	/* Point s at 2nd slash */
833 			snipLastSegment(&outPath, &outPathLen);
834 		} else if (!strcmp(s, "/..")) {
835 			s[1] = '\0';
836 			snipLastSegment(&outPath, &outPathLen);
837 			/* As above, copy "/" on the next and final iteration. */
838 		} else
839 			copyPathSegment(&s, &outPath, &outPathLen);
840 	}
841 	*dd = '\0';
842 	strcat(url, outPath);
843 	strcat(url, rest);
844 	nzFree(inPath);
845 	nzFree(outPath);
846 	nzFree(rest);
847 }				/* squashDirectories */
848 
resolveURL(const char * base,const char * rel)849 char *resolveURL(const char *base, const char *rel)
850 {
851 	char *n;		/* new url */
852 	const char *s, *p;
853 	char *q;
854 	int l;
855 
856 	if (memEqualCI(rel, "data:", 5))
857 		return cloneString(rel);
858 
859 	debugPrint(5, "resolve(%s|%s)", base, rel);
860 	hrefContext = true;
861 	if (!base)
862 		base = emptyString;
863 	if (!rel)
864 		rel = emptyString;
865 	n = allocString(strlen(base) + strlen(rel) + 12);
866 
867 	if (rel[0] == '#') {
868 /* This is an anchor for the current document, don't resolve. */
869 /* I assume the base does not have a #fragment on the end; that is not part of the base. */
870 /* Thus I won't get url#foo#bar */
871 		strcpy(n, rel);
872 out_n:
873 		debugPrint(5, "= %s", n);
874 		hrefContext = false;
875 		return n;
876 	}
877 
878 	if (rel[0] == '?' || rel[0] == '\1') {
879 /* setting or changing get or post data */
880 		strcpy(n, base);
881 		for (q = n; *q && *q != '\1' && *q != '?'; q++) ;
882 		strcpy(q, rel);
883 		goto out_n;
884 	}
885 
886 	if (rel[0] == '/' && rel[1] == '/') {
887 		if ((s = strstr(base, "//"))) {
888 			strncpy(n, base, s - base);
889 			n[s - base] = 0;
890 		} else
891 			strcpy(n, "http:");
892 		strcat(n, rel);
893 		goto squash;
894 	}
895 
896 	if (parseURL(rel, &s, &l, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) > 0) {
897 /* has a protocol */
898 		n[0] = 0;
899 		if (s != rel) {
900 /* It didn't have http in front of it before, put it on now. */
901 /* This is old; it shouldn't happen any more. */
902 			strncpy(n, s, l);
903 			strcpy(n + l, "://");
904 		}
905 		strcat(n, rel);
906 		goto squash;
907 	}
908 // at this point rel is not a url.
909 	s = base;
910 	if (rel[0] == '/') {
911 		s = getDataURL(base);
912 		if (!s) {
913 			strcpy(n, rel);
914 			goto squash;
915 		}
916 		if (!*s) {
917 			if (s - base >= 7 && stringEqual(s - 7, ".browse"))
918 				s -= 7;
919 			if (s > base && s[-1] == '/')
920 				--s;
921 		} else if (!strchr("#?\1", *s)) {
922 			--s;
923 		} else if (s[-1] == '/')
924 			--s;
925 		l = s - base;
926 		strncpy(n, base, l);
927 		strcpy(n + l, rel);
928 		goto squash;
929 	}
930 /* This is a relative change, paste it on after the last slash */
931 	s = base;
932 	if (parseURL(base, 0, 0, 0, 0, 0, 0, &p, 0, 0, 0, 0, 0, 0, 0) > 0 && p)
933 		s = p;
934 	for (p = 0; *s; ++s) {
935 		if (*s == '/')
936 			p = s;
937 		if (strchr("#?\1", *s))
938 			break;
939 	}
940 	if (!p) {
941 		if (isURL(base))
942 			p = s;
943 		else
944 			p = base;
945 	}
946 	l = p - base;
947 	if (l) {
948 		strncpy(n, base, l);
949 		n[l++] = '/';
950 	}
951 	strcpy(n + l, rel);
952 
953 squash:
954 	squashDirectories(n);
955 	goto out_n;
956 }				/* resolveURL */
957 
958 /* This routine could be, should be, more sophisticated */
sameURL(const char * s,const char * t)959 bool sameURL(const char *s, const char *t)
960 {
961 	const char *u, *p, *q;
962 	int l;
963 
964 	if (!s || !t)
965 		return false;
966 
967 /* check for post data at the end */
968 	p = strchr(s, '\1');
969 	if (!p)
970 		p = s + strlen(s);
971 	q = strchr(t, '\1');
972 	if (!q)
973 		q = t + strlen(t);
974 	if (!stringEqual(p, q))
975 		return false;
976 
977 /* lop off hash */
978 	if ((u = findHash(s)))
979 		p = u;
980 	if ((u = findHash(t)))
981 		q = u;
982 
983 /* It's ok if one says http and the other implies it. */
984 	if (memEqualCI(s, "http://", 7))
985 		s += 7;
986 	if (memEqualCI(t, "http://", 7))
987 		t += 7;
988 
989 	if (p - s >= 7 && stringEqual(p - 7, ".browse"))
990 		p -= 7;
991 	if (q - t >= 7 && stringEqual(q - 7, ".browse"))
992 		q -= 7;
993 	l = p - s;
994 	if (l != q - t)
995 		return false;
996 	return !memcmp(s, t, l);
997 }				/* sameURL */
998 
999 /* Find some helpful text to print in place of an image.
1000  * Not sure why we would need more than 1000 chars for this,
1001  * so return a static buffer. */
altText(const char * base)1002 char *altText(const char *base)
1003 {
1004 	static char buf[1000];
1005 	int len, n;
1006 	char *s;
1007 	debugPrint(6, "altText(%s)", base);
1008 	if (!base)
1009 		return 0;
1010 	if (stringEqual(base, "#"))
1011 		return 0;
1012 	if (memEqualCI(base, "javascript", 10))
1013 		return 0;
1014 	strncpy(buf, base, sizeof(buf) - 1);
1015 	spaceCrunch(buf, true, false);
1016 	len = strlen(buf);
1017 /* remove punctuation mark from the end of a sentence or phrase */
1018 	if (len >= 2 && !isalnumByte(buf[len - 1]) && isalnumByte(buf[len - 2]))
1019 		buf[--len] = 0;
1020 /* strip leading whitespace */
1021 	while (len && isspaceByte(buf[0]))
1022 		strmove(buf, buf + 1), --len;
1023 	if (len > 10) {
1024 /* see whether it's a phrase/sentence or a pathname/url */
1025 		if (!isURL(buf))
1026 			return buf;	/* looks like words */
1027 /* Ok, now we believe it's a pathname or url */
1028 /* get rid of post or get data */
1029 		s = strpbrk(buf, "?\1");
1030 		if (s)
1031 			*s = 0;
1032 /* get rid of common suffix */
1033 		s = strrchr(buf, '.');
1034 		if (s) {
1035 /* get rid of trailing .html */
1036 			static const char *const suffix[] = {
1037 				"html", "htm", "shtml", "shtm", "php", "asp",
1038 				"cgi", "rm",
1039 				"ram",
1040 				"gif", "jpg", "bmp",
1041 				0
1042 			};
1043 			n = stringInListCI(suffix, s + 1);
1044 			if (n >= 0 || s[1] == 0)
1045 				*s = 0;
1046 		}
1047 /* Get rid of everything up to the last slash, leaving the file name */
1048 retry:
1049 		s = strrchr(buf, '/');
1050 		if (s && s - buf >= 12) {
1051 			if (!s[1]) {
1052 				*s = 0;
1053 				goto retry;
1054 			}
1055 			strmove(buf, s + 1);
1056 		}
1057 	}			/* more than ten characters */
1058 	return buf;
1059 }				/* altText */
1060 
1061 /* get post data ready for a url. */
encodePostData(const char * s,const char * keep_chars)1062 char *encodePostData(const char *s, const char *keep_chars)
1063 {
1064 	char *post, c;
1065 	int l;
1066 	char buf[4];
1067 
1068 	if (!s)
1069 		return 0;
1070 	if (s == emptyString)
1071 		return emptyString;
1072 	if (!keep_chars)
1073 		keep_chars = "-._~()";
1074 	post = initString(&l);
1075 	while ((c = *s++)) {
1076 		if (isalnumByte(c))
1077 			goto putc;
1078 		if (strchr(keep_chars, c))
1079 			goto putc;
1080 		sprintf(buf, "%%%02X", (uchar) c);
1081 		stringAndString(&post, &l, buf);
1082 		continue;
1083 putc:
1084 		stringAndChar(&post, &l, c);
1085 	}
1086 	return post;
1087 }				/* encodePostData */
1088 
dohex(char c,const char ** sp)1089 static char dohex(char c, const char **sp)
1090 {
1091 	const char *s = *sp;
1092 	char d, e;
1093 	if (c == '+')
1094 		return ' ';
1095 	if (c != '%')
1096 		return c;
1097 	d = *s++;
1098 	e = *s++;
1099 	if (!isxdigit(d) || !isxdigit(e))
1100 		return c;	/* should never happen */
1101 	d = fromHex(d, e);
1102 	if (!d)
1103 		d = ' ';	/* don't allow nulls */
1104 	*sp = s;
1105 	return d;
1106 }				/* dohex */
1107 
decodePostData(const char * data,const char * name,int seqno)1108 char *decodePostData(const char *data, const char *name, int seqno)
1109 {
1110 	const char *s, *n, *t;
1111 	char *ns = 0, *w = 0;
1112 	int j = 0;
1113 	char c;
1114 
1115 	if (!seqno && !name)
1116 		i_printfExit(MSG_DecodePost);
1117 
1118 	for (s = data; *s; s = (*t ? t + 1 : t)) {
1119 		n = 0;
1120 		t = strchr(s, '&');
1121 		if (!t)
1122 			t = s + strlen(s);
1123 /* select attribute by number */
1124 		++j;
1125 		if (j == seqno)
1126 			w = ns = allocString(t - s + 1);
1127 		if (seqno && !w)
1128 			continue;
1129 		if (name)
1130 			n = name;
1131 		while (s < t && (c = *s) != '=') {
1132 			++s;
1133 			c = dohex(c, &s);
1134 			if (n) {
1135 /* I don't know if this is suppose to be case insensitive all the time,
1136  * though there are situations when it must be, as in
1137  * mailto:address?Subject=blah-blah */
1138 				if (isalphaByte(c)) {
1139 					if (!((c ^ *n) & 0xdf))
1140 						++n;
1141 					else
1142 						n = 0;
1143 				} else if (c == *n)
1144 					++n;
1145 				else
1146 					n = 0;
1147 			}
1148 			if (w)
1149 				*w++ = c;
1150 		}
1151 
1152 		if (s == t) {	/* no equals, just a string */
1153 			if (name)
1154 				continue;
1155 			*w = 0;
1156 			return ns;
1157 		}
1158 		if (w)
1159 			*w++ = c;
1160 		++s;		/* skip past equals */
1161 		if (name) {
1162 			if (!n)
1163 				continue;
1164 			if (*n)
1165 				continue;
1166 			w = ns = allocString(t - s + 1);
1167 		}
1168 
1169 /* At this point we have a match */
1170 		while (s < t) {
1171 			c = *s++;
1172 			c = dohex(c, &s);
1173 			*w++ = c;
1174 		}
1175 		*w = 0;
1176 		return ns;
1177 	}
1178 
1179 	return 0;
1180 }				/* decodePostData */
1181 
decodeMailURL(const char * url,char ** addr_p,char ** subj_p,char ** body_p)1182 void decodeMailURL(const char *url, char **addr_p, char **subj_p, char **body_p)
1183 {
1184 	const char *s;
1185 	if (memEqualCI(url, "mailto:", 7))
1186 		url += 7;
1187 	s = url + strcspn(url, "/?");
1188 	if (addr_p)
1189 		*addr_p = pullString1(url, s);
1190 	if (subj_p)
1191 		*subj_p = 0;
1192 	if (body_p)
1193 		*body_p = 0;
1194 	s = strchr(url, '?');
1195 	if (!s)
1196 		return;
1197 	url = s + 1;
1198 	if (subj_p)
1199 		*subj_p = decodePostData(url, "subject", 0);
1200 	if (body_p)
1201 		*body_p = decodePostData(url, "body", 0);
1202 }				/* decodeMailURL */
1203 
1204 // Does a url match a pattern, from an entry in .ebrc
1205 // edbrowse.org matches edbrowse.org and foo.edbrowse.org
1206 // edbrowse.org/foo matches edbrowse.org/foo/bar
patternMatchURL(const char * url,const char * pattern)1207 bool patternMatchURL(const char *url, const char *pattern)
1208 {
1209 	char prot[MAXPROTLEN], host[MAXHOSTLEN];
1210 	const char *path, *q;
1211 	int hl, dl, ql;
1212 	if (!url || !pattern)
1213 		return false;
1214 	if (!url[0] || !pattern[0])
1215 		return false;
1216 // This function has to be threadsafe, so I call getProtHostURL,
1217 // which is also threadsafe.
1218 	if (!getProtHostURL(url, prot, host))
1219 		return false;
1220 	hl = strlen(host);
1221 	path = getDataURL(url);
1222 	q = strchr(pattern, '/');
1223 	if (!q)
1224 		q = pattern + strlen(pattern);
1225 	dl = q - pattern;
1226 	if (dl > hl)
1227 		return false;
1228 	if (!memEqualCI(pattern, host + hl - dl, dl))
1229 		return false;
1230 	if (*q == '/') {
1231 		++q;
1232 		if (hl != dl || !path)
1233 			return false;
1234 		ql = strlen(q);
1235 		return !strncmp(q, path, ql) &&
1236 		    (path[ql] == 0 || path[ql] == '/');
1237 	}			/* domain/path was specified */
1238 	return hl == dl || host[hl - dl - 1] == '.';
1239 }
1240