1 /* url.c
2 * Separate a url into pieces, turn a relative file into an absolute url.
3 * This file is part of the edbrowse project, released under GPL.
4 */
5
6 #include "eb.h"
7
8 struct PROTOCOL {
9 const char prot[MAXPROTLEN];
10 int port;
11 bool free_syntax;
12 bool need_slashes;
13 bool need_slash_after_host;
14 } protocols[] = {
15 {
16 "file", 0, true, true, false}, {
17 "http", 80, false, true, true}, {
18 "https", 443, false, true, true}, {
19 "pop3", 110, false, true, true}, {
20 "pop3s", 995, false, true, true}, {
21 "imap", 220, false, true, true}, {
22 "imaps", 993, false, true, true}, {
23 "smtp", 25, false, true, true}, {
24 "submission", 587, false, true, true}, {
25 "smtps", 465, false, true, true}, {
26 "proxy", 3128, false, true, true}, {
27 "ftp", 21, false, true, true}, {
28 "sftp", 22, false, true, true}, {
29 "scp", 22, false, true, true}, {
30 "ftps", 990, false, true, true}, {
31 "tftp", 69, false, true, true}, {
32 "rtsp", 554, false, true, true}, {
33 "pnm", 7070, false, true, true}, {
34 "finger", 79, false, true, true}, {
35 "smb", 139, false, true, true}, {
36 "mailto", 0, false, false, false}, {
37 "telnet", 23, false, false, false}, {
38 "tn3270", 0, false, false, false}, {
39 "data", 0, true, false, false}, {
40 "javascript", 0, true, false, false}, {
41 "git", 0, false, true, false}, {
42 "svn", 0, false, true, false}, {
43 "gopher", 70, false, true, true}, {
44 "magnet", 0, false, false, false}, {
45 "irc", 0, false, true, false}, {
46 "", 0},};
47
protocolByName(const char * p,int l)48 static int protocolByName(const char *p, int l)
49 {
50 int i;
51 for (i = 0; protocols[i].prot[0]; i++)
52 if (strlen(protocols[i].prot) == l &&
53 memEqualCI(protocols[i].prot, p, l))
54 return i;
55 return -1;
56 } /* protocolByName */
57
58 /* Unpercent the host component of a url, not the data component. */
unpercentURL(char * url)59 void unpercentURL(char *url)
60 {
61 char c, *u, *w;
62 int n;
63 u = w = url;
64 while ((c = *u)) {
65 ++u;
66 if (c == '+')
67 c = ' ';
68 if (c == '%' && isxdigit(u[0]) && isxdigit(u[1])) {
69 c = fromHex(u[0], u[1]);
70 u += 2;
71 }
72 if (!c)
73 c = ' '; /* should never happen */
74 *w++ = c;
75 if (strchr("?#\1", c))
76 break;
77 if (c != '/')
78 continue;
79 n = w - url;
80 if (n == 1 || n > 16)
81 break;
82 if (w[-2] != ':' && w[-2] != '/')
83 break;
84 }
85 strmove(w, u);
86 } /* unpercentURL */
87
88 /* Unpercent an entire string. */
unpercentString(char * s)89 void unpercentString(char *s)
90 {
91 char c, *u, *w;
92 u = w = s;
93 while ((c = *u)) {
94 ++u;
95 if (c == '+')
96 c = ' ';
97 if (c == '%' && isxdigit(u[0]) && isxdigit(u[1])) {
98 c = fromHex(u[0], u[1]);
99 u += 2;
100 }
101 if (!c)
102 c = ' '; /* should never happen */
103 *w++ = c;
104 }
105 *w = 0;
106 } /* unpercentString */
107
108 /*
109 * Function: percentURL
110 * Arguments:
111 ** start: pointer to start of input string
112 ** end: pointer to end of input string.
113 * Return value: A new string with the url encoded.
114 * There is an extra byte, room for / at the end.
115 * This function copies its input to a dynamically-allocated buffer,
116 * while performing the following transformation. Change backslash to slash,
117 * and percent-escape some of the reserved characters as per RFC3986.
118 * Some of the chars retain their reserved semantics and should not be changed.
119 * This is a friggin guess!
120 * All characters in the area between start and end, not including end,
121 * are copied or transformed, except the hash, which is removed.
122 * This function is used to sanitize user-supplied URLs. */
123
124 /* these punctuations are percentable, anywhere in a url.
125 * The order is important.
126 * Google has commas in encoded URLs, and wikipedia has parentheses,
127 * so those are (sort of) ok. */
128 static const char percentable[] = "+,()'\"\\<>!*[]$";
129 static const char hexdigits[] = "0123456789abcdef";
130 #define ESCAPED_CHAR_LENGTH 3
131
percentURL(const char * start,const char * end)132 char *percentURL(const char *start, const char *end)
133 {
134 int bytes_to_alloc;
135 char *new_copy;
136 const char *in_pointer;
137 char *out_pointer;
138 char *frag;
139
140 if (!end)
141 end = start + strlen(start);
142 bytes_to_alloc = end - start + 2;
143 new_copy = NULL;
144 in_pointer = NULL;
145 out_pointer = NULL;
146
147 for (in_pointer = start; in_pointer < end; in_pointer++)
148 if (*in_pointer <= ' ' || strchr(percentable, *in_pointer))
149 bytes_to_alloc += (ESCAPED_CHAR_LENGTH - 1);
150
151 new_copy = allocMem(bytes_to_alloc);
152 out_pointer = new_copy;
153 for (in_pointer = start; in_pointer < end; in_pointer++) {
154 if (*in_pointer == '\\')
155 *out_pointer++ = '/';
156 else if (*in_pointer <= ' ' || strchr(percentable, *in_pointer)) {
157 *out_pointer++ = '%';
158 *out_pointer++ =
159 hexdigits[(uchar) (*in_pointer & 0xf0) >> 4];
160 *out_pointer++ = hexdigits[(*in_pointer & 0x0f)];
161 } else
162 *out_pointer++ = *in_pointer;
163 }
164 *out_pointer = '\0';
165 /* excise #hash, required by some web servers */
166 frag = findHash(new_copy);
167 if (frag)
168 *frag = 0;
169
170 return new_copy;
171 } /* percentURL */
172
173 // For debugging only.
looksPercented(const char * start,const char * end)174 bool looksPercented(const char *start, const char *end)
175 {
176 const char *s;
177 if (!end)
178 end = start + strlen(start);
179 for (s = start; s < end; ++s)
180 if (*s < ' ' || strchr(percentable + 5, *s))
181 return false;
182 return true;
183 } /* looksPercented */
184
185 /* escape & < > for display on a web page */
htmlEscape0(const char * s,bool do_and)186 char *htmlEscape0(const char *s, bool do_and)
187 {
188 char *t;
189 int l;
190 if (!s)
191 return 0;
192 if (!*s)
193 return emptyString;
194 t = initString(&l);
195 for (; *s; ++s) {
196 if (*s == '&' && do_and) {
197 stringAndString(&t, &l, "&");
198 continue;
199 }
200 if (*s == '<') {
201 stringAndString(&t, &l, "<");
202 continue;
203 }
204 if (*s == '>') {
205 stringAndString(&t, &l, ">");
206 continue;
207 }
208 stringAndChar(&t, &l, *s);
209 }
210 return t;
211 } /* htmlEscape0 */
212
213 /* Decide if it looks like a web url. */
214 /* Don't do this in a href context <a href=www.google.com> */
215 static bool hrefContext;
httpDefault(const char * url)216 static bool httpDefault(const char *url)
217 {
218 static const char *const domainSuffix[] = {
219 "com", "biz", "info", "net", "org", "gov", "edu", "us", "uk",
220 "au",
221 "ca", "de", "jp", "nz", 0
222 };
223 int n, len;
224 const char *s, *lastdot, *end;
225 if (hrefContext)
226 return false;
227 end = url + strcspn(url, "/?#\1");
228 if (end - url > 7 && stringEqual(end - 7, ".browse"))
229 end -= 7;
230 s = strrchr(url, ':');
231 if (s && s < end) {
232 const char *colon = s;
233 ++s;
234 while (isdigitByte(*s))
235 ++s;
236 if (s == end)
237 end = colon;
238 }
239 // only domain characters allowed
240 for (s = url; s < end; ++s)
241 if (!isalnumByte(*s) && *s != '.' && *s != '-')
242 return false;
243 /* need at least two embedded dots */
244 n = 0;
245 for (s = url + 1; s < end - 1; ++s)
246 if (*s == '.' && s[-1] != '.' && s[1] != '.')
247 ++n, lastdot = s;
248 if (n < 2)
249 return false;
250 /* All digits, like an ip address, is ok. */
251 if (n == 3) {
252 for (s = url; s < end; ++s)
253 if (!isdigitByte(*s) && *s != '.')
254 break;
255 if (s == end)
256 return true;
257 }
258 /* Look for standard domain suffix */
259 ++lastdot;
260 len = end - lastdot;
261 for (n = 0; domainSuffix[n]; ++n)
262 if (memEqualCI(lastdot, domainSuffix[n], len)
263 && !domainSuffix[n][len])
264 return true;
265 /* www.anything.xx is ok */
266 if (len >= 2 && memEqualCI(url, "www.", 4))
267 return true;
268 return false;
269 } /* httpDefault */
270
271 /*********************************************************************
272 From wikipedia url
273 scheme://domain:port/path?query_string#fragment_id
274 but I allow, at the end of this, control a followed by post data, with the
275 understanding that there should not be query_string and post data simultaneously.
276 *********************************************************************/
277
parseURL(const char * url,const char ** proto,int * prlen,const char ** user,int * uslen,const char ** pass,int * palen,const char ** host,int * holen,const char ** portloc,int * port,const char ** data,int * dalen,const char ** post,bool * freep)278 static bool parseURL(const char *url, const char **proto, int *prlen, const char **user, int *uslen, const char **pass, int *palen, /* ftp protocol */
279 const char **host, int *holen,
280 const char **portloc, int *port,
281 const char **data, int *dalen, const char **post,
282 bool * freep)
283 {
284 const char *p, *q, *pp;
285 int a;
286
287 if (proto)
288 *proto = NULL;
289 if (prlen)
290 *prlen = 0;
291 if (user)
292 *user = NULL;
293 if (uslen)
294 *uslen = 0;
295 if (pass)
296 *pass = NULL;
297 if (palen)
298 *palen = 0;
299 if (host)
300 *host = NULL;
301 if (holen)
302 *holen = 0;
303 if (portloc)
304 *portloc = 0;
305 if (port)
306 *port = 0;
307 if (data)
308 *data = NULL;
309 if (dalen)
310 *dalen = 0;
311 if (post)
312 *post = NULL;
313 if (freep)
314 *freep = false;
315
316 if (!url)
317 return false;
318
319 /* Find the leading protocol:// */
320 a = -1;
321 p = strchr(url, ':');
322 if (p) {
323 for (q = url; q < p; ++q)
324 if (!isalnumByte(*q))
325 break;
326 if (q < p)
327 p = 0;
328 if (isdigit(url[0]))
329 p = 0;
330 }
331
332 if (p) {
333 q = p + 1;
334 if (*q == '/')
335 ++q;
336 if (*q == '/')
337 ++q;
338 skipWhite(&q);
339
340 if (!*q) {
341 // You have to have something after the colon
342 // but javascript: is technically a url, I guess
343 if (strncmp(url, "javascript:", 11))
344 return false;
345 }
346
347 if (proto)
348 *proto = url;
349 if (prlen)
350 *prlen = p - url;
351 a = protocolByName(url, p - url);
352 #if 0
353 // not sure why I had this code
354 if (a < 0 && q == p + 1)
355 return false;
356 #endif
357 if (a >= 0 && !protocols[a].need_slashes)
358 ++p;
359 else
360 p = q;
361 } else if (httpDefault(url)) {
362 static const char http[] = "http://";
363 if (proto)
364 *proto = http;
365 if (prlen)
366 *prlen = 4;
367 a = 1;
368 p = url;
369 } else
370 return false;
371
372 if (a < 0 || protocols[a].free_syntax) {
373 if (data)
374 *data = p;
375 if (dalen)
376 *dalen = strlen(p);
377 if (freep)
378 *freep = true;
379 return true;
380 }
381
382 if (a < 0)
383 return true; // don't know anything else
384
385 /* find the end of the domain */
386 q = p + strcspn(p, "@?#/\1");
387 if (*q == '@') { /* user:password@host */
388 pp = strchr(p, ':');
389 if (!pp || pp > q) { /* no password */
390 if (user)
391 *user = p;
392 if (uslen)
393 *uslen = q - p;
394 } else {
395 if (user)
396 *user = p;
397 if (uslen)
398 *uslen = pp - p;
399 if (pass)
400 *pass = pp + 1;
401 if (palen)
402 *palen = q - pp - 1;
403 }
404 p = q + 1;
405 }
406
407 /* again, look for the end of the domain */
408 q = p + strcspn(p, ":?#/\1");
409 // only domain characters allowed
410 for (pp = p; pp < q; ++pp)
411 if (!isalnumByte(*pp) && *pp != '.' && *pp != '-')
412 return false;
413 if (host)
414 *host = p;
415 if (holen) {
416 *holen = q - p;
417 // Watch out. Accessing document.cookie from javascript calls this function,
418 // and we might have .browse on the end of the domain, which causes trouble.
419 if(*holen > 7 && stringEqual(q - 7, ".browse"))
420 *holen -= 7;
421 }
422 if (*q == ':') { /* port specified */
423 int n;
424 const char *cc, *pp = q + strcspn(q, "/?#\1");
425 if (pp > q + 1) {
426 n = strtol(q + 1, (char **)&cc, 10);
427 if (cc != pp || !isdigitByte(q[1])) {
428 // setError(MSG_BadPort);
429 return false;
430 }
431 if (port)
432 *port = n;
433 }
434 if (portloc)
435 *portloc = q;
436 q = pp; /* up to the slash */
437 } else {
438 if (port)
439 *port = protocols[a].port;
440 } /* colon or not */
441
442 /* Skip past /, but not ? or # */
443 if (*q == '/')
444 q++;
445 p = q;
446
447 /* post data is handled separately */
448 q = p + strcspn(p, "\1");
449 if (data)
450 *data = p;
451 if (dalen)
452 *dalen = q - p;
453 if (post)
454 *post = *q ? q + 1 : NULL;
455 return true;
456 } /* parseURL */
457
isURL(const char * url)458 bool isURL(const char *url)
459 {
460 return parseURL(url, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
461 } /* isURL */
462
isSQL(const char * s)463 bool isSQL(const char *s)
464 {
465 char c = *s;
466 const char *c1 = 0;
467
468 if (!sqlPresent)
469 goto no;
470
471 if (isURL(s))
472 goto no;
473
474 // look for word] or word:word]
475 if (!isalphaByte(c))
476 goto no;
477
478 for (++s; (c = *s); ++s) {
479 if (c == '_')
480 continue;
481 if (isalnumByte(c))
482 continue;
483 if (c == ':') {
484 if (c1)
485 goto no;
486 c1 = s;
487 continue;
488 }
489 if (c == ']')
490 goto yes;
491 }
492
493 no:
494 return false;
495
496 yes:
497 return true;
498 } /* isSQL */
499
500 // non-FTP URLs are always browsable. FTP URLs are browsable if they end with
501 //a slash. gopher urls are a bit more complicated, not yet implemented.
isBrowseableURL(const char * url)502 bool isBrowseableURL(const char *url)
503 {
504 if (isURL(url))
505 return (!memEqualCI(url, "ftp://", 6))
506 || (url[strlen(url) - 1] == '/');
507 else
508 return false;
509 } /* isBrowseableURL */
510
isDataURI(const char * u)511 bool isDataURI(const char *u)
512 {
513 return u && memEqualCI(u, "data:", 5);
514 } /* isDataURI */
515
516 /* Helper functions to return pieces of the URL.
517 * Makes a copy, so you can have your 0 on the end.
518 * Return 0 for an error, and "" if that piece is missing. */
519
getProtURL(const char * url)520 const char *getProtURL(const char *url)
521 {
522 static char buf[MAXPROTLEN];
523 int l;
524 const char *s;
525 if (!parseURL(url, &s, &l, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
526 return 0;
527 if (l >= MAXPROTLEN)
528 l = MAXPROTLEN - 1;
529 memcpy(buf, s, l);
530 buf[l] = 0;
531 return buf;
532 } /* getProtURL */
533
534 // Is this a url without http:// in front?
missingProtURL(const char * url)535 bool missingProtURL(const char *url)
536 {
537 const char *s;
538 if (!parseURL(url, &s, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
539 return false; // not a url
540 // protocol is always the start of url, unless url is a recognized
541 // format like www.foo.bar.com, then s points to the static string "http://".
542 return (s != url);
543 }
544
545 static char hostbuf[MAXHOSTLEN];
getHostURL(const char * url)546 const char *getHostURL(const char *url)
547 {
548 int l;
549 const char *s;
550 char *t;
551 char c, d;
552 bool fs;
553 bool rc = parseURL(url, 0, 0, 0, 0, 0, 0, &s, &l, 0, 0, 0, 0, 0, &fs);
554 if (!rc || fs)
555 return 0;
556 if (!s)
557 return emptyString;
558 if (l >= sizeof(hostbuf)) {
559 setError(MSG_DomainLong);
560 return 0;
561 }
562 memcpy(hostbuf, s, l);
563 if (l && hostbuf[l - 1] == '.')
564 --l;
565 hostbuf[l] = 0;
566 /* domain names must be ascii, with no spaces */
567 d = 0;
568 for (s = t = hostbuf; (c = *s); ++s) {
569 c &= 0x7f;
570 if (c == ' ')
571 continue;
572 if (c == '.' && d == '.')
573 continue;
574 *t++ = d = c;
575 }
576 *t = 0;
577 return hostbuf;
578 } /* getHostURL */
579
getProtHostURL(const char * url,char * pp,char * hp)580 bool getProtHostURL(const char *url, char *pp, char *hp)
581 {
582 int l1, l2;
583 const char *s1, *s2;
584 bool fs;
585 if (!parseURL(url, &s1, &l1, 0, 0, 0, 0, &s2, &l2, 0, 0, 0, 0, 0, &fs))
586 return false;
587 if (pp) {
588 *pp = 0;
589 if (s1) {
590 if (l1 >= MAXPROTLEN)
591 l1 = MAXPROTLEN - 1;
592 memcpy(pp, s1, l1);
593 pp[l1] = 0;
594 }
595 }
596 if (hp) {
597 *hp = 0;
598 if (s2) {
599 if (l2 >= MAXHOSTLEN)
600 l2 = MAXHOSTLEN - 1;
601 memcpy(hp, s2, l2);
602 hp[l2] = 0;
603 }
604 }
605 return true;
606 } /* getProtHostURL */
607
608 // return user:password. Fails only if user or password too long.
getCredsURL(const char * url,char * buf)609 int getCredsURL(const char *url, char *buf)
610 {
611 int l1, l2;
612 const char *s1, *s2;
613 bool fs;
614 bool rc =
615 parseURL(url, 0, 0, &s1, &l1, &s2, &l2, 0, 0, 0, 0, 0, 0, 0, &fs);
616 strcpy(buf, ":");
617 if (!rc || fs)
618 return 0;
619 if (s1 && l1 > MAXUSERPASS)
620 return 1;
621 if (s2 && l2 > MAXUSERPASS)
622 return 2;
623 if (s1)
624 strncpy(buf, s1, l1);
625 else
626 l1 = 0;
627 buf[l1++] = ':';
628 if (s2)
629 strncpy(buf + l1, s2, l2);
630 else
631 l2 = 0;
632 buf[l1 + l2] = 0;
633 return 0;
634 }
635
getDataURL(const char * url)636 const char *getDataURL(const char *url)
637 {
638 const char *s;
639 bool rc = parseURL(url, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, &s, 0, 0, 0);
640 if (!rc)
641 return 0;
642 return s;
643 } /* getDataURL */
644
645 // return null for free syntax
getDataURL1(const char * url)646 static const char *getDataURL1(const char *url)
647 {
648 const char *s;
649 bool fs;
650 bool rc = parseURL(url, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, &s, 0, 0, &fs);
651 if (!rc || fs)
652 return 0;
653 return s;
654 } /* getDataURL1 */
655
getDirURL(const char * url,const char ** start_p,const char ** end_p)656 void getDirURL(const char *url, const char **start_p, const char **end_p)
657 {
658 const char *dir = getDataURL1(url);
659 const char *end;
660 static const char myslash[] = "/";
661 if (!dir || dir == url)
662 goto slash;
663 if (!strchr("#?\1", *dir)) {
664 if (*--dir != '/')
665 i_printfExit(MSG_BadDirSlash, url);
666 }
667 if (*dir == '#') /* special case */
668 end = dir;
669 else
670 end = strpbrk(dir, "?\1");
671 if (!end)
672 end = dir + strlen(dir);
673 while (end > dir && end[-1] != '/')
674 --end;
675 if (end > dir) {
676 *start_p = dir;
677 *end_p = end;
678 return;
679 }
680 slash:
681 *start_p = myslash;
682 *end_p = myslash + 1;
683 } /* getDirURL */
684
685 /* #tag is only meaningfull after the last slash */
findHash(const char * s)686 char *findHash(const char *s)
687 {
688 const char *t = strrchr(s, '/');
689 if (t)
690 s = t;
691 return (char *)strchr(s, '#');
692 } /* findHash */
693
694 /* extract the file piece of a pathname or url */
695 /* This is for debugPrint or w/, so could be chopped for convenience */
getFileURL(const char * url,bool chophash)696 char *getFileURL(const char *url, bool chophash)
697 {
698 const char *s;
699 const char *e;
700 s = strrchr(url, '/');
701 if (s)
702 ++s;
703 else
704 s = url;
705 e = 0;
706 if (isURL(url)) {
707 chophash = true;
708 e = strpbrk(s, "?\1");
709 }
710 if (!e)
711 e = s + strlen(s);
712 if (chophash) {
713 const char *h = findHash(s);
714 if (h)
715 e = h;
716 }
717 // if slash at the end then back up to the prior slash
718 // /.browse is like / at the end
719 if (s > url && (e == s || (e - s == 7 && !strncmp(s, ".browse", 7)))) {
720 while (s > url && s[-1] == '/')
721 --s;
722 e = s;
723 while (s > url && s[-1] != '/')
724 --s;
725 }
726 /* don't retain the .browse suffix on a url */
727 if (e - s > 7 && stringEqual(e - 7, ".browse"))
728 e -= 7;
729 if (e - s > 64)
730 e = s + 64;
731 if (e == s)
732 strcpy(hostbuf, "/");
733 else {
734 strncpy(hostbuf, s, e - s);
735 hostbuf[e - s] = 0;
736 }
737 return hostbuf;
738 } /* getFileURL */
739
getPortLocURL(const char * url,const char ** portloc,int * port)740 bool getPortLocURL(const char *url, const char **portloc, int *port)
741 {
742 bool fs;
743 bool rc =
744 parseURL(url, 0, 0, 0, 0, 0, 0, 0, 0, portloc, port, 0, 0, 0, &fs);
745 if (!rc || fs)
746 return false;
747 return true;
748 } /* getPortLocURL */
749
getPortURL(const char * url)750 int getPortURL(const char *url)
751 {
752 int port;
753 bool fs;
754 bool rc = parseURL(url, 0, 0, 0, 0, 0, 0, 0, 0, 0, &port, 0, 0, 0, &fs);
755 if (!rc || fs)
756 return 0;
757 return port;
758 } /* getPortURL */
759
isProxyURL(const char * url)760 bool isProxyURL(const char *url)
761 {
762 return ((url[0] | 0x20) == 'p');
763 }
764
765 /*
766 * copyPathSegment: copy everything from *src, starting with the leftmost
767 * character (a slash), and ending with either the next slash (not included)
768 * or the end of the string.
769 * Advance *src to point to the character succeeding the copied text.
770 */
copyPathSegment(char ** src,char ** dest,int * destlen)771 static void copyPathSegment(char **src, char **dest, int *destlen)
772 {
773 int spanlen = strcspn(*src + 1, "/") + 1;
774 stringAndBytes(dest, destlen, *src, spanlen);
775 *src = *src + spanlen;
776 } /* copyPathSegment */
777
778 /*
779 * Remove the rightmost component of a path,
780 * including the preceding slash, if any.
781 */
snipLastSegment(char ** path,int * pathLen)782 static void snipLastSegment(char **path, int *pathLen)
783 {
784 char *rightmostSlash = strrchr(*path, '/');
785 if (rightmostSlash == NULL)
786 rightmostSlash = *path;
787 *rightmostSlash = '\0';
788 *pathLen = rightmostSlash - *path;
789 } /* snipLastSegment */
790
squashDirectories(char * url)791 static void squashDirectories(char *url)
792 {
793 char *dd = (char *)getDataURL(url);
794 char *s, *end;
795 char *inPath = NULL;
796 char *outPath;
797 int outPathLen = 0;
798 char *rest = NULL;
799
800 outPath = initString(&outPathLen);
801 if (memEqualCI(url, "javascript:", 11))
802 return;
803 if (!dd || dd == url)
804 return;
805 if (!*dd)
806 return;
807 if (strchr("#?\1", *dd))
808 return;
809 --dd;
810 /* dd could point to : in bogus code such as <A href=crap:foobar> */
811 /* crap: looks like a slashless protocol, perhaps unknown to us. */
812 if (*dd == ':')
813 return;
814 if (*dd != '/')
815 i_printfExit(MSG_BadSlash, url);
816 end = dd + strcspn(dd, "?\1");
817 rest = cloneString(end);
818 inPath = pullString1(dd, end);
819 s = inPath;
820
821 /* The following algorithm is straight out of RFC 3986, section 5.2.4. */
822 /* We can ignore several steps because of a loop invariant: */
823 /* After the test, *s is always a slash. */
824 while (*s) {
825 if (!strncmp(s, "/./", 3))
826 s += 2; /* Point s at 2nd slash */
827 else if (!strcmp(s, "/.")) {
828 s[1] = '\0';
829 /* We'll copy the segment "/" on the next iteration. */
830 /* And that will be the final iteration of the loop. */
831 } else if (!strncmp(s, "/../", 4)) {
832 s += 3; /* Point s at 2nd slash */
833 snipLastSegment(&outPath, &outPathLen);
834 } else if (!strcmp(s, "/..")) {
835 s[1] = '\0';
836 snipLastSegment(&outPath, &outPathLen);
837 /* As above, copy "/" on the next and final iteration. */
838 } else
839 copyPathSegment(&s, &outPath, &outPathLen);
840 }
841 *dd = '\0';
842 strcat(url, outPath);
843 strcat(url, rest);
844 nzFree(inPath);
845 nzFree(outPath);
846 nzFree(rest);
847 } /* squashDirectories */
848
resolveURL(const char * base,const char * rel)849 char *resolveURL(const char *base, const char *rel)
850 {
851 char *n; /* new url */
852 const char *s, *p;
853 char *q;
854 int l;
855
856 if (memEqualCI(rel, "data:", 5))
857 return cloneString(rel);
858
859 debugPrint(5, "resolve(%s|%s)", base, rel);
860 hrefContext = true;
861 if (!base)
862 base = emptyString;
863 if (!rel)
864 rel = emptyString;
865 n = allocString(strlen(base) + strlen(rel) + 12);
866
867 if (rel[0] == '#') {
868 /* This is an anchor for the current document, don't resolve. */
869 /* I assume the base does not have a #fragment on the end; that is not part of the base. */
870 /* Thus I won't get url#foo#bar */
871 strcpy(n, rel);
872 out_n:
873 debugPrint(5, "= %s", n);
874 hrefContext = false;
875 return n;
876 }
877
878 if (rel[0] == '?' || rel[0] == '\1') {
879 /* setting or changing get or post data */
880 strcpy(n, base);
881 for (q = n; *q && *q != '\1' && *q != '?'; q++) ;
882 strcpy(q, rel);
883 goto out_n;
884 }
885
886 if (rel[0] == '/' && rel[1] == '/') {
887 if ((s = strstr(base, "//"))) {
888 strncpy(n, base, s - base);
889 n[s - base] = 0;
890 } else
891 strcpy(n, "http:");
892 strcat(n, rel);
893 goto squash;
894 }
895
896 if (parseURL(rel, &s, &l, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) > 0) {
897 /* has a protocol */
898 n[0] = 0;
899 if (s != rel) {
900 /* It didn't have http in front of it before, put it on now. */
901 /* This is old; it shouldn't happen any more. */
902 strncpy(n, s, l);
903 strcpy(n + l, "://");
904 }
905 strcat(n, rel);
906 goto squash;
907 }
908 // at this point rel is not a url.
909 s = base;
910 if (rel[0] == '/') {
911 s = getDataURL(base);
912 if (!s) {
913 strcpy(n, rel);
914 goto squash;
915 }
916 if (!*s) {
917 if (s - base >= 7 && stringEqual(s - 7, ".browse"))
918 s -= 7;
919 if (s > base && s[-1] == '/')
920 --s;
921 } else if (!strchr("#?\1", *s)) {
922 --s;
923 } else if (s[-1] == '/')
924 --s;
925 l = s - base;
926 strncpy(n, base, l);
927 strcpy(n + l, rel);
928 goto squash;
929 }
930 /* This is a relative change, paste it on after the last slash */
931 s = base;
932 if (parseURL(base, 0, 0, 0, 0, 0, 0, &p, 0, 0, 0, 0, 0, 0, 0) > 0 && p)
933 s = p;
934 for (p = 0; *s; ++s) {
935 if (*s == '/')
936 p = s;
937 if (strchr("#?\1", *s))
938 break;
939 }
940 if (!p) {
941 if (isURL(base))
942 p = s;
943 else
944 p = base;
945 }
946 l = p - base;
947 if (l) {
948 strncpy(n, base, l);
949 n[l++] = '/';
950 }
951 strcpy(n + l, rel);
952
953 squash:
954 squashDirectories(n);
955 goto out_n;
956 } /* resolveURL */
957
958 /* This routine could be, should be, more sophisticated */
sameURL(const char * s,const char * t)959 bool sameURL(const char *s, const char *t)
960 {
961 const char *u, *p, *q;
962 int l;
963
964 if (!s || !t)
965 return false;
966
967 /* check for post data at the end */
968 p = strchr(s, '\1');
969 if (!p)
970 p = s + strlen(s);
971 q = strchr(t, '\1');
972 if (!q)
973 q = t + strlen(t);
974 if (!stringEqual(p, q))
975 return false;
976
977 /* lop off hash */
978 if ((u = findHash(s)))
979 p = u;
980 if ((u = findHash(t)))
981 q = u;
982
983 /* It's ok if one says http and the other implies it. */
984 if (memEqualCI(s, "http://", 7))
985 s += 7;
986 if (memEqualCI(t, "http://", 7))
987 t += 7;
988
989 if (p - s >= 7 && stringEqual(p - 7, ".browse"))
990 p -= 7;
991 if (q - t >= 7 && stringEqual(q - 7, ".browse"))
992 q -= 7;
993 l = p - s;
994 if (l != q - t)
995 return false;
996 return !memcmp(s, t, l);
997 } /* sameURL */
998
999 /* Find some helpful text to print in place of an image.
1000 * Not sure why we would need more than 1000 chars for this,
1001 * so return a static buffer. */
altText(const char * base)1002 char *altText(const char *base)
1003 {
1004 static char buf[1000];
1005 int len, n;
1006 char *s;
1007 debugPrint(6, "altText(%s)", base);
1008 if (!base)
1009 return 0;
1010 if (stringEqual(base, "#"))
1011 return 0;
1012 if (memEqualCI(base, "javascript", 10))
1013 return 0;
1014 strncpy(buf, base, sizeof(buf) - 1);
1015 spaceCrunch(buf, true, false);
1016 len = strlen(buf);
1017 /* remove punctuation mark from the end of a sentence or phrase */
1018 if (len >= 2 && !isalnumByte(buf[len - 1]) && isalnumByte(buf[len - 2]))
1019 buf[--len] = 0;
1020 /* strip leading whitespace */
1021 while (len && isspaceByte(buf[0]))
1022 strmove(buf, buf + 1), --len;
1023 if (len > 10) {
1024 /* see whether it's a phrase/sentence or a pathname/url */
1025 if (!isURL(buf))
1026 return buf; /* looks like words */
1027 /* Ok, now we believe it's a pathname or url */
1028 /* get rid of post or get data */
1029 s = strpbrk(buf, "?\1");
1030 if (s)
1031 *s = 0;
1032 /* get rid of common suffix */
1033 s = strrchr(buf, '.');
1034 if (s) {
1035 /* get rid of trailing .html */
1036 static const char *const suffix[] = {
1037 "html", "htm", "shtml", "shtm", "php", "asp",
1038 "cgi", "rm",
1039 "ram",
1040 "gif", "jpg", "bmp",
1041 0
1042 };
1043 n = stringInListCI(suffix, s + 1);
1044 if (n >= 0 || s[1] == 0)
1045 *s = 0;
1046 }
1047 /* Get rid of everything up to the last slash, leaving the file name */
1048 retry:
1049 s = strrchr(buf, '/');
1050 if (s && s - buf >= 12) {
1051 if (!s[1]) {
1052 *s = 0;
1053 goto retry;
1054 }
1055 strmove(buf, s + 1);
1056 }
1057 } /* more than ten characters */
1058 return buf;
1059 } /* altText */
1060
1061 /* get post data ready for a url. */
encodePostData(const char * s,const char * keep_chars)1062 char *encodePostData(const char *s, const char *keep_chars)
1063 {
1064 char *post, c;
1065 int l;
1066 char buf[4];
1067
1068 if (!s)
1069 return 0;
1070 if (s == emptyString)
1071 return emptyString;
1072 if (!keep_chars)
1073 keep_chars = "-._~()";
1074 post = initString(&l);
1075 while ((c = *s++)) {
1076 if (isalnumByte(c))
1077 goto putc;
1078 if (strchr(keep_chars, c))
1079 goto putc;
1080 sprintf(buf, "%%%02X", (uchar) c);
1081 stringAndString(&post, &l, buf);
1082 continue;
1083 putc:
1084 stringAndChar(&post, &l, c);
1085 }
1086 return post;
1087 } /* encodePostData */
1088
dohex(char c,const char ** sp)1089 static char dohex(char c, const char **sp)
1090 {
1091 const char *s = *sp;
1092 char d, e;
1093 if (c == '+')
1094 return ' ';
1095 if (c != '%')
1096 return c;
1097 d = *s++;
1098 e = *s++;
1099 if (!isxdigit(d) || !isxdigit(e))
1100 return c; /* should never happen */
1101 d = fromHex(d, e);
1102 if (!d)
1103 d = ' '; /* don't allow nulls */
1104 *sp = s;
1105 return d;
1106 } /* dohex */
1107
decodePostData(const char * data,const char * name,int seqno)1108 char *decodePostData(const char *data, const char *name, int seqno)
1109 {
1110 const char *s, *n, *t;
1111 char *ns = 0, *w = 0;
1112 int j = 0;
1113 char c;
1114
1115 if (!seqno && !name)
1116 i_printfExit(MSG_DecodePost);
1117
1118 for (s = data; *s; s = (*t ? t + 1 : t)) {
1119 n = 0;
1120 t = strchr(s, '&');
1121 if (!t)
1122 t = s + strlen(s);
1123 /* select attribute by number */
1124 ++j;
1125 if (j == seqno)
1126 w = ns = allocString(t - s + 1);
1127 if (seqno && !w)
1128 continue;
1129 if (name)
1130 n = name;
1131 while (s < t && (c = *s) != '=') {
1132 ++s;
1133 c = dohex(c, &s);
1134 if (n) {
1135 /* I don't know if this is suppose to be case insensitive all the time,
1136 * though there are situations when it must be, as in
1137 * mailto:address?Subject=blah-blah */
1138 if (isalphaByte(c)) {
1139 if (!((c ^ *n) & 0xdf))
1140 ++n;
1141 else
1142 n = 0;
1143 } else if (c == *n)
1144 ++n;
1145 else
1146 n = 0;
1147 }
1148 if (w)
1149 *w++ = c;
1150 }
1151
1152 if (s == t) { /* no equals, just a string */
1153 if (name)
1154 continue;
1155 *w = 0;
1156 return ns;
1157 }
1158 if (w)
1159 *w++ = c;
1160 ++s; /* skip past equals */
1161 if (name) {
1162 if (!n)
1163 continue;
1164 if (*n)
1165 continue;
1166 w = ns = allocString(t - s + 1);
1167 }
1168
1169 /* At this point we have a match */
1170 while (s < t) {
1171 c = *s++;
1172 c = dohex(c, &s);
1173 *w++ = c;
1174 }
1175 *w = 0;
1176 return ns;
1177 }
1178
1179 return 0;
1180 } /* decodePostData */
1181
decodeMailURL(const char * url,char ** addr_p,char ** subj_p,char ** body_p)1182 void decodeMailURL(const char *url, char **addr_p, char **subj_p, char **body_p)
1183 {
1184 const char *s;
1185 if (memEqualCI(url, "mailto:", 7))
1186 url += 7;
1187 s = url + strcspn(url, "/?");
1188 if (addr_p)
1189 *addr_p = pullString1(url, s);
1190 if (subj_p)
1191 *subj_p = 0;
1192 if (body_p)
1193 *body_p = 0;
1194 s = strchr(url, '?');
1195 if (!s)
1196 return;
1197 url = s + 1;
1198 if (subj_p)
1199 *subj_p = decodePostData(url, "subject", 0);
1200 if (body_p)
1201 *body_p = decodePostData(url, "body", 0);
1202 } /* decodeMailURL */
1203
1204 // Does a url match a pattern, from an entry in .ebrc
1205 // edbrowse.org matches edbrowse.org and foo.edbrowse.org
1206 // edbrowse.org/foo matches edbrowse.org/foo/bar
patternMatchURL(const char * url,const char * pattern)1207 bool patternMatchURL(const char *url, const char *pattern)
1208 {
1209 char prot[MAXPROTLEN], host[MAXHOSTLEN];
1210 const char *path, *q;
1211 int hl, dl, ql;
1212 if (!url || !pattern)
1213 return false;
1214 if (!url[0] || !pattern[0])
1215 return false;
1216 // This function has to be threadsafe, so I call getProtHostURL,
1217 // which is also threadsafe.
1218 if (!getProtHostURL(url, prot, host))
1219 return false;
1220 hl = strlen(host);
1221 path = getDataURL(url);
1222 q = strchr(pattern, '/');
1223 if (!q)
1224 q = pattern + strlen(pattern);
1225 dl = q - pattern;
1226 if (dl > hl)
1227 return false;
1228 if (!memEqualCI(pattern, host + hl - dl, dl))
1229 return false;
1230 if (*q == '/') {
1231 ++q;
1232 if (hl != dl || !path)
1233 return false;
1234 ql = strlen(q);
1235 return !strncmp(q, path, ql) &&
1236 (path[ql] == 0 || path[ql] == '/');
1237 } /* domain/path was specified */
1238 return hl == dl || host[hl - dl - 1] == '.';
1239 }
1240