1 /*
2 * $LynxId: HTParse.c,v 1.98 2021/07/27 21:29:49 tom Exp $
3 *
4 * Parse HyperText Document Address HTParse.c
5 * ================================
6 */
7
8 #include <HTUtils.h>
9 #include <HTParse.h>
10
11 #include <LYUtils.h>
12 #include <LYLeaks.h>
13 #include <LYStrings.h>
14 #include <LYCharUtils.h>
15 #include <LYGlobalDefs.h>
16
17 #ifdef HAVE_ALLOCA_H
18 #include <alloca.h>
19 #else
20 #ifdef __MINGW32__
21 #include <malloc.h>
22 #endif /* __MINGW32__ */
23 #endif
24
25 #ifdef USE_IDN2
26 #include <idn2.h>
27 #define FreeIdna(out) idn2_free(out)
28 #elif defined(USE_IDNA)
29 #include <idna.h>
30 #include <idn-free.h>
31 #define FreeIdna(out) idn_free(out)
32 #define IDN2_OK IDNA_SUCCESS
33 #endif
34
35 #define HEX_ESCAPE '%'
36
37 struct struct_parts {
38 char *access;
39 char *host;
40 char *absolute;
41 char *relative;
42 char *search; /* treated normally as part of path */
43 char *anchor;
44 };
45
46 #if 0 /* for debugging */
47 static void show_parts(const char *name, struct struct_parts *parts, int line)
48 {
49 if (TRACE) {
50 CTRACE((tfp, "struct_parts(%s) %s@%d\n", name, __FILE__, line));
51 CTRACE((tfp, " access '%s'\n", NONNULL(parts->access)));
52 CTRACE((tfp, " host '%s'\n", NONNULL(parts->host)));
53 CTRACE((tfp, " absolute '%s'\n", NONNULL(parts->absolute)));
54 CTRACE((tfp, " relative '%s'\n", NONNULL(parts->relative)));
55 CTRACE((tfp, " search '%s'\n", NONNULL(parts->search)));
56 CTRACE((tfp, " anchor '%s'\n", NONNULL(parts->anchor)));
57 }
58 }
59 #define SHOW_PARTS(name) show_parts(#name, &name, __LINE__)
60 #else
61 #define SHOW_PARTS(name) /* nothing */
62 #endif
63
64 /* Strip white space off a string. HTStrip()
65 * -------------------------------
66 *
67 * On exit,
68 * Return value points to first non-white character, or to 0 if none.
69 * All trailing white space is OVERWRITTEN with zero.
70 */
HTStrip(char * s)71 char *HTStrip(char *s)
72 {
73 #define SPACE(c) ((c == ' ') || (c == '\t') || (c == '\n'))
74 char *p;
75
76 for (p = s; *p; p++) { /* Find end of string */
77 ;
78 }
79 for (p--; p >= s; p--) {
80 if (SPACE(*p))
81 *p = '\0'; /* Zap trailing blanks */
82 else
83 break;
84 }
85 while (SPACE(*s))
86 s++; /* Strip leading blanks */
87 return s;
88 }
89
90 /* Scan a filename for its constituents. scan()
91 * -------------------------------------
92 *
93 * On entry,
94 * name points to a document name which may be incomplete.
95 * On exit,
96 * absolute or relative may be nonzero (but not both).
97 * host, anchor and access may be nonzero if they were specified.
98 * Any which are nonzero point to zero terminated strings.
99 */
scan(char * name,struct struct_parts * parts)100 static void scan(char *name,
101 struct struct_parts *parts)
102 {
103 char *after_access;
104 char *p;
105
106 parts->access = NULL;
107 parts->host = NULL;
108 parts->absolute = NULL;
109 parts->relative = NULL;
110 parts->search = NULL; /* normally not used - kw */
111 parts->anchor = NULL;
112
113 /*
114 * Scan left-to-right for a scheme (access).
115 */
116 after_access = name;
117 for (p = name; *p; p++) {
118 if (*p == ':') {
119 *p = '\0';
120 parts->access = name; /* Access name has been specified */
121 after_access = (p + 1);
122 break;
123 }
124 if (*p == '/' || *p == '#' || *p == ';' || *p == '?')
125 break;
126 }
127
128 /*
129 * Scan left-to-right for a fragment (anchor).
130 */
131 for (p = after_access; *p; p++) {
132 if (*p == '#') {
133 parts->anchor = (p + 1);
134 *p = '\0'; /* terminate the rest */
135 break; /* leave things after first # alone - kw */
136 }
137 }
138
139 /*
140 * Scan left-to-right for a host or absolute path.
141 */
142 p = after_access;
143 if (*p == '/') {
144 if (p[1] == '/') {
145 parts->host = (p + 2); /* host has been specified */
146 *p = '\0'; /* Terminate access */
147 p = StrChr(parts->host, '/'); /* look for end of host name if any */
148 if (p != NULL) {
149 *p = '\0'; /* Terminate host */
150 parts->absolute = (p + 1); /* Root has been found */
151 } else {
152 p = StrChr(parts->host, '?');
153 if (p != NULL) {
154 *p = '\0'; /* Terminate host */
155 parts->search = (p + 1);
156 }
157 }
158 } else {
159 parts->absolute = (p + 1); /* Root found but no host */
160 }
161 } else {
162 parts->relative = (*after_access) ?
163 after_access : NULL; /* NULL for "" */
164 }
165
166 /*
167 * Check schemes that commonly have unescaped hashes.
168 */
169 if (parts->access && parts->anchor &&
170 /* optimize */ StrChr("lnsdLNSD", *parts->access) != NULL) {
171 if ((!parts->host && strcasecomp(parts->access, "lynxcgi")) ||
172 !strcasecomp(parts->access, "nntp") ||
173 !strcasecomp(parts->access, "snews") ||
174 !strcasecomp(parts->access, "news") ||
175 !strcasecomp(parts->access, "data")) {
176 /*
177 * Access specified but no host and not a lynxcgi URL, so the
178 * anchor may not really be one, e.g., news:j462#36487@foo.bar, or
179 * it's an nntp or snews URL, or news URL with a host. Restore the
180 * '#' in the address.
181 */
182 /* but only if we have found a path component of which this will
183 * become part. - kw */
184 if (parts->relative || parts->absolute) {
185 *(parts->anchor - 1) = '#';
186 parts->anchor = NULL;
187 }
188 }
189 }
190 } /*scan */
191
192 #if defined(HAVE_ALLOCA) && !defined(LY_FIND_LEAKS)
193 #define LYalloca(x) alloca((size_t)(x))
194 #define LYalloca_free(x) {}
195 #else
196 #define LYalloca(x) malloc((size_t)(x))
197 #define LYalloca_free(x) free((void *)(x))
198 #endif
199
strchr_or_end(char * string,int ch)200 static char *strchr_or_end(char *string, int ch)
201 {
202 char *result = StrChr(string, ch);
203
204 if (result == 0) {
205 result = string + strlen(string);
206 }
207 return result;
208 }
209
210 /*
211 * Given a host specification that may end with a port number, e.g.,
212 * foobar:123
213 * point to the ':' which begins the ":port" to make it simple to handle the
214 * substring.
215 *
216 * If no port is found (or a syntax error), return null.
217 */
HTParsePort(char * host,int * portp)218 char *HTParsePort(char *host, int *portp)
219 {
220 int brackets = 0;
221 char *result = NULL;
222
223 *portp = 0;
224 if (host != NULL) {
225 while (*host != '\0' && result == 0) {
226 switch (*host++) {
227 case ':':
228 if (brackets == 0 && isdigit(UCH(*host))) {
229 char *next = NULL;
230
231 *portp = (int) strtol(host, &next, 10);
232 if (next != 0 && next != host && *next == '\0') {
233 result = (host - 1);
234 CTRACE((tfp, "HTParsePort %d\n", *portp));
235 }
236 }
237 break;
238 case '[': /* for ipv6 */
239 ++brackets;
240 break;
241 case ']': /* for ipv6 */
242 --brackets;
243 break;
244 }
245 }
246 }
247 return result;
248 }
249
250 #if defined(USE_IDNA) || defined(USE_IDN2)
hex_decode(int ch)251 static int hex_decode(int ch)
252 {
253 int result = -1;
254
255 if (ch >= '0' && ch <= '9')
256 result = (ch - '0');
257 else if (ch >= 'a' && ch <= 'f')
258 result = (ch - 'a') + 10;
259 else if (ch >= 'A' && ch <= 'F')
260 result = (ch - 'A') + 10;
261 return result;
262 }
263
264 /*
265 * Convert in-place the given hostname to IDNA form. That requires up to 64
266 * characters, and we've allowed for that, with MIN_PARSE.
267 */
convert_to_idna(char * host)268 static void convert_to_idna(char *host)
269 {
270 size_t length = strlen(host);
271 char *endhost = host + length;
272 char *buffer = malloc(length + 1);
273 char *params = malloc(length + 1);
274 char *output = NULL;
275 char *src, *dst;
276 int code;
277 int hi, lo;
278
279 if (buffer != NULL && params != NULL) {
280 code = TRUE;
281 *params = '\0';
282 for (dst = buffer, src = host; src < endhost; ++dst) {
283 int ch = *src++;
284
285 if (RFC_3986_GEN_DELIMS(ch)) {
286 strcpy(params, src - 1);
287 *dst = '\0';
288 break;
289 } else if (ch == HEX_ESCAPE) {
290 if ((src + 1) < endhost
291 && (hi = hex_decode(src[0])) >= 0
292 && (lo = hex_decode(src[1])) >= 0) {
293
294 *dst = (char) ((hi << 4) | lo);
295 src += 2;
296 } else {
297 CTRACE((tfp, "convert_to_idna: `%s' is malformed\n", host));
298 code = FALSE;
299 break;
300 }
301 } else {
302 *dst = (char) ch;
303 }
304 }
305 if (code) {
306 *dst = '\0';
307 #ifdef USE_IDN2
308 #if (!defined(IDN2_VERSION_NUMBER) || IDN2_VERSION_NUMBER < 0x02000003)
309 /*
310 * Older libidn2 mishandles STD3, stripping underscores.
311 */
312 if (strchr(buffer, '_') != NULL) {
313 code = -1;
314 } else
315 #endif
316 switch (LYidnaMode) {
317 case LYidna2003:
318 code = idn2_to_ascii_8z(buffer, &output, IDN2_TRANSITIONAL);
319 break;
320 case LYidna2008:
321 /* IDNA2008 rules without the TR46 amendments */
322 code = idn2_to_ascii_8z(buffer, &output, 0);
323 break;
324 case LYidnaTR46:
325 code = idn2_to_ascii_8z(buffer, &output, IDN2_NONTRANSITIONAL
326 | IDN2_NFC_INPUT);
327 break;
328 case LYidnaCompat:
329 /* IDNA2008 */
330 code = idn2_to_ascii_8z(buffer, &output, IDN2_NONTRANSITIONAL
331 | IDN2_NFC_INPUT);
332 if (code == IDN2_DISALLOWED) {
333 /* IDNA2003 - compatible */
334 code = idn2_to_ascii_8z(buffer, &output, IDN2_TRANSITIONAL);
335 }
336 break;
337 }
338 #else
339 code = idna_to_ascii_8z(buffer, &output, IDNA_USE_STD3_ASCII_RULES);
340 #endif
341 if (code == IDN2_OK) {
342 CTRACE((tfp, "convert_to_idna: `%s' -> `%s': OK\n", buffer, output));
343 strcpy(host, output);
344 strcat(host, params);
345 } else {
346 CTRACE((tfp, "convert_to_idna: `%s': %s\n",
347 buffer,
348 idna_strerror((Idna_rc) code)));
349 }
350 if (output)
351 FreeIdna(output);
352 }
353 }
354 free(buffer);
355 free(params);
356 }
357 #define MIN_PARSE 80
358 #else
359 #define MIN_PARSE 8
360 #endif
361
362 /* Parse a Name relative to another name. HTParse()
363 * --------------------------------------
364 *
365 * This returns those parts of a name which are given (and requested)
366 * substituting bits from the related name where necessary.
367 *
368 * Originally based on RFC 1808, some details in RFC 3986 are used.
369 *
370 * On entry,
371 * aName A filename given
372 * relatedName A name relative to which aName is to be parsed
373 * wanted A mask for the bits which are wanted.
374 *
375 * On exit,
376 * returns A pointer to a malloc'd string which MUST BE FREED
377 */
HTParse(const char * aName,const char * relatedName,int wanted)378 char *HTParse(const char *aName,
379 const char *relatedName,
380 int wanted)
381 {
382 char *result = NULL;
383 char *tail = NULL; /* a pointer to the end of the 'result' string */
384 char *return_value = NULL;
385 size_t len, len1, len2;
386 size_t need;
387 char *name = NULL;
388 char *rel = NULL;
389 char *p, *q;
390 char *acc_method;
391 struct struct_parts given, related;
392
393 CTRACE((tfp, "HTParse: aName:`%s'\n", aName));
394 CTRACE((tfp, " relatedName:`%s'\n", relatedName));
395
396 if (wanted & (PARSE_STRICTPATH | PARSE_QUERY)) { /* if detail wanted... */
397 if ((wanted & (PARSE_STRICTPATH | PARSE_QUERY))
398 == (PARSE_STRICTPATH | PARSE_QUERY)) /* if strictpath AND query */
399 wanted |= PARSE_PATH; /* then treat as if PARSE_PATH wanted */
400 if (wanted & PARSE_PATH) /* if PARSE_PATH wanted */
401 wanted &= ~(PARSE_STRICTPATH | PARSE_QUERY); /* ignore details */
402 }
403 /* *INDENT-OFF* */
404 CTRACE((tfp, " want:%s%s%s%s%s%s%s\n",
405 wanted & PARSE_PUNCTUATION ? " punc" : "",
406 wanted & PARSE_ANCHOR ? " anchor" : "",
407 wanted & PARSE_PATH ? " path" : "",
408 wanted & PARSE_HOST ? " host" : "",
409 wanted & PARSE_ACCESS ? " access" : "",
410 wanted & PARSE_STRICTPATH ? " PATH" : "",
411 wanted & PARSE_QUERY ? " QUERY" : ""));
412 /* *INDENT-ON* */
413
414 /*
415 * Allocate the temporary string. Optimized.
416 */
417 len1 = strlen(aName) + 1;
418 len2 = strlen(relatedName) + 1;
419 len = len1 + len2 + MIN_PARSE; /* Lots of space: more than enough */
420
421 need = (len * 2 + len1 + len2);
422 if (need > (size_t) max_uri_size ||
423 (int) need < (int) len1 ||
424 (int) need < (int) len2)
425 return StrAllocCopy(return_value, "");
426
427 result = tail = (char *) LYalloca(need);
428 if (result == NULL) {
429 outofmem(__FILE__, "HTParse");
430 }
431 *result = '\0';
432 name = result + len;
433 rel = name + len1;
434
435 /*
436 * Make working copy of the input string to cut up.
437 */
438 MemCpy(name, aName, len1);
439
440 /*
441 * Cut up the string into URL fields.
442 */
443 scan(name, &given);
444 SHOW_PARTS(given);
445
446 /*
447 * Now related string.
448 */
449 if ((given.access && given.host && given.absolute) || !*relatedName) {
450 /*
451 * Inherit nothing!
452 */
453 related.access = NULL;
454 related.host = NULL;
455 related.absolute = NULL;
456 related.relative = NULL;
457 related.search = NULL;
458 related.anchor = NULL;
459 } else {
460 MemCpy(rel, relatedName, len2);
461 scan(rel, &related);
462 }
463 SHOW_PARTS(related);
464
465 /*
466 * Handle the scheme (access) field.
467 */
468 if (given.access && given.host && !given.relative && !given.absolute) {
469 if (!strcmp(given.access, "http") ||
470 !strcmp(given.access, "https") ||
471 !strcmp(given.access, "ftp")) {
472
473 /*
474 * Assume root.
475 */
476 given.absolute = empty_string;
477 }
478 }
479 acc_method = given.access ? given.access : related.access;
480 if (wanted & PARSE_ACCESS) {
481 if (acc_method) {
482 strcpy(tail, acc_method);
483 tail += strlen(tail);
484 if (wanted & PARSE_PUNCTUATION) {
485 *tail++ = ':';
486 *tail = '\0';
487 }
488 }
489 }
490
491 /*
492 * If different schemes, inherit nothing.
493 *
494 * We'll try complying with RFC 1808 and the Fielding draft, and inherit
495 * nothing if both schemes are given, rather than only when they differ,
496 * except for file URLs - FM
497 *
498 * After trying it for a while, it's still premature, IHMO, to go along
499 * with it, so this is back to inheriting for identical schemes whether or
500 * not they are "file". If you want to try it again yourself, uncomment
501 * the strcasecomp() below. - FM
502 */
503 if ((given.access && related.access) &&
504 ( /* strcasecomp(given.access, "file") || */
505 strcmp(given.access, related.access))) {
506 related.host = NULL;
507 related.absolute = NULL;
508 related.relative = NULL;
509 related.search = NULL;
510 related.anchor = NULL;
511 }
512
513 /*
514 * Handle the host field.
515 */
516 if (wanted & PARSE_HOST) {
517 if (given.host || related.host) {
518 if (wanted & PARSE_PUNCTUATION) {
519 *tail++ = '/';
520 *tail++ = '/';
521 }
522 strcpy(tail, given.host ? given.host : related.host);
523 /*
524 * Ignore default port numbers, and trailing dots on FQDNs, which
525 * will only cause identical addresses to look different. (related
526 * is already a clean url).
527 */
528 {
529 char *p2, *h;
530 int portnumber;
531 int gen_delims = 0;
532
533 if ((p2 = HTSkipToAt(result, &gen_delims)) != NULL
534 && gen_delims == 0) {
535 tail = (p2 + 1);
536 }
537 p2 = HTParsePort(result, &portnumber);
538 if (p2 != NULL && acc_method != NULL) {
539 /*
540 * Port specified.
541 */
542 #define ACC_METHOD(a,b) (!strcmp(acc_method, a) && (portnumber == b))
543 if (ACC_METHOD("http", 80) ||
544 ACC_METHOD("https", 443) ||
545 ACC_METHOD("gopher", 70) ||
546 ACC_METHOD("ftp", 21) ||
547 ACC_METHOD("wais", 210) ||
548 ACC_METHOD("nntp", 119) ||
549 ACC_METHOD("news", 119) ||
550 ACC_METHOD("newspost", 119) ||
551 ACC_METHOD("newsreply", 119) ||
552 ACC_METHOD("snews", 563) ||
553 ACC_METHOD("snewspost", 563) ||
554 ACC_METHOD("snewsreply", 563) ||
555 ACC_METHOD("finger", 79) ||
556 ACC_METHOD("telnet", 23) ||
557 ACC_METHOD("tn3270", 23) ||
558 ACC_METHOD("rlogin", 513) ||
559 ACC_METHOD("cso", 105))
560 *p2 = '\0'; /* It is the default: ignore it */
561 }
562 if (p2 == NULL) {
563 int len3 = (int) strlen(tail);
564
565 if (len3 > 0) {
566 h = tail + len3 - 1; /* last char of hostname */
567 if (*h == '.')
568 *h = '\0'; /* chop final . */
569 }
570 } else if (p2 != result) {
571 h = p2;
572 h--; /* End of hostname */
573 if (*h == '.') {
574 /*
575 * Slide p2 over h.
576 */
577 while (*p2 != '\0')
578 *h++ = *p2++;
579 *h = '\0'; /* terminate */
580 }
581 }
582 }
583 #if defined(USE_IDNA) || defined(USE_IDN2)
584 /*
585 * Depending on locale-support, we could have a literal UTF-8
586 * string as a host name, or a URL-encoded form of that.
587 */
588 convert_to_idna(tail);
589 #endif
590 }
591 }
592
593 /*
594 * Trim any blanks from the result so far - there's no excuse for blanks
595 * in a hostname. Also update the tail here.
596 */
597 tail = LYRemoveBlanks(result);
598
599 /*
600 * If host in given or related was ended directly with a '?' (no slash),
601 * fake the search part into absolute. This is the only case search is
602 * returned from scan. A host must have been present. this restores the
603 * '?' at which the host part had been truncated in scan, we have to do
604 * this after host part handling is done. - kw
605 */
606 if (given.search && *(given.search - 1) == '\0') {
607 given.absolute = given.search - 1;
608 given.absolute[0] = '?';
609 } else if (related.search && !related.absolute &&
610 *(related.search - 1) == '\0') {
611 related.absolute = related.search - 1;
612 related.absolute[0] = '?';
613 }
614
615 /*
616 * If different hosts, inherit no path.
617 */
618 if (given.host && related.host)
619 if (strcmp(given.host, related.host) != 0) {
620 related.absolute = NULL;
621 related.relative = NULL;
622 related.anchor = NULL;
623 }
624
625 /*
626 * Handle the path.
627 */
628 if (wanted & (PARSE_PATH | PARSE_STRICTPATH | PARSE_QUERY)) {
629 int want_detail = (wanted & (PARSE_STRICTPATH | PARSE_QUERY));
630
631 if (acc_method && !given.absolute && given.relative) {
632 /*
633 * Treat all given nntp or snews paths, or given paths for news
634 * URLs with a host, as absolute.
635 */
636 switch (*acc_method) {
637 case 'N':
638 case 'n':
639 if (!strcasecomp(acc_method, "nntp") ||
640 (!strcasecomp(acc_method, "news") &&
641 !strncasecomp(result, "news://", 7))) {
642 given.absolute = given.relative;
643 given.relative = NULL;
644 }
645 break;
646 case 'S':
647 case 's':
648 if (!strcasecomp(acc_method, "snews")) {
649 given.absolute = given.relative;
650 given.relative = NULL;
651 }
652 break;
653 }
654 }
655
656 if (given.absolute) { /* All is given */
657 char *base = tail;
658
659 if (wanted & PARSE_PUNCTUATION)
660 *tail++ = '/';
661 strcpy(tail, given.absolute);
662 HTSimplify(base, TRUE);
663 CTRACE((tfp, "HTParse: (ABS)\n"));
664 } else if (related.absolute) { /* Adopt path not name */
665 char *base = tail;
666
667 *tail++ = '/';
668 strcpy(tail, related.absolute);
669 if (given.relative) {
670 /* RFC 1808 part 4 step 5 (if URL path is empty) */
671 /* a) if given has params, add/replace that */
672 if (given.relative[0] == ';') {
673 strcpy(strchr_or_end(tail, ';'), given.relative);
674 }
675 /* b) if given has query, add/replace that */
676 else if (given.relative[0] == '?') {
677 strcpy(strchr_or_end(tail, '?'), given.relative);
678 }
679 /* otherwise fall through to RFC 1808 part 4 step 6 */
680 else {
681 p = StrChr(tail, '?'); /* Search part? */
682 if (p == NULL)
683 p = (tail + strlen(tail) - 1);
684 for (; *p != '/'; p--) ; /* last / */
685 p[1] = '\0'; /* Remove filename */
686 strcat(p, given.relative); /* Add given one */
687 }
688 HTSimplify(base, FALSE);
689 if (*base == '\0')
690 strcpy(base, "/");
691 } else {
692 HTSimplify(base, TRUE);
693 }
694 if (base[0] == '/' && base[1] == '/') {
695 char *pz;
696
697 for (pz = base; (pz[0] = pz[1]) != '\0'; ++pz) ;
698 }
699 CTRACE((tfp, "HTParse: (Related-ABS)\n"));
700 } else if (given.relative) {
701 strcpy(tail, given.relative); /* what we've got */
702 HTSimplify(tail, FALSE);
703 CTRACE((tfp, "HTParse: (REL)\n"));
704 } else if (related.relative) {
705 strcpy(tail, related.relative);
706 HTSimplify(tail, FALSE);
707 CTRACE((tfp, "HTParse: (Related-REL)\n"));
708 } else { /* No inheritance */
709 if (!isLYNXCGI(aName) &&
710 !isLYNXEXEC(aName) &&
711 !isLYNXPROG(aName)) {
712 *tail++ = '/';
713 *tail = '\0';
714 } else {
715 HTSimplify(tail, FALSE);
716 }
717 if (!strcmp(result, "news:/"))
718 result[5] = '*';
719 CTRACE((tfp, "HTParse: (No inheritance)\n"));
720 }
721 if (want_detail) {
722 p = StrChr(tail, '?'); /* Search part? */
723 if (p) {
724 if (PARSE_STRICTPATH) {
725 *p = '\0';
726 } else {
727 if (!(wanted & PARSE_PUNCTUATION))
728 p++;
729 do {
730 *tail++ = *p;
731 } while (*p++);
732 }
733 } else {
734 if (wanted & PARSE_QUERY)
735 *tail = '\0';
736 }
737 }
738 }
739
740 /*
741 * Handle the fragment (anchor). Never inherit.
742 */
743 if (wanted & PARSE_ANCHOR) {
744 if (given.anchor && *given.anchor) {
745 tail += strlen(tail);
746 if (wanted & PARSE_PUNCTUATION)
747 *tail++ = '#';
748 strcpy(tail, given.anchor);
749 }
750 }
751
752 /*
753 * If there are any blanks remaining in the string, escape them as needed.
754 * See the discussion in LYLegitimizeHREF() for example.
755 */
756 if ((p = StrChr(result, ' ')) != 0) {
757 switch (is_url(result)) {
758 case UNKNOWN_URL_TYPE:
759 CTRACE((tfp, "HTParse: ignore:`%s'\n", result));
760 break;
761 case LYNXEXEC_URL_TYPE:
762 case LYNXPROG_URL_TYPE:
763 case LYNXCGI_URL_TYPE:
764 case LYNXPRINT_URL_TYPE:
765 case LYNXHIST_URL_TYPE:
766 case LYNXDOWNLOAD_URL_TYPE:
767 case LYNXKEYMAP_URL_TYPE:
768 case LYNXIMGMAP_URL_TYPE:
769 case LYNXCOOKIE_URL_TYPE:
770 case LYNXCACHE_URL_TYPE:
771 case LYNXDIRED_URL_TYPE:
772 case LYNXOPTIONS_URL_TYPE:
773 case LYNXCFG_URL_TYPE:
774 case LYNXCOMPILE_OPTS_URL_TYPE:
775 case LYNXMESSAGES_URL_TYPE:
776 CTRACE((tfp, "HTParse: spaces:`%s'\n", result));
777 break;
778 case NOT_A_URL_TYPE:
779 default:
780 CTRACE((tfp, "HTParse: encode:`%s'\n", result));
781 do {
782 q = p + strlen(p) + 2;
783
784 while (q != p + 1) {
785 q[0] = q[-2];
786 --q;
787 }
788 p[0] = HEX_ESCAPE;
789 p[1] = '2';
790 p[2] = '0';
791 } while ((p = StrChr(result, ' ')) != 0);
792 break;
793 }
794 }
795 CTRACE((tfp, "HTParse: result:`%s'\n", result));
796
797 StrAllocCopy(return_value, result);
798 LYalloca_free(result);
799
800 /* FIXME: could be optimized using HTParse() internals */
801 if (*relatedName &&
802 ((wanted & PARSE_ALL_WITHOUT_ANCHOR) == PARSE_ALL_WITHOUT_ANCHOR)) {
803 /*
804 * Check whether to fill in localhost. - FM
805 */
806 LYFillLocalFileURL(&return_value, relatedName);
807 CTRACE((tfp, "pass LYFillLocalFile:`%s'\n", return_value));
808 }
809
810 return return_value; /* exactly the right length */
811 }
812
813 /* HTParseAnchor(), fast HTParse() specialization
814 * ----------------------------------------------
815 *
816 * On exit,
817 * returns A pointer within input string (probably to its end '\0')
818 */
HTParseAnchor(const char * aName)819 const char *HTParseAnchor(const char *aName)
820 {
821 const char *p = aName;
822
823 for (; *p && *p != '#'; p++) {
824 ;
825 }
826 if (*p == '#') {
827 /* the safe way based on HTParse() -
828 * keeping in mind scan() peculiarities on schemes:
829 */
830 struct struct_parts given;
831 size_t need = ((unsigned) ((p - aName) + (int) strlen(p) + 1));
832 char *name;
833
834 if (need > (size_t) max_uri_size) {
835 p += strlen(p);
836 } else {
837 name = (char *) LYalloca(need);
838
839 if (name == NULL) {
840 outofmem(__FILE__, "HTParseAnchor");
841 }
842 strcpy(name, aName);
843 scan(name, &given);
844 LYalloca_free(name);
845
846 p++; /*next to '#' */
847 if (given.anchor == NULL) {
848 for (; *p; p++) /*scroll to end '\0' */
849 ;
850 }
851 }
852 }
853 return p;
854 }
855
856 /* Simplify a filename. HTSimplify()
857 * --------------------
858 *
859 * A unix-style file is allowed to contain the sequence xxx/../ which may
860 * be replaced by "" , and the sequence "/./" which may be replaced by "/".
861 * Simplification helps us recognize duplicate filenames.
862 *
863 * RFC 3986 section 5.2.4 says to do this whether or not the path was relative.
864 */
HTSimplify(char * filename,BOOL absolute)865 void HTSimplify(char *filename, BOOL absolute)
866 {
867 #define MY_FMT "HTParse HTSimplify\t(%s)"
868 #ifdef NO_LYNX_TRACE
869 #define debug_at(at) /* nothing */
870 #define atln "?"
871 #else
872 const char *atln;
873
874 #define debug_at(at) atln = at
875 #endif
876 char *mark;
877 char *p;
878 size_t limit;
879
880 CTRACE2(TRACE_HTPARSE,
881 (tfp, MY_FMT " %s\n",
882 filename,
883 absolute ? "ABS" : "REL"));
884
885 if (LYIsPathSep(*filename) && !absolute)
886 ++filename;
887 mark = filename;
888 limit = strlen(filename);
889
890 for (p = filename; *p; ++p) {
891 if (*p == '?' || *p == '#') {
892 limit = (size_t) (p - filename);
893 break;
894 }
895 }
896 while ((limit != 0) && (*filename != '\0')) {
897 size_t trim = 0;
898 size_t skip = 0;
899 size_t last = 0;
900
901 debug_at("?");
902 p = filename;
903 if (limit >= 2 && !memcmp(p, "./", 2)) { /* 2A */
904 debug_at("2A");
905 trim = 2;
906 } else if (limit >= 3 && !memcmp(p, "../", 3)) {
907 debug_at("2A2");
908 trim = 3;
909 } else if (limit >= 3 && !memcmp(p, "/./", 3)) { /* 2B */
910 debug_at("2B");
911 trim = 2;
912 skip = 1;
913 } else if (limit == 2 && !memcmp(p, "/.", 2)) {
914 debug_at("2B2");
915 trim = 1;
916 skip = 1;
917 } else if (limit >= 4 && !memcmp(p, "/../", 4)) { /* 2C */
918 debug_at("2C");
919 trim = 3;
920 skip = 1;
921 last = 1;
922 } else if (limit == 3 && !memcmp(p, "/..", 3)) {
923 debug_at("2C2");
924 trim = 2;
925 skip = 1;
926 last = 1;
927 } else if (limit == 2 && !memcmp(p, "..", 2)) { /* 2D */
928 debug_at("2D");
929 trim = 2;
930 } else if (limit == 1 && !memcmp(p, ".", 1)) {
931 debug_at("2D2");
932 trim = 1;
933 }
934 if (trim) {
935 CTRACE2(TRACE_HTPARSE,
936 (tfp, MY_FMT " trim %lu/%lu (%.*s) '%.*s' @%s\n",
937 mark, (unsigned long) trim, (unsigned long) limit,
938 (int) trim, p + skip, (int) limit, p, atln));
939 }
940 if (last) {
941 char *prior = filename;
942
943 if (prior != mark) {
944 --prior;
945 while (prior != mark && *prior != '/') {
946 --prior;
947 }
948 }
949 if (prior != filename) {
950 trim += (size_t) (filename - prior);
951 limit += (size_t) (filename - prior);
952 filename = prior;
953 CTRACE2(TRACE_HTPARSE,
954 (tfp, MY_FMT " TRIM %lu/%lu (%.*s)\n",
955 mark, (unsigned long) trim, (unsigned long) limit,
956 (int) trim, filename + skip));
957 }
958 }
959 if (trim) {
960 limit -= trim;
961 for (p = filename;; ++p) {
962 if ((p[0] = p[trim]) == '\0') {
963 break;
964 }
965 if (skip) {
966 p[0] = '/';
967 skip = 0;
968 }
969 }
970 CTRACE2(TRACE_HTPARSE,
971 (tfp, MY_FMT " loop %lu\n", mark, (unsigned long) limit));
972 } else {
973 if (*filename == '/') {
974 ++filename;
975 --limit;
976 }
977 while ((limit != 0) && (*filename != '/')) {
978 ++filename;
979 --limit;
980 }
981 }
982 }
983 CTRACE2(TRACE_HTPARSE, (tfp, MY_FMT " done\n", mark));
984 #undef MY_FMT
985 }
986
987 /* Make Relative Name. HTRelative()
988 * -------------------
989 *
990 * This function creates and returns a string which gives an expression of
991 * one address as related to another. Where there is no relation, an absolute
992 * address is returned.
993 *
994 * On entry,
995 * Both names must be absolute, fully qualified names of nodes
996 * (no anchor bits)
997 *
998 * On exit,
999 * The return result points to a newly allocated name which, if
1000 * parsed by HTParse relative to relatedName, will yield aName.
1001 * The caller is responsible for freeing the resulting name later.
1002 *
1003 */
HTRelative(const char * aName,const char * relatedName)1004 char *HTRelative(const char *aName,
1005 const char *relatedName)
1006 {
1007 char *result = NULL;
1008 const char *p = aName;
1009 const char *q = relatedName;
1010 const char *after_access = NULL;
1011 const char *path = NULL;
1012 const char *last_slash = NULL;
1013 int slashes = 0;
1014
1015 for (; *p; p++, q++) { /* Find extent of match */
1016 if (*p != *q)
1017 break;
1018 if (*p == ':')
1019 after_access = p + 1;
1020 if (*p == '/') {
1021 last_slash = p;
1022 slashes++;
1023 if (slashes == 3)
1024 path = p;
1025 }
1026 }
1027
1028 /* q, p point to the first non-matching character or zero */
1029
1030 if (!after_access) { /* Different access */
1031 StrAllocCopy(result, aName);
1032 } else if (slashes < 3) { /* Different nodes */
1033 StrAllocCopy(result, after_access);
1034 } else if (slashes == 3) { /* Same node, different path */
1035 StrAllocCopy(result, path);
1036 } else { /* Some path in common */
1037 unsigned levels = 0;
1038
1039 for (; *q && (*q != '#'); q++)
1040 if (*q == '/')
1041 levels++;
1042 result = typecallocn(char, 3 * levels + strlen(last_slash) + 1);
1043
1044 if (result == NULL)
1045 outofmem(__FILE__, "HTRelative");
1046
1047 result[0] = '\0';
1048 for (; levels; levels--)
1049 strcat(result, "../");
1050 strcat(result, last_slash + 1);
1051 }
1052 CTRACE((tfp,
1053 "HTparse: `%s' expressed relative to\n `%s' is\n `%s'.\n",
1054 aName, relatedName, result));
1055 return result;
1056 }
1057
1058 #define AlloCopy(next,base,extra) \
1059 typecallocn(char, ((next - base) + ((int) extra)))
1060
1061 /* Escape undesirable characters using % HTEscape()
1062 * -------------------------------------
1063 *
1064 * This function takes a pointer to a string in which
1065 * some characters may be unacceptable unescaped.
1066 * It returns a string which has these characters
1067 * represented by a '%' character followed by two hex digits.
1068 *
1069 * Unlike HTUnEscape(), this routine returns a calloc'd string.
1070 */
1071 /* *INDENT-OFF* */
1072 static const unsigned char isAcceptable[96] =
1073
1074 /* Bit 0 xalpha -- see HTFile.h
1075 * Bit 1 xpalpha -- as xalpha but with plus.
1076 * Bit 2 ... path -- as xpalphas but with /
1077 */
1078 /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
1079 { 0,0,0,0,0,0,0,0,0,0,7,6,0,7,7,4, /* 2x !"#$%&'()*+,-./ */
1080 7,7,7,7,7,7,7,7,7,7,0,0,0,0,0,0, /* 3x 0123456789:;<=>? */
1081 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* 4x @ABCDEFGHIJKLMNO */
1082 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,7, /* 5X PQRSTUVWXYZ[\]^_ */
1083 0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* 6x `abcdefghijklmno */
1084 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,0 }; /* 7X pqrstuvwxyz{|}~ DEL */
1085 /* *INDENT-ON* */
1086
1087 static const char *hex = "0123456789ABCDEF";
1088
1089 #define ACCEPTABLE(a) ( a>=32 && a<128 && ((isAcceptable[a-32]) & mask))
1090
HTEscape(const char * str,unsigned mask)1091 char *HTEscape(const char *str,
1092 unsigned mask)
1093 {
1094 const char *p;
1095 char *q;
1096 char *result;
1097 size_t unacceptable = 0;
1098
1099 for (p = str; *p; p++)
1100 if (!ACCEPTABLE(UCH(TOASCII(*p))))
1101 unacceptable++;
1102 result = AlloCopy(p, str, (unacceptable * 2) + 1);
1103
1104 if (result == NULL)
1105 outofmem(__FILE__, "HTEscape");
1106
1107 for (q = result, p = str; *p; p++) {
1108 unsigned char a = UCH(TOASCII(*p));
1109
1110 if (!ACCEPTABLE(a)) {
1111 *q++ = HEX_ESCAPE; /* Means hex coming */
1112 *q++ = hex[a >> 4];
1113 *q++ = hex[a & 15];
1114 } else
1115 *q++ = *p;
1116 }
1117 *q = '\0'; /* Terminate */
1118 return result;
1119 }
1120
1121 /* Escape unsafe characters using % HTEscapeUnsafe()
1122 * --------------------------------
1123 *
1124 * This function takes a pointer to a string in which
1125 * some characters may be that may be unsafe are unescaped.
1126 * It returns a string which has these characters
1127 * represented by a '%' character followed by two hex digits.
1128 *
1129 * Unlike HTUnEscape(), this routine returns a malloc'd string.
1130 */
1131 #define UNSAFE(ch) (((ch) <= 32) || ((ch) >= 127))
1132
HTEscapeUnsafe(const char * str)1133 char *HTEscapeUnsafe(const char *str)
1134 {
1135 const char *p;
1136 char *q;
1137 char *result;
1138 size_t unacceptable = 0;
1139
1140 for (p = str; *p; p++)
1141 if (UNSAFE(UCH(TOASCII(*p))))
1142 unacceptable++;
1143 result = AlloCopy(p, str, (unacceptable * 2) + 1);
1144
1145 if (result == NULL)
1146 outofmem(__FILE__, "HTEscapeUnsafe");
1147
1148 for (q = result, p = str; *p; p++) {
1149 unsigned char a = UCH(TOASCII(*p));
1150
1151 if (UNSAFE(a)) {
1152 *q++ = HEX_ESCAPE; /* Means hex coming */
1153 *q++ = hex[a >> 4];
1154 *q++ = hex[a & 15];
1155 } else
1156 *q++ = *p;
1157 }
1158 *q = '\0'; /* Terminate */
1159 return result;
1160 }
1161
1162 /* Escape undesirable characters using % but space to +. HTEscapeSP()
1163 * -----------------------------------------------------
1164 *
1165 * This function takes a pointer to a string in which
1166 * some characters may be unacceptable unescaped.
1167 * It returns a string which has these characters
1168 * represented by a '%' character followed by two hex digits,
1169 * except that spaces are converted to '+' instead of %2B.
1170 *
1171 * Unlike HTUnEscape(), this routine returns a calloced string.
1172 */
HTEscapeSP(const char * str,unsigned mask)1173 char *HTEscapeSP(const char *str,
1174 unsigned mask)
1175 {
1176 const char *p;
1177 char *q;
1178 char *result;
1179 size_t unacceptable = 0;
1180
1181 for (p = str; *p; p++)
1182 if (!(*p == ' ' || ACCEPTABLE(UCH(TOASCII(*p)))))
1183 unacceptable++;
1184 result = AlloCopy(p, str, (unacceptable * 2) + 1);
1185
1186 if (result == NULL)
1187 outofmem(__FILE__, "HTEscape");
1188
1189 for (q = result, p = str; *p; p++) {
1190 unsigned char a = UCH(TOASCII(*p));
1191
1192 if (a == 32) {
1193 *q++ = '+';
1194 } else if (!ACCEPTABLE(a)) {
1195 *q++ = HEX_ESCAPE; /* Means hex coming */
1196 *q++ = hex[a >> 4];
1197 *q++ = hex[a & 15];
1198 } else {
1199 *q++ = *p;
1200 }
1201 }
1202 *q = '\0'; /* Terminate */
1203 return result;
1204 }
1205
1206 /* Decode %xx escaped characters. HTUnEscape()
1207 * ------------------------------
1208 *
1209 * This function takes a pointer to a string in which some
1210 * characters may have been encoded in %xy form, where xy is
1211 * the ASCII hex code for character 16x+y.
1212 * The string is converted in place, as it will never grow.
1213 */
from_hex(int c)1214 static char from_hex(int c)
1215 {
1216 return (char) (c >= '0' && c <= '9' ? c - '0'
1217 : c >= 'A' && c <= 'F' ? c - 'A' + 10
1218 : c - 'a' + 10); /* accept small letters just in case */
1219 }
1220
HTUnEscape(char * str)1221 char *HTUnEscape(char *str)
1222 {
1223 char *p = str;
1224 char *q = str;
1225
1226 if (!(p && *p))
1227 return str;
1228
1229 while (*p != '\0') {
1230 if (*p == HEX_ESCAPE &&
1231 /*
1232 * Tests shouldn't be needed, but better safe than sorry.
1233 */
1234 p[1] && p[2] &&
1235 isxdigit(UCH(p[1])) &&
1236 isxdigit(UCH(p[2]))) {
1237 p++;
1238 if (*p)
1239 *q = (char) (from_hex(*p++) * 16);
1240 if (*p) {
1241 /*
1242 * Careful! FROMASCII() may evaluate its arg more than once!
1243 */
1244 /* S/390 -- gil -- 0221 */
1245 *q = (char) (*q + from_hex(*p++));
1246 }
1247 *q = FROMASCII(*q);
1248 q++;
1249 } else {
1250 *q++ = *p++;
1251 }
1252 }
1253
1254 *q = '\0';
1255 return str;
1256
1257 } /* HTUnEscape */
1258
1259 /* Decode some %xx escaped characters. HTUnEscapeSome()
1260 * ----------------------------------- Klaus Weide
1261 * (kweide@tezcat.com)
1262 * This function takes a pointer to a string in which some
1263 * characters may have been encoded in %xy form, where xy is
1264 * the ASCII hex code for character 16x+y, and a pointer to
1265 * a second string containing one or more characters which
1266 * should be unescaped if escaped in the first string.
1267 * The first string is converted in place, as it will never grow.
1268 */
HTUnEscapeSome(char * str,const char * do_trans)1269 char *HTUnEscapeSome(char *str,
1270 const char *do_trans)
1271 {
1272 char *p = str;
1273 char *q = str;
1274 char testcode;
1275
1276 if (p == NULL || *p == '\0' || do_trans == NULL || *do_trans == '\0')
1277 return str;
1278
1279 while (*p != '\0') {
1280 if (*p == HEX_ESCAPE &&
1281 p[1] && p[2] && /* tests shouldn't be needed, but.. */
1282 isxdigit(UCH(p[1])) &&
1283 isxdigit(UCH(p[2])) &&
1284 (testcode = (char) FROMASCII(from_hex(p[1]) * 16 +
1285 from_hex(p[2]))) && /* %00 no good */
1286 StrChr(do_trans, testcode)) { /* it's one of the ones we want */
1287 *q++ = testcode;
1288 p += 3;
1289 } else {
1290 *q++ = *p++;
1291 }
1292 }
1293
1294 *q = '\0';
1295 return str;
1296
1297 } /* HTUnEscapeSome */
1298 /* *INDENT-OFF* */
1299 static const unsigned char crfc[96] =
1300
1301 /* Bit 0 xalpha -- need "quoting"
1302 * Bit 1 xpalpha -- need \escape if quoted
1303 */
1304 /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
1305 { 1,0,3,0,0,0,0,0,1,1,0,0,1,0,1,0, /* 2x !"#$%&'()*+,-./ */
1306 0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0, /* 3x 0123456789:;<=>? */
1307 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 4x @ABCDEFGHIJKLMNO */
1308 0,0,0,0,0,0,0,0,0,0,0,1,2,1,0,0, /* 5X PQRSTUVWXYZ[\]^_ */
1309 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 6x `abcdefghijklmno */
1310 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3 }; /* 7X pqrstuvwxyz{|}~ DEL */
1311 /* *INDENT-ON* */
1312
1313 #define ASCII_TAB '\011'
1314 #define ASCII_LF '\012'
1315 #define ASCII_CR '\015'
1316 #define ASCII_SPC '\040'
1317 #define ASCII_BAK '\134'
1318
1319 /*
1320 * Turn a string which is not a RFC 822 token into a quoted-string. - KW
1321 * The "quoted" parameter tells whether we need the beginning/ending quote
1322 * marks. If not, the caller will provide them -TD
1323 */
HTMake822Word(char ** str,int quoted)1324 void HTMake822Word(char **str,
1325 int quoted)
1326 {
1327 const char *p;
1328 char *q;
1329 char *result;
1330 unsigned char a;
1331 unsigned added = 0;
1332
1333 if (isEmpty(*str)) {
1334 StrAllocCopy(*str, quoted ? "\"\"" : "");
1335 return;
1336 }
1337 for (p = *str; *p; p++) {
1338 a = UCH(TOASCII(*p)); /* S/390 -- gil -- 0240 */
1339 if (a < 32 || a >= 128 ||
1340 ((crfc[a - 32]) & 1)) {
1341 if (!added)
1342 added = 2;
1343 if (a >= 160 || a == '\t')
1344 continue;
1345 if (a == '\r' || a == '\n')
1346 added += 2;
1347 else if ((a & 127) < 32 || ((crfc[a - 32]) & 2))
1348 added++;
1349 }
1350 }
1351 if (!added)
1352 return;
1353 result = AlloCopy(p, *str, added + 1);
1354 if (result == NULL)
1355 outofmem(__FILE__, "HTMake822Word");
1356
1357 q = result;
1358 if (quoted)
1359 *q++ = '"';
1360 /*
1361 * Having converted the character to ASCII, we can't use symbolic
1362 * escape codes, since they're in the host character set, which
1363 * is not necessarily ASCII. Thus we use octal escape codes instead.
1364 * -- gil (Paul Gilmartin) <pg@sweng.stortek.com>
1365 */
1366 /* S/390 -- gil -- 0268 */
1367 for (p = *str; *p; p++) {
1368 a = UCH(TOASCII(*p));
1369 if ((a != ASCII_TAB) &&
1370 ((a & 127) < ASCII_SPC ||
1371 (a < 128 && ((crfc[a - 32]) & 2))))
1372 *q++ = ASCII_BAK;
1373 *q++ = *p;
1374 if (a == ASCII_LF ||
1375 (a == ASCII_CR && (TOASCII(*(p + 1)) != ASCII_LF)))
1376 *q++ = ' ';
1377 }
1378 if (quoted)
1379 *q++ = '"';
1380 *q = '\0'; /* Terminate */
1381 FREE(*str);
1382 *str = result;
1383 }
1384