1 /* Parse HyperText Document Address HTParse.c
2 ** ================================
3 */
4
5 #include "HTParse.h"
6 #define TRACE 0
7
8 #define FREE(x) if (x) {free(x); x = NULL;}
9
10 struct struct_parts {
11 char *access;
12 char *host;
13 char *absolute;
14 char *relative;
15 /* char * search; no - treated as part of path */
16 char *anchor;
17 };
18
19 /* Strings of any length
20 ** ---------------------
21 */
22 PUBLIC int strcasecomp
ARGS2(CONST char *,a,CONST char *,b)23 ARGS2(CONST char *, a, CONST char *, b)
24 {
25 CONST char *p = a;
26 CONST char *q = b;
27
28 for (p = a, q = b; *p && *q; p++, q++) {
29 int diff = TOLOWER(*p) - TOLOWER(*q);
30 if (diff)
31 return diff;
32 }
33 if (*p)
34 return 1; /* p was longer than q */
35 if (*q)
36 return -1; /* p was shorter than q */
37 return 0; /* Exact match */
38 }
39
40 /* With count limit
41 ** ----------------
42 */
43 PUBLIC int strncasecomp
ARGS3(CONST char *,a,CONST char *,b,int,n)44 ARGS3(CONST char *, a, CONST char *, b, int, n)
45 {
46 CONST char *p = a;
47 CONST char *q = b;
48
49 for (p = a, q = b;; p++, q++) {
50 int diff;
51 if (p == (a + n))
52 return 0; /* Match up to n characters */
53 if (!(*p && *q))
54 return (*p - *q);
55 diff = TOLOWER(*p) - TOLOWER(*q);
56 if (diff)
57 return diff;
58 }
59 /* NOTREACHED */
60 }
61
62 /* Allocate a new copy of a string, and returns it
63 */
64 PUBLIC char *HTSACopy
ARGS2(char **,dest,CONST char *,src)65 ARGS2(char **, dest, CONST char *, src)
66 {
67 FREE(*dest);
68 if (src) {
69 *dest = (char *) malloc(strlen(src) + 1);
70 if (*dest == NULL)
71 outofmem(__FILE__, "HTSACopy");
72 strcpy(*dest, src);
73 }
74 return *dest;
75 }
76 /* String Allocate and Concatenate
77 */
78 PUBLIC char *HTSACat
ARGS2(char **,dest,CONST char *,src)79 ARGS2(char **, dest, CONST char *, src)
80 {
81 if (src && *src) {
82 if (*dest) {
83 int length = strlen(*dest);
84 *dest = (char *) realloc(*dest, length + strlen(src) + 1);
85 if (*dest == NULL)
86 outofmem(__FILE__, "HTSACat");
87 strcpy(*dest + length, src);
88 } else {
89 *dest = (char *) malloc(strlen(src) + 1);
90 if (*dest == NULL)
91 outofmem(__FILE__, "HTSACat");
92 strcpy(*dest, src);
93 }
94 }
95 return *dest;
96 }
97
98
99
100 /* Strip white space off a string. HTStrip()
101 ** -------------------------------
102 **
103 ** On exit,
104 ** Return value points to first non-white character, or to 0 if none.
105 ** All trailing white space is OVERWRITTEN with zero.
106 */
107 PUBLIC char *HTStrip
ARGS1(char *,s)108 ARGS1(char *, s)
109 {
110 #define SPACE(c) ((c == ' ') || (c == '\t') || (c == '\n'))
111 char *p = s;
112 for (p = s; *p; p++); /* Find end of string */
113 for (p--; p >= s; p--) {
114 if (SPACE(*p))
115 *p = '\0'; /* Zap trailing blanks */
116 else
117 break;
118 }
119 while (SPACE(*s))
120 s++; /* Strip leading blanks */
121 return s;
122 }
123
124 /* Scan a filename for its consituents. scan()
125 ** ------------------------------------
126 **
127 ** On entry,
128 ** name points to a document name which may be incomplete.
129 ** On exit,
130 ** absolute or relative may be nonzero (but not both).
131 ** host, anchor and access may be nonzero if they were specified.
132 ** Any which are nonzero point to zero terminated strings.
133 */
134 PRIVATE void scan
ARGS2(char *,name,struct struct_parts *,parts)135 ARGS2(char *, name, struct struct_parts *, parts)
136 {
137 char *after_access;
138 char *p;
139 /* int length = strlen (name); */
140
141 parts->access = NULL;
142 parts->host = NULL;
143 parts->absolute = NULL;
144 parts->relative = NULL;
145 parts->anchor = NULL;
146
147 /*
148 ** Scan left-to-right for a scheme (access).
149 */
150 after_access = name;
151 for (p = name; *p; p++) {
152 if (*p == ':') {
153 *p = '\0';
154 parts->access = name; /* Access name has been specified */
155 after_access = (p + 1);
156 break;
157 }
158 if (*p == '/' || *p == '#' || *p == ';' || *p == '?')
159 break;
160 }
161
162 #ifdef NOTDEFINED
163 for (p = (name + length - 1); p >= name; p--) {
164 #endif /* NOTDEFINED */
165 /*
166 ** Scan left-to-right for a fragment (anchor).
167 */
168 for (p = after_access; *p; p++) {
169 if (*p == '#') {
170 parts->anchor = (p + 1);
171 *p = '\0'; /* terminate the rest */
172 }
173 }
174
175 /*
176 ** Scan left-to-right for a host or absolute path.
177 */
178 p = after_access;
179 if (*p == '/') {
180 if (p[1] == '/') {
181 parts->host = (p + 2); /* host has been specified */
182 *p = '\0'; /* Terminate access */
183 p = strchr(parts->host, '/'); /* look for end of host name if any */
184 if (p != NULL) {
185 *p = '\0'; /* Terminate host */
186 parts->absolute = (p + 1); /* Root has been found */
187 }
188 } else {
189 parts->absolute = (p + 1); /* Root found but no host */
190 }
191 } else {
192 parts->relative = (*after_access) ? after_access : NULL; /* NULL for
193 * "" */
194 }
195
196 /*
197 ** Check schemes that commonly have unescaped hashes.
198 */
199 if (parts->access && parts->anchor) {
200 if ((!parts->host && strcasecomp(parts->access, "lynxcgi")) ||
201 !strcasecomp(parts->access, "nntp") ||
202 !strcasecomp(parts->access, "snews") ||
203 !strcasecomp(parts->access, "news") ||
204 !strcasecomp(parts->access, "data")) {
205 /*
206 * Access specified but no host and not a lynxcgi URL, so the
207 * anchor may not really be one, e.g., news:j462#36487@foo.bar,
208 * or it's an nntp or snews URL, or news URL with a host.
209 * Restore the '#' in the address.
210 */
211 *(parts->anchor - 1) = '#';
212 parts->anchor = NULL;
213 }
214 }
215 #ifdef NOT_DEFINED /* search is just treated as part of path */
216 {
217 char *p = (relative ? relative : absolute);
218 if (p != NULL) {
219 char *q = strchr(p, '?'); /* Any search string? */
220 if (q != NULL) {
221 *q = '\0'; /* If so, chop that off. */
222 parts->search = (q + 1);
223 }
224 }
225 }
226 #endif /* NOT_DEFINED */
227 } /* scan */
228
229
230 /* Parse a Name relative to another name. HTParse()
231 ** --------------------------------------
232 **
233 ** This returns those parts of a name which are given (and requested)
234 ** substituting bits from the related name where necessary.
235 **
236 ** On entry,
237 ** aName A filename given
238 ** relatedName A name relative to which aName is to be parsed
239 ** wanted A mask for the bits which are wanted.
240 **
241 ** On exit,
242 ** returns A pointer to a malloc'd string which MUST BE FREED
243 */
244 PUBLIC char *HTParse ARGS3(CONST char *, aName,
245 CONST char *, relatedName, int, wanted) {
246 char *result = NULL;
247 char *return_value = NULL;
248 int len;
249 char *name = NULL;
250 char *rel = NULL;
251 char *p;
252 char *access;
253 struct struct_parts given, related;
254
255 if (TRACE)
256 fprintf(stderr,
257 "HTParse: aName:%s relatedName:%s\n", aName, relatedName);
258
259 /*
260 ** Allocate the output string.
261 */
262 len = strlen(aName) + strlen(relatedName) + 10;
263 result = (char *) malloc(len); /* Lots of space: more than enough */
264 if (result == NULL)
265 outofmem(__FILE__, "HTParse");
266 result[0] = '\0'; /* Clear string */
267
268 /*
269 ** Make working copies of the input strings to cut up.
270 */
271 StrAllocCopy(name, aName);
272 StrAllocCopy(rel, relatedName);
273
274 /*
275 ** Cut up the strings into URL fields.
276 */
277 scan(name, &given);
278 scan(rel, &related);
279
280 /*
281 ** Handle the scheme (access) field.
282 */
283 if (given.access && given.host && !given.relative && !given.absolute) {
284 if (!strcmp(given.access, "http") ||
285 !strcmp(given.access, "https") || !strcmp(given.access, "ftp"))
286 /*
287 ** Assume root.
288 */
289 given.absolute = "";
290 }
291 access = given.access ? given.access : related.access;
292 if (wanted & PARSE_ACCESS) {
293 if (access) {
294 strcat(result, access);
295 if (wanted & PARSE_PUNCTUATION)
296 strcat(result, ":");
297 }
298 }
299
300 /*
301 ** If different schemes, inherit nothing.
302 **
303 ** We'll try complying with RFC 1808 and
304 ** the Fielding draft, and inherit nothing
305 ** if both schemes are given, rather than
306 ** only when they differ, except for
307 ** file URLs - FM
308 **
309 ** After trying it for a while, it's still
310 ** premature, IHMO, to go along with it, so
311 ** this is back to inheriting for identical
312 ** schemes whether or not they are "file".
313 ** If you want to try it again yourself,
314 ** uncomment the strncasecomp() below. - FM
315 */
316 if ((given.access && related.access) && ( /* strcasecomp(given.access,
317 * "file") || */
318 strcmp(given.access,
319 related.access))) {
320 related.host = NULL;
321 related.absolute = NULL;
322 related.relative = NULL;
323 related.anchor = NULL;
324 }
325
326 /*
327 ** Handle the host field.
328 */
329 if (wanted & PARSE_HOST)
330 if (given.host || related.host) {
331 char *tail = result + strlen(result);
332 if (wanted & PARSE_PUNCTUATION)
333 strcat(result, "//");
334 strcat(result, given.host ? given.host : related.host);
335 #define CLEAN_URLS
336 #ifdef CLEAN_URLS
337 /*
338 ** Ignore default port numbers, and trailing dots on FQDNs,
339 ** which will only cause identical addresses to look different.
340 */
341 {
342 char *p, *h;
343 p = strchr(tail, ':');
344 if (p != NULL && !isdigit((unsigned char) p[1]))
345 /*
346 ** Colon not followed by a port number.
347 */
348 *p = '\0';
349 if (p != NULL && p != '\0' && access != NULL) {
350 /*
351 ** Port specified.
352 */
353 if ((!strcmp(access, "http") && !strcmp(p, ":80")) ||
354 (!strcmp(access, "gopher") && !strcmp(p, ":70")) ||
355 (!strcmp(access, "ftp") && !strcmp(p, ":21")) ||
356 (!strcmp(access, "wais") && !strcmp(p, ":210")) ||
357 (!strcmp(access, "nntp") && !strcmp(p, ":119")) ||
358 (!strcmp(access, "news") && !strcmp(p, ":119")) ||
359 (!strcmp(access, "snews") && !strcmp(p, ":563")) ||
360 (!strcmp(access, "finger") && !strcmp(p, ":79")) ||
361 (!strcmp(access, "cso") && !strcmp(p, ":105")))
362 *p = '\0'; /* It is the default: ignore it */
363 }
364 if (p == NULL) {
365 int len = strlen(tail);
366
367 if (len > 0) {
368 h = tail + len - 1; /* last char of hostname */
369 if (*h == '.')
370 *h = '\0'; /* chop final . */
371 }
372 } else {
373 h = p;
374 h--; /* End of hostname */
375 if (*h == '.') {
376 /*
377 ** Slide p over h.
378 */
379 while (*p != '\0')
380 *h++ = *p++;
381 *h = '\0'; /* terminate */
382 }
383 }
384 }
385 #endif /* CLEAN_URLS */
386 }
387
388 /*
389 ** If different hosts, inherit no path.
390 */
391 if (given.host && related.host)
392 if (strcmp(given.host, related.host) != 0) {
393 related.absolute = NULL;
394 related.relative = NULL;
395 related.anchor = NULL;
396 }
397
398 /*
399 ** Handle the path.
400 */
401 if (wanted & PARSE_PATH) {
402 if (access && !given.absolute && given.relative) {
403 if (!strcasecomp(access, "nntp") ||
404 !strcasecomp(access, "snews") ||
405 (!strcasecomp(access, "news") &&
406 !strncasecomp(result, "news://", 7))) {
407 /*
408 * Treat all given nntp or snews paths,
409 * or given paths for news URLs with a host,
410 * as absolute.
411 */
412 given.absolute = given.relative;
413 given.relative = NULL;
414 }
415 }
416 if (given.absolute) { /* All is given */
417 if (wanted & PARSE_PUNCTUATION)
418 strcat(result, "/");
419 strcat(result, given.absolute);
420 if (TRACE)
421 fprintf(stderr, "1\n");
422 } else if (related.absolute) { /* Adopt path not name */
423 strcat(result, "/");
424 strcat(result, related.absolute);
425 if (given.relative) {
426 p = strchr(result, '?'); /* Search part? */
427 if (p == NULL)
428 p = (result + strlen(result) - 1);
429 for (; *p != '/'; p--); /* last / */
430 p[1] = '\0'; /* Remove filename */
431 strcat(result, given.relative); /* Add given one */
432 HTSimplify(result);
433 }
434 if (TRACE)
435 fprintf(stderr, "2\n");
436 } else if (given.relative) {
437 strcat(result, given.relative); /* what we've got */
438 if (TRACE)
439 fprintf(stderr, "3\n");
440 } else if (related.relative) {
441 strcat(result, related.relative);
442 if (TRACE)
443 fprintf(stderr, "4\n");
444 } else { /* No inheritance */
445 if (strncasecomp(aName, "lynxcgi:", 8) &&
446 strncasecomp(aName, "lynxexec:", 9) &&
447 strncasecomp(aName, "lynxprog:", 9)) {
448 strcat(result, "/");
449 }
450 if (!strcmp(result, "news:/"))
451 result[5] = '*';
452 if (TRACE)
453 fprintf(stderr, "5\n");
454 }
455 }
456
457 /*
458 ** Handle the fragment (anchor).
459 */
460 if (wanted & PARSE_ANCHOR)
461 if ((given.anchor && *given.anchor) || (!given.anchor && related.anchor)) {
462 if (wanted & PARSE_PUNCTUATION)
463 strcat(result, "#");
464 strcat(result, (given.anchor) ? given.anchor : related.anchor);
465 }
466 if (TRACE)
467 fprintf(stderr, "HTParse: result:%s\n", result);
468 FREE(rel);
469 FREE(name);
470
471 StrAllocCopy(return_value, result);
472 FREE(result);
473
474 return return_value; /* exactly the right length */
475 }
476
477 /* Simplify a filename. HTSimplify()
478 ** --------------------
479 **
480 ** A unix-style file is allowed to contain the seqeunce xxx/../ which may
481 ** be replaced by "" , and the seqeunce "/./" which may be replaced by "/".
482 ** Simplification helps us recognize duplicate filenames.
483 **
484 ** Thus, /etc/junk/../fred becomes /etc/fred
485 ** /etc/junk/./fred becomes /etc/junk/fred
486 **
487 ** but we should NOT change
488 ** http://fred.xxx.edu/../..
489 **
490 ** or ../../albert.html
491 */
492 PUBLIC void HTSimplify ARGS1(char *, filename) {
493 char *p;
494 char *q, *q1;
495
496 if (filename == NULL)
497 return;
498
499 if ((filename[0] && filename[1]) && strchr(filename, '/') != NULL) {
500 for (p = (filename + 2); *p; p++) {
501 if (*p == '/') {
502 if ((p[1] == '.') && (p[2] == '.') &&
503 (p[3] == '/' || p[3] == '\0')) {
504 /*
505 ** Handle "/../" or "/..".
506 */
507 for (q = (p - 1); (q >= filename) && (*q != '/'); q--)
508 /*
509 ** Back up to previous slash or beginning of string.
510 */
511 ;
512 if ((q[0] == '/') && strncmp(q, "/../", 4) &&
513 !((q - 1) > filename && q[-1] == '/')) {
514 /*
515 ** Not at beginning of string or in a
516 ** host field, so remove the "/xxx/..".
517 */
518 q1 = (p + 3);
519 p = q;
520 while (*q1 != '\0')
521 *p++ = *q1++;
522 *p = '\0'; /* terminate */
523 #ifdef NOTDEFINED
524 /*
525 ** Make sure filename has at least one slash.
526 */
527 if (*filename == '\0') {
528 *filename = '/';
529 *(filename + 1) = '\0';
530 }
531 #endif /* NOTDEFINED */
532 /*
533 ** Start again with previous slash.
534 */
535 p = (q - 1);
536 }
537 } else if (p[1] == '.' && p[2] == '/') {
538 /*
539 ** Handle "./" by removing the characters.
540 */
541 q = p;
542 q1 = (p + 2);
543 while (*q1 != '\0')
544 *q++ = *q1++;
545 *q = '\0'; /* terminate */
546 p--;
547 } else if (p[1] == '.' && p[2] == '\0') {
548 /*
549 ** Handle terminal "." by removing the character.
550 */
551 p[1] = '\0';
552 }
553 }
554 }
555 }
556 }
557
558 /* Make Relative Name. HTRelative()
559 ** -------------------
560 **
561 ** This function creates and returns a string which gives an expression of
562 ** one address as related to another. Where there is no relation, an absolute
563 ** address is retured.
564 **
565 ** On entry,
566 ** Both names must be absolute, fully qualified names of nodes
567 ** (no anchor bits)
568 **
569 ** On exit,
570 ** The return result points to a newly allocated name which, if
571 ** parsed by HTParse relative to relatedName, will yield aName.
572 ** The caller is responsible for freeing the resulting name later.
573 **
574 */
575 PUBLIC char *HTRelative ARGS2(CONST char *, aName, CONST char *, relatedName) {
576 char *result = NULL;
577 CONST char *p = aName;
578 CONST char *q = relatedName;
579 CONST char *after_access = NULL;
580 CONST char *path = NULL;
581 CONST char *last_slash = NULL;
582 int slashes = 0;
583
584 for (; *p; p++, q++) { /* Find extent of match */
585 if (*p != *q)
586 break;
587 if (*p == ':')
588 after_access = p + 1;
589 if (*p == '/') {
590 last_slash = p;
591 slashes++;
592 if (slashes == 3)
593 path = p;
594 }
595 }
596
597 /* q, p point to the first non-matching character or zero */
598
599 if (!after_access) { /* Different access */
600 StrAllocCopy(result, aName);
601 } else if (slashes < 3) { /* Different nodes */
602 StrAllocCopy(result, after_access);
603 } else if (slashes == 3) { /* Same node, different path */
604 StrAllocCopy(result, path);
605 } else { /* Some path in common */
606 int levels = 0;
607 for (; *q && (*q != '#'); q++)
608 if (*q == '/')
609 levels++;
610 result = (char *) malloc(3 * levels + strlen(last_slash) + 1);
611 if (result == NULL)
612 outofmem(__FILE__, "HTRelative");
613 result[0] = '\0';
614 for (; levels; levels--)
615 strcat(result, "../");
616 strcat(result, last_slash + 1);
617 }
618 if (TRACE)
619 fprintf(stderr, "HT: `%s' expressed relative to\n `%s' is\n `%s'.",
620 aName, relatedName, result);
621 return result;
622 }
623