1 /*              Parse HyperText Document Address                HTParse.c
2    **           ================================
3  */
4 
5 #include "HTParse.h"
6 #define TRACE 0
7 
8 #define FREE(x) if (x) {free(x); x = NULL;}
9 
10 struct struct_parts {
11     char           *access;
12     char           *host;
13     char           *absolute;
14     char           *relative;
15 /*      char * search;          no - treated as part of path */
16     char           *anchor;
17 };
18 
19 /*      Strings of any length
20    **      ---------------------
21  */
22 PUBLIC int      strcasecomp
ARGS2(CONST char *,a,CONST char *,b)23 ARGS2(CONST char *, a, CONST char *, b)
24 {
25     CONST char     *p = a;
26     CONST char     *q = b;
27 
28     for (p = a, q = b; *p && *q; p++, q++) {
29         int             diff = TOLOWER(*p) - TOLOWER(*q);
30         if (diff)
31             return diff;
32     }
33     if (*p)
34         return 1;               /* p was longer than q */
35     if (*q)
36         return -1;              /* p was shorter than q */
37     return 0;                   /* Exact match */
38 }
39 
40 /*      With count limit
41    **      ----------------
42  */
43 PUBLIC int      strncasecomp
ARGS3(CONST char *,a,CONST char *,b,int,n)44 ARGS3(CONST char *, a, CONST char *, b, int, n)
45 {
46     CONST char     *p = a;
47     CONST char     *q = b;
48 
49     for (p = a, q = b;; p++, q++) {
50         int             diff;
51         if (p == (a + n))
52             return 0;           /* Match up to n characters */
53         if (!(*p && *q))
54             return (*p - *q);
55         diff = TOLOWER(*p) - TOLOWER(*q);
56         if (diff)
57             return diff;
58     }
59     /* NOTREACHED */
60 }
61 
62 /*      Allocate a new copy of a string, and returns it
63  */
64 PUBLIC char    *HTSACopy
ARGS2(char **,dest,CONST char *,src)65 ARGS2(char **, dest, CONST char *, src)
66 {
67     FREE(*dest);
68     if (src) {
69         *dest = (char *) malloc(strlen(src) + 1);
70         if (*dest == NULL)
71             outofmem(__FILE__, "HTSACopy");
72         strcpy(*dest, src);
73     }
74     return *dest;
75 }
76 /*      String Allocate and Concatenate
77  */
78 PUBLIC char    *HTSACat
ARGS2(char **,dest,CONST char *,src)79 ARGS2(char **, dest, CONST char *, src)
80 {
81     if (src && *src) {
82         if (*dest) {
83             int             length = strlen(*dest);
84             *dest = (char *) realloc(*dest, length + strlen(src) + 1);
85             if (*dest == NULL)
86                 outofmem(__FILE__, "HTSACat");
87             strcpy(*dest + length, src);
88         } else {
89             *dest = (char *) malloc(strlen(src) + 1);
90             if (*dest == NULL)
91                 outofmem(__FILE__, "HTSACat");
92             strcpy(*dest, src);
93         }
94     }
95     return *dest;
96 }
97 
98 
99 
100 /*      Strip white space off a string.                         HTStrip()
101    **   -------------------------------
102    **
103    ** On exit,
104    **   Return value points to first non-white character, or to 0 if none.
105    **   All trailing white space is OVERWRITTEN with zero.
106  */
107 PUBLIC char    *HTStrip
ARGS1(char *,s)108 ARGS1(char *, s)
109 {
110 #define SPACE(c) ((c == ' ') || (c == '\t') || (c == '\n'))
111     char           *p = s;
112     for (p = s; *p; p++);       /* Find end of string */
113     for (p--; p >= s; p--) {
114         if (SPACE(*p))
115             *p = '\0';          /* Zap trailing blanks */
116         else
117             break;
118     }
119     while (SPACE(*s))
120         s++;                    /* Strip leading blanks */
121     return s;
122 }
123 
124 /*      Scan a filename for its consituents.                    scan()
125    **   ------------------------------------
126    **
127    ** On entry,
128    **   name    points to a document name which may be incomplete.
129    ** On exit,
130    **      absolute or relative may be nonzero (but not both).
131    **   host, anchor and access may be nonzero if they were specified.
132    **   Any which are nonzero point to zero terminated strings.
133  */
134 PRIVATE void    scan
ARGS2(char *,name,struct struct_parts *,parts)135 ARGS2(char *, name, struct struct_parts *, parts)
136 {
137     char           *after_access;
138     char           *p;
139     /* int length = strlen (name); */
140 
141     parts->access = NULL;
142     parts->host = NULL;
143     parts->absolute = NULL;
144     parts->relative = NULL;
145     parts->anchor = NULL;
146 
147     /*
148      **  Scan left-to-right for a scheme (access).
149      */
150     after_access = name;
151     for (p = name; *p; p++) {
152         if (*p == ':') {
153             *p = '\0';
154             parts->access = name;       /* Access name has been specified */
155             after_access = (p + 1);
156             break;
157         }
158         if (*p == '/' || *p == '#' || *p == ';' || *p == '?')
159             break;
160     }
161 
162 #ifdef NOTDEFINED
163     for (p = (name + length - 1); p >= name; p--) {
164 #endif                          /* NOTDEFINED */
165         /*
166          **  Scan left-to-right for a fragment (anchor).
167          */
168         for (p = after_access; *p; p++) {
169             if (*p == '#') {
170                 parts->anchor = (p + 1);
171                 *p = '\0';      /* terminate the rest */
172             }
173         }
174 
175         /*
176          **  Scan left-to-right for a host or absolute path.
177          */
178         p = after_access;
179         if (*p == '/') {
180             if (p[1] == '/') {
181                 parts->host = (p + 2);  /* host has been specified */
182                 *p = '\0';      /* Terminate access */
183                 p = strchr(parts->host, '/');   /* look for end of host name if any */
184                 if (p != NULL) {
185                     *p = '\0';  /* Terminate host */
186                     parts->absolute = (p + 1);  /* Root has been found */
187                 }
188             } else {
189                 parts->absolute = (p + 1);      /* Root found but no host */
190             }
191         } else {
192             parts->relative = (*after_access) ? after_access : NULL;    /* NULL for
193                                                                          * "" */
194         }
195 
196         /*
197          **  Check schemes that commonly have unescaped hashes.
198          */
199         if (parts->access && parts->anchor) {
200             if ((!parts->host && strcasecomp(parts->access, "lynxcgi")) ||
201                 !strcasecomp(parts->access, "nntp") ||
202                 !strcasecomp(parts->access, "snews") ||
203                 !strcasecomp(parts->access, "news") ||
204                 !strcasecomp(parts->access, "data")) {
205                 /*
206                  *  Access specified but no host and not a lynxcgi URL, so the
207                  *  anchor may not really be one, e.g., news:j462#36487@foo.bar,
208                  *  or it's an nntp or snews URL, or news URL with a host.
209                  *  Restore the '#' in the address.
210                  */
211                 *(parts->anchor - 1) = '#';
212                 parts->anchor = NULL;
213             }
214         }
215 #ifdef NOT_DEFINED              /* search is just treated as part of path */
216         {
217             char           *p = (relative ? relative : absolute);
218             if (p != NULL) {
219                 char           *q = strchr(p, '?');     /* Any search string? */
220                 if (q != NULL) {
221                     *q = '\0';  /* If so, chop that off. */
222                     parts->search = (q + 1);
223                 }
224             }
225         }
226 #endif                          /* NOT_DEFINED */
227     }                           /* scan */
228 
229 
230 /*      Parse a Name relative to another name.                  HTParse()
231    **   --------------------------------------
232    **
233    **   This returns those parts of a name which are given (and requested)
234    **   substituting bits from the related name where necessary.
235    **
236    ** On entry,
237    **   aName           A filename given
238    **      relatedName     A name relative to which aName is to be parsed
239    **      wanted          A mask for the bits which are wanted.
240    **
241    ** On exit,
242    **   returns         A pointer to a malloc'd string which MUST BE FREED
243  */
244     PUBLIC char    *HTParse ARGS3(CONST char *, aName,
245                                   CONST char *, relatedName, int, wanted) {
246         char           *result = NULL;
247         char           *return_value = NULL;
248         int             len;
249         char           *name = NULL;
250         char           *rel = NULL;
251         char           *p;
252         char           *access;
253         struct struct_parts given, related;
254 
255         if (TRACE)
256             fprintf(stderr,
257                     "HTParse: aName:%s   relatedName:%s\n", aName, relatedName);
258 
259         /*
260          **  Allocate the output string.
261          */
262         len = strlen(aName) + strlen(relatedName) + 10;
263         result = (char *) malloc(len);  /* Lots of space: more than enough */
264         if (result == NULL)
265             outofmem(__FILE__, "HTParse");
266         result[0] = '\0';       /* Clear string */
267 
268         /*
269          **  Make working copies of the input strings to cut up.
270          */
271         StrAllocCopy(name, aName);
272         StrAllocCopy(rel, relatedName);
273 
274         /*
275          **  Cut up the strings into URL fields.
276          */
277         scan(name, &given);
278         scan(rel, &related);
279 
280         /*
281          **  Handle the scheme (access) field.
282          */
283         if (given.access && given.host && !given.relative && !given.absolute) {
284             if (!strcmp(given.access, "http") ||
285                 !strcmp(given.access, "https") || !strcmp(given.access, "ftp"))
286                 /*
287                  **  Assume root.
288                  */
289                 given.absolute = "";
290         }
291         access = given.access ? given.access : related.access;
292         if (wanted & PARSE_ACCESS) {
293             if (access) {
294                 strcat(result, access);
295                 if (wanted & PARSE_PUNCTUATION)
296                     strcat(result, ":");
297             }
298         }
299 
300         /*
301          **  If different schemes, inherit nothing.
302          **
303          **  We'll try complying with RFC 1808 and
304          **  the Fielding draft, and inherit nothing
305          **  if both schemes are given, rather than
306          **  only when they differ, except for
307          **  file URLs - FM
308          **
309          **  After trying it for a while, it's still
310          **  premature, IHMO, to go along with it, so
311          **  this is back to inheriting for identical
312          **  schemes whether or not they are "file".
313          **  If you want to try it again yourself,
314          **  uncomment the strncasecomp() below. - FM
315          */
316         if ((given.access && related.access) && (       /* strcasecomp(given.access,
317                                                          * "file") || */
318                                                     strcmp(given.access,
319                                                            related.access))) {
320             related.host = NULL;
321             related.absolute = NULL;
322             related.relative = NULL;
323             related.anchor = NULL;
324         }
325 
326         /*
327          **  Handle the host field.
328          */
329         if (wanted & PARSE_HOST)
330             if (given.host || related.host) {
331                 char           *tail = result + strlen(result);
332                 if (wanted & PARSE_PUNCTUATION)
333                     strcat(result, "//");
334                 strcat(result, given.host ? given.host : related.host);
335 #define CLEAN_URLS
336 #ifdef CLEAN_URLS
337                 /*
338                  **  Ignore default port numbers, and trailing dots on FQDNs,
339                  **  which will only cause identical addresses to look different.
340                  */
341                 {
342                     char           *p, *h;
343                     p = strchr(tail, ':');
344                     if (p != NULL && !isdigit((unsigned char) p[1]))
345                         /*
346                          **  Colon not followed by a port number.
347                          */
348                         *p = '\0';
349                     if (p != NULL && p != '\0' && access != NULL) {
350                         /*
351                          **  Port specified.
352                          */
353                         if ((!strcmp(access, "http") && !strcmp(p, ":80")) ||
354                             (!strcmp(access, "gopher") && !strcmp(p, ":70")) ||
355                             (!strcmp(access, "ftp") && !strcmp(p, ":21")) ||
356                             (!strcmp(access, "wais") && !strcmp(p, ":210")) ||
357                             (!strcmp(access, "nntp") && !strcmp(p, ":119")) ||
358                             (!strcmp(access, "news") && !strcmp(p, ":119")) ||
359                             (!strcmp(access, "snews") && !strcmp(p, ":563")) ||
360                             (!strcmp(access, "finger") && !strcmp(p, ":79")) ||
361                             (!strcmp(access, "cso") && !strcmp(p, ":105")))
362                             *p = '\0';  /* It is the default: ignore it */
363                     }
364                     if (p == NULL) {
365                         int             len = strlen(tail);
366 
367                         if (len > 0) {
368                             h = tail + len - 1; /* last char of hostname */
369                             if (*h == '.')
370                                 *h = '\0';      /* chop final . */
371                         }
372                     } else {
373                         h = p;
374                         h--;    /* End of hostname */
375                         if (*h == '.') {
376                             /*
377                              **  Slide p over h.
378                              */
379                             while (*p != '\0')
380                                 *h++ = *p++;
381                             *h = '\0';  /* terminate */
382                         }
383                     }
384                 }
385 #endif                          /* CLEAN_URLS */
386             }
387 
388         /*
389          **  If different hosts, inherit no path.
390          */
391         if (given.host && related.host)
392             if (strcmp(given.host, related.host) != 0) {
393                 related.absolute = NULL;
394                 related.relative = NULL;
395                 related.anchor = NULL;
396             }
397 
398         /*
399          **  Handle the path.
400          */
401         if (wanted & PARSE_PATH) {
402             if (access && !given.absolute && given.relative) {
403                 if (!strcasecomp(access, "nntp") ||
404                     !strcasecomp(access, "snews") ||
405                     (!strcasecomp(access, "news") &&
406                      !strncasecomp(result, "news://", 7))) {
407                     /*
408                      *  Treat all given nntp or snews paths,
409                      *  or given paths for news URLs with a host,
410                      *  as absolute.
411                      */
412                     given.absolute = given.relative;
413                     given.relative = NULL;
414                 }
415             }
416             if (given.absolute) {       /* All is given */
417                 if (wanted & PARSE_PUNCTUATION)
418                     strcat(result, "/");
419                 strcat(result, given.absolute);
420                 if (TRACE)
421                     fprintf(stderr, "1\n");
422             } else if (related.absolute) {      /* Adopt path not name */
423                 strcat(result, "/");
424                 strcat(result, related.absolute);
425                 if (given.relative) {
426                     p = strchr(result, '?');    /* Search part? */
427                     if (p == NULL)
428                         p = (result + strlen(result) - 1);
429                     for (; *p != '/'; p--);     /* last / */
430                     p[1] = '\0';        /* Remove filename */
431                     strcat(result, given.relative);     /* Add given one */
432                     HTSimplify(result);
433                 }
434                 if (TRACE)
435                     fprintf(stderr, "2\n");
436             } else if (given.relative) {
437                 strcat(result, given.relative); /* what we've got */
438                 if (TRACE)
439                     fprintf(stderr, "3\n");
440             } else if (related.relative) {
441                 strcat(result, related.relative);
442                 if (TRACE)
443                     fprintf(stderr, "4\n");
444             } else {            /* No inheritance */
445                 if (strncasecomp(aName, "lynxcgi:", 8) &&
446                     strncasecomp(aName, "lynxexec:", 9) &&
447                     strncasecomp(aName, "lynxprog:", 9)) {
448                     strcat(result, "/");
449                 }
450                 if (!strcmp(result, "news:/"))
451                     result[5] = '*';
452                 if (TRACE)
453                     fprintf(stderr, "5\n");
454             }
455         }
456 
457         /*
458          **  Handle the fragment (anchor).
459          */
460         if (wanted & PARSE_ANCHOR)
461             if ((given.anchor && *given.anchor) || (!given.anchor && related.anchor)) {
462                 if (wanted & PARSE_PUNCTUATION)
463                     strcat(result, "#");
464                 strcat(result, (given.anchor) ? given.anchor : related.anchor);
465             }
466         if (TRACE)
467             fprintf(stderr, "HTParse: result:%s\n", result);
468         FREE(rel);
469         FREE(name);
470 
471         StrAllocCopy(return_value, result);
472         FREE(result);
473 
474         return return_value;    /* exactly the right length */
475     }
476 
477 /*      Simplify a filename.                            HTSimplify()
478    **   --------------------
479    **
480    **  A unix-style file is allowed to contain the seqeunce xxx/../ which may
481    **  be replaced by "" , and the seqeunce "/./" which may be replaced by "/".
482    **  Simplification helps us recognize duplicate filenames.
483    **
484    **   Thus,   /etc/junk/../fred       becomes /etc/fred
485    **           /etc/junk/./fred        becomes /etc/junk/fred
486    **
487    **      but we should NOT change
488    **           http://fred.xxx.edu/../..
489    **
490    **   or      ../../albert.html
491  */
492     PUBLIC void HTSimplify ARGS1(char *, filename) {
493         char           *p;
494         char           *q, *q1;
495 
496         if (filename == NULL)
497             return;
498 
499         if ((filename[0] && filename[1]) && strchr(filename, '/') != NULL) {
500             for (p = (filename + 2); *p; p++) {
501                 if (*p == '/') {
502                     if ((p[1] == '.') && (p[2] == '.') &&
503                         (p[3] == '/' || p[3] == '\0')) {
504                         /*
505                          **  Handle "/../" or "/..".
506                          */
507                         for (q = (p - 1); (q >= filename) && (*q != '/'); q--)
508                             /*
509                              **  Back up to previous slash or beginning of string.
510                              */
511                             ;
512                         if ((q[0] == '/') && strncmp(q, "/../", 4) &&
513                             !((q - 1) > filename && q[-1] == '/')) {
514                             /*
515                              **  Not at beginning of string or in a
516                              **  host field, so remove the "/xxx/..".
517                              */
518                             q1 = (p + 3);
519                             p = q;
520                             while (*q1 != '\0')
521                                 *p++ = *q1++;
522                             *p = '\0';  /* terminate */
523 #ifdef NOTDEFINED
524                             /*
525                              **  Make sure filename has at least one slash.
526                              */
527                             if (*filename == '\0') {
528                                 *filename = '/';
529                                 *(filename + 1) = '\0';
530                             }
531 #endif                          /* NOTDEFINED */
532                             /*
533                              **  Start again with previous slash.
534                              */
535                             p = (q - 1);
536                         }
537                     } else if (p[1] == '.' && p[2] == '/') {
538                         /*
539                          **  Handle "./" by removing the characters.
540                          */
541                         q = p;
542                         q1 = (p + 2);
543                         while (*q1 != '\0')
544                             *q++ = *q1++;
545                         *q = '\0';      /* terminate */
546                         p--;
547                     } else if (p[1] == '.' && p[2] == '\0') {
548                         /*
549                          **  Handle terminal "." by removing the character.
550                          */
551                         p[1] = '\0';
552                     }
553                 }
554             }
555         }
556     }
557 
558 /*      Make Relative Name.                                     HTRelative()
559    **   -------------------
560    **
561    ** This function creates and returns a string which gives an expression of
562    ** one address as related to another. Where there is no relation, an absolute
563    ** address is retured.
564    **
565    **  On entry,
566    **   Both names must be absolute, fully qualified names of nodes
567    **   (no anchor bits)
568    **
569    **  On exit,
570    **   The return result points to a newly allocated name which, if
571    **   parsed by HTParse relative to relatedName, will yield aName.
572    **   The caller is responsible for freeing the resulting name later.
573    **
574  */
575     PUBLIC char    *HTRelative ARGS2(CONST char *, aName, CONST char *, relatedName) {
576         char           *result = NULL;
577         CONST char     *p = aName;
578         CONST char     *q = relatedName;
579         CONST char     *after_access = NULL;
580         CONST char     *path = NULL;
581         CONST char     *last_slash = NULL;
582         int             slashes = 0;
583 
584         for (; *p; p++, q++) {  /* Find extent of match */
585             if (*p != *q)
586                 break;
587             if (*p == ':')
588                 after_access = p + 1;
589             if (*p == '/') {
590                 last_slash = p;
591                 slashes++;
592                 if (slashes == 3)
593                     path = p;
594             }
595         }
596 
597         /* q, p point to the first non-matching character or zero */
598 
599         if (!after_access) {    /* Different access */
600             StrAllocCopy(result, aName);
601         } else if (slashes < 3) {       /* Different nodes */
602             StrAllocCopy(result, after_access);
603         } else if (slashes == 3) {      /* Same node, different path */
604             StrAllocCopy(result, path);
605         } else {                /* Some path in common */
606             int             levels = 0;
607             for (; *q && (*q != '#'); q++)
608                 if (*q == '/')
609                     levels++;
610             result = (char *) malloc(3 * levels + strlen(last_slash) + 1);
611             if (result == NULL)
612                 outofmem(__FILE__, "HTRelative");
613             result[0] = '\0';
614             for (; levels; levels--)
615                 strcat(result, "../");
616             strcat(result, last_slash + 1);
617         }
618         if (TRACE)
619             fprintf(stderr, "HT: `%s' expressed relative to\n    `%s' is\n   `%s'.",
620                     aName, relatedName, result);
621         return result;
622     }
623