1 /*								      HTParse.c
2 **	URI MANAGEMENT
3 **
4 **	(c) COPYRIGHT MIT 1995.
5 **	Please first read the full copyright statement in the file COPYRIGH.
6 **	@(#) $Id$
7 **
8 ** history:
9 **	May 12 94	TAB added as legal char in HTCleanTelnetString
10 **
11 */
12 
13 /* Library include files */
14 #include "wwwsys.h"
15 #include "WWWUtil.h"
16 #include "HTParse.h"					 /* Implemented here */
17 
18 typedef struct _HTURI {
19     char * access;		/* Now known as "scheme" */
20     char * host;
21     char * absolute;
22     char * relative;
23     char * fragment;
24 } HTURI;
25 
26 /*	Scan a filename for its consituents
27 **	-----------------------------------
28 **
29 ** On entry,
30 **	name	points to a document name which may be incomplete.
31 ** On exit,
32 **      absolute or relative may be nonzero (but not both).
33 **	host, fragment and access may be nonzero if they were specified.
34 **	Any which are nonzero point to zero terminated strings.
35 */
scan(char * name,HTURI * parts)36 PRIVATE void scan (char * name, HTURI * parts)
37 {
38     char * p;
39     char * after_access = name;
40     memset(parts, '\0', sizeof(HTURI));
41 
42     /* Look for fragment identifier */
43     if ((p = strchr(name, '#')) != NULL) {
44 	*p++ = '\0';
45 	parts->fragment = p;
46     }
47 
48 
49     if ((p = strchr(name, ' ')) != NULL) *p++ = '\0';
50 
51     for(p=name; *p; p++) {
52 
53 	/*
54 	** Look for any whitespace. This is very bad for pipelining as it
55 	** makes the request invalid
56 	*/
57 	if (isspace((int) *p)) {
58 	    char *orig=p, *dest=p+1;
59 	    while ((*orig++ = *dest++));
60 	    p = p-1;
61 	}
62 	if (*p=='/' || *p=='#' || *p=='?')
63 	    break;
64 	if (*p==':') {
65 		*p = 0;
66 		parts->access = after_access; /* Scheme has been specified */
67 
68 /* The combination of gcc, the "-O" flag and the HP platform is
69    unhealthy. The following three lines is a quick & dirty fix, but is
70    not recommended. Rather, turn off "-O". */
71 
72 /*		after_access = p;*/
73 /*		while (*after_access == 0)*/
74 /*		    after_access++;*/
75 
76 		after_access = p+1;
77 
78 		if (0==strcasecomp("URL", parts->access)) {
79 		    parts->access = NULL;  /* Ignore IETF's URL: pre-prefix */
80 		} else break;
81 	}
82     }
83 
84     p = after_access;
85     if (*p=='/'){
86 	if (p[1]=='/') {
87 	    parts->host = p+2;		/* host has been specified 	*/
88 	    *p=0;			/* Terminate access 		*/
89 	    p=strchr(parts->host,'/');	/* look for end of host name if any */
90 	    if(p) {
91 	        *p=0;			/* Terminate host */
92 	        parts->absolute = p+1;		/* Root has been found */
93 	    }
94 	} else {
95 	    parts->absolute = p+1;		/* Root found but no host */
96 	}
97     } else {
98         parts->relative = (*after_access) ? after_access : 0; /* zero for "" */
99     }
100 }
101 
102 
103 /*	Parse a Name relative to another name
104 **	-------------------------------------
105 **
106 **	This returns those parts of a name which are given (and requested)
107 **	substituting bits from the related name where necessary.
108 **
109 ** On entry,
110 **	aName		A filename given
111 **      relatedName     A name relative to which aName is to be parsed. Give
112 **                      it an empty string if aName is absolute.
113 **      wanted          A mask for the bits which are wanted.
114 **
115 ** On exit,
116 **	returns		A pointer to a malloc'd string which MUST BE FREED
117 */
HTParse(const char * aName,const char * relatedName,int wanted)118 PUBLIC char * HTParse (const char *aName, const char *relatedName, int wanted)
119 {
120     char * result = 0;
121     char * return_value = 0;
122     int len;
123     char * name = 0;
124     char * rel = 0;
125     char * p;
126     char * access;
127     HTURI given, related;
128 
129     if (!aName) return NULL;
130     if (!relatedName)        /* HWL 23/8/94: dont dump due to NULL */
131         relatedName = "";
132 
133     /* Make working copies of input strings to cut up: */
134     len = strlen(aName)+strlen(relatedName)+10;
135     if ((result=(char *) HT_MALLOC(len)) == NULL) /* Lots of space: more than enough */
136 	HT_OUTOFMEM("parse space");
137     StrAllocCopy(name, aName);
138     StrAllocCopy(rel, relatedName);
139 
140     scan(name, &given);
141     scan(rel,  &related);
142     result[0]=0;		/* Clear string  */
143     access = given.access ? given.access : related.access;
144     if (wanted & PARSE_ACCESS)
145         if (access) {
146 	    strcat(result, access);
147 	    if(wanted & PARSE_PUNCTUATION) strcat(result, ":");
148 	}
149 
150     if (given.access && related.access)	/* If different, inherit nothing. */
151         if (strcmp(given.access, related.access)!=0) {
152 	    related.host=0;
153 	    related.absolute=0;
154 	    related.relative=0;
155 	    related.fragment=0;
156 	}
157 
158     if (wanted & PARSE_HOST)
159         if(given.host || related.host) {
160 	    if(wanted & PARSE_PUNCTUATION) strcat(result, "//");
161 	    strcat(result, given.host ? given.host : related.host);
162 	}
163 
164     if (given.host && related.host)  /* If different hosts, inherit no path. */
165         if (strcmp(given.host, related.host)!=0) {
166 	    related.absolute=0;
167 	    related.relative=0;
168 	    related.fragment=0;
169 	}
170 
171     if (wanted & PARSE_PATH) {
172         if(given.absolute) {				/* All is given */
173 	    if(wanted & PARSE_PUNCTUATION) strcat(result, "/");
174 	    strcat(result, given.absolute);
175 	} else if(related.absolute) {	/* Adopt path not name */
176 	    strcat(result, "/");
177 	    strcat(result, related.absolute);
178 	    if (given.relative) {
179 		p = strchr(result, '?');	/* Search part? */
180 		if (!p) p=result+strlen(result)-1;
181 		for (; *p!='/'; p--);	/* last / */
182 		p[1]=0;					/* Remove filename */
183 		strcat(result, given.relative);		/* Add given one */
184 #if 0
185 		result = HTSimplify (&result);
186 #endif
187 	    }
188 	} else if(given.relative) {
189 	    strcat(result, given.relative);		/* what we've got */
190 	} else if(related.relative) {
191 	    strcat(result, related.relative);
192 	} else {  /* No inheritance */
193 	    strcat(result, "/");
194 	}
195     }
196 
197     if (wanted & PARSE_VIEW)
198 	if(given.fragment || related.fragment) {
199 	    if(given.absolute && given.fragment) {   /*Fixes for relURLs...*/
200 		if(wanted & PARSE_PUNCTUATION) strcat(result, "#");
201 		strcat(result, given.fragment);
202 	    } else if (!(given.absolute) && !(given.fragment)) {
203 		strcat(result, "");
204 	    } else {
205 		if(wanted & PARSE_PUNCTUATION) strcat(result, "#");
206 		strcat(result, given.fragment ? given.fragment : related.fragment);
207 	    }
208 	}
209     HT_FREE(rel);
210     HT_FREE(name);
211 
212     StrAllocCopy(return_value, result);
213     HT_FREE(result);
214     return return_value;		/* exactly the right length */
215 }
216 
217 
218 /*
219 **	Canonicalizes the URL in the following manner starting from the host
220 **	pointer:
221 **
222 **	1) The host name is converted to lowercase
223 **	2) Chop off port if `:80' (http), `:70' (gopher), or `:21' (ftp)
224 **
225 **	Return: OK	The position of the current path part of the URL
226 **			which might be the old one or a new one.
227 */
HTCanon(char ** filename,char * host)228 PRIVATE char * HTCanon (char ** filename, char * host)
229 {
230     char *newname = NULL;
231     char *port;
232     char *strptr;
233     char *path;
234     char *access = host-3;
235 
236     while (access>*filename && *(access-1)!='/')       /* Find access method */
237 	access--;
238     if ((path = strchr(host, '/')) == NULL)			/* Find path */
239 	path = host + strlen(host);
240     if ((strptr = strchr(host, '@')) != NULL && strptr<path)	   /* UserId */
241 	host = strptr;
242     if ((port = strchr(host, ':')) != NULL && port>path)      /* Port number */
243 	port = NULL;
244 
245     strptr = host;				    /* Convert to lower-case */
246     while (strptr<path) {
247 	*strptr = TOLOWER(*strptr);
248 	strptr++;
249     }
250 
251     /* Does the URL contain a full domain name? This also works for a
252        numerical host name. The domain name is already made lower-case
253        and without a trailing dot. */
254 #if 0
255     if (((strptr = strchr(host, '.')) == NULL || strptr >= path) &&
256 	strncasecomp(host, "localhost", 9)) {
257 	const char *domain = HTGetDomainName();
258 	if (domain && *domain) {
259 	    if ((newname = (char *) HT_CALLOC(1, strlen(*filename) + strlen(domain)+2)) == NULL)
260 		HT_OUTOFMEM("HTCanon");
261 	    if (port)
262 		strncpy(newname, *filename, (int) (port-*filename));
263 	    else
264 		strncpy(newname, *filename, (int) (path-*filename));
265 	    strcat(newname, ".");
266 	    strcat(newname, domain);
267 	}
268     } else 					  /* Look for a trailing dot */
269 #endif
270     {
271 	char *dot = port ? port : path;
272 	if (dot > *filename && *--dot=='.') {
273 	    char *orig=dot, *dest=dot+1;
274 	    while((*orig++ = *dest++));
275 	    if (port) port--;
276 	    path--;
277 	}
278     }
279     /* Chop off port if `:', `:80' (http), `:70' (gopher), or `:21' (ftp) */
280     if (port) {
281 	if (!*(port+1) || *(port+1)=='/') {
282 	    if (!newname) {
283 		char *orig=port, *dest=port+1;
284 		while((*orig++ = *dest++));
285 	    }
286 	} else if ((!strncmp(access, "http", 4) &&
287 	     (*(port+1)=='8'&&*(port+2)=='0'&&(*(port+3)=='/'||!*(port+3)))) ||
288 	    (!strncmp(access, "gopher", 6) &&
289 	     (*(port+1)=='7'&&*(port+2)=='0'&&(*(port+3)=='/'||!*(port+3)))) ||
290 	    (!strncmp(access, "ftp", 3) &&
291 	     (*(port+1)=='2'&&*(port+2)=='1'&&(*(port+3)=='/'||!*(port+3))))) {
292 	    if (!newname) {
293 		char *orig=port, *dest=port+3;
294 		while((*orig++ = *dest++));
295 		path -= 3;   	       /* Update path position, Henry Minsky */
296 	    }
297 	} else if (newname)
298 	    strncat(newname, port, (int) (path-port));
299     }
300 
301     if (newname) {
302 	char *newpath = newname+strlen(newname);
303 	strcat(newname, path);
304 	path = newpath;
305 	HT_FREE(*filename);				    /* Free old copy */
306 	*filename = newname;
307     }
308     return path;
309 }
310 
311 /*
312 **  Search the URL and determine whether it is a relative or absolute URL.
313 **  We check to see if there is a ":" before any "/", "?", and "#". If this
314 **  is the case then we say it is absolute. Otherwise it is relative.
315 */
HTURL_isAbsolute(const char * url)316 PUBLIC BOOL HTURL_isAbsolute (const char * url)
317 {
318     if (url) {
319 	const char * ptr = url;
320 	while (*ptr) {
321 	    if (*ptr == ':') return YES;
322 	    if (*ptr == '/' || *ptr == '?' || *ptr == '#') break;
323 	    ptr ++;
324 	}
325     }
326     return NO;
327 }
328 
329 /*	        Simplify a URI
330 //		--------------
331 // A URI is allowed to contain the seqeunce xxx/../ which may be
332 // replaced by "" , and the seqeunce "/./" which may be replaced by "/".
333 // Simplification helps us recognize duplicate URIs.
334 //
335 //	Thus, 	/etc/junk/../fred 	becomes	/etc/fred
336 //		/etc/junk/./fred	becomes	/etc/junk/fred
337 //
338 //      but we should NOT change
339 //		http://fred.xxx.edu/../..
340 //
341 //	or	../../albert.html
342 //
343 // In order to avoid empty URLs the following URLs become:
344 //
345 //		/fred/..		becomes /fred/..
346 //		/fred/././..		becomes /fred/..
347 //		/fred/.././junk/.././	becomes /fred/..
348 //
349 // If more than one set of `://' is found (several proxies in cascade) then
350 // only the part after the last `://' is simplified.
351 //
352 // Returns: A string which might be the old one or a new one.
353 */
HTSimplify(char ** url)354 PUBLIC char *HTSimplify (char ** url)
355 {
356     char *path;
357     char *p;
358     if (!url || !*url) {
359 	HTTRACE(URI_TRACE, "HTSimplify.. Nothing done\n");
360 	return *url;
361     }
362     HTTRACE(URI_TRACE, "HTSimplify.. `%s\' " _ *url);
363 
364     /* Find any scheme name */
365     if ((path = strstr(*url, "://")) != NULL) {		   /* Find host name */
366 	char *newptr;
367 	char *access = *url;
368 	while (access<path && (*access=TOLOWER(*access))) access++;
369 	path += 3;
370 	while ((newptr = strstr(path, "://")) != NULL)        /* For proxies */
371 	    path = newptr+3;
372 	path = HTCanon(url, path);       	      /* We have a host name */
373     } else if ((path = strstr(*url, ":/")) != NULL) {
374 	path += 2;
375     } else
376 	path = *url;
377     if (*path == '/' && *(path+1)=='/') {	  /* Some URLs start //<foo> */
378 	path += 1;
379     } else if (!strncmp(path, "news:", 5)) {
380 	char *ptr = strchr(path+5, '@');
381 	if (!ptr) ptr = path+5;
382 	while (*ptr) {			    /* Make group or host lower case */
383 	    *ptr = TOLOWER(*ptr);
384 	    ptr++;
385 	}
386 	HTTRACE(URI_TRACE, "into\n............ `%s'\n" _ *url);
387 	return *url;		      /* Doesn't need to do any more */
388     }
389     if ((p = path)) {
390 	char *end;
391 	if (!((end = strchr(path, ';')) || (end = strchr(path, '?')) ||
392 	      (end = strchr(path, '#'))))
393 	    end = path+strlen(path);
394 
395 	/* Parse string second time to simplify */
396 	p = path;
397 	while(p<end) {
398 	    if (*p=='/') {
399 		if (p>*url && *(p+1)=='.' && (*(p+2)=='/' || !*(p+2))) {
400 		    char *orig = p+1;
401 		    char *dest = (*(p+2)!='/') ? p+2 : p+3;
402 		    while ((*orig++ = *dest++)); /* Remove a slash and a dot */
403 		    end = orig-1;
404 		} else if (*(p+1)=='.' && *(p+2)=='.' && (*(p+3)=='/' || !*(p+3))) {
405 		    char *q = p;
406 		    while (q>path && *--q!='/');	       /* prev slash */
407 		    if (strncmp(q, "/../", 4)) {
408 			char *orig = q+1;
409 			char *dest = (*(p+3)!='/') ? p+3 : p+4;
410 			while ((*orig++ = *dest++));	   /* Remove /xxx/.. */
411 			end = orig-1;
412 			p = q;		      /* Start again with prev slash */
413 		    } else
414 			p++;
415 		} else if (*(p+1)=='/') {
416 		    while (*(p+1)=='/') {
417 			char *orig=p, *dest=p+1;
418 			while ((*orig++ = *dest++));  /* Remove multiple /'s */
419 			end = orig-1;
420 		    }
421 		} else
422 		    p++;
423 	    } else
424 		p++;
425 	}
426     }
427 
428     /*
429     **  Check for host/../.. kind of things
430     */
431     while (*path=='/' && *(path+1)=='.' && *(path+2)=='.' &&
432 	   (!*(path+3) || *(path+3)=='/')) {
433 	char * orig = path;
434 	char * dest = path+3;
435 	while ((*orig++ = *dest++));
436     }
437     HTTRACE(URI_TRACE, "into\n............ `%s'\n" _ *url);
438     return *url;
439 }
440 
441 /*		Make Relative Name
442 **		------------------
443 **
444 ** This function creates and returns a string which gives an expression of
445 ** one address as related to another. Where there is no relation, an absolute
446 ** address is retured.
447 **
448 **  On entry,
449 **	Both names must be absolute, fully qualified names of nodes
450 **	(no fragment bits)
451 **
452 **  On exit,
453 **	The return result points to a newly allocated name which, if
454 **	parsed by HTParse relative to relatedName, will yield aName.
455 **	The caller is responsible for freeing the resulting name later.
456 **
457 */
HTRelative(const char * aName,const char * relatedName)458 PUBLIC char * HTRelative (const char * aName, const char * relatedName)
459 {
460     char * result = 0;
461     const char *p = aName;
462     const char *q = relatedName;
463     const char * after_access = NULL;
464     const char * path = 0;
465     const char * last_slash = 0;
466     int slashes = 0;
467 
468     for(;*p; p++, q++) {	/* Find extent of match */
469     	if (*p!=*q) break;
470 	if (*p==':') if (!after_access) after_access = p+1;
471 	if (*p=='/') {
472 	    last_slash = p;
473 	    slashes++;
474 	    if (slashes==3) path=p;
475 	}
476     }
477 
478     /* q, p point to the first non-matching character or zero */
479 
480     if (!after_access) {			/* Different access */
481         StrAllocCopy(result, aName);
482     } else if (slashes<3){			/* Different nodes */
483     	StrAllocCopy(result, after_access);
484     } else {					/* Some path in common */
485         int levels= 0;
486         for(; *q && *q!='#' && *q!=';' && *q!='?'; q++) if (*q=='/') levels++;
487 	if ((result = (char  *) HT_MALLOC(3*levels + strlen(last_slash) + 4)) == NULL)
488 	    HT_OUTOFMEM("HTRelative");
489 	*result = '\0';
490 	if (!levels) strcat(result, "./");
491 	for(;levels; levels--)strcat(result, "../");
492 	strcat(result, last_slash+1);
493 	if (!*result) strcat(result, "./");
494     }
495     HTTRACE(URI_TRACE, "HTRelative.. `%s' expressed relative to  `%s' is `%s'\n" _
496 		aName _ relatedName _ result);
497 #if 0
498     {
499 	char * absstr = HTParse(result, relatedName, PARSE_ALL);
500 	HTSimplify(&absstr);
501 	HTTRACE(URI_TRACE, "HTRelative.. `%s' made absolute based on `%s' is `%s'\n" _
502 		result _ relatedName _ absstr);
503 	if (strcmp(absstr, aName) != 0) HTTRACE(URI_TRACE, "THEY DIFFER!!!\n");
504 	HT_FREE(absstr);
505     }
506 #endif
507     return result;
508 }
509 
510 /*							HTCleanTelnetString()
511  *	Make sure that the given string doesn't contain characters that
512  *	could cause security holes, such as newlines in ftp, gopher,
513  *	news or telnet URLs; more specifically: allows everything between
514  *	ASCII 20-7E, and also A0-FE, inclusive. Also TAB ('\t') allowed!
515  *
516  * On entry,
517  *	str	the string that is *modified* if necessary.  The
518  *		string will be truncated at the first illegal
519  *		character that is encountered.
520  * On exit,
521  *	returns	YES, if the string was modified.
522  *		NO, otherwise.
523  */
HTCleanTelnetString(char * str)524 PUBLIC BOOL HTCleanTelnetString (char * str)
525 {
526     char * cur = str;
527 
528     if (!str) return NO;
529 
530     while (*cur) {
531 	int a = TOASCII((unsigned char) *cur);
532 	if (a != 0x9 && (a < 0x20 || (a > 0x7E && a < 0xA0) ||  a > 0xFE)) {
533 	    HTTRACE(URI_TRACE, "Illegal..... character in URL: \"%s\"\n" _ str);
534 	    *cur = 0;
535 	    HTTRACE(URI_TRACE, "Truncated... \"%s\"\n" _ str);
536 	    return YES;
537 	}
538 	cur++;
539     }
540     return NO;
541 }
542 
543