1 /* HTParse.c
2 ** URI MANAGEMENT
3 **
4 ** (c) COPYRIGHT MIT 1995.
5 ** Please first read the full copyright statement in the file COPYRIGH.
6 ** @(#) $Id$
7 **
8 ** history:
9 ** May 12 94 TAB added as legal char in HTCleanTelnetString
10 **
11 */
12
13 /* Library include files */
14 #include "wwwsys.h"
15 #include "WWWUtil.h"
16 #include "HTParse.h" /* Implemented here */
17
18 typedef struct _HTURI {
19 char * access; /* Now known as "scheme" */
20 char * host;
21 char * absolute;
22 char * relative;
23 char * fragment;
24 } HTURI;
25
26 /* Scan a filename for its consituents
27 ** -----------------------------------
28 **
29 ** On entry,
30 ** name points to a document name which may be incomplete.
31 ** On exit,
32 ** absolute or relative may be nonzero (but not both).
33 ** host, fragment and access may be nonzero if they were specified.
34 ** Any which are nonzero point to zero terminated strings.
35 */
scan(char * name,HTURI * parts)36 PRIVATE void scan (char * name, HTURI * parts)
37 {
38 char * p;
39 char * after_access = name;
40 memset(parts, '\0', sizeof(HTURI));
41
42 /* Look for fragment identifier */
43 if ((p = strchr(name, '#')) != NULL) {
44 *p++ = '\0';
45 parts->fragment = p;
46 }
47
48
49 if ((p = strchr(name, ' ')) != NULL) *p++ = '\0';
50
51 for(p=name; *p; p++) {
52
53 /*
54 ** Look for any whitespace. This is very bad for pipelining as it
55 ** makes the request invalid
56 */
57 if (isspace((int) *p)) {
58 char *orig=p, *dest=p+1;
59 while ((*orig++ = *dest++));
60 p = p-1;
61 }
62 if (*p=='/' || *p=='#' || *p=='?')
63 break;
64 if (*p==':') {
65 *p = 0;
66 parts->access = after_access; /* Scheme has been specified */
67
68 /* The combination of gcc, the "-O" flag and the HP platform is
69 unhealthy. The following three lines is a quick & dirty fix, but is
70 not recommended. Rather, turn off "-O". */
71
72 /* after_access = p;*/
73 /* while (*after_access == 0)*/
74 /* after_access++;*/
75
76 after_access = p+1;
77
78 if (0==strcasecomp("URL", parts->access)) {
79 parts->access = NULL; /* Ignore IETF's URL: pre-prefix */
80 } else break;
81 }
82 }
83
84 p = after_access;
85 if (*p=='/'){
86 if (p[1]=='/') {
87 parts->host = p+2; /* host has been specified */
88 *p=0; /* Terminate access */
89 p=strchr(parts->host,'/'); /* look for end of host name if any */
90 if(p) {
91 *p=0; /* Terminate host */
92 parts->absolute = p+1; /* Root has been found */
93 }
94 } else {
95 parts->absolute = p+1; /* Root found but no host */
96 }
97 } else {
98 parts->relative = (*after_access) ? after_access : 0; /* zero for "" */
99 }
100 }
101
102
103 /* Parse a Name relative to another name
104 ** -------------------------------------
105 **
106 ** This returns those parts of a name which are given (and requested)
107 ** substituting bits from the related name where necessary.
108 **
109 ** On entry,
110 ** aName A filename given
111 ** relatedName A name relative to which aName is to be parsed. Give
112 ** it an empty string if aName is absolute.
113 ** wanted A mask for the bits which are wanted.
114 **
115 ** On exit,
116 ** returns A pointer to a malloc'd string which MUST BE FREED
117 */
HTParse(const char * aName,const char * relatedName,int wanted)118 PUBLIC char * HTParse (const char *aName, const char *relatedName, int wanted)
119 {
120 char * result = 0;
121 char * return_value = 0;
122 int len;
123 char * name = 0;
124 char * rel = 0;
125 char * p;
126 char * access;
127 HTURI given, related;
128
129 if (!aName) return NULL;
130 if (!relatedName) /* HWL 23/8/94: dont dump due to NULL */
131 relatedName = "";
132
133 /* Make working copies of input strings to cut up: */
134 len = strlen(aName)+strlen(relatedName)+10;
135 if ((result=(char *) HT_MALLOC(len)) == NULL) /* Lots of space: more than enough */
136 HT_OUTOFMEM("parse space");
137 StrAllocCopy(name, aName);
138 StrAllocCopy(rel, relatedName);
139
140 scan(name, &given);
141 scan(rel, &related);
142 result[0]=0; /* Clear string */
143 access = given.access ? given.access : related.access;
144 if (wanted & PARSE_ACCESS)
145 if (access) {
146 strcat(result, access);
147 if(wanted & PARSE_PUNCTUATION) strcat(result, ":");
148 }
149
150 if (given.access && related.access) /* If different, inherit nothing. */
151 if (strcmp(given.access, related.access)!=0) {
152 related.host=0;
153 related.absolute=0;
154 related.relative=0;
155 related.fragment=0;
156 }
157
158 if (wanted & PARSE_HOST)
159 if(given.host || related.host) {
160 if(wanted & PARSE_PUNCTUATION) strcat(result, "//");
161 strcat(result, given.host ? given.host : related.host);
162 }
163
164 if (given.host && related.host) /* If different hosts, inherit no path. */
165 if (strcmp(given.host, related.host)!=0) {
166 related.absolute=0;
167 related.relative=0;
168 related.fragment=0;
169 }
170
171 if (wanted & PARSE_PATH) {
172 if(given.absolute) { /* All is given */
173 if(wanted & PARSE_PUNCTUATION) strcat(result, "/");
174 strcat(result, given.absolute);
175 } else if(related.absolute) { /* Adopt path not name */
176 strcat(result, "/");
177 strcat(result, related.absolute);
178 if (given.relative) {
179 p = strchr(result, '?'); /* Search part? */
180 if (!p) p=result+strlen(result)-1;
181 for (; *p!='/'; p--); /* last / */
182 p[1]=0; /* Remove filename */
183 strcat(result, given.relative); /* Add given one */
184 #if 0
185 result = HTSimplify (&result);
186 #endif
187 }
188 } else if(given.relative) {
189 strcat(result, given.relative); /* what we've got */
190 } else if(related.relative) {
191 strcat(result, related.relative);
192 } else { /* No inheritance */
193 strcat(result, "/");
194 }
195 }
196
197 if (wanted & PARSE_VIEW)
198 if(given.fragment || related.fragment) {
199 if(given.absolute && given.fragment) { /*Fixes for relURLs...*/
200 if(wanted & PARSE_PUNCTUATION) strcat(result, "#");
201 strcat(result, given.fragment);
202 } else if (!(given.absolute) && !(given.fragment)) {
203 strcat(result, "");
204 } else {
205 if(wanted & PARSE_PUNCTUATION) strcat(result, "#");
206 strcat(result, given.fragment ? given.fragment : related.fragment);
207 }
208 }
209 HT_FREE(rel);
210 HT_FREE(name);
211
212 StrAllocCopy(return_value, result);
213 HT_FREE(result);
214 return return_value; /* exactly the right length */
215 }
216
217
218 /*
219 ** Canonicalizes the URL in the following manner starting from the host
220 ** pointer:
221 **
222 ** 1) The host name is converted to lowercase
223 ** 2) Chop off port if `:80' (http), `:70' (gopher), or `:21' (ftp)
224 **
225 ** Return: OK The position of the current path part of the URL
226 ** which might be the old one or a new one.
227 */
HTCanon(char ** filename,char * host)228 PRIVATE char * HTCanon (char ** filename, char * host)
229 {
230 char *newname = NULL;
231 char *port;
232 char *strptr;
233 char *path;
234 char *access = host-3;
235
236 while (access>*filename && *(access-1)!='/') /* Find access method */
237 access--;
238 if ((path = strchr(host, '/')) == NULL) /* Find path */
239 path = host + strlen(host);
240 if ((strptr = strchr(host, '@')) != NULL && strptr<path) /* UserId */
241 host = strptr;
242 if ((port = strchr(host, ':')) != NULL && port>path) /* Port number */
243 port = NULL;
244
245 strptr = host; /* Convert to lower-case */
246 while (strptr<path) {
247 *strptr = TOLOWER(*strptr);
248 strptr++;
249 }
250
251 /* Does the URL contain a full domain name? This also works for a
252 numerical host name. The domain name is already made lower-case
253 and without a trailing dot. */
254 #if 0
255 if (((strptr = strchr(host, '.')) == NULL || strptr >= path) &&
256 strncasecomp(host, "localhost", 9)) {
257 const char *domain = HTGetDomainName();
258 if (domain && *domain) {
259 if ((newname = (char *) HT_CALLOC(1, strlen(*filename) + strlen(domain)+2)) == NULL)
260 HT_OUTOFMEM("HTCanon");
261 if (port)
262 strncpy(newname, *filename, (int) (port-*filename));
263 else
264 strncpy(newname, *filename, (int) (path-*filename));
265 strcat(newname, ".");
266 strcat(newname, domain);
267 }
268 } else /* Look for a trailing dot */
269 #endif
270 {
271 char *dot = port ? port : path;
272 if (dot > *filename && *--dot=='.') {
273 char *orig=dot, *dest=dot+1;
274 while((*orig++ = *dest++));
275 if (port) port--;
276 path--;
277 }
278 }
279 /* Chop off port if `:', `:80' (http), `:70' (gopher), or `:21' (ftp) */
280 if (port) {
281 if (!*(port+1) || *(port+1)=='/') {
282 if (!newname) {
283 char *orig=port, *dest=port+1;
284 while((*orig++ = *dest++));
285 }
286 } else if ((!strncmp(access, "http", 4) &&
287 (*(port+1)=='8'&&*(port+2)=='0'&&(*(port+3)=='/'||!*(port+3)))) ||
288 (!strncmp(access, "gopher", 6) &&
289 (*(port+1)=='7'&&*(port+2)=='0'&&(*(port+3)=='/'||!*(port+3)))) ||
290 (!strncmp(access, "ftp", 3) &&
291 (*(port+1)=='2'&&*(port+2)=='1'&&(*(port+3)=='/'||!*(port+3))))) {
292 if (!newname) {
293 char *orig=port, *dest=port+3;
294 while((*orig++ = *dest++));
295 path -= 3; /* Update path position, Henry Minsky */
296 }
297 } else if (newname)
298 strncat(newname, port, (int) (path-port));
299 }
300
301 if (newname) {
302 char *newpath = newname+strlen(newname);
303 strcat(newname, path);
304 path = newpath;
305 HT_FREE(*filename); /* Free old copy */
306 *filename = newname;
307 }
308 return path;
309 }
310
311 /*
312 ** Search the URL and determine whether it is a relative or absolute URL.
313 ** We check to see if there is a ":" before any "/", "?", and "#". If this
314 ** is the case then we say it is absolute. Otherwise it is relative.
315 */
HTURL_isAbsolute(const char * url)316 PUBLIC BOOL HTURL_isAbsolute (const char * url)
317 {
318 if (url) {
319 const char * ptr = url;
320 while (*ptr) {
321 if (*ptr == ':') return YES;
322 if (*ptr == '/' || *ptr == '?' || *ptr == '#') break;
323 ptr ++;
324 }
325 }
326 return NO;
327 }
328
329 /* Simplify a URI
330 // --------------
331 // A URI is allowed to contain the seqeunce xxx/../ which may be
332 // replaced by "" , and the seqeunce "/./" which may be replaced by "/".
333 // Simplification helps us recognize duplicate URIs.
334 //
335 // Thus, /etc/junk/../fred becomes /etc/fred
336 // /etc/junk/./fred becomes /etc/junk/fred
337 //
338 // but we should NOT change
339 // http://fred.xxx.edu/../..
340 //
341 // or ../../albert.html
342 //
343 // In order to avoid empty URLs the following URLs become:
344 //
345 // /fred/.. becomes /fred/..
346 // /fred/././.. becomes /fred/..
347 // /fred/.././junk/.././ becomes /fred/..
348 //
349 // If more than one set of `://' is found (several proxies in cascade) then
350 // only the part after the last `://' is simplified.
351 //
352 // Returns: A string which might be the old one or a new one.
353 */
HTSimplify(char ** url)354 PUBLIC char *HTSimplify (char ** url)
355 {
356 char *path;
357 char *p;
358 if (!url || !*url) {
359 HTTRACE(URI_TRACE, "HTSimplify.. Nothing done\n");
360 return *url;
361 }
362 HTTRACE(URI_TRACE, "HTSimplify.. `%s\' " _ *url);
363
364 /* Find any scheme name */
365 if ((path = strstr(*url, "://")) != NULL) { /* Find host name */
366 char *newptr;
367 char *access = *url;
368 while (access<path && (*access=TOLOWER(*access))) access++;
369 path += 3;
370 while ((newptr = strstr(path, "://")) != NULL) /* For proxies */
371 path = newptr+3;
372 path = HTCanon(url, path); /* We have a host name */
373 } else if ((path = strstr(*url, ":/")) != NULL) {
374 path += 2;
375 } else
376 path = *url;
377 if (*path == '/' && *(path+1)=='/') { /* Some URLs start //<foo> */
378 path += 1;
379 } else if (!strncmp(path, "news:", 5)) {
380 char *ptr = strchr(path+5, '@');
381 if (!ptr) ptr = path+5;
382 while (*ptr) { /* Make group or host lower case */
383 *ptr = TOLOWER(*ptr);
384 ptr++;
385 }
386 HTTRACE(URI_TRACE, "into\n............ `%s'\n" _ *url);
387 return *url; /* Doesn't need to do any more */
388 }
389 if ((p = path)) {
390 char *end;
391 if (!((end = strchr(path, ';')) || (end = strchr(path, '?')) ||
392 (end = strchr(path, '#'))))
393 end = path+strlen(path);
394
395 /* Parse string second time to simplify */
396 p = path;
397 while(p<end) {
398 if (*p=='/') {
399 if (p>*url && *(p+1)=='.' && (*(p+2)=='/' || !*(p+2))) {
400 char *orig = p+1;
401 char *dest = (*(p+2)!='/') ? p+2 : p+3;
402 while ((*orig++ = *dest++)); /* Remove a slash and a dot */
403 end = orig-1;
404 } else if (*(p+1)=='.' && *(p+2)=='.' && (*(p+3)=='/' || !*(p+3))) {
405 char *q = p;
406 while (q>path && *--q!='/'); /* prev slash */
407 if (strncmp(q, "/../", 4)) {
408 char *orig = q+1;
409 char *dest = (*(p+3)!='/') ? p+3 : p+4;
410 while ((*orig++ = *dest++)); /* Remove /xxx/.. */
411 end = orig-1;
412 p = q; /* Start again with prev slash */
413 } else
414 p++;
415 } else if (*(p+1)=='/') {
416 while (*(p+1)=='/') {
417 char *orig=p, *dest=p+1;
418 while ((*orig++ = *dest++)); /* Remove multiple /'s */
419 end = orig-1;
420 }
421 } else
422 p++;
423 } else
424 p++;
425 }
426 }
427
428 /*
429 ** Check for host/../.. kind of things
430 */
431 while (*path=='/' && *(path+1)=='.' && *(path+2)=='.' &&
432 (!*(path+3) || *(path+3)=='/')) {
433 char * orig = path;
434 char * dest = path+3;
435 while ((*orig++ = *dest++));
436 }
437 HTTRACE(URI_TRACE, "into\n............ `%s'\n" _ *url);
438 return *url;
439 }
440
441 /* Make Relative Name
442 ** ------------------
443 **
444 ** This function creates and returns a string which gives an expression of
445 ** one address as related to another. Where there is no relation, an absolute
446 ** address is retured.
447 **
448 ** On entry,
449 ** Both names must be absolute, fully qualified names of nodes
450 ** (no fragment bits)
451 **
452 ** On exit,
453 ** The return result points to a newly allocated name which, if
454 ** parsed by HTParse relative to relatedName, will yield aName.
455 ** The caller is responsible for freeing the resulting name later.
456 **
457 */
HTRelative(const char * aName,const char * relatedName)458 PUBLIC char * HTRelative (const char * aName, const char * relatedName)
459 {
460 char * result = 0;
461 const char *p = aName;
462 const char *q = relatedName;
463 const char * after_access = NULL;
464 const char * path = 0;
465 const char * last_slash = 0;
466 int slashes = 0;
467
468 for(;*p; p++, q++) { /* Find extent of match */
469 if (*p!=*q) break;
470 if (*p==':') if (!after_access) after_access = p+1;
471 if (*p=='/') {
472 last_slash = p;
473 slashes++;
474 if (slashes==3) path=p;
475 }
476 }
477
478 /* q, p point to the first non-matching character or zero */
479
480 if (!after_access) { /* Different access */
481 StrAllocCopy(result, aName);
482 } else if (slashes<3){ /* Different nodes */
483 StrAllocCopy(result, after_access);
484 } else { /* Some path in common */
485 int levels= 0;
486 for(; *q && *q!='#' && *q!=';' && *q!='?'; q++) if (*q=='/') levels++;
487 if ((result = (char *) HT_MALLOC(3*levels + strlen(last_slash) + 4)) == NULL)
488 HT_OUTOFMEM("HTRelative");
489 *result = '\0';
490 if (!levels) strcat(result, "./");
491 for(;levels; levels--)strcat(result, "../");
492 strcat(result, last_slash+1);
493 if (!*result) strcat(result, "./");
494 }
495 HTTRACE(URI_TRACE, "HTRelative.. `%s' expressed relative to `%s' is `%s'\n" _
496 aName _ relatedName _ result);
497 #if 0
498 {
499 char * absstr = HTParse(result, relatedName, PARSE_ALL);
500 HTSimplify(&absstr);
501 HTTRACE(URI_TRACE, "HTRelative.. `%s' made absolute based on `%s' is `%s'\n" _
502 result _ relatedName _ absstr);
503 if (strcmp(absstr, aName) != 0) HTTRACE(URI_TRACE, "THEY DIFFER!!!\n");
504 HT_FREE(absstr);
505 }
506 #endif
507 return result;
508 }
509
510 /* HTCleanTelnetString()
511 * Make sure that the given string doesn't contain characters that
512 * could cause security holes, such as newlines in ftp, gopher,
513 * news or telnet URLs; more specifically: allows everything between
514 * ASCII 20-7E, and also A0-FE, inclusive. Also TAB ('\t') allowed!
515 *
516 * On entry,
517 * str the string that is *modified* if necessary. The
518 * string will be truncated at the first illegal
519 * character that is encountered.
520 * On exit,
521 * returns YES, if the string was modified.
522 * NO, otherwise.
523 */
HTCleanTelnetString(char * str)524 PUBLIC BOOL HTCleanTelnetString (char * str)
525 {
526 char * cur = str;
527
528 if (!str) return NO;
529
530 while (*cur) {
531 int a = TOASCII((unsigned char) *cur);
532 if (a != 0x9 && (a < 0x20 || (a > 0x7E && a < 0xA0) || a > 0xFE)) {
533 HTTRACE(URI_TRACE, "Illegal..... character in URL: \"%s\"\n" _ str);
534 *cur = 0;
535 HTTRACE(URI_TRACE, "Truncated... \"%s\"\n" _ str);
536 return YES;
537 }
538 cur++;
539 }
540 return NO;
541 }
542
543