1 /*
2    netrik -- The ANTRIK Internet Viewer
3    Copyright (C) Olaf D. Buddenhagen AKA antrik, et al (see AUTHORS)
4    Published under the GNU GPL; see LICENSE for details.
5 */
6 /*
7  * url.c -- URL handling functions
8  *
9  * (C) 2001, 2002 antrik
10  *     2002 Patrice Neff
11  */
12 #include <ctype.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 
17 #include "debug.h"
18 #include "url.h"
19 
20 static void dump_url(const struct Url *url);    /* print components of a split URL */
21 static void store_component(const char *start, const char *end, char **component);    /* store a URL field extracted by split_url() */
22 static void str_append(char **str, const char *append);    /* append string to end of other string, reallocating memory */
23 
24 /* print components of a split URL */
dump_url(url)25 static void dump_url(url)
26 const struct Url *url;
27 {
28    debug_printf("   full url: %s\n", url->full_url);
29    debug_printf("   full path: %s\n", url->path);
30    debug_printf("   protocol: %s\n", url->proto.str);
31    debug_printf("   host: %s\n", url->host);
32    debug_printf("   port: %d\n", url->port);
33    debug_printf("   directory: %s\n", url->dir);
34    debug_printf("   file name: %s\n", url->name);
35    debug_printf("   parameters: %s\n", url->params);
36    debug_printf("   fragment identifier: %s\n", url->frag);
37 }
38 
39 /* store a section representing one URL field extracted by split_url()
40  * to the respective string */
store_component(start,end,component)41 void store_component(start, end, component)
42 const char	*start;
43 const char	*end;
44 char		**component;    /* where to store */
45 {
46    const int	len=end-start;
47 
48    if(len>0) {
49       *component=malloc(len+1);
50       if(component==NULL) {
51 	 fprintf(stderr, "memory allocation error while parsing URL (in function store_component)\n");
52 	 exit(1);
53       }
54 
55       strncpy(*component, start, len);
56       (*component)[len]='\0';
57    } else
58       *component=NULL;
59 }
60 
61 /* parse URL (split URL string into fields) */
split_url(url)62 struct Url *split_url(url)
63 const char	*url;
64 {
65    struct Url	*components;
66 
67    char		*url_char;    /* current position in URL string */
68 
69    char		*word_start;    /* starting position of currently parsed url field */
70    char		*name_start;    /* start of file name (component of path after last '/') */
71 
72    /* mode to parse next char in (<<8 to allow simple combining with "dispatch_char") */
73    enum {
74       PM_START=0x000,
75       PM_SLASH=0x100,    /* '/' at beginning of URL */
76       PM_PROTO=0x200,    /* inside protocoll specification */
77       PM_HOST_START1=0x300,    /* first '/' of "//" introcucing host */
78       PM_HOST_START2=0x400,
79       PM_HOST=0x500,    /* inside host name */
80       PM_PORT=0x600,    /* inside port number */
81       PM_PATH=0x700,    /* inside directory/name string */
82       PM_PARA=0x800,    /* inside CGI parameter list */
83       PM_FRAG=0x900,    /* inside fragment identifier */
84       PM_END=0xa00    /* '\0' ending URL string */
85    } parse_mode=PM_START;
86 
87    char		escaped_url[3*strlen(url)+1];    /* enough for worst case of all characters needing escaping */
88 
89 
90    DMSG(("parsing URL...\n"));
91 
92    components=malloc(sizeof(struct Url));
93    if(components==NULL) {
94       fprintf(stderr, "memory allocation error while parsing URL\n");
95       exit(1);
96    }
97    /* set default values */
98    {
99       const struct Url	def={NULL, NULL, {NULL, PT_UNKNOWN}, NULL, 0, NULL, NULL, NULL, NULL, 0, 0};
100 
101       memcpy(components, &def, sizeof(def));
102       components->proto.str=strdup("");    /* must be present when setting PT_INTERNAL due to error */
103       if(components->proto.str==NULL) {
104 	 fprintf(stderr, "memory allocation error while splitting URL\n");
105 	 exit(1);
106       }
107    }
108 
109    /* Escape invalid chars in URL. */
110    {
111       char	*src_pos;
112       char	*write_pos;
113 
114       for(src_pos=(char *)url, write_pos=escaped_url; *src_pos; ++src_pos) {    /* all chars in original URL */
115 	 const unsigned char	chr=*src_pos;
116 
117 	 if(isascii(chr) && isgraph(chr) && chr!='\'')    /* normal char (see hacking-load.* for explanation!) */
118 	    write_pos+=snprintf(write_pos, sizeof(escaped_url) - (write_pos-escaped_url), "%c", chr);    /* just store char */
119 	 else    /* needs to be escaped */
120 	    write_pos+=snprintf(write_pos, sizeof(escaped_url) - (write_pos-escaped_url), "%%%.2x", (int)chr);    /* store %hh */
121       }
122       *write_pos=0;
123    }
124 
125    for(name_start=word_start=url_char=escaped_url; parse_mode!=PM_END; ++url_char) {    /* all chars in URL string */
126       int	dispatch_char;    /* input character after modification for dispatch */
127       int	dispatch;    /* combined state variable used to determine action in main dispatcher */
128       int	recycle=0;    /* need additional dispatch pass (sometimes necessary after switching parsing mode */
129 
130       DMSG(("%c", *url_char));
131 
132       if(strchr(":/?#", *url_char)==NULL)    /* normal character */
133 	 dispatch_char='$';    /* indicated by '$' (shared dispatch case) */
134       else    /* characters which may have special function */
135 	 dispatch_char=*url_char;    /* handled each seperately in dispatcher */
136 
137       do {    /* while recycle (additional pass necessary) */
138 	 recycle=0;
139 	 dispatch=dispatch_char|parse_mode;    /* dispatch on new char and current state */
140 	 switch(dispatch) {
141 
142 	    /* URL starts with normal char -> assume it is a protocol specification */
143 	    case '$'|PM_START:
144 	       parse_mode=PM_PROTO;
145 
146 	    /* normal protocol string char -> go ahead */
147 	    case '$'|PM_PROTO:
148 	       break;
149 
150 	    /* URL starts with '/' */
151 	    case '/'|PM_START:
152 	       name_start=url_char+1;    /* might be path start... (see '/'|PM_PATH below) */
153 	       parse_mode=PM_SLASH;
154 	       break;
155 
156 	    /* URL start with host ("//") */
157 	    case '/'|PM_SLASH:
158 	       parse_mode=PM_HOST_START2;    /* host will follow */
159 	       recycle=1;
160 	       break;
161 
162 	    /* URL starts with path (with single '/') -> skip to path parsing */
163 	    case '$'|PM_SLASH:
164 	    case '?'|PM_SLASH:
165 	    case '#'|PM_SLASH:
166 	    case '\0'|PM_SLASH:
167 
168 	    /* URL starts neither with '/' nor with word -> also skip to path parsing (which will skip to next component) */
169 	    case '?'|PM_START:
170 	    case '#'|PM_START:
171 	    case '\0'|PM_START:
172 
173 	    /* URL starts with word, but first word doesn't end with ':' => it was not the protocol specification -> also skip to path parsing */
174 	    case '/'|PM_PROTO:
175 	    case '?'|PM_PROTO:
176 	    case '#'|PM_PROTO:
177 	    case '\0'|PM_PROTO:
178 	       parse_mode=PM_PATH;
179 	       recycle=1;
180 	       break;
181 
182 	    /* protocol specification end */
183 	    case ':'|PM_PROTO:
184 	       free(components->proto.str);
185 	       store_component(word_start, url_char, &components->proto.str);
186 
187 	       /* test for known protocols */
188 	       if(components->proto.str!=NULL) {
189 		  if(strcasecmp(components->proto.str, "http")==0)
190 		     components->proto.type=PT_HTTP;
191 		  else if(strcasecmp(components->proto.str, "ftp")==0)
192 		     components->proto.type=PT_FTP;
193 		  else if(strcasecmp(components->proto.str, "file")==0)
194 		     components->proto.type=PT_FILE;
195 		  else {
196 		     fprintf(stderr, "unknown protocol %s\n", components->proto.str);
197 		     components->proto.type=PT_INTERNAL;
198 		     return components;
199 		  }
200 	       } else
201 		  components->proto.type=PT_UNKNOWN;
202 
203 	       parse_mode=PM_HOST_START1;
204 	       break;
205 
206 	    /* first '/' of "//" introducing host specification */
207 	    case '/'|PM_HOST_START1:
208 	       parse_mode=PM_HOST_START2;
209 	       break;
210 
211 	    /* second '/' */
212 	    case '/'|PM_HOST_START2:
213 	       parse_mode=PM_HOST;
214 	       word_start=url_char+1;    /* host name starts after the "//" */
215 	       break;
216 
217 	    /* host name char */
218 	    case '$'|PM_HOST:
219 	       break;
220 
221 	    /* host name end */
222 	    case ':'|PM_HOST:
223 	    case '/'|PM_HOST:
224 	    case '?'|PM_HOST:
225 	    case '#'|PM_HOST:
226 	    case '\0'|PM_HOST:
227 	       store_component(word_start, url_char, &components->host);
228 
229 	       parse_mode=PM_PORT;
230 	       if(*url_char==':')    /* port number follows host name */
231 		  word_start=url_char+1;
232 	       else {    /* something else than port number follows -> needs to take another look at the current char */
233 		  word_start=url_char;
234 		  recycle=1;
235 	       }
236 	       break;
237 
238 	    /* port number char */
239 	    case '$'|PM_PORT:
240 	       break;
241 
242 	    /* port number end */
243 	    case '/'|PM_PORT:
244 	    case '?'|PM_PORT:
245 	    case '#'|PM_PORT:
246 	    case '\0'|PM_PORT:
247 	       components->port=atoi(word_start);
248 	       if(components->port==0)    /* no port specified -> use default port */
249 		  components->port=80;
250 
251 	       name_start=word_start=url_char;
252 	       parse_mode=PM_PATH;
253 	       recycle=1;    /* path start always needs additional pass (for setting "name_start" properly) */
254 	       break;
255 
256 	    /* normal path char */
257 	    case '$'|PM_PATH:
258 	    case ':'|PM_PATH:
259 	       break;
260 
261 	    /* '/' in path -> previous word wasn't file name, but the following one might be */
262 	    case '/'|PM_PATH:
263 	       name_start=url_char+1;    /* file name starts after last '/' */
264 	       break;
265 
266 	    /* path end */
267 	    case '?'|PM_PATH:
268 	    case '#'|PM_PATH:
269 	    case '\0'|PM_PATH:
270 	       if((url_char-name_start==1 && !strncmp(name_start, ".", 1)) || (url_char-name_start==2 && !strncmp(name_start, "..", 2))) {    /* path ends in "." or ".." => this is *not* the file name, but a dir component! */
271 		  char	save_char=*url_char;
272 		  *url_char='/';    /* let it end with a slash like a dir should... */
273 		  store_component(word_start, url_char+1, &components->dir);    /* whole path, and the appended '/', are directory */
274 		  *url_char=save_char;    /* need to restore the original char at this position, as it is important for further processing */
275 	       } else {
276 		  store_component(word_start, name_start, &components->dir);    /* path ends at last '/' */
277 		  store_component(name_start, url_char, &components->name);    /* file name is path component after last '/' */
278 	       }
279 
280 	       word_start=url_char+1;
281 	       parse_mode=PM_PARA;
282 	       if(*url_char!='?')
283 		  recycle=1;
284 	       break;
285 
286 	    /* parameter list char */
287 	    case '$'|PM_PARA:
288 	    case ':'|PM_PARA:
289 	    case '/'|PM_PARA:    /* '/' and '?' treated as normal chars in parameter list */
290 	    case '?'|PM_PARA:
291 	       break;
292 
293 	    /* parameter list end */
294 	    case '#'|PM_PARA:
295 	    case '\0'|PM_PARA:
296 	       store_component(word_start, url_char, &components->params);
297 
298 	       word_start=url_char+1;
299 	       parse_mode=PM_FRAG;
300 	       if(*url_char!='#')
301 		  recycle=1;
302 	       break;
303 
304 	    /* fragment identifier char */
305 	    case '$'|PM_FRAG:
306 	       break;
307 
308 	    /* fragment identifier end == url end */
309 	    case '\0'|PM_FRAG:
310 	       store_component(word_start, url_char, &components->frag);
311 
312 	       parse_mode=PM_END;
313 	       break;
314 
315 	    default:
316 	       fprintf(stderr, "\nURL parse error (unexpected character)\n");
317 	       components->proto.type=PT_INTERNAL;
318 	       return components;
319 	 }    /* switch(dispatch) */
320       } while(recycle);    /* additional dispatch pass after parsing mode change */
321    }    /* for all chars in URL string */
322 
323 #ifdef DEBUG
324    if(cfg.debug) {
325       debug_printf("\nextracted URL components:\n");
326       dump_url(components);
327    }
328 #endif
329 
330    return components;
331 }
332 
333 /* free memory used by a parsed URL struct */
free_url(url)334 void free_url(url)
335 struct Url	*url;
336 {
337    DMSG(("freeing url components...\n"));
338 
339    if(url->full_url!=NULL)
340       free(url->full_url);
341    if(url->proto.str!=NULL)
342       free(url->proto.str);
343    if(url->host!=NULL)
344       free(url->host);
345    if(url->dir!=NULL)
346       free(url->dir);
347    if(url->name!=NULL)
348       free(url->name);
349    if(url->params!=NULL)
350       free(url->params);
351    if(url->frag!=NULL)
352       free(url->frag);
353 
354    free(url);
355 }
356 
357 /* append string to end of other string, reallocating memory;
358  * create new string, if str was NULL */
str_append(str,append)359 static void str_append(str, append)
360 char		**str;
361 const char	*append;
362 {
363    if(*str==NULL) {    /* no string to append to -> create empty string */
364       *str=strdup("");
365       if(*str==NULL) {
366 	 fprintf(stderr, "memory allocation error while creating URL (in function str_append)\n");
367 	 exit(2);
368       }
369    }
370 
371    *str=realloc(*str, sizeof(char[strlen(*str)+strlen(append)+1]));
372    if(*str==NULL) {
373       fprintf(stderr, "memory allocation error while creating URL (in function str_append)\n");
374       exit(2);
375    }
376 
377    strcat(*str, append);
378 }
379 
380 /*
381  * Merges two URLs to create a new one.
382  *
383  * The "url_string" is split into components, and "base_url" is used to
384  * complete the URL: Components are taken from "base_url" (or defaults, if no
385  * "base_url" supplied) until the first field is specified in the main URL;
386  * afterwards, all components are taken from the main URL (or defaults, if none
387  * supplied). If the main URL path is relative, it is merged with the base
388  * path.
389  *
390  * The "form" parameter gives an optional form data string which is to be
391  * submitted with the URL. (Method GET.) NULL means nothing to submit.
392  * Otherwise, the form data is attached as part of the merged URL.
393  */
394 
merge_urls(base,url_string,form)395 struct Url *merge_urls(base, url_string, form)
396 const struct Url	*base;    /* default values/base path */
397 const char		*url_string;    /* URL to be split and completed by "base_url" */
398 const char		*form;
399 {
400    struct Url	*url;    /* merged URL */
401    struct Url	*base_url;
402    struct Url	*main_url;
403 
404    base_url=(struct Url *)base;
405 #ifdef DEBUG
406    if(cfg.debug) {
407       if(base_url!=NULL) {
408 	 debug_printf("Base URL components:\n");
409 	 dump_url(base_url);
410       } else
411 	 debug_printf("No base URL.\n");
412    }
413 #endif
414 
415    main_url=split_url(url_string);
416    if(main_url->proto.type==PT_INTERNAL)    /* splitting failed */
417       fprintf(stderr, "can't get absolute target URL\n");
418 
419    DMSG(("Merging...\n"));
420 
421    /* alloc struct for merged URL */
422    url=malloc(sizeof(struct Url));
423    if(url==NULL) {
424       fprintf(stderr, "memory allocation error while parsing URL\n");
425       exit(1);
426    }
427 
428    /* merge with fallback values from "base_url" */
429 
430    if(main_url->proto.type!=PT_UNKNOWN) {    /* some protocol set in "main_url" -> take it */
431       base_url=NULL;    /* take following components from "main_url" also (use defaults if not supplied) */
432       url->proto.type=main_url->proto.type;
433       url->proto.str=strdup(main_url->proto.str);
434    } else {    /* no protocol in "main_url" */
435       if(base_url!=NULL) {    /* has "base_url" -> take fallback from it */
436 	 url->proto.type=base_url->proto.type;
437 	 url->proto.str=strdup(base_url->proto.str);
438       } else {    /* no "base_url" -> use default */
439 	 url->proto.type=PT_UNKNOWN;
440 	 url->proto.str=strdup("");
441       }
442    }
443 
444    if(base_url==NULL)    /* absolute URL (no "base_url" was given, or "main_url" contained a protocol specification, so "base_url" isn't used) */
445       url->absolute=1;
446    else
447       url->absolute=0;
448 
449    if(main_url->host!=NULL) {
450       base_url=NULL;
451       url->host=strdup(main_url->host);
452       url->port=main_url->port;    /* port always set with host */
453    } else {
454       if(base_url!=NULL) {    /* has "base_url" (and no components from "main_url" yet) */
455 	 url->host=strdup(base_url->host);
456 	 url->port=base_url->port;
457       } else {
458 	 url->host=strdup("");
459 	 url->port=80;
460       }
461    }
462 
463    if(main_url->dir!=NULL) {
464       if(main_url->dir[0]=='/' || base_url==NULL)    /* dir in "main_url" is absolute (or only one supplied) -> take it */
465 	 url->dir=strdup(main_url->dir);
466       else {    /* dir in "main_url" is relative -> merge with "base_url" */
467 	 char	merged_dir[strlen(base_url->dir)+strlen(main_url->dir)+1];
468 	 char	*merge_part=main_url->dir;
469 
470 	 strcpy(merged_dir, base_url->dir);
471 
472 	 if(!strncmp(merge_part, "./", 2))    /* skip "./" */
473 	    merge_part+=2;
474 
475 	 /* handle ".." */
476 	 for(; *merged_dir && !strncmp(merge_part, "../", 3); merge_part+=3) {    /* relative path starts with "..", and there is anything left in original path -> remove last component */
477 	    char	*trunc_pos;
478 
479 	    /* find beginning of last component */
480 	    for(trunc_pos=strchr(merged_dir, 0)-1; trunc_pos>merged_dir && *(trunc_pos-1)!='/'; --trunc_pos);    /* look for '/' ending previous component, or string start (skip last character, which is the '/' terminating last component) */
481 	    *trunc_pos=0;
482 	 }    /* for all ".." */
483 
484 	 strcat(merged_dir, merge_part);    /* concatenate the part of the relative path remaining after skipping all the "./" and "../" */
485 	 url->dir=strdup(merged_dir);
486       }
487       base_url=NULL;
488    } else {
489       if(base_url!=NULL)
490 	 url->dir=strdup(base_url->dir);
491       else
492 	 url->dir=strdup(url->proto.type==PT_FILE || url->proto.type==PT_UNKNOWN?"":"/");
493    }
494 
495    if(main_url->name!=NULL) {
496       base_url=NULL;
497       url->name=strdup(main_url->name);
498    } else {
499       if(base_url!=NULL)
500 	 url->name=strdup(base_url->name);
501       else
502 	 url->name=strdup("");
503    }
504 
505 
506    if(form!=NULL) {    /* submit form data -> store in place of any other CGI paramters */
507       base_url=NULL;    /* handle as if the data was passed as part of main URL (force loading new document even if URL otherwise identical; discard old fragment identifier) */
508       url->params=strdup(form);
509    } else if(main_url->params!=NULL) {    /* no form to submit, but paramters given in main URL -> take them */
510       base_url=NULL;
511       url->params=strdup(main_url->params);
512    } else {
513       if(base_url!=NULL)
514 	 url->params=strdup(base_url->params);
515       else
516 	 url->params=strdup("");
517    }
518 
519    if(base_url!=NULL)    /* same document as "base", only fragment identifier may differ (nothing taken from "main_url" yet) */
520       url->local=1;
521    else
522       url->local=0;
523 
524    if(main_url->frag!=NULL) {
525       base_url=NULL;
526       url->frag=strdup(main_url->frag);
527    } else {
528       if(base_url!=NULL)
529 	 url->frag=strdup(base_url->frag);
530       else
531 	 url->frag=strdup("");
532    }
533 
534    if(url->proto.str==NULL
535       || url->host==NULL
536       || url->dir==NULL
537       || url->name==NULL
538       || url->params==NULL
539       || url->frag==NULL
540    ) {
541       fprintf(stderr, "memory allocation error while creating URL (in function merge_urls)\n");
542       exit(1);
543    }
544 
545    free_url(main_url);
546 
547    /* create "full_url" string (and set "path" pointer) */
548    {
549       int	path_pos;
550 
551       DMSG(("creating new URL...\n"));
552       url->full_url=NULL;
553 
554       if(url->proto.type==PT_HTTP || url->proto.type==PT_FTP) {
555 	 str_append(&url->full_url, url->proto.str);
556 	 str_append(&url->full_url, "://");
557       }
558 
559       str_append(&url->full_url, url->host);
560 
561       if(url->port!=80) {
562 	 char	port_str[7];
563 	 snprintf(port_str, sizeof(port_str), ":%d", url->port);
564 	 str_append(&url->full_url, port_str);
565       }
566 
567       path_pos=strlen(url->full_url);    /* path starts where "dir" and following components will be appended (present string end) */
568 
569       str_append(&url->full_url, url->dir);
570 
571       str_append(&url->full_url, url->name);
572 
573       if(*url->params) {
574 	 str_append(&url->full_url, "?");
575 	 str_append(&url->full_url, url->params);
576       }
577 
578       url->path=url->full_url+path_pos;    /* store pointer to full path (can store pointer now, as full_url won't be moved anymore) */
579    }
580 
581 #ifdef DEBUG
582    if(cfg.debug) {
583       debug_printf("merged URL components:\n");
584       dump_url(url);
585    }
586 #endif
587 
588    return url;
589 }
590 
591 /* Returns a string with %xx URL escape codes from the source string replaced by the respective characters. The result is newly allocated; the source remains untouched. */
url_unescape(escaped)592 char *url_unescape(escaped)
593 const char *escaped;
594 {
595    char *rpos, *wpos, chr;
596    char *result;
597 
598    result=malloc(strlen(escaped)+1);
599    if(result==NULL) {
600       fprintf(stderr, "memory allocation error while unescaping path (in function url_unescape)\n");
601       exit(1);
602    }
603 
604    wpos=result; rpos=(char *)escaped;
605    do {
606       if(*rpos=='%') {
607 	 int code;
608 	 sscanf(rpos+1, "%2x", &code);
609 	 chr=(unsigned char)code;
610 	 rpos+=3;
611       } else
612 	 chr=*rpos++;
613    } while((*wpos++=chr));
614 
615    return result;
616 }
617