1 /*
2 netrik -- The ANTRIK Internet Viewer
3 Copyright (C) Olaf D. Buddenhagen AKA antrik, et al (see AUTHORS)
4 Published under the GNU GPL; see LICENSE for details.
5 */
6 /*
7 * url.c -- URL handling functions
8 *
9 * (C) 2001, 2002 antrik
10 * 2002 Patrice Neff
11 */
12 #include <ctype.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16
17 #include "debug.h"
18 #include "url.h"
19
20 static void dump_url(const struct Url *url); /* print components of a split URL */
21 static void store_component(const char *start, const char *end, char **component); /* store a URL field extracted by split_url() */
22 static void str_append(char **str, const char *append); /* append string to end of other string, reallocating memory */
23
24 /* print components of a split URL */
dump_url(url)25 static void dump_url(url)
26 const struct Url *url;
27 {
28 debug_printf(" full url: %s\n", url->full_url);
29 debug_printf(" full path: %s\n", url->path);
30 debug_printf(" protocol: %s\n", url->proto.str);
31 debug_printf(" host: %s\n", url->host);
32 debug_printf(" port: %d\n", url->port);
33 debug_printf(" directory: %s\n", url->dir);
34 debug_printf(" file name: %s\n", url->name);
35 debug_printf(" parameters: %s\n", url->params);
36 debug_printf(" fragment identifier: %s\n", url->frag);
37 }
38
39 /* store a section representing one URL field extracted by split_url()
40 * to the respective string */
store_component(start,end,component)41 void store_component(start, end, component)
42 const char *start;
43 const char *end;
44 char **component; /* where to store */
45 {
46 const int len=end-start;
47
48 if(len>0) {
49 *component=malloc(len+1);
50 if(component==NULL) {
51 fprintf(stderr, "memory allocation error while parsing URL (in function store_component)\n");
52 exit(1);
53 }
54
55 strncpy(*component, start, len);
56 (*component)[len]='\0';
57 } else
58 *component=NULL;
59 }
60
61 /* parse URL (split URL string into fields) */
split_url(url)62 struct Url *split_url(url)
63 const char *url;
64 {
65 struct Url *components;
66
67 char *url_char; /* current position in URL string */
68
69 char *word_start; /* starting position of currently parsed url field */
70 char *name_start; /* start of file name (component of path after last '/') */
71
72 /* mode to parse next char in (<<8 to allow simple combining with "dispatch_char") */
73 enum {
74 PM_START=0x000,
75 PM_SLASH=0x100, /* '/' at beginning of URL */
76 PM_PROTO=0x200, /* inside protocoll specification */
77 PM_HOST_START1=0x300, /* first '/' of "//" introcucing host */
78 PM_HOST_START2=0x400,
79 PM_HOST=0x500, /* inside host name */
80 PM_PORT=0x600, /* inside port number */
81 PM_PATH=0x700, /* inside directory/name string */
82 PM_PARA=0x800, /* inside CGI parameter list */
83 PM_FRAG=0x900, /* inside fragment identifier */
84 PM_END=0xa00 /* '\0' ending URL string */
85 } parse_mode=PM_START;
86
87 char escaped_url[3*strlen(url)+1]; /* enough for worst case of all characters needing escaping */
88
89
90 DMSG(("parsing URL...\n"));
91
92 components=malloc(sizeof(struct Url));
93 if(components==NULL) {
94 fprintf(stderr, "memory allocation error while parsing URL\n");
95 exit(1);
96 }
97 /* set default values */
98 {
99 const struct Url def={NULL, NULL, {NULL, PT_UNKNOWN}, NULL, 0, NULL, NULL, NULL, NULL, 0, 0};
100
101 memcpy(components, &def, sizeof(def));
102 components->proto.str=strdup(""); /* must be present when setting PT_INTERNAL due to error */
103 if(components->proto.str==NULL) {
104 fprintf(stderr, "memory allocation error while splitting URL\n");
105 exit(1);
106 }
107 }
108
109 /* Escape invalid chars in URL. */
110 {
111 char *src_pos;
112 char *write_pos;
113
114 for(src_pos=(char *)url, write_pos=escaped_url; *src_pos; ++src_pos) { /* all chars in original URL */
115 const unsigned char chr=*src_pos;
116
117 if(isascii(chr) && isgraph(chr) && chr!='\'') /* normal char (see hacking-load.* for explanation!) */
118 write_pos+=snprintf(write_pos, sizeof(escaped_url) - (write_pos-escaped_url), "%c", chr); /* just store char */
119 else /* needs to be escaped */
120 write_pos+=snprintf(write_pos, sizeof(escaped_url) - (write_pos-escaped_url), "%%%.2x", (int)chr); /* store %hh */
121 }
122 *write_pos=0;
123 }
124
125 for(name_start=word_start=url_char=escaped_url; parse_mode!=PM_END; ++url_char) { /* all chars in URL string */
126 int dispatch_char; /* input character after modification for dispatch */
127 int dispatch; /* combined state variable used to determine action in main dispatcher */
128 int recycle=0; /* need additional dispatch pass (sometimes necessary after switching parsing mode */
129
130 DMSG(("%c", *url_char));
131
132 if(strchr(":/?#", *url_char)==NULL) /* normal character */
133 dispatch_char='$'; /* indicated by '$' (shared dispatch case) */
134 else /* characters which may have special function */
135 dispatch_char=*url_char; /* handled each seperately in dispatcher */
136
137 do { /* while recycle (additional pass necessary) */
138 recycle=0;
139 dispatch=dispatch_char|parse_mode; /* dispatch on new char and current state */
140 switch(dispatch) {
141
142 /* URL starts with normal char -> assume it is a protocol specification */
143 case '$'|PM_START:
144 parse_mode=PM_PROTO;
145
146 /* normal protocol string char -> go ahead */
147 case '$'|PM_PROTO:
148 break;
149
150 /* URL starts with '/' */
151 case '/'|PM_START:
152 name_start=url_char+1; /* might be path start... (see '/'|PM_PATH below) */
153 parse_mode=PM_SLASH;
154 break;
155
156 /* URL start with host ("//") */
157 case '/'|PM_SLASH:
158 parse_mode=PM_HOST_START2; /* host will follow */
159 recycle=1;
160 break;
161
162 /* URL starts with path (with single '/') -> skip to path parsing */
163 case '$'|PM_SLASH:
164 case '?'|PM_SLASH:
165 case '#'|PM_SLASH:
166 case '\0'|PM_SLASH:
167
168 /* URL starts neither with '/' nor with word -> also skip to path parsing (which will skip to next component) */
169 case '?'|PM_START:
170 case '#'|PM_START:
171 case '\0'|PM_START:
172
173 /* URL starts with word, but first word doesn't end with ':' => it was not the protocol specification -> also skip to path parsing */
174 case '/'|PM_PROTO:
175 case '?'|PM_PROTO:
176 case '#'|PM_PROTO:
177 case '\0'|PM_PROTO:
178 parse_mode=PM_PATH;
179 recycle=1;
180 break;
181
182 /* protocol specification end */
183 case ':'|PM_PROTO:
184 free(components->proto.str);
185 store_component(word_start, url_char, &components->proto.str);
186
187 /* test for known protocols */
188 if(components->proto.str!=NULL) {
189 if(strcasecmp(components->proto.str, "http")==0)
190 components->proto.type=PT_HTTP;
191 else if(strcasecmp(components->proto.str, "ftp")==0)
192 components->proto.type=PT_FTP;
193 else if(strcasecmp(components->proto.str, "file")==0)
194 components->proto.type=PT_FILE;
195 else {
196 fprintf(stderr, "unknown protocol %s\n", components->proto.str);
197 components->proto.type=PT_INTERNAL;
198 return components;
199 }
200 } else
201 components->proto.type=PT_UNKNOWN;
202
203 parse_mode=PM_HOST_START1;
204 break;
205
206 /* first '/' of "//" introducing host specification */
207 case '/'|PM_HOST_START1:
208 parse_mode=PM_HOST_START2;
209 break;
210
211 /* second '/' */
212 case '/'|PM_HOST_START2:
213 parse_mode=PM_HOST;
214 word_start=url_char+1; /* host name starts after the "//" */
215 break;
216
217 /* host name char */
218 case '$'|PM_HOST:
219 break;
220
221 /* host name end */
222 case ':'|PM_HOST:
223 case '/'|PM_HOST:
224 case '?'|PM_HOST:
225 case '#'|PM_HOST:
226 case '\0'|PM_HOST:
227 store_component(word_start, url_char, &components->host);
228
229 parse_mode=PM_PORT;
230 if(*url_char==':') /* port number follows host name */
231 word_start=url_char+1;
232 else { /* something else than port number follows -> needs to take another look at the current char */
233 word_start=url_char;
234 recycle=1;
235 }
236 break;
237
238 /* port number char */
239 case '$'|PM_PORT:
240 break;
241
242 /* port number end */
243 case '/'|PM_PORT:
244 case '?'|PM_PORT:
245 case '#'|PM_PORT:
246 case '\0'|PM_PORT:
247 components->port=atoi(word_start);
248 if(components->port==0) /* no port specified -> use default port */
249 components->port=80;
250
251 name_start=word_start=url_char;
252 parse_mode=PM_PATH;
253 recycle=1; /* path start always needs additional pass (for setting "name_start" properly) */
254 break;
255
256 /* normal path char */
257 case '$'|PM_PATH:
258 case ':'|PM_PATH:
259 break;
260
261 /* '/' in path -> previous word wasn't file name, but the following one might be */
262 case '/'|PM_PATH:
263 name_start=url_char+1; /* file name starts after last '/' */
264 break;
265
266 /* path end */
267 case '?'|PM_PATH:
268 case '#'|PM_PATH:
269 case '\0'|PM_PATH:
270 if((url_char-name_start==1 && !strncmp(name_start, ".", 1)) || (url_char-name_start==2 && !strncmp(name_start, "..", 2))) { /* path ends in "." or ".." => this is *not* the file name, but a dir component! */
271 char save_char=*url_char;
272 *url_char='/'; /* let it end with a slash like a dir should... */
273 store_component(word_start, url_char+1, &components->dir); /* whole path, and the appended '/', are directory */
274 *url_char=save_char; /* need to restore the original char at this position, as it is important for further processing */
275 } else {
276 store_component(word_start, name_start, &components->dir); /* path ends at last '/' */
277 store_component(name_start, url_char, &components->name); /* file name is path component after last '/' */
278 }
279
280 word_start=url_char+1;
281 parse_mode=PM_PARA;
282 if(*url_char!='?')
283 recycle=1;
284 break;
285
286 /* parameter list char */
287 case '$'|PM_PARA:
288 case ':'|PM_PARA:
289 case '/'|PM_PARA: /* '/' and '?' treated as normal chars in parameter list */
290 case '?'|PM_PARA:
291 break;
292
293 /* parameter list end */
294 case '#'|PM_PARA:
295 case '\0'|PM_PARA:
296 store_component(word_start, url_char, &components->params);
297
298 word_start=url_char+1;
299 parse_mode=PM_FRAG;
300 if(*url_char!='#')
301 recycle=1;
302 break;
303
304 /* fragment identifier char */
305 case '$'|PM_FRAG:
306 break;
307
308 /* fragment identifier end == url end */
309 case '\0'|PM_FRAG:
310 store_component(word_start, url_char, &components->frag);
311
312 parse_mode=PM_END;
313 break;
314
315 default:
316 fprintf(stderr, "\nURL parse error (unexpected character)\n");
317 components->proto.type=PT_INTERNAL;
318 return components;
319 } /* switch(dispatch) */
320 } while(recycle); /* additional dispatch pass after parsing mode change */
321 } /* for all chars in URL string */
322
323 #ifdef DEBUG
324 if(cfg.debug) {
325 debug_printf("\nextracted URL components:\n");
326 dump_url(components);
327 }
328 #endif
329
330 return components;
331 }
332
333 /* free memory used by a parsed URL struct */
free_url(url)334 void free_url(url)
335 struct Url *url;
336 {
337 DMSG(("freeing url components...\n"));
338
339 if(url->full_url!=NULL)
340 free(url->full_url);
341 if(url->proto.str!=NULL)
342 free(url->proto.str);
343 if(url->host!=NULL)
344 free(url->host);
345 if(url->dir!=NULL)
346 free(url->dir);
347 if(url->name!=NULL)
348 free(url->name);
349 if(url->params!=NULL)
350 free(url->params);
351 if(url->frag!=NULL)
352 free(url->frag);
353
354 free(url);
355 }
356
357 /* append string to end of other string, reallocating memory;
358 * create new string, if str was NULL */
str_append(str,append)359 static void str_append(str, append)
360 char **str;
361 const char *append;
362 {
363 if(*str==NULL) { /* no string to append to -> create empty string */
364 *str=strdup("");
365 if(*str==NULL) {
366 fprintf(stderr, "memory allocation error while creating URL (in function str_append)\n");
367 exit(2);
368 }
369 }
370
371 *str=realloc(*str, sizeof(char[strlen(*str)+strlen(append)+1]));
372 if(*str==NULL) {
373 fprintf(stderr, "memory allocation error while creating URL (in function str_append)\n");
374 exit(2);
375 }
376
377 strcat(*str, append);
378 }
379
380 /*
381 * Merges two URLs to create a new one.
382 *
383 * The "url_string" is split into components, and "base_url" is used to
384 * complete the URL: Components are taken from "base_url" (or defaults, if no
385 * "base_url" supplied) until the first field is specified in the main URL;
386 * afterwards, all components are taken from the main URL (or defaults, if none
387 * supplied). If the main URL path is relative, it is merged with the base
388 * path.
389 *
390 * The "form" parameter gives an optional form data string which is to be
391 * submitted with the URL. (Method GET.) NULL means nothing to submit.
392 * Otherwise, the form data is attached as part of the merged URL.
393 */
394
merge_urls(base,url_string,form)395 struct Url *merge_urls(base, url_string, form)
396 const struct Url *base; /* default values/base path */
397 const char *url_string; /* URL to be split and completed by "base_url" */
398 const char *form;
399 {
400 struct Url *url; /* merged URL */
401 struct Url *base_url;
402 struct Url *main_url;
403
404 base_url=(struct Url *)base;
405 #ifdef DEBUG
406 if(cfg.debug) {
407 if(base_url!=NULL) {
408 debug_printf("Base URL components:\n");
409 dump_url(base_url);
410 } else
411 debug_printf("No base URL.\n");
412 }
413 #endif
414
415 main_url=split_url(url_string);
416 if(main_url->proto.type==PT_INTERNAL) /* splitting failed */
417 fprintf(stderr, "can't get absolute target URL\n");
418
419 DMSG(("Merging...\n"));
420
421 /* alloc struct for merged URL */
422 url=malloc(sizeof(struct Url));
423 if(url==NULL) {
424 fprintf(stderr, "memory allocation error while parsing URL\n");
425 exit(1);
426 }
427
428 /* merge with fallback values from "base_url" */
429
430 if(main_url->proto.type!=PT_UNKNOWN) { /* some protocol set in "main_url" -> take it */
431 base_url=NULL; /* take following components from "main_url" also (use defaults if not supplied) */
432 url->proto.type=main_url->proto.type;
433 url->proto.str=strdup(main_url->proto.str);
434 } else { /* no protocol in "main_url" */
435 if(base_url!=NULL) { /* has "base_url" -> take fallback from it */
436 url->proto.type=base_url->proto.type;
437 url->proto.str=strdup(base_url->proto.str);
438 } else { /* no "base_url" -> use default */
439 url->proto.type=PT_UNKNOWN;
440 url->proto.str=strdup("");
441 }
442 }
443
444 if(base_url==NULL) /* absolute URL (no "base_url" was given, or "main_url" contained a protocol specification, so "base_url" isn't used) */
445 url->absolute=1;
446 else
447 url->absolute=0;
448
449 if(main_url->host!=NULL) {
450 base_url=NULL;
451 url->host=strdup(main_url->host);
452 url->port=main_url->port; /* port always set with host */
453 } else {
454 if(base_url!=NULL) { /* has "base_url" (and no components from "main_url" yet) */
455 url->host=strdup(base_url->host);
456 url->port=base_url->port;
457 } else {
458 url->host=strdup("");
459 url->port=80;
460 }
461 }
462
463 if(main_url->dir!=NULL) {
464 if(main_url->dir[0]=='/' || base_url==NULL) /* dir in "main_url" is absolute (or only one supplied) -> take it */
465 url->dir=strdup(main_url->dir);
466 else { /* dir in "main_url" is relative -> merge with "base_url" */
467 char merged_dir[strlen(base_url->dir)+strlen(main_url->dir)+1];
468 char *merge_part=main_url->dir;
469
470 strcpy(merged_dir, base_url->dir);
471
472 if(!strncmp(merge_part, "./", 2)) /* skip "./" */
473 merge_part+=2;
474
475 /* handle ".." */
476 for(; *merged_dir && !strncmp(merge_part, "../", 3); merge_part+=3) { /* relative path starts with "..", and there is anything left in original path -> remove last component */
477 char *trunc_pos;
478
479 /* find beginning of last component */
480 for(trunc_pos=strchr(merged_dir, 0)-1; trunc_pos>merged_dir && *(trunc_pos-1)!='/'; --trunc_pos); /* look for '/' ending previous component, or string start (skip last character, which is the '/' terminating last component) */
481 *trunc_pos=0;
482 } /* for all ".." */
483
484 strcat(merged_dir, merge_part); /* concatenate the part of the relative path remaining after skipping all the "./" and "../" */
485 url->dir=strdup(merged_dir);
486 }
487 base_url=NULL;
488 } else {
489 if(base_url!=NULL)
490 url->dir=strdup(base_url->dir);
491 else
492 url->dir=strdup(url->proto.type==PT_FILE || url->proto.type==PT_UNKNOWN?"":"/");
493 }
494
495 if(main_url->name!=NULL) {
496 base_url=NULL;
497 url->name=strdup(main_url->name);
498 } else {
499 if(base_url!=NULL)
500 url->name=strdup(base_url->name);
501 else
502 url->name=strdup("");
503 }
504
505
506 if(form!=NULL) { /* submit form data -> store in place of any other CGI paramters */
507 base_url=NULL; /* handle as if the data was passed as part of main URL (force loading new document even if URL otherwise identical; discard old fragment identifier) */
508 url->params=strdup(form);
509 } else if(main_url->params!=NULL) { /* no form to submit, but paramters given in main URL -> take them */
510 base_url=NULL;
511 url->params=strdup(main_url->params);
512 } else {
513 if(base_url!=NULL)
514 url->params=strdup(base_url->params);
515 else
516 url->params=strdup("");
517 }
518
519 if(base_url!=NULL) /* same document as "base", only fragment identifier may differ (nothing taken from "main_url" yet) */
520 url->local=1;
521 else
522 url->local=0;
523
524 if(main_url->frag!=NULL) {
525 base_url=NULL;
526 url->frag=strdup(main_url->frag);
527 } else {
528 if(base_url!=NULL)
529 url->frag=strdup(base_url->frag);
530 else
531 url->frag=strdup("");
532 }
533
534 if(url->proto.str==NULL
535 || url->host==NULL
536 || url->dir==NULL
537 || url->name==NULL
538 || url->params==NULL
539 || url->frag==NULL
540 ) {
541 fprintf(stderr, "memory allocation error while creating URL (in function merge_urls)\n");
542 exit(1);
543 }
544
545 free_url(main_url);
546
547 /* create "full_url" string (and set "path" pointer) */
548 {
549 int path_pos;
550
551 DMSG(("creating new URL...\n"));
552 url->full_url=NULL;
553
554 if(url->proto.type==PT_HTTP || url->proto.type==PT_FTP) {
555 str_append(&url->full_url, url->proto.str);
556 str_append(&url->full_url, "://");
557 }
558
559 str_append(&url->full_url, url->host);
560
561 if(url->port!=80) {
562 char port_str[7];
563 snprintf(port_str, sizeof(port_str), ":%d", url->port);
564 str_append(&url->full_url, port_str);
565 }
566
567 path_pos=strlen(url->full_url); /* path starts where "dir" and following components will be appended (present string end) */
568
569 str_append(&url->full_url, url->dir);
570
571 str_append(&url->full_url, url->name);
572
573 if(*url->params) {
574 str_append(&url->full_url, "?");
575 str_append(&url->full_url, url->params);
576 }
577
578 url->path=url->full_url+path_pos; /* store pointer to full path (can store pointer now, as full_url won't be moved anymore) */
579 }
580
581 #ifdef DEBUG
582 if(cfg.debug) {
583 debug_printf("merged URL components:\n");
584 dump_url(url);
585 }
586 #endif
587
588 return url;
589 }
590
591 /* Returns a string with %xx URL escape codes from the source string replaced by the respective characters. The result is newly allocated; the source remains untouched. */
url_unescape(escaped)592 char *url_unescape(escaped)
593 const char *escaped;
594 {
595 char *rpos, *wpos, chr;
596 char *result;
597
598 result=malloc(strlen(escaped)+1);
599 if(result==NULL) {
600 fprintf(stderr, "memory allocation error while unescaping path (in function url_unescape)\n");
601 exit(1);
602 }
603
604 wpos=result; rpos=(char *)escaped;
605 do {
606 if(*rpos=='%') {
607 int code;
608 sscanf(rpos+1, "%2x", &code);
609 chr=(unsigned char)code;
610 rpos+=3;
611 } else
612 chr=*rpos++;
613 } while((*wpos++=chr));
614
615 return result;
616 }
617