1 /*
2    netrik -- The ANTRIK Internet Viewer
3    Copyright (C) Olaf D. Buddenhagen AKA antrik, et al (see AUTHORS)
4    Published under the GNU GPL; see LICENSE for details.
5 */
6 /*
7  * load.c -- load file from ressource
8  *
9  * (C) 2001, 2002 antrik
10  *     2001, 2002 Patrice Neff
11  *     2003 Witold Filipczyk
12  *
13  * Open a file, HTTP connection etc, and then read data blockwise.
14  */
15 #include <setjmp.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <string.h>
19 #include <unistd.h>
20 
21 #include "config.h"
22 
23 #include "cfg.h"
24 #include "debug.h"
25 #include "forms.h"    /* for url_encode() */
26 #include "http.h"
27 #include "interrupt.h"
28 #include "load.h"
29 
30 #ifdef DEBUG
31    #define BUF_SIZE 16
32 #else
33    #define BUF_SIZE 4096
34 #endif
35 
36 static void init_wget(struct Resource *res);    /* prepare reading HTTP resource via wget */
37 
38 /* prepare reading HTTP resource via wget;
39  * calls wget in background, using its standard output as our input stream */
init_wget(res)40 static void init_wget(res)
41 struct Resource	*res;
42 {
43    char	cmd[strlen(res->url->full_url)+sizeof(WGET_CMD)];
44 
45    res->type=RES_PIPE;
46 
47    sprintf(cmd, WGET_CMD, res->url->full_url);    /* create command line for wget */
48    res->handle.stream=popen(cmd, "r");    /* call in background, get output as pipe */
49    if(res->handle.stream==NULL) {
50       fprintf(stderr, "error executing wget\n");
51       if(res->url->proto.type==PT_HTTP)
52 	 fprintf(stderr, "(try using --builtin-http)\n");
53       exit(1);
54    }
55 }
56 
57 /*
58  * build new URL, and prepare addressed resource for reading data
59  * (create descriptor, open file/HTTP connection/wget pipe, alloc input buffer)
60  */
init_load(base,url,form_data)61 struct Resource *init_load(base, url, form_data)
62 const struct Url	*base;    /* main URL is relative to this one */
63 const char		*url;    /* main URL, to be merged with "base" */
64 const struct Item	*form_data;    /* form item of a form to submit */
65 {
66    struct Resource	*res;    /* opened ressource */
67    struct Url		*base_url=(struct Url *)base;    /* avoid warnings about modifying "base" */
68    char			*main_url=(char *)url;    /* avoid warnings about modifying "url" */
69    struct Item		*form=(struct Item *)form_data;    /* avoid warnings about modifying "form_data" */
70 
71    int			redir;    /* number of HTTP redirections */
72    struct Header	*redir_header;    /* HTTP header causing redirection, if any */
73 
74    hold_int();    /* enable SIGINT during whole loading process, but do not handle until read()/fread() called */
75 
76    for(redir_header=NULL, redir=0; !redir || (redir_header!=NULL && redir<=10); ++redir) {    /* repeat HTTP load for up to 10 redirections */
77 
78       if(redir_header) {
79 	 fprintf(stderr, "redirect to: %s\n", redir_header->value);
80 
81 	 base_url=res->url;    /* redirect URL is relative to original target URL */
82 	 res->url=NULL;    /* remove original link (prevent URL string from being freed) */
83 
84 	 main_url=redir_header->value;
85 	 redir_header->value=NULL;
86 
87 	 form=NULL;    /* redirect is always a simple GET */
88 
89 	 uninit_load(res);    /* redirect => don't need original connection anymore */
90 
91 	 redir_header=NULL;    /* this redirection is accounted for */
92       }    /* redirect */
93 
94       /* alloc resource descriptor */
95       res=malloc(sizeof(struct Resource));
96       if(res==NULL) {
97 	 fprintf(stderr, "memory allocation failure while opening resource\n");
98 	 exit(1);
99       }
100 
101       res->user_break=0;
102 
103       /* alloc input buffer */
104       res->buf=malloc(sizeof(char[BUF_SIZE]));
105       if(res->buf==NULL) {
106 	 fprintf(stderr, "memory allocation failure while opening resource\n");
107 	 exit(1);
108       }
109       res->buf_ptr=res->buf_end=res->buf;    /* empty */
110 
111       /* open file/connection/pipe */
112 
113       if(main_url!=NULL) {    /* not already a split/merged URL (from history) */
114 	 char	*form_data=NULL;
115 
116 	 if(form!=NULL && form->data.form->method==METHOD_GET) {
117 	    struct Data_string	form_data_string=url_encode(form);
118 	    if(form_data_string.size < 0) {    /* error in retrieving form data */
119 	       res->url->proto.type=PT_INTERNAL;
120 	       res->type=RES_FAIL;
121 	       return res;
122 	    }
123 	    form_data=form_data_string.data;    /* just get the data (URL encoded strings can be handled as normal C strings) */
124 	 }
125 
126 	 res->url=merge_urls(base_url, main_url, form_data);
127 	 free(form_data);
128 	 if(res->url->proto.type==PT_INTERNAL) {    /* couldn't merge URLs */
129 	    res->type=RES_FAIL;
130 	    return res;
131 	 }
132 
133 	 if(strcmp(main_url, "-")==0)    /* load from stdin */
134 	    res->url->proto.type=PT_INTERNAL;
135 
136 	 if(redir && res->url->proto.type!=PT_HTTP) {
137 	    fprintf(stderr, "illegal redirect (not to HTTP resource)\n");
138 	    res->type=RES_FAIL;
139 	    return res;
140 	 }
141       } else    /* already split URL */
142 	 res->url=base_url;
143 
144       switch(res->url->proto.type) {
145 	 case PT_INTERNAL:    /* use stdin */
146 	    if(!res->url->absolute) {
147 	       fprintf(stderr, "error: can't follow relative link from stdin\n");
148 	       exit(2);
149 	    }
150 
151 	    /* reopen stdin; use for data input */
152 	    res->handle.stream=fdopen(dup(0), "r");
153 	    if(res->handle.stream==NULL) {
154 	       fprintf(stderr, "can't read stdin (reopen failed)\n");
155 	       exit(1);
156 	    }
157 
158 	    /* use stderr as stdin */
159 	    if(dup2(2, 0)<0) {
160 	       fprintf(stderr, "can't reopen stderr for reading; don't know how to get interactive user input\n");
161 	       exit(1);
162 	    }
163 
164 	    res->type=RES_STDIN;
165 	    break;
166 
167 	 case PT_FILE:
168 	 case PT_UNKNOWN: {    /* assume local file first */
169 	    char	*path=url_unescape(res->url->path);
170 	    char	real_path[strlen(path)+sizeof(".bz2")];    /* real file name of (possibly compressed) local file */
171 
172 	    if(res->url->proto.type==PT_UNKNOWN)
173 	       DMSG(("trying file: %s\n", path));
174 
175 	    /* get "real_path" */
176 	    {
177 	       static const char	*ext[]={
178 		  "",
179 		  ".gz",
180 		  ".bz2",
181 		  NULL
182 	       };
183 
184 	       int	try;
185 
186 	       for(try=0; ext[try]!=NULL; ++try) {    /* all possible extensions */
187 		  snprintf(real_path, sizeof(real_path), "%s%s", path, ext[try]);
188 		  DMSG(("   trying %s...", real_path));
189 		  if(access(real_path, F_OK)==0) {    /* file exists -> keep this one */
190 		     DMSG(("OK\n"));
191 		     break;
192 		  } else
193 		     DMSG(("no\n"));
194 	       }
195 	       if(ext[try]==NULL)    /* none matched -> no file */
196 		  *real_path=0;
197 
198 	       free(path);    /* no longer needed */
199 	    }    /* get "real_path" */
200 
201 	    if(*real_path==0 && res->url->proto.type==PT_UNKNOWN && res->url->path[0]!='/') {    /* no local file of that name, but may be inclomplete HTTP URL */
202 	       char	x_url[sizeof("http://")+strlen(main_url)];    /* URL string with prepended protocol specification */
203 
204 	       DMSG(("can't open local file\ntrying HTTP...\n"));
205 	       sprintf(x_url, "http://%s", main_url);
206 
207 	       free_url(res->url);
208 	       res->url=merge_urls(base_url, x_url, NULL);    /* need to parse again (can't have form data -- if submitting form, always have some base URL!) */
209 	       if(res->url->proto.type==PT_INTERNAL) {    /* couldn't merge URLs */
210 		  res->type=RES_FAIL;
211 		  return res;
212 	       }
213 	       /* fallthrough (to PT_HTTP) */
214 	    } else {    /* must be local file */
215 	       res->handle.stream=NULL;    /* assume failed open (important if "*real_path==0"!) */
216 	       if(*real_path!=0) {    /* file exists */
217 /* Witold, antrik --> */
218 		  char	*ext=strrchr(real_path, '.');
219 		  char	*slash=strrchr(real_path, '/');
220 
221 		  res->type=RES_FILE;    /* assume normal file */
222 
223 		  if(ext!=NULL && (slash==NULL || slash<ext)) {    /* file name has extension => may be compressed */
224 		     const struct {
225 			char	*ext;
226 			char	*cmd;
227 		     } compress[]={
228 			{".gz", "gunzip -c '%s'"},
229 			{".bz2", "bunzip2 -c '%s'"},
230 			{NULL, NULL}
231 		     };
232 
233 		     int	try;
234 
235 		     DMSG(("has file extension (%s), maybe compressed\n", ext));
236 		     for(try=0; compress[try].ext!=NULL; ++try) {    /* all possible compression types */
237 			DMSG(("   testing for %s ...", compress[try].ext));
238 			if(!strcmp(compress[try].ext, ext)) {    /* match -> open using this compression */
239 			   char	cmd[strlen(compress[try].cmd)+strlen(real_path)];
240 
241 			   /* security workaround: prevent escaping shell quote */
242 			   {
243 			      char *chr;
244 			      for(chr=real_path; *chr; ++chr)
245 				 if(*chr=='\'')
246 				    *chr=' ';
247 			   }
248 
249 			   snprintf(cmd, sizeof(cmd), compress[try].cmd, real_path);
250 
251 			   DMSG(("OK\nopening compressed file with command:\n%s\n", cmd));
252 			   res->handle.stream = popen(cmd, "r");
253 			   res->type=RES_PIPE;
254 			   break;    /* don't search further */
255 			} else    /* compress type doesn't match */
256 			   DMSG(("no\n"));
257 		     }    /* for all possible compression types */
258 		  }    /* has extension */
259 /* <-- Witold, antrik */
260 
261 		  if(res->type==RES_FILE) {    /* not compressed -> open normal file */
262 		     res->handle.stream=fopen(real_path, "r");
263 		  }
264 	       }    /* file exists */
265 
266 	       if(res->handle.stream==NULL) {
267 		  fprintf(stderr, "can't open file %s\n", main_url);
268 		  res->type=RES_FAIL;
269 		  res->url->proto.type=PT_INTERNAL;    /* don't keep in history */
270 		  return res;
271 	       }
272 
273 	       if(!cfg.dump)
274 		  fprintf(stderr, "loading file: %s\n\n", real_path);
275 	       else
276 		  DMSG(("loading file: %s\n", real_path));
277 
278 	       res->url->proto.type=PT_FILE;
279 	       break;
280 	    }    /* must be local file */
281 	 }    /* PT_FILE */
282 
283 	 case PT_HTTP:
284 	    if(cfg.wget)
285 	       init_wget(res);
286 	    else {
287 	       int	cur_header;    /* currently scanned header */
288 
289 	       http_init_load(res, (form!=NULL && form->data.form->method!=METHOD_GET) ? form : NULL);    /* open connection and load HTTP head */
290 
291 	       if(redir) {    /* not original URLs (function arguments) -> don't keep */
292 		  free_url(base_url);
293 		  free(main_url);
294 	       }
295 
296 	       /* check for redirect */
297 	       for(cur_header=0; cur_header<res->handle.http->headers.count; ++cur_header) {    /* all headers */
298 		  if(strcmp(res->handle.http->headers.header[cur_header].name, "location")==0) {    /* found */
299 		     redir_header=&res->handle.http->headers.header[cur_header];
300 		     break;    /* don't search further */
301 		  }
302 	       }
303 	    }    /* not wget */
304 	    break;
305 
306 	 case PT_FTP:
307 	    init_wget(res);
308       }    /* switch proto.type */
309 
310    }    /* while redirect, up to 10 times */
311    if(redir>10) {    /* still wants to redirect after last iteration */
312       fprintf(stderr, "Too many redirections.\n");
313       res->type=RES_FAIL;
314       res->url->proto.type=PT_INTERNAL;    /* don't keep in history */
315    }
316 
317    return res;
318 }
319 
320 /* read data block from ressource into buffer */
load(res)321 void load(res)
322 struct Resource	*res;
323 {
324    int chars_read;
325 
326    if(setjmp(label_int)) {    /* longjmp from SIGINT handler */
327       res->user_break=1;
328       chars_read=0;
329    } else {    /* normal action */
330       enable_int();    /* allow SIGINT during read()/fread() */
331 
332       if(res->type==RES_FAIL || res->user_break)    /* can't/shouldn't read anything */
333 	 chars_read=0;
334 /* Patrice, antrik --> */
335       else if(res->type == RES_HTTP) {
336 	 chars_read=read(res->handle.http->socket, res->buf, BUF_SIZE);
337 	 if(chars_read<0) {
338 	    fprintf(stderr, "error while reading data from HTTP connection\n");
339 	    res->type=RES_FAIL;
340 	    chars_read=0;    /* return EOF */
341 	 }
342       } else {    /* RES_FILE, RES_PIPE */
343 /* <-- Patrice, antrik */
344 	 chars_read=fread(res->buf, sizeof(char), BUF_SIZE, res->handle.stream);
345 	 if(ferror(res->handle.stream)) {
346 	    fprintf(stderr, "error while reading from file\n");
347 	    res->type=RES_FAIL;
348 	    chars_read=0;    /* return EOF */
349 	 }
350       }    /* RES_FILE, RES_PIPE */
351    }    /* normal action (not longjmp return) */
352 
353    hold_int();    /* put SIGINT on hold again (till next call of load()) */
354 
355    res->buf_end=res->buf+chars_read;
356 }
357 
358 /* tidy up after loading a file;
359  * closes stream and then frees memory allocated for the buffer and the resource
360  * handle */
uninit_load(res)361 void uninit_load(res)
362 struct Resource *res;
363 {
364    if(res->type==RES_PIPE) {
365       if(pclose(res->handle.stream)!=0 && !res->user_break) {    /* care about wget error only if not user break (which always causes broken pipe...) */
366 	 fprintf(stderr, "error loading HTTP page\n");
367 	 exit(1);
368       }
369    } else if(res->type==RES_HTTP) {
370       close(res->handle.http->socket);
371 
372       {
373 	 int	header;
374 	 for(header=0; header < res->handle.http->headers.count; ++header) {
375 	    free(res->handle.http->headers.header[header].name);
376 	    free(res->handle.http->headers.header[header].value);
377 	 }
378 	 free(res->handle.http->headers.header);
379       }
380 
381       free(res->handle.http);
382    } else if(res->type==RES_FILE || res->type==RES_STDIN)
383       fclose(res->handle.stream);
384 
385    disable_int();    /* ignore ^C outside of file/HTTP loads */
386 
387    free(res->buf);
388    free(res);
389 }
390