1 /*
2 netrik -- The ANTRIK Internet Viewer
3 Copyright (C) Olaf D. Buddenhagen AKA antrik, et al (see AUTHORS)
4 Published under the GNU GPL; see LICENSE for details.
5 */
6 /*
7 * load.c -- load file from ressource
8 *
9 * (C) 2001, 2002 antrik
10 * 2001, 2002 Patrice Neff
11 * 2003 Witold Filipczyk
12 *
13 * Open a file, HTTP connection etc, and then read data blockwise.
14 */
15 #include <setjmp.h>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <string.h>
19 #include <unistd.h>
20
21 #include "config.h"
22
23 #include "cfg.h"
24 #include "debug.h"
25 #include "forms.h" /* for url_encode() */
26 #include "http.h"
27 #include "interrupt.h"
28 #include "load.h"
29
30 #ifdef DEBUG
31 #define BUF_SIZE 16
32 #else
33 #define BUF_SIZE 4096
34 #endif
35
36 static void init_wget(struct Resource *res); /* prepare reading HTTP resource via wget */
37
38 /* prepare reading HTTP resource via wget;
39 * calls wget in background, using its standard output as our input stream */
init_wget(res)40 static void init_wget(res)
41 struct Resource *res;
42 {
43 char cmd[strlen(res->url->full_url)+sizeof(WGET_CMD)];
44
45 res->type=RES_PIPE;
46
47 sprintf(cmd, WGET_CMD, res->url->full_url); /* create command line for wget */
48 res->handle.stream=popen(cmd, "r"); /* call in background, get output as pipe */
49 if(res->handle.stream==NULL) {
50 fprintf(stderr, "error executing wget\n");
51 if(res->url->proto.type==PT_HTTP)
52 fprintf(stderr, "(try using --builtin-http)\n");
53 exit(1);
54 }
55 }
56
57 /*
58 * build new URL, and prepare addressed resource for reading data
59 * (create descriptor, open file/HTTP connection/wget pipe, alloc input buffer)
60 */
init_load(base,url,form_data)61 struct Resource *init_load(base, url, form_data)
62 const struct Url *base; /* main URL is relative to this one */
63 const char *url; /* main URL, to be merged with "base" */
64 const struct Item *form_data; /* form item of a form to submit */
65 {
66 struct Resource *res; /* opened ressource */
67 struct Url *base_url=(struct Url *)base; /* avoid warnings about modifying "base" */
68 char *main_url=(char *)url; /* avoid warnings about modifying "url" */
69 struct Item *form=(struct Item *)form_data; /* avoid warnings about modifying "form_data" */
70
71 int redir; /* number of HTTP redirections */
72 struct Header *redir_header; /* HTTP header causing redirection, if any */
73
74 hold_int(); /* enable SIGINT during whole loading process, but do not handle until read()/fread() called */
75
76 for(redir_header=NULL, redir=0; !redir || (redir_header!=NULL && redir<=10); ++redir) { /* repeat HTTP load for up to 10 redirections */
77
78 if(redir_header) {
79 fprintf(stderr, "redirect to: %s\n", redir_header->value);
80
81 base_url=res->url; /* redirect URL is relative to original target URL */
82 res->url=NULL; /* remove original link (prevent URL string from being freed) */
83
84 main_url=redir_header->value;
85 redir_header->value=NULL;
86
87 form=NULL; /* redirect is always a simple GET */
88
89 uninit_load(res); /* redirect => don't need original connection anymore */
90
91 redir_header=NULL; /* this redirection is accounted for */
92 } /* redirect */
93
94 /* alloc resource descriptor */
95 res=malloc(sizeof(struct Resource));
96 if(res==NULL) {
97 fprintf(stderr, "memory allocation failure while opening resource\n");
98 exit(1);
99 }
100
101 res->user_break=0;
102
103 /* alloc input buffer */
104 res->buf=malloc(sizeof(char[BUF_SIZE]));
105 if(res->buf==NULL) {
106 fprintf(stderr, "memory allocation failure while opening resource\n");
107 exit(1);
108 }
109 res->buf_ptr=res->buf_end=res->buf; /* empty */
110
111 /* open file/connection/pipe */
112
113 if(main_url!=NULL) { /* not already a split/merged URL (from history) */
114 char *form_data=NULL;
115
116 if(form!=NULL && form->data.form->method==METHOD_GET) {
117 struct Data_string form_data_string=url_encode(form);
118 if(form_data_string.size < 0) { /* error in retrieving form data */
119 res->url->proto.type=PT_INTERNAL;
120 res->type=RES_FAIL;
121 return res;
122 }
123 form_data=form_data_string.data; /* just get the data (URL encoded strings can be handled as normal C strings) */
124 }
125
126 res->url=merge_urls(base_url, main_url, form_data);
127 free(form_data);
128 if(res->url->proto.type==PT_INTERNAL) { /* couldn't merge URLs */
129 res->type=RES_FAIL;
130 return res;
131 }
132
133 if(strcmp(main_url, "-")==0) /* load from stdin */
134 res->url->proto.type=PT_INTERNAL;
135
136 if(redir && res->url->proto.type!=PT_HTTP) {
137 fprintf(stderr, "illegal redirect (not to HTTP resource)\n");
138 res->type=RES_FAIL;
139 return res;
140 }
141 } else /* already split URL */
142 res->url=base_url;
143
144 switch(res->url->proto.type) {
145 case PT_INTERNAL: /* use stdin */
146 if(!res->url->absolute) {
147 fprintf(stderr, "error: can't follow relative link from stdin\n");
148 exit(2);
149 }
150
151 /* reopen stdin; use for data input */
152 res->handle.stream=fdopen(dup(0), "r");
153 if(res->handle.stream==NULL) {
154 fprintf(stderr, "can't read stdin (reopen failed)\n");
155 exit(1);
156 }
157
158 /* use stderr as stdin */
159 if(dup2(2, 0)<0) {
160 fprintf(stderr, "can't reopen stderr for reading; don't know how to get interactive user input\n");
161 exit(1);
162 }
163
164 res->type=RES_STDIN;
165 break;
166
167 case PT_FILE:
168 case PT_UNKNOWN: { /* assume local file first */
169 char *path=url_unescape(res->url->path);
170 char real_path[strlen(path)+sizeof(".bz2")]; /* real file name of (possibly compressed) local file */
171
172 if(res->url->proto.type==PT_UNKNOWN)
173 DMSG(("trying file: %s\n", path));
174
175 /* get "real_path" */
176 {
177 static const char *ext[]={
178 "",
179 ".gz",
180 ".bz2",
181 NULL
182 };
183
184 int try;
185
186 for(try=0; ext[try]!=NULL; ++try) { /* all possible extensions */
187 snprintf(real_path, sizeof(real_path), "%s%s", path, ext[try]);
188 DMSG((" trying %s...", real_path));
189 if(access(real_path, F_OK)==0) { /* file exists -> keep this one */
190 DMSG(("OK\n"));
191 break;
192 } else
193 DMSG(("no\n"));
194 }
195 if(ext[try]==NULL) /* none matched -> no file */
196 *real_path=0;
197
198 free(path); /* no longer needed */
199 } /* get "real_path" */
200
201 if(*real_path==0 && res->url->proto.type==PT_UNKNOWN && res->url->path[0]!='/') { /* no local file of that name, but may be inclomplete HTTP URL */
202 char x_url[sizeof("http://")+strlen(main_url)]; /* URL string with prepended protocol specification */
203
204 DMSG(("can't open local file\ntrying HTTP...\n"));
205 sprintf(x_url, "http://%s", main_url);
206
207 free_url(res->url);
208 res->url=merge_urls(base_url, x_url, NULL); /* need to parse again (can't have form data -- if submitting form, always have some base URL!) */
209 if(res->url->proto.type==PT_INTERNAL) { /* couldn't merge URLs */
210 res->type=RES_FAIL;
211 return res;
212 }
213 /* fallthrough (to PT_HTTP) */
214 } else { /* must be local file */
215 res->handle.stream=NULL; /* assume failed open (important if "*real_path==0"!) */
216 if(*real_path!=0) { /* file exists */
217 /* Witold, antrik --> */
218 char *ext=strrchr(real_path, '.');
219 char *slash=strrchr(real_path, '/');
220
221 res->type=RES_FILE; /* assume normal file */
222
223 if(ext!=NULL && (slash==NULL || slash<ext)) { /* file name has extension => may be compressed */
224 const struct {
225 char *ext;
226 char *cmd;
227 } compress[]={
228 {".gz", "gunzip -c '%s'"},
229 {".bz2", "bunzip2 -c '%s'"},
230 {NULL, NULL}
231 };
232
233 int try;
234
235 DMSG(("has file extension (%s), maybe compressed\n", ext));
236 for(try=0; compress[try].ext!=NULL; ++try) { /* all possible compression types */
237 DMSG((" testing for %s ...", compress[try].ext));
238 if(!strcmp(compress[try].ext, ext)) { /* match -> open using this compression */
239 char cmd[strlen(compress[try].cmd)+strlen(real_path)];
240
241 /* security workaround: prevent escaping shell quote */
242 {
243 char *chr;
244 for(chr=real_path; *chr; ++chr)
245 if(*chr=='\'')
246 *chr=' ';
247 }
248
249 snprintf(cmd, sizeof(cmd), compress[try].cmd, real_path);
250
251 DMSG(("OK\nopening compressed file with command:\n%s\n", cmd));
252 res->handle.stream = popen(cmd, "r");
253 res->type=RES_PIPE;
254 break; /* don't search further */
255 } else /* compress type doesn't match */
256 DMSG(("no\n"));
257 } /* for all possible compression types */
258 } /* has extension */
259 /* <-- Witold, antrik */
260
261 if(res->type==RES_FILE) { /* not compressed -> open normal file */
262 res->handle.stream=fopen(real_path, "r");
263 }
264 } /* file exists */
265
266 if(res->handle.stream==NULL) {
267 fprintf(stderr, "can't open file %s\n", main_url);
268 res->type=RES_FAIL;
269 res->url->proto.type=PT_INTERNAL; /* don't keep in history */
270 return res;
271 }
272
273 if(!cfg.dump)
274 fprintf(stderr, "loading file: %s\n\n", real_path);
275 else
276 DMSG(("loading file: %s\n", real_path));
277
278 res->url->proto.type=PT_FILE;
279 break;
280 } /* must be local file */
281 } /* PT_FILE */
282
283 case PT_HTTP:
284 if(cfg.wget)
285 init_wget(res);
286 else {
287 int cur_header; /* currently scanned header */
288
289 http_init_load(res, (form!=NULL && form->data.form->method!=METHOD_GET) ? form : NULL); /* open connection and load HTTP head */
290
291 if(redir) { /* not original URLs (function arguments) -> don't keep */
292 free_url(base_url);
293 free(main_url);
294 }
295
296 /* check for redirect */
297 for(cur_header=0; cur_header<res->handle.http->headers.count; ++cur_header) { /* all headers */
298 if(strcmp(res->handle.http->headers.header[cur_header].name, "location")==0) { /* found */
299 redir_header=&res->handle.http->headers.header[cur_header];
300 break; /* don't search further */
301 }
302 }
303 } /* not wget */
304 break;
305
306 case PT_FTP:
307 init_wget(res);
308 } /* switch proto.type */
309
310 } /* while redirect, up to 10 times */
311 if(redir>10) { /* still wants to redirect after last iteration */
312 fprintf(stderr, "Too many redirections.\n");
313 res->type=RES_FAIL;
314 res->url->proto.type=PT_INTERNAL; /* don't keep in history */
315 }
316
317 return res;
318 }
319
320 /* read data block from ressource into buffer */
load(res)321 void load(res)
322 struct Resource *res;
323 {
324 int chars_read;
325
326 if(setjmp(label_int)) { /* longjmp from SIGINT handler */
327 res->user_break=1;
328 chars_read=0;
329 } else { /* normal action */
330 enable_int(); /* allow SIGINT during read()/fread() */
331
332 if(res->type==RES_FAIL || res->user_break) /* can't/shouldn't read anything */
333 chars_read=0;
334 /* Patrice, antrik --> */
335 else if(res->type == RES_HTTP) {
336 chars_read=read(res->handle.http->socket, res->buf, BUF_SIZE);
337 if(chars_read<0) {
338 fprintf(stderr, "error while reading data from HTTP connection\n");
339 res->type=RES_FAIL;
340 chars_read=0; /* return EOF */
341 }
342 } else { /* RES_FILE, RES_PIPE */
343 /* <-- Patrice, antrik */
344 chars_read=fread(res->buf, sizeof(char), BUF_SIZE, res->handle.stream);
345 if(ferror(res->handle.stream)) {
346 fprintf(stderr, "error while reading from file\n");
347 res->type=RES_FAIL;
348 chars_read=0; /* return EOF */
349 }
350 } /* RES_FILE, RES_PIPE */
351 } /* normal action (not longjmp return) */
352
353 hold_int(); /* put SIGINT on hold again (till next call of load()) */
354
355 res->buf_end=res->buf+chars_read;
356 }
357
358 /* tidy up after loading a file;
359 * closes stream and then frees memory allocated for the buffer and the resource
360 * handle */
uninit_load(res)361 void uninit_load(res)
362 struct Resource *res;
363 {
364 if(res->type==RES_PIPE) {
365 if(pclose(res->handle.stream)!=0 && !res->user_break) { /* care about wget error only if not user break (which always causes broken pipe...) */
366 fprintf(stderr, "error loading HTTP page\n");
367 exit(1);
368 }
369 } else if(res->type==RES_HTTP) {
370 close(res->handle.http->socket);
371
372 {
373 int header;
374 for(header=0; header < res->handle.http->headers.count; ++header) {
375 free(res->handle.http->headers.header[header].name);
376 free(res->handle.http->headers.header[header].value);
377 }
378 free(res->handle.http->headers.header);
379 }
380
381 free(res->handle.http);
382 } else if(res->type==RES_FILE || res->type==RES_STDIN)
383 fclose(res->handle.stream);
384
385 disable_int(); /* ignore ^C outside of file/HTTP loads */
386
387 free(res->buf);
388 free(res);
389 }
390