1 /*
2  * Copyright 2001 Niels Provos <provos@citi.umich.edu>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by Niels Provos.
16  * 4. The name of the author may not be used to endorse or promote products
17  *    derived from this software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/time.h>
33 #include <sys/stat.h>
34 #include <sys/queue.h>
35 #include <fcntl.h>
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <string.h>
39 #include <err.h>
40 #include <unistd.h>
41 #include <regex.h>
42 
43 #include "config.h"
44 
45 #include <event.h>
46 
47 #include "tree.h"
48 #include "http.h"
49 #include "html.h"
50 #include "crawldb.h"
51 #include "util.h"
52 #include "dns.h"
53 #include "cfg.h"
54 
55 #define CRAWL		"crawl"
56 #define CRAWL_VERSION	VERSION
57 
58 void usage(void);
59 ssize_t atomicio(ssize_t (*f)(), int, void *, size_t);
60 
61 int url_permitted(char *);
62 int img_permitted(char *);
63 int image_saver(struct uri *);
64 
65 char *progname;
66 
67 struct html_parse *parser;
68 
69 /* Regular expression which determines if an url gets included */
70 regex_t reurlinc;
71 regex_t reurlexcl;
72 regex_t reimginc;
73 regex_t reimgexcl;
74 
75 /* Path to directory where the saved images go to */
76 char *saveimgdir;
77 
78 struct timeval tv_timeout;
79 int maxdepth = 0;
80 
81 #define WAIT_DEFAULT	5
82 
83 #define MEDIA_MINLEN	20000
84 #define MEDIA_MAXLEN	400000
85 
86 int media_minlen = MEDIA_MINLEN;
87 int media_maxlen = MEDIA_MAXLEN;
88 
89 /* An external filter that does special processing */
90 int extmatchw = -1;
91 int extmatchr = -1;
92 
93 int debug = 1;
94 
95 #define DFPRINTF(x,y)	if (debug >= x) fprintf y
96 
97 void
usage(void)98 usage(void)
99 {
100 	fprintf(stderr,
101 		"%s: [-u urlincl] [-e urlexcl] [-i imgincl] [-d dir] [-m depth] <url> ...\n"
102 		"\t for documentation of all options consult the man page\n",
103 		progname);
104 }
105 
106 int
url_permitted(char * url)107 url_permitted(char *url)
108 {
109 	if (strncasecmp(url, HTTP_PREFIX, strlen(HTTP_PREFIX)))
110 		return (0);
111 
112 	if (regexec(&reurlinc, url, 0, NULL, 0) != 0)
113 		return (0);
114 
115 	if (regexec(&reurlexcl, url, 0, NULL, 0) == 0)
116 		return (0);
117 
118 	/* Use an external filter */
119 	if (extmatchw != -1) {
120 		char ch;
121 
122 		if (atomicio(write, extmatchw, url, strlen(url)) == -1)
123 			err(1, "write");
124 		if (atomicio(write, extmatchw, "\n", 1) == -1)
125 			err(1, "write");
126 
127 		if (atomicio(read, extmatchr, &ch, 1) == -1)
128 			err(1, "read");
129 
130 		if (ch != 'y')
131 			return (0);
132 	}
133 
134 	return (1);
135 }
136 
137 int
img_permitted(char * url)138 img_permitted(char *url)
139 {
140 	if (strncasecmp(url, HTTP_PREFIX, strlen(HTTP_PREFIX)))
141 		return (0);
142 
143 	if (regexec(&reimginc, url, 0, NULL, 0) != 0)
144 		return (0);
145 
146 	if (regexec(&reimgexcl, url, 0, NULL, 0) == 0)
147 		return (0);
148 
149 	return (1);
150 }
151 
152 int
image_saver(struct uri * uri)153 image_saver(struct uri *uri)
154 {
155 	struct stat sb;
156 	char *path;
157 	char tmp[128], *p = "";
158 	char *url;
159 
160 	url = http_make_url(&uri->url);
161 
162 	/*
163 	 * Sometimes we download something that should have been
164 	 * html, but is media instead.
165 	 */
166 	if (uri->save_fd == -1 && !img_permitted(url))
167 		return (-1);
168 
169 	if (uri->length != -1 && uri->bdlen != uri->length) {
170 		snprintf(tmp, sizeof(tmp), " (%4.1f%%/%d)",
171 		    (float)uri->bdlen/uri->length*100, uri->length);
172 		p = tmp;
173 	}
174 	fprintf(stdout, "%s %s%s\n",
175 		uri->flags & HTTP_REQUEST_GET ? "GET" : "HEAD",
176 		url, uri->flags & HTTP_REQUEST_GET ? p : "");
177 
178 	if (uri->flags & HTTP_REQUEST_HEAD) {
179 		int minlen, maxlen;
180 
181 		if (uri->format == NULL) {
182 			minlen = media_minlen;
183 			maxlen = media_maxlen;
184 		} else {
185 			/* Get lengths depending on mime types */
186 			minlen = conf_get_num(uri->format, "Min-Length",
187 			    media_minlen);
188 			maxlen = conf_get_num(uri->format, "Max-Length",
189 			    media_maxlen);
190 		}
191 
192 		/* See if it meets our extra constraints */
193 		if ((minlen != -1 && uri->length < minlen) ||
194 		    (maxlen != -1 && uri->length > maxlen))
195 			return (-1);
196 
197 		/* Re-add request as GET */
198 		http_add(HTTP_REQUEST_GET, url, uri->depth);
199 		return (0);
200 	}
201 
202 	if ((path = construct_path(url, 1)) == NULL)
203 		return (-1);
204 	if (stat(path, &sb) != -1) {
205 		if (sb.st_size >= uri->length)
206 			return (-1);
207 	}
208 	if (uri->save_fd == -1) {
209 		uri->save_fd = open(path, O_WRONLY|O_CREAT|O_TRUNC, 0644);
210 		if (uri->save_fd == -1)
211 			return (-1);
212 	}
213 
214 	if (atomicio(write, uri->save_fd, uri->body, uri->bdread) == -1)
215 		return (-1);
216 
217 	uri->bdread = 0;
218 
219 	return (0);
220 }
221 
222 void
http_dealimage(char * url,u_short depth)223 http_dealimage(char *url, u_short depth)
224 {
225 	char *path;
226 
227 	if (!img_permitted(url)) {
228 		DFPRINTF(3, (stderr, "Reject: %s\n", url));
229 		return;
230 	}
231 
232 	/* Check if the file exists */
233 	if ((path = construct_path(url, 0)) == NULL)
234 		return;
235 	if (access(path, F_OK) != -1)
236 		return;
237 
238 	if (http_mark_seen(url) != 0)
239 		return;
240 
241 	http_add(HTTP_REQUEST_HEAD, url, depth);
242 }
243 
244 void
html_foundimage(void * arg,char * el,char ** attr)245 html_foundimage(void *arg, char *el, char **attr)
246 {
247 	struct uri *uri = arg;
248 	char *base, *normal;
249 
250 	if (!strcasecmp(el, "img"))
251 		attr = html_attr_find(attr, "src");
252 	else if (!strcasecmp(el, "body"))
253 		attr = html_attr_find(attr, "background");
254 
255 	if (*attr == NULL)
256 		return;
257 
258 	base = http_basename(uri);
259 	if (base == NULL)
260 		return;
261 
262 	normal = http_make_uri(base, attr[1]);
263 	if (normal == NULL) {
264 		DFPRINTF(3, (stderr, "Failed on %s and %s\n", base, attr[1]));
265 		return;
266 	}
267 
268 	http_dealimage(normal, uri->depth + 1);
269 }
270 
271 void
html_foundanchor(void * arg,char * el,char ** attr)272 html_foundanchor(void *arg, char *el, char **attr)
273 {
274 	struct uri *uri = arg;
275 	char *base, *normal;
276 
277 	if (!strcasecmp(el, "a"))
278 		attr = html_attr_find(attr, "href");
279 	else if (!strcasecmp(el, "frame"))
280 		attr = html_attr_find(attr, "src");
281 	else if (!strcasecmp(el, "iframe"))
282 		attr = html_attr_find(attr, "src");
283 	else if (!strcasecmp(el, "area"))
284 		attr = html_attr_find(attr, "href");
285 	else if (!strcasecmp(el, "base"))
286 		attr = html_attr_find(attr, "link");
287 	else if (!strcasecmp(el, "link")) {
288 		char **p;
289 		p = html_attr_find(attr, "href");
290 		if (*p != NULL)
291 			attr = p;
292 		else
293 			attr = html_attr_find(attr, "rel");
294 	} else
295 		return;
296 
297 	if (*attr == NULL)
298 		return;
299 
300 	base = http_basename(uri);
301 	if (base == NULL)
302 		return;
303 
304 	normal = http_make_uri(base, attr[1]);
305 	if (normal == NULL) {
306 		DFPRINTF(3, (stderr, "Failed on %s and %s\n", base, attr[1]));
307 		return;
308 	}
309 
310 	/* If this is an image, do to the image dealer */
311 	if (isMedia(normal))
312 		return (http_dealimage(normal, uri->depth + 1));
313 
314 	if (maxdepth != -1 && uri->depth + 1 > maxdepth) {
315 		DFPRINTF(2, (stderr, "Max depth reached: %s\n", normal));
316 		return;
317 	}
318 
319 	/* fprintf(stdout, "Anchor: %s\n", normal); */
320 	if (!url_permitted(normal)) {
321 		DFPRINTF(3, (stderr, "Reject: %s\n", normal));
322 		return;
323 	}
324 
325 	if (http_mark_seen(normal) != 0)
326 		return;
327 
328 	http_add(HTTP_REQUEST_GET, normal, uri->depth + 1);
329 }
330 
331 int
html_follower(struct uri * uri)332 html_follower(struct uri *uri)
333 {
334 	fprintf(stdout, "%s http://%s%s\n",
335 		uri->flags & HTTP_REQUEST_GET ? "GET" : "HEAD",
336 		uri->url_host, uri->url_file);
337 
338 	/* We should never get a head request here */
339 	if (uri->flags & HTTP_REQUEST_HEAD)
340 		return (0);
341 
342 	html_parse_setdata(parser, uri);
343 	html_parser(parser, uri->body, uri->bdlen);
344 
345 	return (0);
346 }
347 
348 /* Deal with relocations */
349 
350 void
http_moved(struct uri * uri,char * location)351 http_moved(struct uri *uri, char *location)
352 {
353 	if (!url_permitted(location)) {
354 		DFPRINTF(3, (stderr, "Reject: %s\n", location));
355 		return;
356 	}
357 
358 	if (http_mark_seen(location) != 0)
359 		return;
360 
361 	http_add(uri->flags & HTTP_REQUEST_GET ?
362 	    HTTP_REQUEST_GET : HTTP_REQUEST_HEAD,
363 	    location, uri->depth);
364 }
365 
366 void
external_filter(char * program)367 external_filter(char *program)
368 {
369 	int toext[2], fromext[2];
370 	int res;
371 
372 	if (pipe(toext) == -1)
373 		err(1, "pipe");
374 	if (pipe(fromext) == -1)
375 		err(1, "pipe");
376 
377 	res = fork();
378 	if (res == -1)
379 		err(1, "fork");
380 	if (res == 0) {
381 		/* Child */
382 		dup2(toext[0], fileno(stdin));
383 		close(toext[1]);
384 		dup2(fromext[1], fileno(stdout));
385 		close(fromext[0]);
386 
387 		if (execlp(program, program, NULL) == -1)
388 			err(1, "execlp");
389 		exit(0);
390 	}
391 
392 	/* Parent */
393 	extmatchw = toext[1];
394 	extmatchr = fromext[0];
395 	close(toext[0]);
396 	close(fromext[1]);
397 }
398 
399 #define CONF_SAVE(w,f)	do { \
400 				char *p = f; \
401 				if (p != NULL) \
402 					(w) = p; \
403 			} while (0)
404 
405 int
main(int argc,char ** argv)406 main(int argc, char **argv)
407 {
408 	extern char *optarg;
409 	extern int optind;
410 	char *urlinclude = "http://.*\\.citi\\.umich\\.edu";
411 	char *urlexclude = "\\.(ps|gz|c|h|tar|exe|doc|pdf|ppt|txt|diff)$";
412 	char *imginclude = "\\.(jpg|jpeg)";
413 	char *imgexclude = "thumbs\\.";
414 	char *agent = CRAWL"/"CRAWL_VERSION;
415 	extern int use_robots, http_maxconnects;
416 	char *state = "crawl.state";
417 	char *external = NULL;
418 	char *resume = NULL;
419 	int ch;
420 
421 	progname = argv[0];
422 	saveimgdir = ".";
423 	timerclear(&tv_timeout);
424 	tv_timeout.tv_sec = WAIT_DEFAULT;
425 
426 	conf_init();
427 	CONF_SAVE(urlinclude, conf_get_str("General", "Url-Include"));
428 	CONF_SAVE(urlexclude, conf_get_str("General", "Url-Exclude"));
429 	CONF_SAVE(imginclude, conf_get_str("General", "Img-Include"));
430 	CONF_SAVE(imgexclude, conf_get_str("General", "Img-Exclude"));
431 	CONF_SAVE(saveimgdir, conf_get_str("General", "Img-Directory"));
432 	CONF_SAVE(external, conf_get_str("General", "External-Filter"));
433 	CONF_SAVE(agent, conf_get_str("HTTP", "Agent"));
434 	CONF_SAVE(state, conf_get_str("HTTP", "State-File"));
435 	use_robots = conf_get_num("HTTP", "Use-Robots", 1);
436 	http_maxconnects = conf_get_num("HTTP", "Connections", HTTP_MAXCONNECTS);
437 	maxdepth = conf_get_num("General", "Max-Depth", 0);
438 	media_minlen = conf_get_num("General", "Min-Length", MEDIA_MINLEN);
439 	media_maxlen = conf_get_num("General", "Max-Length", MEDIA_MAXLEN);
440 
441 	while ((ch = getopt(argc, argv, "t:v:u:e:i:d:m:RA:E:I:c:")) != -1)
442 		switch(ch) {
443 		case 't': {
444 			char *p;
445 			float val;
446 			val = strtod(optarg, &p);
447 			if (p == NULL && *p != '\0') {
448 				usage();
449 				exit(1);
450 			}
451 
452 			tv_timeout.tv_sec = val;
453 			tv_timeout.tv_usec = (val - (int)val) * 1000000L;
454 			break;
455 		}
456 		case 'v':
457 			debug = atoi(optarg);
458 			break;
459 		case 'c':
460 			resume = optarg;
461 			break;
462 		case 'u':
463 			urlinclude = optarg;
464 			break;
465 		case 'e':
466 			urlexclude = optarg;
467 			break;
468 		case 'i':
469 			imginclude = optarg;
470 			break;
471 		case 'I':
472 			imgexclude = optarg;
473 			break;
474 		case 'd':
475 			saveimgdir = optarg;
476 			break;
477 		case 'm':
478 			maxdepth = atoi(optarg);
479 			break;
480 		case 'A':
481 			agent = optarg;
482 			break;
483 		case 'R':
484 			use_robots = 0;
485 			break;
486 		case 'E':
487 			external = optarg;
488 			break;
489 		default:
490 			usage();
491 			exit(1);
492 		}
493 
494 	argc -= optind;
495 	argv += optind;
496 
497 	if (resume == NULL && argc < 1) {
498 		usage();
499 		exit(1);
500 	}
501 
502 	if (regcomp(&reurlinc, urlinclude, REG_EXTENDED|REG_ICASE) == -1)
503 		errx(1, "Error compiling regexp: '%s'\n", urlinclude);
504 	if (regcomp(&reurlexcl, urlexclude, REG_EXTENDED|REG_ICASE) == -1)
505 		errx(1, "Error compiling regexp: '%s'\n", urlinclude);
506 	if (regcomp(&reimginc, imginclude, REG_EXTENDED|REG_ICASE) == -1)
507 		errx(1, "Error compiling regexp: '%s'\n", imginclude);
508 	if (regcomp(&reimgexcl, imgexclude, REG_EXTENDED|REG_ICASE) == -1)
509 		errx(1, "Error compiling regexp: '%s'\n", imgexclude);
510 
511 	if (external != NULL)
512 		external_filter(external);
513 
514 	db_setup("crawl.db");
515 
516 	event_init();
517 	/* dns_init has to go before http_init because of signal handlers */
518 	dns_init();
519 	http_init(state);
520 
521 	if (resume == NULL) {
522 		while (argc) {
523 			if (http_add(HTTP_REQUEST_GET, argv[0], 0) != -1)
524 				http_mark_seen(argv[0]);
525 
526 			argc--;
527 			argv++;
528 		}
529 	} else
530 		if (http_restore_state(resume) == -1)
531 			exit(1);
532 
533 	/* Schedule connections */
534 	while (http_postevent() != -1)
535 		;
536 
537 	if (strlen(agent))
538 		http_setuseragent(agent);
539 
540 	http_register_dispatch("text/html", html_follower);
541 	http_register_dispatch("image/", image_saver);
542 	http_register_dispatch("audio/", image_saver);
543 	http_register_dispatch("video/", image_saver);
544 
545 	/* XXX - bad cludge */
546 	http_setcallback(http_movecb, http_moved);
547 
548 	if ((parser = html_newparser()) == NULL)
549 		return (-1);
550 
551 	html_register_cb(parser, "img", html_foundimage);
552 	html_register_cb(parser, "body", html_foundimage);
553 	html_register_cb(parser, "a", html_foundanchor);
554 	html_register_cb(parser, "frame", html_foundanchor);
555 	html_register_cb(parser, "area", html_foundanchor);
556 	html_register_cb(parser, "base", html_foundanchor);
557 	html_register_cb(parser, "link", html_foundanchor);
558 
559 	setvbuf(stdout, NULL, _IONBF, 0);
560 
561 	event_dispatch();
562 
563 	/* Print some informative stats */
564 	http_print_stats();
565 	dns_print_stats();
566 	dns_end();
567 
568 	/* Done with everything */
569 	html_freeparser(parser);
570 	db_close();
571 
572 	exit (0);
573 }
574