1 /*
2 * Copyright 2001 Niels Provos <provos@citi.umich.edu>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by Niels Provos.
16 * 4. The name of the author may not be used to endorse or promote products
17 * derived from this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include <sys/types.h>
32 #include <sys/time.h>
33 #include <sys/stat.h>
34 #include <sys/queue.h>
35 #include <fcntl.h>
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <string.h>
39 #include <err.h>
40 #include <unistd.h>
41 #include <regex.h>
42
43 #include "config.h"
44
45 #include <event.h>
46
47 #include "tree.h"
48 #include "http.h"
49 #include "html.h"
50 #include "crawldb.h"
51 #include "util.h"
52 #include "dns.h"
53 #include "cfg.h"
54
55 #define CRAWL "crawl"
56 #define CRAWL_VERSION VERSION
57
58 void usage(void);
59 ssize_t atomicio(ssize_t (*f)(), int, void *, size_t);
60
61 int url_permitted(char *);
62 int img_permitted(char *);
63 int image_saver(struct uri *);
64
65 char *progname;
66
67 struct html_parse *parser;
68
69 /* Regular expression which determines if an url gets included */
70 regex_t reurlinc;
71 regex_t reurlexcl;
72 regex_t reimginc;
73 regex_t reimgexcl;
74
75 /* Path to directory where the saved images go to */
76 char *saveimgdir;
77
78 struct timeval tv_timeout;
79 int maxdepth = 0;
80
81 #define WAIT_DEFAULT 5
82
83 #define MEDIA_MINLEN 20000
84 #define MEDIA_MAXLEN 400000
85
86 int media_minlen = MEDIA_MINLEN;
87 int media_maxlen = MEDIA_MAXLEN;
88
89 /* An external filter that does special processing */
90 int extmatchw = -1;
91 int extmatchr = -1;
92
93 int debug = 1;
94
95 #define DFPRINTF(x,y) if (debug >= x) fprintf y
96
97 void
usage(void)98 usage(void)
99 {
100 fprintf(stderr,
101 "%s: [-u urlincl] [-e urlexcl] [-i imgincl] [-d dir] [-m depth] <url> ...\n"
102 "\t for documentation of all options consult the man page\n",
103 progname);
104 }
105
106 int
url_permitted(char * url)107 url_permitted(char *url)
108 {
109 if (strncasecmp(url, HTTP_PREFIX, strlen(HTTP_PREFIX)))
110 return (0);
111
112 if (regexec(&reurlinc, url, 0, NULL, 0) != 0)
113 return (0);
114
115 if (regexec(&reurlexcl, url, 0, NULL, 0) == 0)
116 return (0);
117
118 /* Use an external filter */
119 if (extmatchw != -1) {
120 char ch;
121
122 if (atomicio(write, extmatchw, url, strlen(url)) == -1)
123 err(1, "write");
124 if (atomicio(write, extmatchw, "\n", 1) == -1)
125 err(1, "write");
126
127 if (atomicio(read, extmatchr, &ch, 1) == -1)
128 err(1, "read");
129
130 if (ch != 'y')
131 return (0);
132 }
133
134 return (1);
135 }
136
137 int
img_permitted(char * url)138 img_permitted(char *url)
139 {
140 if (strncasecmp(url, HTTP_PREFIX, strlen(HTTP_PREFIX)))
141 return (0);
142
143 if (regexec(&reimginc, url, 0, NULL, 0) != 0)
144 return (0);
145
146 if (regexec(&reimgexcl, url, 0, NULL, 0) == 0)
147 return (0);
148
149 return (1);
150 }
151
152 int
image_saver(struct uri * uri)153 image_saver(struct uri *uri)
154 {
155 struct stat sb;
156 char *path;
157 char tmp[128], *p = "";
158 char *url;
159
160 url = http_make_url(&uri->url);
161
162 /*
163 * Sometimes we download something that should have been
164 * html, but is media instead.
165 */
166 if (uri->save_fd == -1 && !img_permitted(url))
167 return (-1);
168
169 if (uri->length != -1 && uri->bdlen != uri->length) {
170 snprintf(tmp, sizeof(tmp), " (%4.1f%%/%d)",
171 (float)uri->bdlen/uri->length*100, uri->length);
172 p = tmp;
173 }
174 fprintf(stdout, "%s %s%s\n",
175 uri->flags & HTTP_REQUEST_GET ? "GET" : "HEAD",
176 url, uri->flags & HTTP_REQUEST_GET ? p : "");
177
178 if (uri->flags & HTTP_REQUEST_HEAD) {
179 int minlen, maxlen;
180
181 if (uri->format == NULL) {
182 minlen = media_minlen;
183 maxlen = media_maxlen;
184 } else {
185 /* Get lengths depending on mime types */
186 minlen = conf_get_num(uri->format, "Min-Length",
187 media_minlen);
188 maxlen = conf_get_num(uri->format, "Max-Length",
189 media_maxlen);
190 }
191
192 /* See if it meets our extra constraints */
193 if ((minlen != -1 && uri->length < minlen) ||
194 (maxlen != -1 && uri->length > maxlen))
195 return (-1);
196
197 /* Re-add request as GET */
198 http_add(HTTP_REQUEST_GET, url, uri->depth);
199 return (0);
200 }
201
202 if ((path = construct_path(url, 1)) == NULL)
203 return (-1);
204 if (stat(path, &sb) != -1) {
205 if (sb.st_size >= uri->length)
206 return (-1);
207 }
208 if (uri->save_fd == -1) {
209 uri->save_fd = open(path, O_WRONLY|O_CREAT|O_TRUNC, 0644);
210 if (uri->save_fd == -1)
211 return (-1);
212 }
213
214 if (atomicio(write, uri->save_fd, uri->body, uri->bdread) == -1)
215 return (-1);
216
217 uri->bdread = 0;
218
219 return (0);
220 }
221
222 void
http_dealimage(char * url,u_short depth)223 http_dealimage(char *url, u_short depth)
224 {
225 char *path;
226
227 if (!img_permitted(url)) {
228 DFPRINTF(3, (stderr, "Reject: %s\n", url));
229 return;
230 }
231
232 /* Check if the file exists */
233 if ((path = construct_path(url, 0)) == NULL)
234 return;
235 if (access(path, F_OK) != -1)
236 return;
237
238 if (http_mark_seen(url) != 0)
239 return;
240
241 http_add(HTTP_REQUEST_HEAD, url, depth);
242 }
243
244 void
html_foundimage(void * arg,char * el,char ** attr)245 html_foundimage(void *arg, char *el, char **attr)
246 {
247 struct uri *uri = arg;
248 char *base, *normal;
249
250 if (!strcasecmp(el, "img"))
251 attr = html_attr_find(attr, "src");
252 else if (!strcasecmp(el, "body"))
253 attr = html_attr_find(attr, "background");
254
255 if (*attr == NULL)
256 return;
257
258 base = http_basename(uri);
259 if (base == NULL)
260 return;
261
262 normal = http_make_uri(base, attr[1]);
263 if (normal == NULL) {
264 DFPRINTF(3, (stderr, "Failed on %s and %s\n", base, attr[1]));
265 return;
266 }
267
268 http_dealimage(normal, uri->depth + 1);
269 }
270
271 void
html_foundanchor(void * arg,char * el,char ** attr)272 html_foundanchor(void *arg, char *el, char **attr)
273 {
274 struct uri *uri = arg;
275 char *base, *normal;
276
277 if (!strcasecmp(el, "a"))
278 attr = html_attr_find(attr, "href");
279 else if (!strcasecmp(el, "frame"))
280 attr = html_attr_find(attr, "src");
281 else if (!strcasecmp(el, "iframe"))
282 attr = html_attr_find(attr, "src");
283 else if (!strcasecmp(el, "area"))
284 attr = html_attr_find(attr, "href");
285 else if (!strcasecmp(el, "base"))
286 attr = html_attr_find(attr, "link");
287 else if (!strcasecmp(el, "link")) {
288 char **p;
289 p = html_attr_find(attr, "href");
290 if (*p != NULL)
291 attr = p;
292 else
293 attr = html_attr_find(attr, "rel");
294 } else
295 return;
296
297 if (*attr == NULL)
298 return;
299
300 base = http_basename(uri);
301 if (base == NULL)
302 return;
303
304 normal = http_make_uri(base, attr[1]);
305 if (normal == NULL) {
306 DFPRINTF(3, (stderr, "Failed on %s and %s\n", base, attr[1]));
307 return;
308 }
309
310 /* If this is an image, do to the image dealer */
311 if (isMedia(normal))
312 return (http_dealimage(normal, uri->depth + 1));
313
314 if (maxdepth != -1 && uri->depth + 1 > maxdepth) {
315 DFPRINTF(2, (stderr, "Max depth reached: %s\n", normal));
316 return;
317 }
318
319 /* fprintf(stdout, "Anchor: %s\n", normal); */
320 if (!url_permitted(normal)) {
321 DFPRINTF(3, (stderr, "Reject: %s\n", normal));
322 return;
323 }
324
325 if (http_mark_seen(normal) != 0)
326 return;
327
328 http_add(HTTP_REQUEST_GET, normal, uri->depth + 1);
329 }
330
331 int
html_follower(struct uri * uri)332 html_follower(struct uri *uri)
333 {
334 fprintf(stdout, "%s http://%s%s\n",
335 uri->flags & HTTP_REQUEST_GET ? "GET" : "HEAD",
336 uri->url_host, uri->url_file);
337
338 /* We should never get a head request here */
339 if (uri->flags & HTTP_REQUEST_HEAD)
340 return (0);
341
342 html_parse_setdata(parser, uri);
343 html_parser(parser, uri->body, uri->bdlen);
344
345 return (0);
346 }
347
348 /* Deal with relocations */
349
350 void
http_moved(struct uri * uri,char * location)351 http_moved(struct uri *uri, char *location)
352 {
353 if (!url_permitted(location)) {
354 DFPRINTF(3, (stderr, "Reject: %s\n", location));
355 return;
356 }
357
358 if (http_mark_seen(location) != 0)
359 return;
360
361 http_add(uri->flags & HTTP_REQUEST_GET ?
362 HTTP_REQUEST_GET : HTTP_REQUEST_HEAD,
363 location, uri->depth);
364 }
365
366 void
external_filter(char * program)367 external_filter(char *program)
368 {
369 int toext[2], fromext[2];
370 int res;
371
372 if (pipe(toext) == -1)
373 err(1, "pipe");
374 if (pipe(fromext) == -1)
375 err(1, "pipe");
376
377 res = fork();
378 if (res == -1)
379 err(1, "fork");
380 if (res == 0) {
381 /* Child */
382 dup2(toext[0], fileno(stdin));
383 close(toext[1]);
384 dup2(fromext[1], fileno(stdout));
385 close(fromext[0]);
386
387 if (execlp(program, program, NULL) == -1)
388 err(1, "execlp");
389 exit(0);
390 }
391
392 /* Parent */
393 extmatchw = toext[1];
394 extmatchr = fromext[0];
395 close(toext[0]);
396 close(fromext[1]);
397 }
398
399 #define CONF_SAVE(w,f) do { \
400 char *p = f; \
401 if (p != NULL) \
402 (w) = p; \
403 } while (0)
404
405 int
main(int argc,char ** argv)406 main(int argc, char **argv)
407 {
408 extern char *optarg;
409 extern int optind;
410 char *urlinclude = "http://.*\\.citi\\.umich\\.edu";
411 char *urlexclude = "\\.(ps|gz|c|h|tar|exe|doc|pdf|ppt|txt|diff)$";
412 char *imginclude = "\\.(jpg|jpeg)";
413 char *imgexclude = "thumbs\\.";
414 char *agent = CRAWL"/"CRAWL_VERSION;
415 extern int use_robots, http_maxconnects;
416 char *state = "crawl.state";
417 char *external = NULL;
418 char *resume = NULL;
419 int ch;
420
421 progname = argv[0];
422 saveimgdir = ".";
423 timerclear(&tv_timeout);
424 tv_timeout.tv_sec = WAIT_DEFAULT;
425
426 conf_init();
427 CONF_SAVE(urlinclude, conf_get_str("General", "Url-Include"));
428 CONF_SAVE(urlexclude, conf_get_str("General", "Url-Exclude"));
429 CONF_SAVE(imginclude, conf_get_str("General", "Img-Include"));
430 CONF_SAVE(imgexclude, conf_get_str("General", "Img-Exclude"));
431 CONF_SAVE(saveimgdir, conf_get_str("General", "Img-Directory"));
432 CONF_SAVE(external, conf_get_str("General", "External-Filter"));
433 CONF_SAVE(agent, conf_get_str("HTTP", "Agent"));
434 CONF_SAVE(state, conf_get_str("HTTP", "State-File"));
435 use_robots = conf_get_num("HTTP", "Use-Robots", 1);
436 http_maxconnects = conf_get_num("HTTP", "Connections", HTTP_MAXCONNECTS);
437 maxdepth = conf_get_num("General", "Max-Depth", 0);
438 media_minlen = conf_get_num("General", "Min-Length", MEDIA_MINLEN);
439 media_maxlen = conf_get_num("General", "Max-Length", MEDIA_MAXLEN);
440
441 while ((ch = getopt(argc, argv, "t:v:u:e:i:d:m:RA:E:I:c:")) != -1)
442 switch(ch) {
443 case 't': {
444 char *p;
445 float val;
446 val = strtod(optarg, &p);
447 if (p == NULL && *p != '\0') {
448 usage();
449 exit(1);
450 }
451
452 tv_timeout.tv_sec = val;
453 tv_timeout.tv_usec = (val - (int)val) * 1000000L;
454 break;
455 }
456 case 'v':
457 debug = atoi(optarg);
458 break;
459 case 'c':
460 resume = optarg;
461 break;
462 case 'u':
463 urlinclude = optarg;
464 break;
465 case 'e':
466 urlexclude = optarg;
467 break;
468 case 'i':
469 imginclude = optarg;
470 break;
471 case 'I':
472 imgexclude = optarg;
473 break;
474 case 'd':
475 saveimgdir = optarg;
476 break;
477 case 'm':
478 maxdepth = atoi(optarg);
479 break;
480 case 'A':
481 agent = optarg;
482 break;
483 case 'R':
484 use_robots = 0;
485 break;
486 case 'E':
487 external = optarg;
488 break;
489 default:
490 usage();
491 exit(1);
492 }
493
494 argc -= optind;
495 argv += optind;
496
497 if (resume == NULL && argc < 1) {
498 usage();
499 exit(1);
500 }
501
502 if (regcomp(&reurlinc, urlinclude, REG_EXTENDED|REG_ICASE) == -1)
503 errx(1, "Error compiling regexp: '%s'\n", urlinclude);
504 if (regcomp(&reurlexcl, urlexclude, REG_EXTENDED|REG_ICASE) == -1)
505 errx(1, "Error compiling regexp: '%s'\n", urlinclude);
506 if (regcomp(&reimginc, imginclude, REG_EXTENDED|REG_ICASE) == -1)
507 errx(1, "Error compiling regexp: '%s'\n", imginclude);
508 if (regcomp(&reimgexcl, imgexclude, REG_EXTENDED|REG_ICASE) == -1)
509 errx(1, "Error compiling regexp: '%s'\n", imgexclude);
510
511 if (external != NULL)
512 external_filter(external);
513
514 db_setup("crawl.db");
515
516 event_init();
517 /* dns_init has to go before http_init because of signal handlers */
518 dns_init();
519 http_init(state);
520
521 if (resume == NULL) {
522 while (argc) {
523 if (http_add(HTTP_REQUEST_GET, argv[0], 0) != -1)
524 http_mark_seen(argv[0]);
525
526 argc--;
527 argv++;
528 }
529 } else
530 if (http_restore_state(resume) == -1)
531 exit(1);
532
533 /* Schedule connections */
534 while (http_postevent() != -1)
535 ;
536
537 if (strlen(agent))
538 http_setuseragent(agent);
539
540 http_register_dispatch("text/html", html_follower);
541 http_register_dispatch("image/", image_saver);
542 http_register_dispatch("audio/", image_saver);
543 http_register_dispatch("video/", image_saver);
544
545 /* XXX - bad cludge */
546 http_setcallback(http_movecb, http_moved);
547
548 if ((parser = html_newparser()) == NULL)
549 return (-1);
550
551 html_register_cb(parser, "img", html_foundimage);
552 html_register_cb(parser, "body", html_foundimage);
553 html_register_cb(parser, "a", html_foundanchor);
554 html_register_cb(parser, "frame", html_foundanchor);
555 html_register_cb(parser, "area", html_foundanchor);
556 html_register_cb(parser, "base", html_foundanchor);
557 html_register_cb(parser, "link", html_foundanchor);
558
559 setvbuf(stdout, NULL, _IONBF, 0);
560
561 event_dispatch();
562
563 /* Print some informative stats */
564 http_print_stats();
565 dns_print_stats();
566 dns_end();
567
568 /* Done with everything */
569 html_freeparser(parser);
570 db_close();
571
572 exit (0);
573 }
574