1 /***************************************************************************/
2 /* This code is part of WWW grabber called pavuk */
3 /* Copyright (c) 1997 - 2001 Stefan Ondrejicka */
4 /* Distributed under GPL 2 or later */
5 /***************************************************************************/
6
7 #include "config.h"
8
9 #include <assert.h>
10 #include <string.h>
11 #include <stdlib.h>
12 #include <stdio.h>
13 #include <errno.h>
14 #include <netdb.h>
15 #include <errno.h>
16 #include <unistd.h>
17 #include <sys/socket.h>
18 #include <sys/types.h>
19 #include <sys/stat.h>
20 #include <dirent.h>
21 #include <limits.h>
22 #include <time.h>
23 #include <signal.h>
24
25 #include "url.h"
26 #include "doc.h"
27 #include "tools.h"
28 #include "html.h"
29 #include "http.h"
30 #include "ftp.h"
31 #include "myssl.h"
32 #include "abstract.h"
33 #include "recurse.h"
34 #include "mime.h"
35 #include "robots.h"
36 #include "mode.h"
37 #include "times.h"
38 #include "stats.h"
39 #include "errcode.h"
40 #include "cookie.h"
41 #include "log.h"
42 #include "gui_api.h"
43 #include "form.h"
44 #include "ainterface.h"
45 #include "gcinfo.h"
46
47 static void dump_ftp_list(dllist *);
48 static void dump_urls_list(dllist *);
49
50 #define SETNEXTURL doc_cleanup(docu); \
51 _free(pstr); \
52 return docu->errcode;
53
54 #ifdef HAVE_MT
_sigintthr(int nr)55 static void _sigintthr(int nr)
56 {
57 #ifdef I_FACE
58 if(!cfg.processing)
59 {
60 exit(0);
61 }
62 #endif
63
64 errno = EINTR;
65 cfg.stop = TRUE;
66 cfg.rbreak = TRUE;
67 }
68
_sigquitthr(int nr)69 static void _sigquitthr(int nr)
70 {
71 pthread_exit(NULL);
72 }
73 #endif
74
reschedule_url(url * urlp)75 static void reschedule_url(url * urlp)
76 {
77 DEBUG_MISC(gettext("Rescheduling locked URL as no. %d\n"), cfg.total_cnt);
78 LOCK_CFG_URLSTACK;
79 cfg.urlstack = dllist_append(cfg.urlstack, (dllist_t) urlp);
80 #ifdef HAVE_MT
81 mt_semaphore_up(&cfg.urlstack_sem);
82 #endif
83 cfg.total_cnt++;
84 UNLOCK_CFG_URLSTACK;
85 }
86
run_post_command(doc * docp)87 static void run_post_command(doc * docp)
88 {
89 char *urlstr;
90 char *cmd;
91
92 DEBUG_MISC(gettext("Running post-processing command\n"));
93 urlstr = url_to_urlstr(docp->doc_url, TRUE);
94
95 cmd = tl_str_concat(NULL, priv_cfg.post_cmd, " \'",
96 url_to_filename(docp->doc_url, FALSE),
97 docp->is_parsable ? "\' 1 \'" : "\' 0 \'", urlstr, "\'", NULL);
98
99 _free(urlstr);
100
101 tl_system(cmd);
102
103 _free(cmd);
104 }
105
add_matching_form(doc * docp,int nform,url_info * ui)106 static void add_matching_form(doc * docp, int nform, url_info * ui)
107 {
108 char *ftext;
109 int flen;
110 form_info *fi;
111 dllist *ptr, *fields, *sfields;
112 url_info *nui;
113
114 if(!(ftext = form_get_text(nform, docp->contents, docp->size, &flen)))
115 {
116 return;
117 }
118
119 fi = form_parse(ftext, flen);
120
121 if(!fi)
122 return;
123
124 /* copy all fields supplied on cmdln */
125 fields = NULL;
126 for(ptr = ui->fields; ptr; ptr = ptr->next)
127 {
128 fields = dllist_prepend(fields, (dllist_t)
129 form_field_duplicate((form_field *) ptr->data));
130 }
131
132 /* copy all suitable fields from HTML form */
133 sfields = NULL;
134 form_get_default_successful(NULL, fi->infos, &sfields);
135
136 for(; sfields; sfields = dllist_remove_entry(sfields, sfields))
137 {
138 form_field *ff = (form_field *) sfields->data;
139
140 if(dllist_find2(fields, (dllist_t) ff, form_field_compare_name))
141 {
142 _free(ff->name);
143 _free(ff->value);
144 _free(ff);
145 }
146 else
147 {
148 fields = dllist_prepend(fields, (dllist_t) ff);
149 }
150 }
151
152 nui = url_info_new(fi->action);
153 nui->type = URLI_FORM;
154 nui->fields = fields;
155 nui->encoding = fi->encoding;
156 nui->method = fi->method;
157 nui->localname = tl_strdup(ui->localname);
158
159 form_free(fi);
160
161 append_starting_url(nui, docp->doc_url);
162
163 url_info_free(nui);
164 }
165
add_matching_forms(doc * docp,dllist * formlist)166 static void add_matching_forms(doc * docp, dllist * formlist)
167 {
168 dllist *fptr, *uptr;
169 int nform;
170
171 for(fptr = formlist, nform = 0; fptr; fptr = fptr->next, nform++)
172 {
173 url *urlp;
174
175 urlp = url_parse((char *) fptr->data);
176 assert(urlp->type != URLT_FROMPARENT);
177
178
179 if((urlp->type != URLT_HTTP) && (urlp->type != URLT_HTTPS))
180 {
181 free_deep_url(urlp);
182 _free(urlp);
183 continue;
184 }
185 free_deep_url(urlp);
186 _free(urlp);
187
188 for(uptr = priv_cfg.formdata; uptr; uptr = uptr->next)
189 {
190 url_info *ui = (url_info *) uptr->data;
191
192 if(!strcmp(ui->urlstr, (char *) fptr->data))
193 {
194 add_matching_form(docp, nform, ui);
195 }
196 }
197 }
198 }
199
process_document(doc * docu,int check_lim)200 int process_document(doc * docu, int check_lim)
201 {
202 url *urlr;
203 int nreget = 0, nredir = 0, pokus = 0;
204 time_t atm;
205 char cpom[64];
206 char *pstr = NULL;
207 int store_stat;
208 struct stat estat;
209
210 urlr = docu->doc_url;
211
212 docu->check_limits = check_lim;
213
214 _Xt_Serve;
215
216 if(docu->check_limits)
217 docu->check_limits = (urlr->parent_url != NULL);
218
219 while(!cfg.stop && !cfg.rbreak)
220 {
221 _free(docu->ftp_pasv_host);
222 docu->errcode = ERR_NOERROR;
223 docu->mime = NULL;
224 docu->type_str = NULL;
225 docu->doc_url = urlr;
226 docu->dtime = 0L;
227 docu->contents = NULL;
228 docu->is_chunked = FALSE;
229 docu->read_chunksize = FALSE;
230 docu->read_trailer = FALSE;
231 docu->ftp_fatal_err = FALSE;
232 pstr = url_to_urlstr(urlr, FALSE);
233
234 if(pokus)
235 xprintf(1, gettext("retry no. %d\n"), pokus);
236
237 #ifdef HAVE_MT
238 xprintf(1, gettext("URL[%2d]: %5d(%d) of %5d %s\n"), docu->threadnr + 1,
239 docu->doc_nr, cfg.fail_cnt, cfg.total_cnt, pstr);
240 #else
241 xprintf(1, gettext("URL: %5d(%d) of %5d %s\n"), docu->doc_nr,
242 cfg.fail_cnt, cfg.total_cnt, pstr);
243 #endif
244
245 #ifdef I_FACE
246 if(cfg.xi_face)
247 {
248 gui_set_doccounter();
249
250 gui_set_url(pstr);
251
252 gui_set_status(gettext("Starting download"));
253 }
254 #endif
255 /*** to be able to revisit moved documents ***/
256 /*** especially for authorization purposes ***/
257 if((urlr->status & URL_PROCESSED) && urlr->moved_to && nredir)
258 {
259 urlr->status &= ~URL_PROCESSED;
260 }
261
262 if(docu->check_limits)
263 {
264 cond_info_t condp;
265
266 condp.level = 2;
267 condp.urlnr = docu->doc_nr;
268 condp.size = 0;
269 condp.time = 0L;
270 condp.mimet = NULL;
271 condp.full_tag = NULL;
272 condp.params = NULL;
273 condp.html_doc = NULL;
274 condp.html_doc_offset = 0;
275 condp.tag = NULL;
276 condp.attrib = NULL;
277
278 if(urlr->status & URL_PROCESSED)
279 {
280 xprintf(1, gettext("Already processed\n"));
281 docu->errcode = ERR_PROCESSED;
282 SETNEXTURL;
283 }
284
285 if(urlr->status & URL_USER_DISABLED)
286 {
287 xprintf(1, gettext("Disallowed by user\n"));
288 docu->errcode = ERR_UDISABLED;
289 SETNEXTURL;
290 }
291
292 if(!prottable[urlr->type].supported || (urlr->parent_url
293 && (urlr->type == URLT_FTP || urlr->type == URLT_FTPS)
294 && urlr->p.ftp.dir && !cfg.condition.ftpdir)
295 || (urlr->parent_url && !url_append_condition(urlr, &condp)))
296 {
297 xprintf(1, gettext("Disallowed by rules\n"));
298
299 urlr->status |= URL_REJECTED;
300 docu->errcode = ERR_RDISABLED;
301 SETNEXTURL;
302 }
303
304 gui_set_status(gettext("Checking \"robots.txt\""));
305
306 if(!robots_check(urlr))
307 {
308 xprintf(1, gettext("Disallowed by \"robots.txt\"\n"));
309 urlr->status |= URL_REJECTED;
310 docu->errcode = ERR_RDISABLED;
311 SETNEXTURL;
312 }
313 }
314
315 if(cfg.mode == MODE_FTPDIR &&
316 (urlr->type != URLT_FTP && urlr->type != URLT_FTPS))
317 {
318 xprintf(1,
319 gettext("This URL type is not supported with ftpdir mode\n"));
320
321 urlr->status |= URL_REJECTED;
322 docu->errcode = ERR_RDISABLED;
323 SETNEXTURL;
324 }
325
326 _Xt_Serve;
327
328 if(cfg.mode == MODE_SYNC)
329 {
330 char *pp = url_to_filename(urlr, TRUE);
331
332 if(!stat(pp, &estat) && !S_ISDIR(estat.st_mode))
333 {
334 atm = time(NULL) - 86400 * cfg.ddays;
335 /*
336 pro: We do not want the message
337 "No transfer - file not expired"
338 if the server's clock is ahead of our clock.
339 If no parameter cfg.ddays is given, then
340 we do not compare the file modification times.
341 */
342 if(cfg.ddays == 0 || estat.st_mtime < atm)
343 docu->dtime = estat.st_mtime;
344 else
345 {
346 xprintf(1, gettext("No transfer - file not expired\n"));
347 urlr->status |= URL_REJECTED;
348 docu->errcode = ERR_RDISABLED;
349 SETNEXTURL;
350 }
351 urlr->status |= URL_ISLOCAL;
352 docu->origsize = estat.st_size;
353 }
354 }
355
356 if(cfg.show_time)
357 {
358 atm = time(NULL);
359 LOCK_TIME;
360 strftime(cpom, sizeof(cpom), "%H:%M:%S", localtime(&atm));
361 UNLOCK_TIME;
362 xprintf(1, gettext("Starting time : %s\n"), cpom);
363 }
364
365 #ifdef I_FACE
366 if(cfg.stop || cfg.rbreak)
367 {
368 _free(pstr);
369 break;
370 }
371 #endif
372
373 _Xt_Serve;
374
375 if((urlr->type == URLT_FTP || urlr->type == URLT_FTP)
376 && urlr->extension &&
377 ((ftp_url_extension *) urlr->extension)->type == FTP_TYPE_L &&
378 ((ftp_url_extension *) urlr->extension)->slink)
379 {
380 if(cfg.retrieve_slink)
381 {
382 /** need to kill extension, because we must **/
383 /** guess the file type beside the symlink **/
384 ftp_url_ext_free(urlr->extension);
385 urlr->extension = NULL;
386 }
387 else
388 {
389 ftp_make_symlink(urlr);
390 urlr->status |= URL_PROCESSED;
391 docu->errcode = ERR_NOERROR;
392 SETNEXTURL;
393 }
394 }
395
396 gui_set_status(gettext("Starting download"));
397
398 if(doc_download(docu, FALSE, FALSE))
399 {
400 if(cfg.show_time)
401 {
402 atm = time(NULL);
403 LOCK_TIME;
404 strftime(cpom, sizeof(cpom), "%H:%M:%S", localtime(&atm));
405 UNLOCK_TIME;
406 xprintf(1, gettext("Ending time : %s\n"), cpom);
407 }
408
409 _Xt_Serve;
410 doc_remove_lock(docu);
411 _free(docu->contents);
412
413 report_error(docu, gettext("download"));
414 DEBUG_USER("Error status code - (%d)\n");
415
416 if((nreget < cfg.nreget &&
417 (docu->errcode == ERR_HTTP_TRUNC ||
418 docu->errcode == ERR_FTP_TRUNC ||
419 docu->errcode == ERR_LOW_TRANSFER_RATE ||
420 docu->errcode == ERR_HTTP_FAILREGET ||
421 docu->errcode == ERR_HTTP_TIMEOUT ||
422 docu->errcode == ERR_HTTP_GW_TIMEOUT)) ||
423 (nredir < cfg.nredir &&
424 docu->errcode == ERR_HTTP_REDIR) ||
425 (docu->errcode == ERR_HTTP_AUTH) ||
426 (docu->errcode == ERR_HTTP_PROXY_AUTH))
427 {
428 if(docu->errcode == ERR_HTTP_REDIR)
429 {
430 urlr->status |= URL_PROCESSED;
431 if((urlr->moved_to->status & URL_PROCESSED) &&
432 (!urlr->moved_to->moved_to))
433 {
434 SETNEXTURL;
435 }
436 else
437 {
438 #ifdef I_FACE
439 if(cfg.xi_face)
440 gui_tree_set_icon_for_doc(docu);
441 #endif
442 urlr = urlr->moved_to;
443 }
444 }
445
446 if(docu->errcode == ERR_HTTP_TRUNC)
447 {
448 urlr->status |= URL_TRUNCATED;
449 _free(docu->etag);
450
451 docu->etag = get_mime_param_val_str("ETag:", docu->mime);
452 if(!docu->etag)
453 docu->etag =
454 get_mime_param_val_str("Content-Location:", docu->mime);
455 if(!docu->etag)
456 docu->etag = get_mime_param_val_str("Last-Modified", docu->mime);
457 }
458
459 if(docu->errcode == ERR_HTTP_AUTH)
460 {
461 docu->doc_url->status |= URL_PROCESSED;
462 docu->doc_url->status |= URL_ERR_REC;
463 SETNEXTURL;
464 }
465
466 if(docu->errcode == ERR_HTTP_PROXY_AUTH)
467 {
468 docu->doc_url->status |= URL_PROCESSED;
469 docu->doc_url->status |= URL_ERR_REC;
470 SETNEXTURL;
471 }
472
473 _free(docu->mime);
474 _free(docu->type_str);
475
476 nreget += (docu->errcode == ERR_HTTP_TRUNC ||
477 docu->errcode == ERR_FTP_TRUNC) && cfg.mode != MODE_SREGET;
478 nredir += (docu->errcode == ERR_HTTP_REDIR);
479 _free(pstr);
480 continue;
481 }
482
483 if(docu->errcode == ERR_FTP_UNKNOWN ||
484 docu->errcode == ERR_FTP_CONNECT ||
485 docu->errcode == ERR_FTP_DATACON ||
486 docu->errcode == ERR_FTPS_CONNECT ||
487 docu->errcode == ERR_FTPS_DATASSLCONNECT ||
488 docu->errcode == ERR_HTTP_UNKNOWN ||
489 docu->errcode == ERR_HTTP_CONNECT ||
490 docu->errcode == ERR_HTTP_SNDREQ ||
491 docu->errcode == ERR_HTTP_SNDREQDATA ||
492 docu->errcode == ERR_HTTP_RCVRESP ||
493 docu->errcode == ERR_HTTP_SERV ||
494 docu->errcode == ERR_HTTP_TIMEOUT ||
495 docu->errcode == ERR_HTTP_PROXY_CONN ||
496 docu->errcode == ERR_HTTPS_CONNECT ||
497 docu->errcode == ERR_READ ||
498 docu->errcode == ERR_ZERO_SIZE ||
499 docu->errcode == ERR_GOPHER_CONNECT ||
500 docu->errcode == ERR_PROXY_CONNECT || docu->errcode == ERR_HTTP_SERV)
501 {
502 urlr->status |= URL_ERR_REC;
503 pokus++;
504 /*** retry only when allowed ***/
505 if(pokus >= cfg.nretry)
506 {
507 urlr->status |= URL_PROCESSED;
508 SETNEXTURL;
509 }
510 _free(pstr);
511 _free(docu->mime);
512 _free(docu->type_str);
513 continue;
514 }
515 else if(docu->errcode == ERR_LOCKED)
516 {
517 if(!cfg.urlstack)
518 {
519 xprintf(1,
520 gettext("last document locked -> sleeping for 5 seconds\n"));
521 tl_sleep(5);
522 }
523 reschedule_url(urlr);
524 SETNEXTURL;
525 }
526 else if(docu->errcode == ERR_BIGGER ||
527 docu->errcode == ERR_SMALLER ||
528 docu->errcode == ERR_NOMIMET ||
529 docu->errcode == ERR_OUTTIME || docu->errcode == ERR_SCRIPT_DISABLED)
530 {
531 urlr->status |= URL_PROCESSED;
532 urlr->status |= URL_ERR_REC;
533 SETNEXTURL;
534 }
535 else
536 {
537 /*** remove improper documents if required ***/
538 if((cfg.remove_old &&
539 (cfg.mode == MODE_SYNC ||
540 cfg.mode == MODE_MIRROR)) &&
541 (((docu->errcode == ERR_FTP_GET ||
542 docu->errcode == ERR_FTP_BDIR ||
543 docu->errcode == ERR_FTP_NODIR) &&
544 docu->ftp_respc == 550) ||
545 docu->errcode == ERR_HTTP_NFOUND ||
546 docu->errcode == ERR_HTTP_GONE))
547 {
548 doc_remove(docu->doc_url);
549 }
550
551 urlr->status |= URL_ERR_UNREC;
552 urlr->status |= URL_PROCESSED;
553
554 SETNEXTURL;
555 }
556 }
557
558 _Xt_Serve;
559
560 if(urlr->status & URL_TRUNCATED)
561 urlr->status &= ~URL_TRUNCATED;
562
563 if(urlr->status & URL_ERR_REC)
564 urlr->status &= ~URL_ERR_REC;
565
566 if(cfg.show_time)
567 {
568 atm = time(NULL);
569 LOCK_TIME;
570 strftime(cpom, sizeof(cpom), "%H:%M:%S", localtime(&atm));
571 UNLOCK_TIME;
572 xprintf(1, gettext("Ending time : %s\n"), cpom);
573 }
574
575 report_error(docu, gettext("download"));
576
577 _Xt_Serve;
578
579 if(docu->contents)
580 {
581 if(docu->is_parsable)
582 {
583 dllist *formlist = NULL;
584 dllist *urls;
585
586 gui_set_status(gettext("Relocating and scanning HTML document"));
587
588 urls =
589 html_process_document(docu, priv_cfg.formdata ? &formlist : NULL);
590
591 _Xt_Serve;
592
593 if(urls && cfg.dump_urlfd >= 0)
594 {
595 dump_urls_list(urls);
596 }
597
598 if(priv_cfg.formdata && formlist)
599 {
600 add_matching_forms(docu, formlist);
601 while(formlist)
602 {
603 if(formlist->data) free((void *) formlist->data);
604 formlist = dllist_remove_entry(formlist, formlist);
605 }
606 }
607
608 if(cfg.mode != MODE_SREGET &&
609 cfg.mode != MODE_FTPDIR && !(docu->doc_url->status & URL_NORECURSE))
610 {
611 gui_tree_add_start();
612 cat_links_to_url_list(urls);
613 gui_tree_add_end();
614 }
615 else if(cfg.mode == MODE_FTPDIR)
616 {
617 dump_ftp_list(urls);
618 }
619 else
620 {
621 for(; urls; urls = dllist_remove_entry(urls, urls))
622 {
623 free_deep_url((url *) urls->data);
624 if(urls->data) free((url *)urls->data);
625 }
626 }
627
628 _Xt_Serve;
629 }
630
631 store_stat = 0;
632
633 if(cfg.dumpfd >= 0 && cfg.dump_after)
634 {
635 bufio *fd;
636
637 gui_set_status(gettext("Dumping processed document"));
638 LOCK_DUMPFD;
639 fd = bufio_dupfd(cfg.dumpfd);
640
641 if(docu->mime && cfg.dump_resp)
642 bufio_write(fd, docu->mime, strlen(docu->mime));
643
644 bufio_write(fd, docu->contents, docu->size);
645
646 bufio_close(fd);
647 UNLOCK_DUMPFD;
648 }
649 else if((docu->doc_url->type != URLT_FILE) &&
650 !(docu->doc_url->status & URL_REDIRECT) &&
651 (docu->errcode != ERR_HTTP_ACTUAL) &&
652 (docu->errcode != ERR_FTP_ACTUAL) &&
653 (cfg.mode != MODE_NOSTORE) &&
654 (cfg.dumpfd < 0) && (cfg.mode != MODE_FTPDIR))
655 {
656 gui_set_status(gettext("Storing document"));
657
658 store_stat = doc_store(docu, TRUE);
659
660 if(store_stat)
661 {
662 xprintf(1, gettext("Store failed\n"));
663 urlr->status &= ~URL_ERR_REC;
664 }
665 }
666
667 _Xt_Serve;
668
669 if(priv_cfg.post_cmd)
670 run_post_command(docu);
671
672 doc_remove_lock(docu);
673
674 doc_update_parent_links(docu);
675 }
676 else
677 {
678 if(priv_cfg.post_cmd)
679 run_post_command(docu);
680
681 doc_remove_lock(docu);
682
683 doc_update_parent_links(docu);
684 }
685
686 urlr->status |= URL_DOWNLOADED;
687 urlr->status |= URL_PROCESSED;
688 SETNEXTURL;
689 }
690 return ERR_UNKNOWN;
691 }
692
693 #ifdef I_FACE
download_single_doc(url * urlp)694 int download_single_doc(url * urlp)
695 {
696 int rv;
697 doc docu;
698 global_connection_info con_info;
699 #if defined(HAVE_MT) && defined(I_FACE)
700 _config_struct_priv_t privcfg;
701
702 #if defined (__OSF__) || defined (__osf__)
703 #define __builtin_try
704 #define __builtin_finally
705 #endif
706
707 privcfg_make_copy(&privcfg);
708 pthread_setspecific(cfg.privcfg_key, (void *) (&privcfg));
709 pthread_cleanup_push((void *) privcfg_free, (void *) (&privcfg));
710 #endif
711
712 gui_start_download(FALSE);
713
714 #ifdef HAVE_MT
715 {
716 sigset_t smask;
717
718 sigemptyset(&smask);
719 sigaddset(&smask, SIGINT);
720 sigaddset(&smask, SIGQUIT);
721 pthread_sigmask(SIG_UNBLOCK, &smask, NULL);
722
723 signal(SIGINT, _sigintthr);
724 signal(SIGQUIT, _sigquitthr);
725 }
726
727 pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
728 pthread_setspecific(cfg.thrnr_key, (void *) 0);
729 DEBUG_MTTHR("starting thread(%ld) %d\n", pthread_self(), 0);
730
731 cfg.allthreadsnr = 0;
732 gui_mt_thread_start(cfg.allthreadsnr);
733 #endif
734
735 cfg.rbreak = FALSE;
736 cfg.stop = FALSE;
737 cfg.processing = TRUE;
738
739 doc_init(&docu, urlp);
740 #ifdef HAVE_MT
741 docu.threadnr = 0;
742 pthread_setspecific(cfg.currdoc_key, (void *) NULL);
743 pthread_setspecific(cfg.herrno_key, (void *) (&(docu.__herrno)));
744 #endif
745 rv = process_document(&docu, FALSE);
746
747 init_global_connection_data(&con_info);
748 save_global_connection_data(&con_info, &docu);
749 kill_global_connection_data(&con_info);
750
751 cfg.processing = FALSE;
752
753 cfg.rbreak = FALSE;
754 cfg.stop = FALSE;
755
756 #if defined(HAVE_MT) && defined(I_FACE)
757 pthread_cleanup_pop(TRUE);
758 #endif
759 #ifdef HAVE_MT
760 doc_finish_processing(&docu);
761 cfg.allthreadsnr = 0;
762 gui_mt_thread_end(0);
763 #endif
764 gui_beep();
765 gui_set_msg(gettext("Done"), 0);
766
767 return rv;
768 }
769 #endif
770
771 /*********************************************/
772 /* rekurzivne prechadzanie stromu dokumentov */
773 /* FIXME: Translate me! */
774 /*********************************************/
775 #ifdef HAVE_MT
_recurse(int thnr)776 static void _recurse(int thnr)
777 #else
778 void recurse(int thnr)
779 #endif
780 {
781 bool_t rbreaksave, stopsave;
782
783 global_connection_info con_info;
784
785 if(cfg.urlstack == NULL)
786 return;
787
788 init_global_connection_data(&con_info);
789
790 /**** obsluzenie vsetkych URL v zozname ****/
791 /**** FIXME: Translate me! ****/
792 while(cfg.urlstack && !cfg.stop)
793 {
794 doc docu;
795 url *urlp;
796
797 LOCK_CFG_URLSTACK;
798 if(cfg.urlstack)
799 {
800 urlp = (url *) cfg.urlstack->data;
801 cfg.urlstack = dllist_remove_entry(cfg.urlstack, cfg.urlstack);
802 #ifdef HAVE_MT
803 mt_semaphore_decrement(&cfg.urlstack_sem);
804 #endif
805 UNLOCK_CFG_URLSTACK;
806 }
807 else
808 {
809 UNLOCK_CFG_URLSTACK;
810 break;
811 }
812
813 doc_init(&docu, urlp);
814
815 #ifdef HAVE_MT
816 docu.threadnr = thnr;
817 pthread_setspecific(cfg.currdoc_key, (void *) (&docu));
818 pthread_setspecific(cfg.herrno_key, (void *) (&(docu.__herrno)));
819 #endif
820
821 LOCK_DCNT;
822 cfg.docnr++;
823 docu.doc_nr = cfg.docnr;
824 UNLOCK_DCNT;
825
826 restore_global_connection_data(&con_info, &docu);
827
828 process_document(&docu, TRUE);
829
830 save_global_connection_data(&con_info, &docu);
831
832 #ifdef HAVE_MT
833 doc_finish_processing(&docu);
834 #endif
835
836 if(docu.errcode == ERR_QUOTA_FS ||
837 docu.errcode == ERR_QUOTA_TRANS ||
838 docu.errcode == ERR_QUOTA_TIME || cfg.rbreak)
839 {
840 LOCK_CFG_URLSTACK;
841 cfg.docnr--;
842 cfg.urlstack = dllist_prepend(cfg.urlstack, (dllist_t) urlp);
843 #ifdef HAVE_MT
844 mt_semaphore_up(&cfg.urlstack_sem);
845 #endif
846 UNLOCK_CFG_URLSTACK;
847 break;
848 }
849 }
850 #if defined(I_FACE) && !defined(HAVE_MT)
851 if(cfg.xi_face)
852 {
853 gui_set_status(gettext("Done"));
854 }
855 #endif
856
857 #ifdef I_FACE
858 if(cfg.xi_face)
859 gui_set_doccounter();
860 #endif
861
862 stopsave = cfg.stop;
863 rbreaksave = cfg.rbreak;
864 cfg.stop = FALSE;
865 cfg.rbreak = FALSE;
866
867 kill_global_connection_data(&con_info);
868
869 if(cfg.update_cookies)
870 {
871 cookie_update_file(TRUE);
872 }
873
874 cfg.stop = stopsave;
875 cfg.rbreak = rbreaksave;
876
877 if(!cfg.rbreak && !cfg.stop && cfg.stats_file)
878 {
879 stats_fill_spage(cfg.stats_file, NULL);
880 }
881 }
882
883 #ifdef HAVE_MT
_recurse_thrd(int thrnr)884 static void _recurse_thrd(int thrnr)
885 {
886 bool_t init = (thrnr == 0);
887 #ifdef I_FACE
888 _config_struct_priv_t privcfg;
889 #endif
890
891 {
892 sigset_t smask;
893
894 sigemptyset(&smask);
895 sigaddset(&smask, SIGINT);
896 sigaddset(&smask, SIGQUIT);
897 pthread_sigmask(SIG_UNBLOCK, &smask, NULL);
898
899 signal(SIGINT, _sigintthr);
900 signal(SIGQUIT, _sigquitthr);
901 }
902
903 pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
904 pthread_setspecific(cfg.currdoc_key, (void *) NULL);
905 pthread_setspecific(cfg.thrnr_key, (void *) thrnr);
906 DEBUG_MTTHR("starting thread(%ld) %d\n", pthread_self(), thrnr);
907
908 #ifdef I_FACE
909 privcfg_make_copy(&privcfg);
910 pthread_setspecific(cfg.privcfg_key, (void *) (&privcfg));
911 pthread_cleanup_push((void *) privcfg_free, (void *) (&privcfg));
912 #endif
913
914 for(; !cfg.rbreak && !cfg.stop;)
915 {
916 int v;
917
918 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
919 DEBUG_MTTHR("thread %d awaking\n", thrnr);
920
921 _recurse(thrnr);
922 init = FALSE;
923 gui_clear_status();
924 DEBUG_MTTHR("thread %d sleeping\n", thrnr);
925 gui_set_status(gettext("Sleeping ..."));
926 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
927 mt_semaphore_up(&cfg.nrunning_sem);
928 /* UN-critical section */
929
930 while(!cfg.stop && !cfg.rbreak &&
931 (v = mt_semaphore_timed_wait(&cfg.urlstack_sem, 400)) < 0);
932
933 mt_semaphore_decrement(&cfg.nrunning_sem);
934 }
935 #ifdef I_FACE
936 pthread_cleanup_pop(TRUE);
937 #endif
938 DEBUG_MTTHR("thread %d exiting\n", thrnr);
939 gui_set_status(gettext("Exiting ..."));
940 pthread_exit(NULL);
941 }
942
recurse(int dumb)943 void recurse(int dumb)
944 {
945 pthread_attr_t thrdattr;
946 int i;
947 int num = cfg.nthr;
948 sigset_t smask;
949
950 sigemptyset(&smask);
951 sigaddset(&smask, SIGINT);
952 sigaddset(&smask, SIGQUIT);
953 pthread_sigmask(SIG_UNBLOCK, &smask, NULL);
954
955 signal(SIGQUIT, _sigquitthr);
956
957 pthread_attr_init(&thrdattr);
958 pthread_attr_setscope(&thrdattr, PTHREAD_SCOPE_SYSTEM);
959 pthread_attr_setstacksize(&thrdattr, MT_STACK_SIZE);
960 mt_semaphore_init(&cfg.nrunning_sem);
961
962 if(num <= 0)
963 num = 1;
964
965 cfg.allthreadsnr = 0;
966 cfg.allthreads = _malloc(num * sizeof(pthread_t));
967
968 mt_semaphore_decrement(&cfg.urlstack_sem);
969
970 for(i = 0; i < num; i++)
971 {
972 if(!pthread_create(&(cfg.allthreads[cfg.allthreadsnr]),
973 &thrdattr, (void *) _recurse_thrd, (void *) cfg.allthreadsnr))
974 {
975 cfg.allthreadsnr++;
976 gui_mt_thread_start(cfg.allthreadsnr);
977 mt_semaphore_decrement(&cfg.nrunning_sem);
978 }
979 else
980 {
981 char pom[100];
982 sprintf(pom, "Create downloading thread %d", i);
983 xperror(pom);
984 }
985 if(cfg.rbreak || cfg.stop)
986 break;
987 }
988
989 while(!cfg.stop && !cfg.rbreak &&
990 mt_semaphore_timed_down(&cfg.nrunning_sem, 500) < 0);
991
992 cfg.stop = TRUE;
993
994 tl_msleep(300);
995
996 for(i = 0; i < cfg.allthreadsnr; i++)
997 {
998 /*
999 pthread_cancel(cfg.allthreads[i]);
1000 pthread_kill(cfg.allthreads[i], SIGQUIT);
1001 */
1002 pthread_join(cfg.allthreads[i], NULL);
1003 }
1004
1005 mt_semaphore_destroy(&cfg.nrunning_sem);
1006 _free(cfg.allthreads);
1007 cfg.allthreadsnr = 0;
1008 gui_mt_thread_end(0);
1009 }
1010 #endif
1011
dump_ftp_list(dllist * urllst)1012 static void dump_ftp_list(dllist * urllst)
1013 {
1014 dllist *ptr = urllst;
1015
1016 while(ptr)
1017 {
1018 url *urlp = (url *) ptr->data;
1019 void *dupl;
1020
1021 dupl = dllist_find2(ptr->next, (dllist_t) urlp, dllist_url_compare);
1022
1023 if(!dupl && !(urlp->status & URL_INLINE_OBJ) &&
1024 (urlp->type == URLT_FTP || urlp->type == URLT_FTPS))
1025 {
1026 char *p, *pp;
1027
1028 p = url_get_path(urlp);
1029 pp = strrchr(p, '/');
1030 if(pp)
1031 {
1032 pp++;
1033 if(!*pp)
1034 {
1035 pp -= 2;
1036 while(pp > p && *pp != '/')
1037 pp--;
1038 pp++;
1039 }
1040 if(urlp->extension)
1041 {
1042 ftp_url_extension *fe = urlp->extension;
1043
1044 if(fe->type == FTP_TYPE_F)
1045 xprintf(1, gettext("\t%s (%d bytes)\n"), pp, fe->size);
1046 else if(fe->type == FTP_TYPE_L)
1047 xprintf(1, "\t%s -> %s\n", pp, fe->slink);
1048 else if(fe->type == FTP_TYPE_D)
1049 xprintf(1, "\t%s/\n", pp, fe->slink);
1050 }
1051 else
1052 xprintf(1, "\t%s\n", pp);
1053 }
1054 }
1055 free_deep_url(urlp);
1056 free(urlp);
1057 ptr = dllist_remove_entry(ptr, ptr);
1058 }
1059 }
1060
dump_urls_list(dllist * urls)1061 static void dump_urls_list(dllist * urls)
1062 {
1063 dllist *ptr;
1064
1065 LOCK_DUMPURLS;
1066 for(ptr = urls; ptr; ptr = ptr->next)
1067 {
1068 void *dupl;
1069
1070 dupl = dllist_find2(ptr->next, (dllist_t) ptr->data, dllist_url_compare);
1071
1072 if(!dupl)
1073 {
1074 char *ustr = url_to_urlstr((url *) ptr->data, FALSE);
1075
1076 if(ustr)
1077 {
1078 write(cfg.dump_urlfd, ustr, strlen(ustr));
1079 write(cfg.dump_urlfd, "\n", 1);
1080 free(ustr);
1081 }
1082 }
1083 }
1084 UNLOCK_DUMPURLS;
1085 }
1086
get_urls_to_resume(char * dirname)1087 void get_urls_to_resume(char *dirname)
1088 {
1089 DIR *dir;
1090 struct dirent *dent;
1091 char next_dir[PATH_MAX];
1092 struct stat estat;
1093 url *purl;
1094
1095 if(!(dir = opendir(dirname)))
1096 {
1097 xperror(dirname);
1098 return;
1099 }
1100
1101 gui_set_msg(gettext("Searching for files to resume"), 0);
1102
1103 while((dent = readdir(dir)))
1104 {
1105 _Xt_Serve;
1106
1107 snprintf(next_dir, sizeof(next_dir), "%s/%s", dirname, dent->d_name);
1108 if(!strcmp(dent->d_name, "."))
1109 continue;
1110 if(!strcmp(dent->d_name, ".."))
1111 continue;
1112 if(lstat(next_dir, &estat))
1113 {
1114 xperror(next_dir);
1115 continue;
1116 }
1117
1118 if(S_ISDIR(estat.st_mode))
1119 {
1120 if(!strcmp(dent->d_name, ".pavuk_info") && cfg.enable_info)
1121 continue;
1122
1123 get_urls_to_resume(next_dir);
1124 }
1125 else if(!strncmp(".in_", dent->d_name, 4))
1126 {
1127 snprintf(next_dir, sizeof(next_dir), "%s/%s", dirname, dent->d_name + 4);
1128 if((purl = filename_to_url(next_dir)))
1129 {
1130 if(cfg.mode != MODE_MIRROR)
1131 {
1132 xprintf(1, gettext("Adding %s to resume list\n"), next_dir);
1133 }
1134 purl->status |= URL_ISSTARTING;
1135 url_set_filename(purl, tl_strdup(next_dir));
1136 append_url_to_list(purl);
1137 }
1138 }
1139
1140 #ifdef I_FACE
1141 if(cfg.xi_face && (cfg.rbreak || cfg.stop))
1142 break;
1143 #endif
1144 }
1145
1146 closedir(dir);
1147 }
1148
get_urls_to_synchronize(char * dirname,dllist ** list)1149 void get_urls_to_synchronize(char *dirname, dllist ** list)
1150 {
1151 DIR *dir;
1152 struct dirent *dent;
1153 char next_dir[PATH_MAX];
1154 struct stat estat;
1155 url *purl;
1156
1157
1158 if(!(dir = opendir(dirname)))
1159 {
1160 xperror(dirname);
1161 return;
1162 }
1163
1164 gui_set_msg(gettext("Searching for documents to synchronize"), 0);
1165
1166 while((dent = readdir(dir)))
1167 {
1168 _Xt_Serve;
1169
1170 snprintf(next_dir, sizeof(next_dir), "%s/%s", dirname, dent->d_name);
1171 if(!strcmp(dent->d_name, "."))
1172 continue;
1173 if(!strcmp(dent->d_name, ".."))
1174 continue;
1175 if(lstat(next_dir, &estat))
1176 {
1177 xperror(next_dir);
1178 continue;
1179 }
1180
1181 if(S_ISDIR(estat.st_mode))
1182 {
1183 if(!strcmp(dent->d_name, ".pavuk_info") && cfg.enable_info)
1184 continue;
1185
1186 strcat(next_dir, "/");
1187
1188 if((purl = filename_to_url(next_dir)) &&
1189 purl->type == URLT_FTP && !cfg.store_index)
1190 {
1191 purl->status |= URL_ISSTARTING;
1192 purl->extension = ftp_url_ext_new(FTP_TYPE_D, -1, -1, NULL, 0);
1193 url_set_filename(purl,
1194 tl_str_concat(NULL, next_dir, priv_cfg.index_name, NULL));
1195 *list = dllist_prepend(*list, (dllist_t) purl);
1196 }
1197 else if(purl)
1198 {
1199 free_deep_url(purl);
1200 _free(purl);
1201 }
1202
1203 next_dir[strlen(next_dir) - 1] = '\0';
1204
1205 get_urls_to_synchronize(next_dir, list);
1206 }
1207 else if(cfg.enable_info && !strcmp(dent->d_name, ".lock"))
1208 {
1209 /* do nothing */
1210 continue;
1211 }
1212 else if(!strncmp(".in_", dent->d_name, 4))
1213 {
1214 snprintf(next_dir, sizeof(next_dir), "%s/%s", dirname, dent->d_name + 4);
1215 if((purl = filename_to_url(next_dir)))
1216 {
1217 char *ustr;
1218
1219 ustr = url_to_urlstr(purl, FALSE);
1220 if(cfg.mode != MODE_MIRROR)
1221 {
1222 xprintf(1, gettext("Adding file %s to sync list as URL %s\n"),
1223 next_dir, ustr);
1224 }
1225 _free(ustr);
1226 if(purl->type == URLT_FTP)
1227 {
1228 int tp;
1229
1230 if(purl->p.ftp.dir)
1231 tp = FTP_TYPE_D;
1232 else
1233 tp = FTP_TYPE_F;
1234 purl->extension = ftp_url_ext_new(tp, -1, -1, NULL, 0);
1235 }
1236 purl->status |= URL_ISSTARTING;
1237
1238 url_set_filename(purl, tl_strdup(next_dir));
1239 *list = dllist_prepend(*list, (dllist_t) purl);
1240 }
1241 }
1242 else
1243 {
1244 if((purl = filename_to_url(next_dir)))
1245 {
1246 char *ustr;
1247
1248 ustr = url_to_urlstr(purl, FALSE);
1249 if(cfg.mode != MODE_MIRROR)
1250 {
1251 xprintf(1, gettext("Adding file %s to sync list as URL %s\n"),
1252 next_dir, ustr);
1253 }
1254 _free(ustr);
1255
1256 if(purl->type == URLT_FTP)
1257 {
1258 int tp;
1259
1260 if(purl->p.ftp.dir)
1261 tp = FTP_TYPE_D;
1262 #ifdef S_ISLNK
1263 else if(S_ISLNK(estat.st_mode))
1264 tp = FTP_TYPE_L;
1265 #endif
1266 else
1267 tp = FTP_TYPE_F;
1268 purl->extension = ftp_url_ext_new(tp, -1, -1, NULL, 0);
1269 }
1270
1271 purl->status |= URL_ISSTARTING;
1272 url_set_filename(purl, tl_strdup(next_dir));
1273 *list = dllist_prepend(*list, (dllist_t) purl);
1274 }
1275 }
1276
1277 #ifdef I_FACE
1278 if(cfg.xi_face && (cfg.rbreak || cfg.stop))
1279 break;
1280 #endif
1281 }
1282
1283 closedir(dir);
1284 }
1285