1 /***************************************************************************/
2 /* This code is part of WWW grabber called pavuk */
3 /* Copyright (c) 1997 - 2001 Stefan Ondrejicka */
4 /* Distributed under GPL 2 or later */
5 /***************************************************************************/
6
7 #include "config.h"
8
9 #include <assert.h>
10 #include <stdio.h>
11 #include <unistd.h>
12 #include <sys/types.h>
13 #include <sys/stat.h>
14 #include <fcntl.h>
15 #include <string.h>
16 #include <stdlib.h>
17 #include <utime.h>
18
19 #include "url.h"
20 #include "doc.h"
21 #include "html.h"
22 #include "htmlparser.h"
23 #include "gui_api.h"
24 #include "mime.h"
25 #include "errcode.h"
26 #include "uexit.h"
27
28 /*****************************************/
29 /* get requested attribute from HTML tag */
30 /*****************************************/
html_get_attrib_from_tag(char * tag,char * link_attrib)31 char *html_get_attrib_from_tag(char *tag, char *link_attrib)
32 {
33 char *p;
34 char *retval = NULL;
35 char *attrstart = NULL;
36 char *attrend = NULL;
37 int llen = strlen(link_attrib);
38 bool_t was_sep = TRUE;
39
40 for(p = tag; *p; p++)
41 {
42 if(was_sep && !attrstart && !strncasecmp(link_attrib, p, llen) &&
43 (tl_ascii_isspace(*(p + llen)) || (*(p + llen) == '=')))
44 {
45 attrstart = p + llen;
46
47 while(*attrstart)
48 {
49 if(tl_ascii_isspace(*attrstart) || (*attrstart == '='))
50 attrstart++;
51 else
52 break;
53 }
54 if(*attrstart == '\"' || *attrstart == '\'')
55 {
56 if(!(attrend = strchr(attrstart + 1, *attrstart)))
57 attrend = attrstart + strcspn(attrstart, " \t\r\n>");
58 attrstart++;
59 }
60 else
61 {
62 attrend = attrstart + strcspn(attrstart, " \t\r\n\"\'>");
63 }
64 break;
65 }
66 was_sep = tl_ascii_isspace(*p) != 0;
67 if(!attrend && !was_sep)
68 was_sep = (*p == ';');
69 if(*p == '\"' || *p == '\'')
70 {
71 if(!(p = strchr(p + 1, *p)))
72 break;
73 }
74 }
75 if(attrstart)
76 {
77 /* strip leading/trailing spaces */
78 while(tl_ascii_isspace(*attrstart))
79 attrstart++;
80 while(attrend > attrstart && tl_ascii_isspace(*(attrend - 1)))
81 attrend--;
82
83 /* to workaround broken tags which are missing closing */
84 /* quotes and contain leading space characters */
85 if(attrstart > attrend)
86 attrend = attrstart + strcspn(attrstart, "> ");
87
88 retval = tl_strndup(attrstart, attrend - attrstart);
89 omit_chars(retval, "\t\n\r");
90 }
91 return retval;
92 }
93
94 /********************************************************/
95 /* overwrite content of specified attribute in HTML tag */
96 /********************************************************/
html_replace_url_in_stack(char * tag,char * link_attrib,char * urlin,int pare)97 void html_replace_url_in_stack(char *tag, char *link_attrib, char *urlin,
98 int pare)
99 {
100 char *pom;
101 char *p;
102 char *attrstart = NULL;
103 char *pattrstart = NULL;
104 char *attrend = NULL;
105 int llen = strlen(link_attrib);
106 bool_t was_sep = TRUE;
107
108 for(p = tag; *p; p++)
109 {
110 if(was_sep && !attrstart && !strncasecmp(link_attrib, p, llen) &&
111 (tl_ascii_isspace(*(p + llen)) || (*(p + llen) == '=')))
112 {
113 pattrstart = attrstart = p + llen;
114
115 while(*attrstart)
116 {
117 if(tl_ascii_isspace(*attrstart) || (*attrstart == '='))
118 attrstart++;
119 else
120 break;
121 }
122 if(*attrstart == '\"' || *attrstart == '\'')
123 {
124 if(!(attrend = strchr(attrstart + 1, *attrstart)))
125 attrend = attrstart + strcspn(attrstart, " \r\n\t>");
126 attrstart++;
127 }
128 else
129 {
130 attrend = attrstart + strcspn(attrstart, " \t\r\n\"\'>");
131 }
132 break;
133 }
134 was_sep = tl_ascii_isspace(*p) != 0;
135 if(*p == '\"' || *p == '\'')
136 {
137 if(!(p = strchr(p + 1, *p)))
138 break;
139 }
140 }
141 if(attrstart)
142 {
143 /* to workaround broken tags which are missing closing */
144 /* quotes and contain leading space characters */
145 if(attrstart > attrend)
146 attrend = attrstart + strcspn(attrstart, "> ");
147
148 pom = (*attrend == '\'' || *attrend == '\"') ?
149 tl_strdup(attrend + 1) : tl_strdup(attrend);
150
151 if(!pare)
152 strcpy(pattrstart, "=\"");
153 else
154 strcpy(pattrstart, "=");
155 strcat(pattrstart, urlin);
156 if(!pare)
157 strcat(pattrstart, "\"");
158
159 strcat(pattrstart, pom);
160
161 _free(pom);
162 }
163 return;
164 }
165
166 /******************************************/
167 /* look if tag contains specified element */
168 /******************************************/
html_tag_co_elem(char * tag,char * elem)169 int html_tag_co_elem(char *tag, char *elem)
170 {
171 char *p;
172 int llen = strlen(elem);
173 bool_t was_sep = TRUE;
174
175 for(p = tag; *p; p++)
176 {
177 if(was_sep && !strncasecmp(elem, p, llen) &&
178 (tl_ascii_isspace(*(p + llen)) ||
179 (*(p + llen) == '=') || (!*(p + llen))))
180 {
181 return TRUE;
182 }
183 was_sep = tl_ascii_isspace(*p) != 0;
184 if(!was_sep)
185 was_sep = (*p == ';');
186 if(*p == '\"' || *p == '\'')
187 {
188 if(!(p = strchr(p + 1, *p)))
189 break;
190 }
191 }
192 return FALSE;
193 }
194
195 /**********************************************************/
196 /* determine base URL for document looking at request URL */
197 /**********************************************************/
html_get_init_base_url(url * urlp,char ** base,char ** baset)198 static void html_get_init_base_url(url * urlp, char **base, char **baset)
199 {
200 char *p;
201
202 *baset = url_to_urlstr(urlp, FALSE);
203 *base = tl_strdup(*baset);
204 if((p = strrchr(*baset, '#')))
205 *p = '\0';
206 DEBUG_HTML("BASE URL - %s\n", *base);
207
208 if((p = strrchr(*base, '?')))
209 *p = '\0';
210 if(!tl_is_dirname(*base))
211 {
212 p = strrchr(*base, '/');
213 if(p)
214 *(p + 1) = '\0';
215 }
216 }
217
218 /********************************************************************/
219 /* determine base URL for document looking on request URL && server */
220 /* response header fields Content-Location: & Content-Base: & Base: */
221 /********************************************************************/
html_get_base_url(doc * docp,char ** base,char ** baset)222 static void html_get_base_url(doc * docp, char **base, char **baset)
223 {
224 char *p;
225
226 html_get_init_base_url(docp->doc_url, base, baset);
227
228 /* get possible base URL from server response header */
229 if(docp->mime &&
230 ((p = get_mime_param_val_str("Content-Location:", docp->mime)) ||
231 (p = get_mime_param_val_str("Content-Base:", docp->mime)) ||
232 (p = get_mime_param_val_str("Base:", docp->mime))) && p)
233 {
234 char *p2;
235 url *urlp;
236
237 p2 = url_to_absolute_url(*base, *baset, docp->doc_url, p);
238 urlp = url_parse(p2);
239 assert(urlp->type != URLT_FROMPARENT);
240
241 if(!prottable[urlp->type].supported)
242 {
243 xprintf(1,
244 gettext("Unsupported BASE URL - %s (probably bad handled)\n"), p);
245 _free(*base);
246 *base = tl_strdup(p);
247 }
248 else
249 {
250 _free(p);
251 _free(*base);
252 html_get_init_base_url(urlp, base, &p);
253 _free(p);
254 }
255 free_deep_url(urlp);
256 _free(urlp);
257 _free(p2);
258 }
259 }
260
261 /*******************************************************/
262 /* parse HTML document and extract URLs from it and if */
263 /* requested, also adjust content of document */
264 /*******************************************************/
html_process_document(doc * html_doc,dllist ** formlist)265 dllist *html_process_document(doc * html_doc, dllist ** formlist)
266 {
267 char *base, *baset;
268 html_parser_t *hp;
269 html_extract_info_t einfo;
270 html_rewrite_info_t rinfo;
271 html_robots_info_t oinfo;
272 int rewrite;
273 int purestyle;
274 int purescript;
275 int follow = TRUE;
276
277 /** call the -follow_cmd script **/
278 if(priv_cfg.condition.follow_cmd)
279 {
280 int rv = uexit_follow_cmd(html_doc);
281
282 if(rv == 0)
283 follow = FALSE;
284 }
285
286 purestyle = (html_doc->doc_url->status & URL_STYLE);
287 purescript = (html_doc->doc_url->status & URL_ISSCRIPT);
288
289 einfo.prev_a = NULL;
290 einfo.urls = NULL;
291 einfo.no_limits = (cfg.mode == MODE_FTPDIR) || (cfg.dump_urlfd >= 0);
292 einfo.only_inline = (cfg.mode == MODE_SINGLE) || cfg.singlepage;
293 einfo.enable_js = cfg.enable_js;
294
295 rinfo.einfo = &einfo;
296 rinfo.all_to_local = cfg.all_to_local;
297 rinfo.selected_to_local = cfg.sel_to_local;
298 rinfo.all_to_remote = cfg.all_to_remote;
299
300 oinfo.index = TRUE;
301 oinfo.follow = TRUE;
302 oinfo.images = TRUE;
303
304 rewrite = cfg.rewrite_links && cfg.mode != MODE_FTPDIR;
305
306 hp = html_parser_init(html_link_tags, html_link_tags_num(),
307 rewrite, purestyle, purescript);
308
309 /** urls in script are relative to HTML document **/
310 /** where it is called not relative to script itself **/
311 if(purescript && html_doc->doc_url->parent_url)
312 html_get_init_base_url((url *) html_doc->doc_url->parent_url->data, &base,
313 &baset);
314 else
315 html_get_base_url(html_doc, &base, &baset);
316
317 html_parser_set_base(hp, base, baset);
318 html_parser_set_document(hp, html_doc->doc_url,
319 html_doc->contents, html_doc->size);
320
321 html_parser_add_tag_func(hp, html_parser_parse_tag, NULL);
322 html_parser_add_tag_func(hp,
323 (html_parser_func_t) html_parser_parse_tag_slash_a, &einfo);
324 html_parser_add_tag_func(hp, html_parser_parse_tag_meta_refresh, NULL);
325
326 if(cfg.condition.allow_robots)
327 html_parser_add_tag_func(hp,
328 (html_parser_func_t) html_parser_parse_tag_meta_robots, &oinfo);
329
330 html_parser_add_attrib_func(hp, html_parser_url_to_absolute_url, NULL);
331 #ifdef HAVE_REGEX
332 if(rewrite && cfg.remove_adv && priv_cfg.advert_res)
333 html_parser_add_attrib_func(hp, html_parser_remove_advertisement, NULL);
334 #endif
335 html_parser_add_attrib_func(hp, html_parser_process_base, NULL);
336 html_parser_add_attrib_func(hp,
337 (html_parser_func_t) html_parser_process_form, formlist);
338
339 if(follow)
340 html_parser_add_attrib_func(hp,
341 (html_parser_func_t) html_parser_get_url, &einfo);
342
343 if(rewrite && !cfg.post_update)
344 html_parser_add_attrib_func(hp,
345 (html_parser_func_t) html_parser_url_to_local, &rinfo);
346
347 html_parser_add_style_func(hp, html_parser_style_to_absolute_urls, NULL);
348 if(follow)
349 html_parser_add_style_func(hp,
350 (html_parser_func_t) html_parser_get_style_urls, &einfo);
351
352 if(rewrite && !cfg.post_update)
353 html_parser_add_style_func(hp,
354 (html_parser_func_t) html_parser_style_to_local_urls, &rinfo);
355
356
357 if(cfg.enable_js)
358 {
359 html_parser_add_script_func(hp, html_parser_parse_jspatterns, NULL);
360 html_parser_add_script_func(hp, html_parser_parse_body_jspatterns, NULL);
361
362 #ifdef HAVE_REGEX
363 if(priv_cfg.js_transform)
364 {
365 html_parser_add_tag_func(hp, html_parser_parse_tag_jstransform, NULL);
366 html_parser_add_script_func(hp, html_parser_parse_body_jstransform,
367 NULL);
368 }
369 #endif
370 }
371
372 html_parser_parse(hp);
373
374 if(rewrite)
375 {
376 _free(html_doc->contents);
377 html_parser_take_document(hp, &html_doc->contents, &html_doc->size);
378 }
379
380 html_parser_kill(hp);
381
382 /*** support for robots limits in META only ***/
383 /*** nofollow supported, rest doesn't have ***/
384 /*** any real meaning in pavuk ***/
385 if(!oinfo.follow)
386 {
387 DEBUG_HTML("NOFOLLOW attribute in meta data found\n");
388 while(einfo.urls)
389 {
390 free_deep_url((url *) einfo.urls->data);
391 free((url *)einfo.urls->data);
392 einfo.urls = dllist_remove_entry(einfo.urls, einfo.urls);
393 }
394 }
395
396 return einfo.urls;
397 }
398
399 /*****************************************/
400 /* adjust URLs inside document to point */
401 /* to present local documents */
402 /*****************************************/
html_process_parent_document(doc * html_doc,url * url_old,char * url_new)403 void html_process_parent_document(doc * html_doc, url * url_old,
404 char *url_new)
405 {
406 char *base, *baset;
407 html_parser_t *hp;
408 html_extract_info_t einfo;
409 html_rewrite_info_t rinfo;
410 html_change_info_t chinfo;
411 int purestyle;
412 int purescript;
413 char *relfn = NULL;
414
415 purestyle = (html_doc->doc_url->status & URL_STYLE);
416 purescript = (html_doc->doc_url->status & URL_ISSCRIPT);
417
418 if(cfg.all_to_local || cfg.sel_to_local || cfg.all_to_remote)
419 return;
420
421 einfo.prev_a = NULL;
422 einfo.urls = NULL;
423 einfo.no_limits = FALSE;
424 einfo.only_inline = FALSE;
425 einfo.enable_js = cfg.enable_js;
426
427 rinfo.einfo = &einfo;
428 rinfo.all_to_local = cfg.all_to_local;
429 rinfo.selected_to_local = cfg.sel_to_local;
430 rinfo.all_to_remote = cfg.all_to_remote;
431
432 chinfo.url_old = url_old;
433
434 if(url_new)
435 chinfo.url_new = url_new;
436 else if(cfg.post_update)
437 {
438 relfn = get_relative_path(url_to_filename(html_doc->doc_url, FALSE),
439 url_to_filename(url_old, FALSE));
440
441 chinfo.url_new = relfn;
442 }
443 else
444 chinfo.url_new = NULL;
445
446 hp = html_parser_init(html_link_tags, html_link_tags_num(),
447 TRUE, purestyle, purescript);
448 html_get_base_url(html_doc, &base, &baset);
449 html_parser_set_base(hp, base, baset);
450 html_parser_set_document(hp, html_doc->doc_url,
451 html_doc->contents, html_doc->size);
452
453 html_parser_add_tag_func(hp, html_parser_parse_tag, NULL);
454 html_parser_add_tag_func(hp, html_parser_parse_tag_meta_refresh, NULL);
455
456 if(chinfo.url_new)
457 html_parser_add_attrib_func(hp,
458 (html_parser_func_t) html_parser_change_url, &chinfo);
459
460 if(!cfg.post_update)
461 html_parser_add_attrib_func(hp,
462 (html_parser_func_t) html_parser_url_to_local, &rinfo);
463
464 if(chinfo.url_new)
465 html_parser_add_style_func(hp,
466 (html_parser_func_t) html_parser_style_change_url, &chinfo);
467
468 if(!cfg.post_update)
469 html_parser_add_style_func(hp,
470 (html_parser_func_t) html_parser_style_to_local_urls, &rinfo);
471
472
473 if(cfg.enable_js)
474 {
475 html_parser_add_script_func(hp, html_parser_parse_jspatterns, NULL);
476 html_parser_add_script_func(hp, html_parser_parse_body_jspatterns, NULL);
477 }
478
479 html_parser_parse(hp);
480
481 _free(html_doc->contents);
482 html_parser_take_document(hp, &html_doc->contents, &html_doc->size);
483
484 html_parser_kill(hp);
485
486 _free(relfn);
487 }
488
489 /*************************************************/
490 /* load parent document adjust it and store back */
491 /* with locking and modification time preserving */
492 /*************************************************/
rewrite_one_parent_links(url * doc_url,url * parent_url,char * dst_name)493 void rewrite_one_parent_links(url * doc_url, url * parent_url, char *dst_name)
494 {
495 char pom[PATH_MAX];
496 char *fnamep;
497 char *rfn = NULL;
498 char *savetmp, *p;
499 int fd;
500 doc pdoc;
501 struct stat estat;
502 struct utimbuf ut;
503 int perm;
504 url dum;
505
506 DEBUG_PROCS("rewrite_one_parent_links()");
507 if(!parent_url || !(parent_url->status & URL_DOWNLOADED))
508 return;
509
510 /*** parent document was not stored ***/
511 if(!cfg.store_index && url_is_dir_index(parent_url))
512 return;
513
514 fnamep = url_to_filename(parent_url, FALSE);
515 if(stat(fnamep, &estat) == 0)
516 {
517 if(S_ISDIR(estat.st_mode))
518 {
519 xprintf(1, gettext("Can't work on directory\n"));
520 return;
521 }
522 }
523 else
524 {
525 xperror("stat");
526 return;
527 }
528
529 perm = estat.st_mode;
530 ut.actime = estat.st_atime;
531 ut.modtime = estat.st_mtime;
532
533 memset(&dum, 0, sizeof(url));
534 dum.type = URLT_FILE;
535 dum.p.file.filename = fnamep;
536 dum.local_name = fnamep;
537 dum.status = parent_url->status & URL_STYLE;
538 dum.status &= ~URL_REDIRECT;
539 doc_init(&pdoc, &dum);
540 pdoc.report_size = FALSE;
541
542 if(doc_download(&pdoc, TRUE, TRUE))
543 {
544 doc_remove_lock(&pdoc);
545 if(pdoc.errcode)
546 report_error(&pdoc, gettext("rewrite parent"));
547 return;
548 }
549
550 if(pdoc.errcode)
551 report_error(&pdoc, gettext("rewrite parent"));
552
553 _free(pdoc.mime);
554
555 /* dst_name != NULL means child document was moved */
556 if(dst_name &&
557 !access(dst_name, R_OK) &&
558 !stat(dst_name, &estat) && !S_ISDIR(estat.st_mode))
559 {
560 rfn = get_relative_path(fnamep, dst_name);
561 }
562
563 html_process_parent_document(&pdoc, doc_url, rfn);
564 _free(rfn);
565
566 strncpy(pom, fnamep, sizeof(pom) - 20);
567 pom[sizeof(pom) - 21] = '\0';
568 p = strrchr(pom, '/');
569 if(p)
570 sprintf(p + 1, "_*%d", (int) getpid());
571 else
572 snprintf(pom, sizeof(pom), "%s/_*%d", pom, (int) getpid());
573
574 savetmp = tl_strdup(pom);
575 rename(fnamep, savetmp);
576
577
578 if((fd =
579 open(fnamep, O_BINARY | O_CREAT | O_TRUNC | O_WRONLY,
580 S_IWUSR | S_IRUSR | S_IRGRP | S_IROTH)) < 0)
581 {
582 xperror(fnamep);
583 rename(savetmp, fnamep);
584 doc_remove_lock(&pdoc);
585 free(savetmp);
586 free(pdoc.contents);
587 return;
588 }
589 if(write(fd, pdoc.contents, pdoc.size) != pdoc.size)
590 {
591 xperror(fnamep);
592 close(fd);
593 rename(savetmp, fnamep);
594 doc_remove_lock(&pdoc);
595 free(savetmp);
596 free(pdoc.contents);
597 return;
598 }
599 close(fd);
600 doc_remove_lock(&pdoc);
601 utime(fnamep, &ut);
602 chmod(fnamep, perm);
603 unlink(savetmp);
604 free(savetmp);
605 free(pdoc.contents);
606 DEBUG_PROCE("rewrite_one_parent_links()");
607 }
608
609 /*************************************************/
610 /* take all parent documents and adjust inside */
611 /* all URLs, recurse up when document was moved */
612 /*************************************************/
rewrite_parents_links(url * doc_url,char * dst_name)613 void rewrite_parents_links(url * doc_url, char *dst_name)
614 {
615 char *fn = NULL;
616 dllist *ptr;
617
618 if((doc_url->status & URL_MOVED) && !dst_name)
619 return;
620
621 LOCK_URL(doc_url);
622 for(ptr = doc_url->parent_url; ptr; ptr = ptr->next)
623 {
624 url *parent_url = (url *) ptr->data;
625
626 if(cfg.rbreak)
627 break;
628
629 if(parent_url->status & URL_MOVED)
630 {
631 fn = dst_name ? dst_name : url_to_filename(doc_url, FALSE);
632
633 rewrite_parents_links(parent_url, fn);
634 }
635 else
636 {
637 rewrite_one_parent_links(doc_url, parent_url, dst_name);
638 }
639 }
640 UNLOCK_URL(doc_url);
641 }
642