1 /***************************************************************************/
2 /*    This code is part of WWW grabber called pavuk                        */
3 /*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          */
4 /*    Distributed under GPL 2 or later                                     */
5 /***************************************************************************/
6 
7 #include "config.h"
8 
9 #include <assert.h>
10 #include <stdio.h>
11 #include <unistd.h>
12 #include <sys/types.h>
13 #include <sys/stat.h>
14 #include <fcntl.h>
15 #include <string.h>
16 #include <stdlib.h>
17 #include <utime.h>
18 
19 #include "url.h"
20 #include "doc.h"
21 #include "html.h"
22 #include "htmlparser.h"
23 #include "gui_api.h"
24 #include "mime.h"
25 #include "errcode.h"
26 #include "uexit.h"
27 
28 /*****************************************/
29 /* get requested attribute from HTML tag */
30 /*****************************************/
html_get_attrib_from_tag(char * tag,char * link_attrib)31 char *html_get_attrib_from_tag(char *tag, char *link_attrib)
32 {
33   char *p;
34   char *retval = NULL;
35   char *attrstart = NULL;
36   char *attrend = NULL;
37   int llen = strlen(link_attrib);
38   bool_t was_sep = TRUE;
39 
40   for(p = tag; *p; p++)
41   {
42     if(was_sep && !attrstart && !strncasecmp(link_attrib, p, llen) &&
43       (tl_ascii_isspace(*(p + llen)) || (*(p + llen) == '=')))
44     {
45       attrstart = p + llen;
46 
47       while(*attrstart)
48       {
49         if(tl_ascii_isspace(*attrstart) || (*attrstart == '='))
50           attrstart++;
51         else
52           break;
53       }
54       if(*attrstart == '\"' || *attrstart == '\'')
55       {
56         if(!(attrend = strchr(attrstart + 1, *attrstart)))
57           attrend = attrstart + strcspn(attrstart, " \t\r\n>");
58         attrstart++;
59       }
60       else
61       {
62         attrend = attrstart + strcspn(attrstart, " \t\r\n\"\'>");
63       }
64       break;
65     }
66     was_sep = tl_ascii_isspace(*p) != 0;
67     if(!attrend && !was_sep)
68       was_sep = (*p == ';');
69     if(*p == '\"' || *p == '\'')
70     {
71       if(!(p = strchr(p + 1, *p)))
72         break;
73     }
74   }
75   if(attrstart)
76   {
77     /* strip leading/trailing spaces */
78     while(tl_ascii_isspace(*attrstart))
79       attrstart++;
80     while(attrend > attrstart && tl_ascii_isspace(*(attrend - 1)))
81       attrend--;
82 
83     /* to workaround broken tags which are missing closing */
84     /* quotes and contain leading space characters        */
85     if(attrstart > attrend)
86       attrend = attrstart + strcspn(attrstart, "> ");
87 
88     retval = tl_strndup(attrstart, attrend - attrstart);
89     omit_chars(retval, "\t\n\r");
90   }
91   return retval;
92 }
93 
94 /********************************************************/
95 /* overwrite content of specified attribute in HTML tag */
96 /********************************************************/
html_replace_url_in_stack(char * tag,char * link_attrib,char * urlin,int pare)97 void html_replace_url_in_stack(char *tag, char *link_attrib, char *urlin,
98   int pare)
99 {
100   char *pom;
101   char *p;
102   char *attrstart = NULL;
103   char *pattrstart = NULL;
104   char *attrend = NULL;
105   int llen = strlen(link_attrib);
106   bool_t was_sep = TRUE;
107 
108   for(p = tag; *p; p++)
109   {
110     if(was_sep && !attrstart && !strncasecmp(link_attrib, p, llen) &&
111       (tl_ascii_isspace(*(p + llen)) || (*(p + llen) == '=')))
112     {
113       pattrstart = attrstart = p + llen;
114 
115       while(*attrstart)
116       {
117         if(tl_ascii_isspace(*attrstart) || (*attrstart == '='))
118           attrstart++;
119         else
120           break;
121       }
122       if(*attrstart == '\"' || *attrstart == '\'')
123       {
124         if(!(attrend = strchr(attrstart + 1, *attrstart)))
125           attrend = attrstart + strcspn(attrstart, " \r\n\t>");
126         attrstart++;
127       }
128       else
129       {
130         attrend = attrstart + strcspn(attrstart, " \t\r\n\"\'>");
131       }
132       break;
133     }
134     was_sep = tl_ascii_isspace(*p) != 0;
135     if(*p == '\"' || *p == '\'')
136     {
137       if(!(p = strchr(p + 1, *p)))
138         break;
139     }
140   }
141   if(attrstart)
142   {
143     /* to workaround broken tags which are missing closing */
144     /* quotes and contain leading space characters        */
145     if(attrstart > attrend)
146       attrend = attrstart + strcspn(attrstart, "> ");
147 
148     pom = (*attrend == '\'' || *attrend == '\"') ?
149       tl_strdup(attrend + 1) : tl_strdup(attrend);
150 
151     if(!pare)
152       strcpy(pattrstart, "=\"");
153     else
154       strcpy(pattrstart, "=");
155     strcat(pattrstart, urlin);
156     if(!pare)
157       strcat(pattrstart, "\"");
158 
159     strcat(pattrstart, pom);
160 
161     _free(pom);
162   }
163   return;
164 }
165 
166 /******************************************/
167 /* look if tag contains specified element */
168 /******************************************/
html_tag_co_elem(char * tag,char * elem)169 int html_tag_co_elem(char *tag, char *elem)
170 {
171   char *p;
172   int llen = strlen(elem);
173   bool_t was_sep = TRUE;
174 
175   for(p = tag; *p; p++)
176   {
177     if(was_sep && !strncasecmp(elem, p, llen) &&
178       (tl_ascii_isspace(*(p + llen)) ||
179         (*(p + llen) == '=') || (!*(p + llen))))
180     {
181       return TRUE;
182     }
183     was_sep = tl_ascii_isspace(*p) != 0;
184     if(!was_sep)
185       was_sep = (*p == ';');
186     if(*p == '\"' || *p == '\'')
187     {
188       if(!(p = strchr(p + 1, *p)))
189         break;
190     }
191   }
192   return FALSE;
193 }
194 
195 /**********************************************************/
196 /* determine base URL for document looking at request URL */
197 /**********************************************************/
html_get_init_base_url(url * urlp,char ** base,char ** baset)198 static void html_get_init_base_url(url * urlp, char **base, char **baset)
199 {
200   char *p;
201 
202   *baset = url_to_urlstr(urlp, FALSE);
203   *base = tl_strdup(*baset);
204   if((p = strrchr(*baset, '#')))
205     *p = '\0';
206   DEBUG_HTML("BASE URL - %s\n", *base);
207 
208   if((p = strrchr(*base, '?')))
209     *p = '\0';
210   if(!tl_is_dirname(*base))
211   {
212     p = strrchr(*base, '/');
213     if(p)
214       *(p + 1) = '\0';
215   }
216 }
217 
218 /********************************************************************/
219 /* determine base URL for document looking on request URL && server */
220 /* response header fields Content-Location: & Content-Base: & Base: */
221 /********************************************************************/
html_get_base_url(doc * docp,char ** base,char ** baset)222 static void html_get_base_url(doc * docp, char **base, char **baset)
223 {
224   char *p;
225 
226   html_get_init_base_url(docp->doc_url, base, baset);
227 
228   /* get possible base URL from server response header */
229   if(docp->mime &&
230     ((p = get_mime_param_val_str("Content-Location:", docp->mime)) ||
231       (p = get_mime_param_val_str("Content-Base:", docp->mime)) ||
232       (p = get_mime_param_val_str("Base:", docp->mime))) && p)
233   {
234     char *p2;
235     url *urlp;
236 
237     p2 = url_to_absolute_url(*base, *baset, docp->doc_url, p);
238     urlp = url_parse(p2);
239     assert(urlp->type != URLT_FROMPARENT);
240 
241     if(!prottable[urlp->type].supported)
242     {
243       xprintf(1,
244         gettext("Unsupported BASE URL -  %s (probably bad handled)\n"), p);
245       _free(*base);
246       *base = tl_strdup(p);
247     }
248     else
249     {
250       _free(p);
251       _free(*base);
252       html_get_init_base_url(urlp, base, &p);
253       _free(p);
254     }
255     free_deep_url(urlp);
256     _free(urlp);
257     _free(p2);
258   }
259 }
260 
261 /*******************************************************/
262 /* parse HTML document and extract URLs from it and if */
263 /* requested, also adjust content of document          */
264 /*******************************************************/
html_process_document(doc * html_doc,dllist ** formlist)265 dllist *html_process_document(doc * html_doc, dllist ** formlist)
266 {
267   char *base, *baset;
268   html_parser_t *hp;
269   html_extract_info_t einfo;
270   html_rewrite_info_t rinfo;
271   html_robots_info_t oinfo;
272   int rewrite;
273   int purestyle;
274   int purescript;
275   int follow = TRUE;
276 
277   /** call the -follow_cmd script **/
278   if(priv_cfg.condition.follow_cmd)
279   {
280     int rv = uexit_follow_cmd(html_doc);
281 
282     if(rv == 0)
283       follow = FALSE;
284   }
285 
286   purestyle = (html_doc->doc_url->status & URL_STYLE);
287   purescript = (html_doc->doc_url->status & URL_ISSCRIPT);
288 
289   einfo.prev_a = NULL;
290   einfo.urls = NULL;
291   einfo.no_limits = (cfg.mode == MODE_FTPDIR) || (cfg.dump_urlfd >= 0);
292   einfo.only_inline = (cfg.mode == MODE_SINGLE) || cfg.singlepage;
293   einfo.enable_js = cfg.enable_js;
294 
295   rinfo.einfo = &einfo;
296   rinfo.all_to_local = cfg.all_to_local;
297   rinfo.selected_to_local = cfg.sel_to_local;
298   rinfo.all_to_remote = cfg.all_to_remote;
299 
300   oinfo.index = TRUE;
301   oinfo.follow = TRUE;
302   oinfo.images = TRUE;
303 
304   rewrite = cfg.rewrite_links && cfg.mode != MODE_FTPDIR;
305 
306   hp = html_parser_init(html_link_tags, html_link_tags_num(),
307     rewrite, purestyle, purescript);
308 
309   /** urls in script are relative to HTML document     **/
310   /** where it is called not relative to script itself **/
311   if(purescript && html_doc->doc_url->parent_url)
312     html_get_init_base_url((url *) html_doc->doc_url->parent_url->data, &base,
313       &baset);
314   else
315     html_get_base_url(html_doc, &base, &baset);
316 
317   html_parser_set_base(hp, base, baset);
318   html_parser_set_document(hp, html_doc->doc_url,
319     html_doc->contents, html_doc->size);
320 
321   html_parser_add_tag_func(hp, html_parser_parse_tag, NULL);
322   html_parser_add_tag_func(hp,
323     (html_parser_func_t) html_parser_parse_tag_slash_a, &einfo);
324   html_parser_add_tag_func(hp, html_parser_parse_tag_meta_refresh, NULL);
325 
326   if(cfg.condition.allow_robots)
327     html_parser_add_tag_func(hp,
328       (html_parser_func_t) html_parser_parse_tag_meta_robots, &oinfo);
329 
330   html_parser_add_attrib_func(hp, html_parser_url_to_absolute_url, NULL);
331 #ifdef HAVE_REGEX
332   if(rewrite && cfg.remove_adv && priv_cfg.advert_res)
333     html_parser_add_attrib_func(hp, html_parser_remove_advertisement, NULL);
334 #endif
335   html_parser_add_attrib_func(hp, html_parser_process_base, NULL);
336   html_parser_add_attrib_func(hp,
337     (html_parser_func_t) html_parser_process_form, formlist);
338 
339   if(follow)
340     html_parser_add_attrib_func(hp,
341       (html_parser_func_t) html_parser_get_url, &einfo);
342 
343   if(rewrite && !cfg.post_update)
344     html_parser_add_attrib_func(hp,
345       (html_parser_func_t) html_parser_url_to_local, &rinfo);
346 
347   html_parser_add_style_func(hp, html_parser_style_to_absolute_urls, NULL);
348   if(follow)
349     html_parser_add_style_func(hp,
350       (html_parser_func_t) html_parser_get_style_urls, &einfo);
351 
352   if(rewrite && !cfg.post_update)
353     html_parser_add_style_func(hp,
354       (html_parser_func_t) html_parser_style_to_local_urls, &rinfo);
355 
356 
357   if(cfg.enable_js)
358   {
359     html_parser_add_script_func(hp, html_parser_parse_jspatterns, NULL);
360     html_parser_add_script_func(hp, html_parser_parse_body_jspatterns, NULL);
361 
362 #ifdef HAVE_REGEX
363     if(priv_cfg.js_transform)
364     {
365       html_parser_add_tag_func(hp, html_parser_parse_tag_jstransform, NULL);
366       html_parser_add_script_func(hp, html_parser_parse_body_jstransform,
367         NULL);
368     }
369 #endif
370   }
371 
372   html_parser_parse(hp);
373 
374   if(rewrite)
375   {
376     _free(html_doc->contents);
377     html_parser_take_document(hp, &html_doc->contents, &html_doc->size);
378   }
379 
380   html_parser_kill(hp);
381 
382   /*** support for robots limits in META only ***/
383   /*** nofollow supported, rest doesn't have  ***/
384   /*** any real meaning in pavuk              ***/
385   if(!oinfo.follow)
386   {
387     DEBUG_HTML("NOFOLLOW attribute in meta data found\n");
388     while(einfo.urls)
389     {
390       free_deep_url((url *) einfo.urls->data);
391       free((url *)einfo.urls->data);
392       einfo.urls = dllist_remove_entry(einfo.urls, einfo.urls);
393     }
394   }
395 
396   return einfo.urls;
397 }
398 
399 /*****************************************/
400 /* adjust URLs inside document to point  */
401 /* to present local documents            */
402 /*****************************************/
html_process_parent_document(doc * html_doc,url * url_old,char * url_new)403 void html_process_parent_document(doc * html_doc, url * url_old,
404   char *url_new)
405 {
406   char *base, *baset;
407   html_parser_t *hp;
408   html_extract_info_t einfo;
409   html_rewrite_info_t rinfo;
410   html_change_info_t chinfo;
411   int purestyle;
412   int purescript;
413   char *relfn = NULL;
414 
415   purestyle = (html_doc->doc_url->status & URL_STYLE);
416   purescript = (html_doc->doc_url->status & URL_ISSCRIPT);
417 
418   if(cfg.all_to_local || cfg.sel_to_local || cfg.all_to_remote)
419     return;
420 
421   einfo.prev_a = NULL;
422   einfo.urls = NULL;
423   einfo.no_limits = FALSE;
424   einfo.only_inline = FALSE;
425   einfo.enable_js = cfg.enable_js;
426 
427   rinfo.einfo = &einfo;
428   rinfo.all_to_local = cfg.all_to_local;
429   rinfo.selected_to_local = cfg.sel_to_local;
430   rinfo.all_to_remote = cfg.all_to_remote;
431 
432   chinfo.url_old = url_old;
433 
434   if(url_new)
435     chinfo.url_new = url_new;
436   else if(cfg.post_update)
437   {
438     relfn = get_relative_path(url_to_filename(html_doc->doc_url, FALSE),
439       url_to_filename(url_old, FALSE));
440 
441     chinfo.url_new = relfn;
442   }
443   else
444     chinfo.url_new = NULL;
445 
446   hp = html_parser_init(html_link_tags, html_link_tags_num(),
447     TRUE, purestyle, purescript);
448   html_get_base_url(html_doc, &base, &baset);
449   html_parser_set_base(hp, base, baset);
450   html_parser_set_document(hp, html_doc->doc_url,
451     html_doc->contents, html_doc->size);
452 
453   html_parser_add_tag_func(hp, html_parser_parse_tag, NULL);
454   html_parser_add_tag_func(hp, html_parser_parse_tag_meta_refresh, NULL);
455 
456   if(chinfo.url_new)
457     html_parser_add_attrib_func(hp,
458       (html_parser_func_t) html_parser_change_url, &chinfo);
459 
460   if(!cfg.post_update)
461     html_parser_add_attrib_func(hp,
462       (html_parser_func_t) html_parser_url_to_local, &rinfo);
463 
464   if(chinfo.url_new)
465     html_parser_add_style_func(hp,
466       (html_parser_func_t) html_parser_style_change_url, &chinfo);
467 
468   if(!cfg.post_update)
469     html_parser_add_style_func(hp,
470       (html_parser_func_t) html_parser_style_to_local_urls, &rinfo);
471 
472 
473   if(cfg.enable_js)
474   {
475     html_parser_add_script_func(hp, html_parser_parse_jspatterns, NULL);
476     html_parser_add_script_func(hp, html_parser_parse_body_jspatterns, NULL);
477   }
478 
479   html_parser_parse(hp);
480 
481   _free(html_doc->contents);
482   html_parser_take_document(hp, &html_doc->contents, &html_doc->size);
483 
484   html_parser_kill(hp);
485 
486   _free(relfn);
487 }
488 
489 /*************************************************/
490 /* load parent document adjust it and store back */
491 /* with locking and modification time preserving */
492 /*************************************************/
rewrite_one_parent_links(url * doc_url,url * parent_url,char * dst_name)493 void rewrite_one_parent_links(url * doc_url, url * parent_url, char *dst_name)
494 {
495   char pom[PATH_MAX];
496   char *fnamep;
497   char *rfn = NULL;
498   char *savetmp, *p;
499   int fd;
500   doc pdoc;
501   struct stat estat;
502   struct utimbuf ut;
503   int perm;
504   url dum;
505 
506   DEBUG_PROCS("rewrite_one_parent_links()");
507   if(!parent_url || !(parent_url->status & URL_DOWNLOADED))
508     return;
509 
510   /*** parent document was not stored ***/
511   if(!cfg.store_index && url_is_dir_index(parent_url))
512     return;
513 
514   fnamep = url_to_filename(parent_url, FALSE);
515   if(stat(fnamep, &estat) == 0)
516   {
517     if(S_ISDIR(estat.st_mode))
518     {
519       xprintf(1, gettext("Can't work on directory\n"));
520       return;
521     }
522   }
523   else
524   {
525     xperror("stat");
526     return;
527   }
528 
529   perm = estat.st_mode;
530   ut.actime = estat.st_atime;
531   ut.modtime = estat.st_mtime;
532 
533   memset(&dum, 0, sizeof(url));
534   dum.type = URLT_FILE;
535   dum.p.file.filename = fnamep;
536   dum.local_name = fnamep;
537   dum.status = parent_url->status & URL_STYLE;
538   dum.status &= ~URL_REDIRECT;
539   doc_init(&pdoc, &dum);
540   pdoc.report_size = FALSE;
541 
542   if(doc_download(&pdoc, TRUE, TRUE))
543   {
544     doc_remove_lock(&pdoc);
545     if(pdoc.errcode)
546       report_error(&pdoc, gettext("rewrite parent"));
547     return;
548   }
549 
550   if(pdoc.errcode)
551     report_error(&pdoc, gettext("rewrite parent"));
552 
553   _free(pdoc.mime);
554 
555   /* dst_name != NULL means child document was moved */
556   if(dst_name &&
557     !access(dst_name, R_OK) &&
558     !stat(dst_name, &estat) && !S_ISDIR(estat.st_mode))
559   {
560     rfn = get_relative_path(fnamep, dst_name);
561   }
562 
563   html_process_parent_document(&pdoc, doc_url, rfn);
564   _free(rfn);
565 
566   strncpy(pom, fnamep, sizeof(pom) - 20);
567   pom[sizeof(pom) - 21] = '\0';
568   p = strrchr(pom, '/');
569   if(p)
570     sprintf(p + 1, "_*%d", (int) getpid());
571   else
572     snprintf(pom, sizeof(pom), "%s/_*%d", pom, (int) getpid());
573 
574   savetmp = tl_strdup(pom);
575   rename(fnamep, savetmp);
576 
577 
578   if((fd =
579       open(fnamep, O_BINARY | O_CREAT | O_TRUNC | O_WRONLY,
580         S_IWUSR | S_IRUSR | S_IRGRP | S_IROTH)) < 0)
581   {
582     xperror(fnamep);
583     rename(savetmp, fnamep);
584     doc_remove_lock(&pdoc);
585     free(savetmp);
586     free(pdoc.contents);
587     return;
588   }
589   if(write(fd, pdoc.contents, pdoc.size) != pdoc.size)
590   {
591     xperror(fnamep);
592     close(fd);
593     rename(savetmp, fnamep);
594     doc_remove_lock(&pdoc);
595     free(savetmp);
596     free(pdoc.contents);
597     return;
598   }
599   close(fd);
600   doc_remove_lock(&pdoc);
601   utime(fnamep, &ut);
602   chmod(fnamep, perm);
603   unlink(savetmp);
604   free(savetmp);
605   free(pdoc.contents);
606   DEBUG_PROCE("rewrite_one_parent_links()");
607 }
608 
609 /*************************************************/
610 /* take all parent documents and adjust inside   */
611 /* all URLs, recurse up when document was moved  */
612 /*************************************************/
rewrite_parents_links(url * doc_url,char * dst_name)613 void rewrite_parents_links(url * doc_url, char *dst_name)
614 {
615   char *fn = NULL;
616   dllist *ptr;
617 
618   if((doc_url->status & URL_MOVED) && !dst_name)
619     return;
620 
621   LOCK_URL(doc_url);
622   for(ptr = doc_url->parent_url; ptr; ptr = ptr->next)
623   {
624     url *parent_url = (url *) ptr->data;
625 
626     if(cfg.rbreak)
627       break;
628 
629     if(parent_url->status & URL_MOVED)
630     {
631       fn = dst_name ? dst_name : url_to_filename(doc_url, FALSE);
632 
633       rewrite_parents_links(parent_url, fn);
634     }
635     else
636     {
637       rewrite_one_parent_links(doc_url, parent_url, dst_name);
638     }
639   }
640   UNLOCK_URL(doc_url);
641 }
642