1 /***************************************************************************/
2 /*    This code is part of WWW grabber called pavuk                        */
3 /*    Copyright (c) 1997 - 2001 Stefan Ondrejicka                          */
4 /*    Distributed under GPL 2 or later                                     */
5 /***************************************************************************/
6 
7 #include "config.h"
8 
9 #include <assert.h>
10 #include <sys/types.h>
11 #include <sys/stat.h>
12 #include <unistd.h>
13 #include <string.h>
14 
15 #include "htmlparser.h"
16 
17 #include "tools.h"
18 #include "css.h"
19 #include "re.h"
20 #include "ftp.h"
21 #include "jstrans.h"
22 
23 static dlhash *html_parser_tag_hash = NULL;
24 
25 #define COMMENT_PREFIX "<!-- "
26 #define COMMENT_SUFFIX " -->"
27 #define ADVERT_PREFIX "adv<!-- Removed by pavuk "
28 #define ADVERT_SUFFIX " -->"
29 
html_parser_tag_comp_func(dllist_t key1,dllist_t key2)30 static int html_parser_tag_comp_func(dllist_t key1, dllist_t key2)
31 {
32   return (!strcasecmp((void *) key1, (void *) key2));
33 }
34 
html_parser_tag_hash_func(unsigned int size,dllist_t key)35 static unsigned int html_parser_tag_hash_func(unsigned int size, dllist_t key)
36 {
37   unsigned char *p = (unsigned char *) key;
38   unsigned int retv = 0;
39 
40   while(*p)
41   {
42     retv = (retv + tl_ascii_tolower(*p)) % size;
43     p++;
44   }
45 
46   return retv;
47 }
48 
html_parser_tag_key_func(dllist_t data)49 static dllist_t html_parser_tag_key_func(dllist_t data)
50 {
51   return (dllist_t) ((html_tag_t *) data)->tag;
52 }
53 
html_parser_init(html_tag_t * tags,int ntags,int with_tag_rewriting,int purestyle,int purescript)54 html_parser_t *html_parser_init(html_tag_t * tags, int ntags,
55   int with_tag_rewriting, int purestyle, int purescript)
56 {
57   html_parser_t *rv;
58   int i;
59 
60   rv = _malloc(sizeof(html_parser_t));
61 
62   rv->rewrite = with_tag_rewriting;
63   rv->purestyle = purestyle;
64   rv->purescript = purescript;
65   rv->in_content = NULL;
66   rv->out_content = NULL;
67   rv->in_size = 0;
68   rv->aout_size = 0;
69   rv->out_offset = 0;
70   rv->in_offset = 0;
71 
72   rv->stack = NULL;
73   rv->stack_size = 0;
74   rv->stack_offset = 0;
75 
76   rv->base = NULL;
77   rv->baset = NULL;
78 
79   rv->tag_attrib = NULL;
80 
81   rv->tag_funcs = NULL;
82   rv->attrib_funcs = NULL;
83   rv->style_funcs = NULL;
84   rv->script_funcs = NULL;
85 
86   rv->current_tag = NULL;
87   rv->current_attrib = NULL;
88 
89   LOCK_TAG_HASH;
90   if(html_parser_tag_hash)
91     rv->tag_hash = html_parser_tag_hash;
92   else
93   {
94 
95     rv->tag_hash = dlhash_new(20,
96       html_parser_tag_key_func,
97       html_parser_tag_hash_func, html_parser_tag_comp_func);
98 
99     for(i = 0; i < ntags; i++)
100       dlhash_insert(rv->tag_hash, (dllist_t) &tags[i]);
101 
102     html_parser_tag_hash = rv->tag_hash;
103 
104   }
105   UNLOCK_TAG_HASH;
106 
107   return rv;
108 }
109 
html_parser_do_cleanup(void)110 void html_parser_do_cleanup(void)
111 {
112   if(html_parser_tag_hash)
113     dlhash_free(html_parser_tag_hash);
114 }
115 
html_parser_kill(html_parser_t * hpinfo)116 void html_parser_kill(html_parser_t * hpinfo)
117 {
118 #define KILL_FUNC_CHAIN(chain) \
119   while (chain) \
120   { \
121     if(chain->data) free((void *)chain->data);\
122     chain = dllist_remove_entry(chain, chain);\
123   }
124 
125   KILL_FUNC_CHAIN(hpinfo->tag_funcs);
126   KILL_FUNC_CHAIN(hpinfo->attrib_funcs);
127   KILL_FUNC_CHAIN(hpinfo->style_funcs);
128   KILL_FUNC_CHAIN(hpinfo->script_funcs);
129 
130   _free(hpinfo->stack);
131   _free(hpinfo->out_content);
132 
133   _free(hpinfo->base);
134   _free(hpinfo->baset);
135   _free(hpinfo);
136 }
137 
html_parser_add_tag_func(html_parser_t * hpinfo,html_parser_func_t func,void * data)138 void html_parser_add_tag_func(html_parser_t * hpinfo, html_parser_func_t func,
139   void *data)
140 {
141   html_parser_func_info_t *nfunc;
142 
143   nfunc = _malloc(sizeof(html_parser_func_info_t));
144   nfunc->func = func;
145   nfunc->data = data;
146   hpinfo->tag_funcs = dllist_append(hpinfo->tag_funcs, (dllist_t) nfunc);
147 }
148 
html_parser_add_attrib_func(html_parser_t * hpinfo,html_parser_func_t func,void * data)149 void html_parser_add_attrib_func(html_parser_t * hpinfo,
150   html_parser_func_t func, void *data)
151 {
152   html_parser_func_info_t *nfunc;
153 
154   nfunc = _malloc(sizeof(html_parser_func_info_t));
155   nfunc->func = func;
156   nfunc->data = data;
157   hpinfo->attrib_funcs = dllist_append(hpinfo->attrib_funcs, (dllist_t) nfunc);
158 }
159 
html_parser_add_style_func(html_parser_t * hpinfo,html_parser_func_t func,void * data)160 void html_parser_add_style_func(html_parser_t * hpinfo,
161   html_parser_func_t func, void *data)
162 {
163   html_parser_func_info_t *nfunc;
164 
165   nfunc = _malloc(sizeof(html_parser_func_info_t));
166   nfunc->func = func;
167   nfunc->data = data;
168   hpinfo->style_funcs = dllist_append(hpinfo->style_funcs, (dllist_t) nfunc);
169 }
170 
html_parser_add_script_func(html_parser_t * hpinfo,html_parser_func_t func,void * data)171 void html_parser_add_script_func(html_parser_t * hpinfo,
172   html_parser_func_t func, void *data)
173 {
174   html_parser_func_info_t *nfunc;
175 
176   nfunc = _malloc(sizeof(html_parser_func_info_t));
177   nfunc->func = func;
178   nfunc->data = data;
179   hpinfo->script_funcs = dllist_append(hpinfo->script_funcs, (dllist_t) nfunc);
180 }
181 
html_parser_set_document(html_parser_t * hpinfo,url * doc_url,char * content,ssize_t size)182 void html_parser_set_document(html_parser_t * hpinfo, url * doc_url,
183   char *content, ssize_t size)
184 {
185   hpinfo->doc_url = doc_url;
186   hpinfo->in_content = content;
187   hpinfo->in_size = size;
188 }
189 
html_parser_take_document(html_parser_t * hpinfo,char ** out_content,ssize_t * out_size)190 void html_parser_take_document(html_parser_t * hpinfo, char **out_content,
191   ssize_t * out_size)
192 {
193   *out_content = hpinfo->out_content;
194   *out_size = hpinfo->out_offset;
195 
196   hpinfo->out_content = NULL;
197   hpinfo->out_offset = 0;
198   hpinfo->aout_size = 0;
199 }
200 
html_parser_set_base(html_parser_t * hpinfo,char * base,char * baset)201 void html_parser_set_base(html_parser_t * hpinfo, char *base, char *baset)
202 {
203   if(base)
204   {
205     _free(hpinfo->base);
206     hpinfo->base = base;
207   }
208 
209   if(baset)
210   {
211     _free(hpinfo->baset);
212     hpinfo->baset = baset;
213   }
214 }
215 
html_parser_process_new_base_url(html_parser_t * hpinfo,char * baseattr)216 static void html_parser_process_new_base_url(html_parser_t * hpinfo,
217   char *baseattr)
218 {
219   url *purl;
220   char *newbase;
221 
222   purl = url_parse(baseattr);
223   assert(purl->type != URLT_FROMPARENT);
224 
225   if(!prottable[purl->type].supported)
226   {
227     xprintf(1, gettext("Unsupported BASE URL -  %s (probably bad handled)\n"),
228       baseattr);
229     newbase = tl_strdup(baseattr);
230   }
231   else
232   {
233     char *idx;
234 
235     newbase =
236       url_to_absolute_url(hpinfo->base, hpinfo->baset, hpinfo->doc_url,
237       baseattr);
238 
239     if(!newbase) return; /* collect base="" and ignore it */
240 
241     if((idx = strrchr(newbase, '?')))
242       *idx = '\0';
243     if(!tl_is_dirname(newbase))
244     {
245       idx = strrchr(newbase, '/');
246       if(idx)
247         *(idx + 1) = '\0';
248     }
249   }
250   DEBUG_HTML("NEW BASE URL - %s\n", newbase);
251 
252   free_deep_url(purl);
253   _free(purl);
254   _free(hpinfo->base);
255   hpinfo->base = newbase;
256 }
257 
html_parser_call_funcs(html_parser_t * hpinfo,dllist * funcs)258 static void html_parser_call_funcs(html_parser_t * hpinfo, dllist * funcs)
259 {
260   dllist *ptr;
261 
262   for(ptr = funcs; ptr; ptr = ptr->next)
263   {
264     html_parser_func_info_t *fi = (html_parser_func_info_t *) ptr->data;
265 
266     fi->func(hpinfo, hpinfo->stack, fi->data);
267   }
268 }
269 
html_parser_flush_stack_to_output(html_parser_t * hpinfo)270 static void html_parser_flush_stack_to_output(html_parser_t * hpinfo)
271 {
272   int l;
273 
274   if(!hpinfo->rewrite)
275     return;
276 
277   l = strlen(hpinfo->stack);
278   html_parser_MEXPAND(hpinfo, l)
279     memcpy(hpinfo->out_content + hpinfo->out_offset, hpinfo->stack, l);
280   hpinfo->out_offset += l;
281   hpinfo->stack_offset = 0;
282 }
283 
html_parser_check_tag(html_parser_t * hpinfo,char * tagstart)284 static int html_parser_check_tag(html_parser_t * hpinfo, char *tagstart)
285 {
286   int tl;
287 
288   hpinfo->current_tag = NULL;
289 
290   for(tl = 0; tl_ascii_isalpha(tagstart[tl]); tl++);
291 
292   if(strchr(" \t\r\n>", tagstart[tl]))
293   {
294     char *tagname;
295 
296     tagname = tl_strndup(tagstart, tl);
297     hpinfo->current_tag = (html_tag_t *) dlhash_find_by_key(hpinfo->tag_hash,
298     (dllist_t) tagname);
299     _free(tagname);
300   }
301   return (hpinfo->current_tag != NULL);
302 }
303 
html_parser_parse_init(html_parser_t * hpinfo)304 static void html_parser_parse_init(html_parser_t * hpinfo)
305 {
306   hpinfo->in_offset = 0;
307   if(hpinfo->rewrite)
308   {
309     hpinfo->aout_size = hpinfo->in_size + html_parser_FENDER;
310     hpinfo->out_content = _malloc(hpinfo->aout_size);
311     hpinfo->out_offset = 0;
312   }
313 
314   hpinfo->stack_size = 2 * html_parser_FENDER;
315   hpinfo->stack = _malloc(hpinfo->stack_size);
316   hpinfo->stack_offset = 0;
317 }
318 
html_parser_parse(html_parser_t * hpinfo)319 void html_parser_parse(html_parser_t * hpinfo)
320 {
321   int tagstart = FALSE;
322   int scriptstart = FALSE;
323   int commentstart = FALSE;
324   int stylestart = FALSE;
325   int singlequoteintag = FALSE;
326   int doublequoteintag = FALSE;
327   char *p;
328 
329   html_parser_parse_init(hpinfo);
330 
331   if(hpinfo->purestyle)
332     stylestart = TRUE;
333 
334   if(hpinfo->purescript)
335     scriptstart = TRUE;
336 
337   for(p = hpinfo->in_content; (p - hpinfo->in_content) < hpinfo->in_size;
338     p++, hpinfo->in_offset++)
339   {
340     if(stylestart)
341     {
342       if(!strncasecmp(p, "</STYLE", 7))
343       {
344         stylestart = FALSE;
345 
346         hpinfo->stack[hpinfo->stack_offset] = *p;
347         hpinfo->stack[hpinfo->stack_offset + 1] = '\0';
348 
349         html_parser_call_funcs(hpinfo, hpinfo->style_funcs);
350         html_parser_flush_stack_to_output(hpinfo);
351       }
352       else
353       {
354         html_parser_SEXPAND(hpinfo, 1)
355           hpinfo->stack[hpinfo->stack_offset] = *p;
356         hpinfo->stack_offset++;
357       }
358 
359       continue;
360     }
361 
362     if(scriptstart)
363     {
364       if(!strncasecmp(p + 1, "</SCRIPT", 8))
365       {
366         scriptstart = FALSE;
367 
368         hpinfo->stack[hpinfo->stack_offset] = *p;
369         hpinfo->stack[hpinfo->stack_offset + 1] = '\0';
370 
371         html_parser_call_funcs(hpinfo, hpinfo->script_funcs);
372         html_parser_flush_stack_to_output(hpinfo);
373       }
374       else
375       {
376         html_parser_SEXPAND(hpinfo, 1)
377           hpinfo->stack[hpinfo->stack_offset] = *p;
378         hpinfo->stack_offset++;
379       }
380 
381       continue;
382     }
383 
384     if(commentstart)
385     {
386       if(!strncmp(p, "-->", 3))
387         commentstart = FALSE;
388 
389       if(hpinfo->rewrite)
390       {
391         hpinfo->out_content[hpinfo->out_offset] = *p;
392         hpinfo->out_offset++;
393       }
394       continue;
395     }
396 
397     if((*p == '\"') && tagstart && !singlequoteintag)
398     {
399       if(doublequoteintag)
400       {
401         doublequoteintag = FALSE;
402       }
403       else
404       {
405         doublequoteintag = TRUE;
406       }
407     }
408     else if((*p == '\'') && tagstart && !doublequoteintag)
409     {
410       if(singlequoteintag)
411       {
412         singlequoteintag = FALSE;
413       }
414       else
415       {
416         singlequoteintag = TRUE;
417       }
418     }
419     else if(*p == '<')
420     {
421       if(singlequoteintag || doublequoteintag)
422       {
423         continue;
424       }
425       if(tagstart)
426       {
427         hpinfo->stack[hpinfo->stack_offset] = '\0';
428         html_parser_flush_stack_to_output(hpinfo);
429       }
430       tagstart = FALSE;
431 
432       if(!strncasecmp(p, "<STYLE", 6))
433       {
434         stylestart = TRUE;
435         hpinfo->stack_offset = 0;
436       }
437       else if(!strncmp(p, "<!--", 4))
438       {
439         commentstart = TRUE;
440       }
441       else
442       {
443         hpinfo->stack_offset = 0;
444         tagstart = TRUE;
445         singlequoteintag = FALSE;
446         doublequoteintag = FALSE;
447       }
448     }
449     else if(*p == '>' && tagstart)
450     {
451       if(singlequoteintag || doublequoteintag)
452       {
453         continue;
454       }
455       hpinfo->stack[hpinfo->stack_offset] = *p;
456       hpinfo->stack[hpinfo->stack_offset + 1] = '\0';
457       html_parser_call_funcs(hpinfo, hpinfo->tag_funcs);
458       html_parser_flush_stack_to_output(hpinfo);
459 
460       if(hpinfo->current_tag &&
461         hpinfo->current_tag->type == HTML_TAG_SCRIPT &&
462         !html_tag_co_elem(hpinfo->stack, "SRC"))
463       {
464         scriptstart = TRUE;
465       }
466 
467       tagstart = FALSE;
468       singlequoteintag = FALSE;
469       doublequoteintag = FALSE;
470       continue;
471     }
472 
473     if(tagstart || stylestart || scriptstart)
474     {
475       hpinfo->stack[hpinfo->stack_offset] = *p;
476       hpinfo->stack_offset++;
477       html_parser_SEXPAND(hpinfo, 1);
478     }
479     else
480     {
481       if(hpinfo->rewrite)
482       {
483         hpinfo->out_content[hpinfo->out_offset] = *p;
484         hpinfo->out_offset++;
485       }
486     }
487   }
488 
489   /* pure style don't need to end with </STYLE>   */
490   /* so we must parse CSS also at end of document */
491   if(stylestart && hpinfo->purestyle)
492   {
493     stylestart = FALSE;
494 
495     hpinfo->stack[hpinfo->stack_offset] = *p;
496     hpinfo->stack[hpinfo->stack_offset + 1] = '\0';
497 
498     html_parser_call_funcs(hpinfo, hpinfo->style_funcs);
499     html_parser_flush_stack_to_output(hpinfo);
500   }
501 
502   /* pure script don't need to end with </SCRIPT> so we */
503   /* must parse script patterns also at end of document */
504   if(scriptstart && hpinfo->purescript)
505   {
506     scriptstart = FALSE;
507 
508     hpinfo->stack[hpinfo->stack_offset] = *p;
509     hpinfo->stack[hpinfo->stack_offset + 1] = '\0';
510 
511     html_parser_call_funcs(hpinfo, hpinfo->script_funcs);
512     html_parser_flush_stack_to_output(hpinfo);
513   }
514 
515   if(tagstart || stylestart)
516   {
517     hpinfo->stack[hpinfo->stack_offset] = '\0';
518     html_parser_flush_stack_to_output(hpinfo);
519   }
520 
521   if(hpinfo->rewrite)
522     hpinfo->out_content[hpinfo->out_offset] = '\0';
523 }
524 
525 /********************************************/
526 /* functions for processing whole HTML tags */
527 /********************************************/
html_parser_parse_tag(html_parser_t * hpinfo,char * stack,void * data)528 void html_parser_parse_tag(html_parser_t * hpinfo, char *stack, void *data)
529 {
530   int j;
531   dllist *ptr;
532 
533   if(!html_parser_check_tag(hpinfo, hpinfo->stack + 1))
534     return;
535 
536   if(hpinfo->current_tag->type == HTML_TAG_META)
537     return;
538 
539   for(j = 0; hpinfo->current_tag->attribs[j].attrib; j++)
540   {
541     hpinfo->current_attrib = &hpinfo->current_tag->attribs[j];
542 
543     if(hpinfo->current_attrib->stat & LINK_DISABLED)
544       continue;
545 
546     hpinfo->tag_attrib = html_get_attrib_from_tag(hpinfo->stack,
547       hpinfo->current_attrib->attrib);
548 
549     /*** -dont_touch_url_pattern support ***/
550     if(hpinfo->tag_attrib && cfg.dont_touch_url_pattern)
551     {
552       if(is_in_pattern_list(hpinfo->tag_attrib, cfg.dont_touch_url_pattern))
553       {
554         _free(hpinfo->tag_attrib);
555       }
556     }
557 
558 #ifdef HAVE_REGEX
559     /*** -dont_touch_url_rpattern support ***/
560     for(ptr = cfg.dont_touch_url_rpattern;
561       ptr && hpinfo->tag_attrib; ptr = ptr->next)
562     {
563       if(re_pmatch((re_entry *) ptr->data, hpinfo->tag_attrib))
564         _free(hpinfo->tag_attrib);
565     }
566 
567     /*** -dont_touch_tag_rpattern support ***/
568     for(ptr = cfg.dont_touch_tag_rpattern;
569       ptr && hpinfo->tag_attrib; ptr = ptr->next)
570     {
571       if(re_pmatch((re_entry *)ptr->data, hpinfo->stack))
572         _free(hpinfo->tag_attrib);
573     }
574 #endif
575 
576     if(hpinfo->tag_attrib)
577     {
578       /* to support javascript:... URLs  */
579       /* inside any attribute            */
580       if(!strncasecmp(hpinfo->tag_attrib, "javascript:", 11))
581       {
582         char *saved_attrib = hpinfo->tag_attrib;
583 
584         hpinfo->tag_attrib = tl_strdup(saved_attrib + 11);
585         html_parser_call_funcs(hpinfo, hpinfo->script_funcs);
586         if(hpinfo->rewrite)
587         {
588           int len;
589 
590           len = strlen(hpinfo->tag_attrib);
591           saved_attrib = _realloc(saved_attrib, 12 + len);
592           memcpy(saved_attrib + 11, hpinfo->tag_attrib, len + 1);
593           _free(hpinfo->tag_attrib);
594           hpinfo->tag_attrib = saved_attrib;
595         }
596         else
597           _free(saved_attrib);
598       }
599       else if(hpinfo->current_attrib->stat & LINK_STYLE)
600         html_parser_call_funcs(hpinfo, hpinfo->style_funcs);
601       else if(hpinfo->current_attrib->stat & LINK_JS)
602         html_parser_call_funcs(hpinfo, hpinfo->script_funcs);
603       else
604         html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs);
605     }
606 
607     if(hpinfo->rewrite && hpinfo->tag_attrib)
608     {
609       int l = strlen(hpinfo->tag_attrib);
610 
611       html_parser_SEND(hpinfo);
612       html_parser_SEXPAND(hpinfo, l);
613       html_replace_url_in_stack(hpinfo->stack,
614         hpinfo->current_attrib->attrib, hpinfo->tag_attrib, FALSE);
615     }
616 
617     _free(hpinfo->tag_attrib);
618   }
619 }
620 
html_parser_parse_tag_slash_a(html_parser_t * hpinfo,char * stack,html_extract_info_t * einfo)621 void html_parser_parse_tag_slash_a(html_parser_t * hpinfo, char *stack,
622   html_extract_info_t * einfo)
623 {
624   if(einfo->prev_a && !strcasecmp(hpinfo->stack, "</A>"))
625   {
626     einfo->prev_a = NULL;
627   }
628 }
629 
html_parser_parse_tag_meta_refresh(html_parser_t * hpinfo,char * stack,void * data)630 void html_parser_parse_tag_meta_refresh(html_parser_t * hpinfo, char *stack,
631   void *data)
632 {
633   char *saved_meta = (char *) 0;
634   char *meta_type;
635 
636   if(!hpinfo->current_tag || hpinfo->current_tag->type != HTML_TAG_META)
637     return;
638 
639   hpinfo->current_attrib = &hpinfo->current_tag->attribs[0];
640 
641   meta_type = html_get_attrib_from_tag(hpinfo->stack, "HTTP-EQUIV");
642 
643   if(!meta_type || strcasecmp(meta_type, "Refresh"))
644   {
645     _free(meta_type);
646     return;
647   }
648   _free(meta_type);
649 
650   saved_meta = html_get_attrib_from_tag(hpinfo->stack, "CONTENT");
651 
652   if(!saved_meta)
653     return;
654 
655   hpinfo->tag_attrib = html_get_attrib_from_tag(saved_meta, "URL");
656 
657   if(hpinfo->tag_attrib)
658   {
659     html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs);
660 
661     if(hpinfo->rewrite)
662     {
663       /* little hack to prevent writing    */
664       /* outside of allocated memory chunk */
665       saved_meta = _realloc(saved_meta,
666         strlen(saved_meta) + strlen(hpinfo->tag_attrib) + 4);
667 
668       html_replace_url_in_stack(saved_meta, "URL", hpinfo->tag_attrib, TRUE);
669       _free(hpinfo->tag_attrib);
670 
671       hpinfo->tag_attrib = saved_meta;
672 
673       if(hpinfo->tag_attrib)
674       {
675         int l = strlen(hpinfo->tag_attrib);
676 
677         html_parser_SEND(hpinfo);
678         html_parser_SEXPAND(hpinfo, l);
679         html_replace_url_in_stack(hpinfo->stack,
680           hpinfo->current_attrib->attrib, hpinfo->tag_attrib, FALSE);
681         hpinfo->tag_attrib = 0;
682       }
683     }
684     else
685     {
686       _free(hpinfo->tag_attrib);
687     }
688   }
689 
690   _free(saved_meta);
691 }
692 
html_parser_parse_tag_meta_robots(html_parser_t * hpinfo,char * stack,html_robots_info_t * oinfo)693 void html_parser_parse_tag_meta_robots(html_parser_t * hpinfo, char *stack,
694   html_robots_info_t * oinfo)
695 {
696   char *meta_type;
697   char *content;
698   char **flags;
699   int i;
700 
701   if(!hpinfo->current_tag || hpinfo->current_tag->type != HTML_TAG_META)
702     return;
703 
704   meta_type = html_get_attrib_from_tag(hpinfo->stack, "NAME");
705 
706   if(!meta_type || strcasecmp(meta_type, "Robots"))
707   {
708     _free(meta_type);
709     return;
710   }
711 
712   _free(meta_type);
713 
714   content = html_get_attrib_from_tag(hpinfo->stack, "CONTENT");
715 
716   if(!content)
717     return;
718 
719   flags = tl_str_split(content, ",");
720   _free(content);
721 
722   for(i = 0; flags && flags[i]; i++)
723   {
724     if(!strcasecmp(flags[i], "all"))
725     {
726       oinfo->index = TRUE;
727       oinfo->follow = TRUE;
728       oinfo->images = TRUE;
729     }
730     else if(!strcasecmp(flags[i], "none"))
731     {
732       oinfo->index = FALSE;
733       oinfo->follow = FALSE;
734       oinfo->images = FALSE;
735     }
736     else if(!strcasecmp(flags[i], "index"))
737       oinfo->index = TRUE;
738     else if(!strcasecmp(flags[i], "follow"))
739       oinfo->follow = TRUE;
740     else if(!strcasecmp(flags[i], "noimageindex"))
741       oinfo->images = FALSE;
742     else if(!strcasecmp(flags[i], "noindex"))
743       oinfo->index = FALSE;
744     else if(!strcasecmp(flags[i], "nofollow"))
745       oinfo->follow = FALSE;
746     _free(flags[i]);
747   }
748   _free(flags);
749 }
750 
html_parser_parse_tag_jstransform(html_parser_t * hpinfo,char * stack,void * data)751 void html_parser_parse_tag_jstransform(html_parser_t * hpinfo, char *stack,
752   void *data)
753 {
754 #ifdef HAVE_REGEX
755   dllist *ptr;
756   html_tag_t t = { HTML_TAG_HACK, "HACK",
757     {{HTML_ATTRIB_HACK, "HACK", LINK_INLINE | LINK_DOWNLD},
758       {HTML_ATTRIB_NULL, NULL, 0}}
759   };
760 
761   for(ptr = priv_cfg.js_transform; ptr; ptr = ptr->next)
762   {
763     js_transform_t *jt = (js_transform_t *) ptr->data;
764 
765     if(js_transform_match_tag(jt, hpinfo->stack))
766     {
767       int nsub, *subs;
768       char *attr = html_get_attrib_from_tag(hpinfo->stack,
769         jt->attrib);
770 
771       if(!attr)
772         continue;
773 
774       if(!re_pmatch_subs(jt->re, attr, &nsub, &subs))
775       {
776         _free(attr);
777         continue;
778       }
779 
780       hpinfo->tag_attrib = js_transform_apply(jt, attr, nsub, subs);
781 
782       /*****************************************/
783       /* quite dirty hack to make happy attrib */
784       /* parsing funcs which require valid     */
785       /* current_tag & current_attrib          */
786       /*****************************************/
787       hpinfo->current_tag = &t;
788       hpinfo->current_attrib = &(t.attribs[0]);
789 
790       if(hpinfo->tag_attrib)
791         html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs);
792 
793       if(hpinfo->rewrite && jt->type == 1 && nsub)
794       {
795         int l = strlen(hpinfo->tag_attrib);
796 
797         attr = _realloc(attr, strlen(attr) + l + 1);
798         memmove(attr + l + subs[2], attr + subs[3],
799           strlen(attr + subs[3]) + 1);
800         memcpy(attr + subs[2], hpinfo->tag_attrib, l);
801 
802         l = strlen(attr);
803         html_parser_SEND(hpinfo);
804         html_parser_SEXPAND(hpinfo, l);
805         html_replace_url_in_stack(hpinfo->stack, jt->attrib, attr, FALSE);
806       }
807 
808       _free(subs);
809       _free(attr);
810 
811       /* :-) unhack */
812       hpinfo->current_tag = NULL;
813       hpinfo->current_attrib = NULL;
814 
815       _free(hpinfo->tag_attrib);
816     }
817   }
818 #endif
819 }
820 
821 /********************************************************/
822 /* functions for processing URL attributes of HTML tags */
823 /********************************************************/
html_parser_url_to_absolute_url(html_parser_t * hpinfo,char * stack,void * data)824 void html_parser_url_to_absolute_url(html_parser_t * hpinfo, char *stack,
825   void *data)
826 {
827   char *ustr;
828 
829   /*
830      printf("http_parser sees %s %s=\"%s\"<\n",
831      hpinfo->current_tag->tag,
832      hpinfo->current_attrib->attrib,
833      hpinfo->tag_attrib);
834    */
835 
836   ustr = url_to_absolute_url(hpinfo->base, hpinfo->baset,
837     hpinfo->doc_url, hpinfo->tag_attrib);
838 
839   if(ustr && *ustr)
840   {
841     DEBUG_HTML("Rewriting URL (to abs) - %s -> %s\n", hpinfo->tag_attrib,
842       ustr);
843     _free(hpinfo->tag_attrib);
844     hpinfo->tag_attrib = ustr;
845   }
846 }
847 
html_parser_process_base(html_parser_t * hpinfo,char * stack,void * data)848 void html_parser_process_base(html_parser_t * hpinfo, char *stack, void *data)
849 {
850   if(hpinfo->current_tag->type == HTML_TAG_BASE &&
851     hpinfo->current_attrib->type == HTML_ATTRIB_HREF)
852   {
853     int lp, ls;
854 
855     html_parser_process_new_base_url(hpinfo, hpinfo->tag_attrib);
856 
857     /* comment BASE tag because pavuk        */
858     /* overwrites URLs according to this tag */
859     lp = strlen(COMMENT_PREFIX);
860     ls = strlen(COMMENT_SUFFIX);
861 
862     html_parser_SEND(hpinfo);
863     html_parser_SEXPAND(hpinfo, (lp + ls));
864 
865     memmove(hpinfo->stack + lp, hpinfo->stack, strlen(hpinfo->stack) + 1);
866     memcpy(hpinfo->stack, COMMENT_PREFIX, lp);
867     strcat(hpinfo->stack, COMMENT_SUFFIX);
868   }
869 }
870 
html_parser_process_form(html_parser_t * hpinfo,char * stack,dllist ** formlist)871 void html_parser_process_form(html_parser_t * hpinfo, char *stack,
872   dllist ** formlist)
873 {
874   if(hpinfo->current_attrib->stat & LINK_FORM &&
875     hpinfo->current_attrib->type == HTML_ATTRIB_ACTION)
876   {
877     hpinfo->doc_url->status |= URL_HAVE_FORMS;
878 
879     if(formlist && hpinfo->tag_attrib)
880     {
881       *formlist = dllist_append(*formlist,
882       (dllist_t) tl_strdup(hpinfo->tag_attrib));
883     }
884   }
885 }
886 
html_parser_get_url(html_parser_t * hpinfo,char * stack,html_extract_info_t * einfo)887 void html_parser_get_url(html_parser_t * hpinfo, char *stack,
888   html_extract_info_t * einfo)
889 {
890   if(*hpinfo->tag_attrib /* Never follow "" */  &&
891     (hpinfo->current_attrib->stat & LINK_DOWNLD) &&
892     (!einfo->only_inline ||
893       (einfo->only_inline &&
894         hpinfo->current_attrib->stat & LINK_INLINE)) &&
895     (!(hpinfo->current_attrib->stat & LINK_SCRIPT) ||
896       (einfo->enable_js && hpinfo->current_attrib->stat & LINK_SCRIPT)))
897   {
898     url *purl = (url *) 0;
899     cond_info_t condp;
900 
901     condp.level = 0;
902     condp.urlnr = 0;
903     condp.size = 0;
904     condp.time = 0L;
905     condp.mimet = NULL;
906     condp.full_tag = stack;
907     condp.params = NULL;
908     condp.html_doc = hpinfo->in_content;
909     condp.html_doc_offset = hpinfo->in_offset;
910     condp.tag = hpinfo->current_tag ? hpinfo->current_tag->tag : NULL;
911     condp.attrib = hpinfo->current_attrib ?
912       hpinfo->current_attrib->attrib : NULL;
913 
914     purl = url_parse(hpinfo->tag_attrib);
915     assert(purl->type != URLT_FROMPARENT);
916     url_path_abs(purl);
917 
918     if(hpinfo->current_attrib->stat & LINK_INLINE)
919       purl->status |= URL_INLINE_OBJ;
920 
921     if(hpinfo->current_attrib->stat & LINK_SCRIPT)
922       purl->status |= URL_ISSCRIPT;
923 
924     purl->level = hpinfo->doc_url->level + 1;
925     purl->parent_url = dllist_append(purl->parent_url,
926     (dllist_t) hpinfo->doc_url);
927 
928     /*****************************************************/
929     /* if we are in SYNC/MIRROR mode try to get original */
930     /* URL rather than processing it as file             */
931     /* (mandatory thing to get working SYNC/MIRROR mode) */
932     /*****************************************************/
933     if((cfg.mode == MODE_SYNC || cfg.mode == MODE_MIRROR) &&
934       cfg.request && (purl->type == URLT_FILE))
935     {
936       url *pomurl = filename_to_url(purl->p.file.filename);
937 
938       if(pomurl)
939       {
940         free_deep_url(purl);
941         _free(purl);
942         purl = pomurl;
943       }
944     }
945 
946     /**********************************/
947     /* remove last anchor URL because */
948     /* it is server side image map    */
949     /**********************************/
950     if(einfo->prev_a &&
951       hpinfo->current_tag->type == HTML_TAG_IMG &&
952       hpinfo->current_attrib->type == HTML_ATTRIB_SRC &&
953       html_tag_co_elem(hpinfo->stack, "ISMAP"))
954     {
955       DEBUG_HTML("Removing server image map\n");
956       free_deep_url((url *) einfo->prev_a->data);
957       free((url *) einfo->prev_a->data);
958       einfo->urls = dllist_remove_entry(einfo->urls, einfo->prev_a);
959       einfo->prev_a = NULL;
960     }
961 
962     if(hpinfo->current_tag->type == HTML_TAG_A &&
963       hpinfo->current_attrib->type == HTML_ATTRIB_HREF)
964     {
965       einfo->prev_a = NULL;
966     }
967 
968     /* Do not accept links, which only link inside the already loaded document
969        like <a href="#top">. This is a local relative reference, so remove it.
970     */
971     if((hpinfo->current_attrib->type == HTML_ATTRIB_USEMAP ||
972     hpinfo->current_attrib->type == HTML_ATTRIB_HREF) &&
973     hpinfo->tag_attrib[0] == '#')
974     {
975       LOCK_REJCNT;
976       cfg.reject_cnt++;
977       UNLOCK_REJCNT;
978 
979       DEBUG_HTML("Rejecting local anchor URL - %s\n", hpinfo->tag_attrib);
980     }
981     else if(einfo->no_limits || url_append_condition(purl, &condp))
982     {
983       DEBUG_HTML("Accepting URL - %s\n", hpinfo->tag_attrib);
984 
985       /***************************************/
986       /* process special add-on tag PAVUKEXT */
987       /* where are stored some additional    */
988       /* informations about FTP URLs         */
989       /***************************************/
990       if(purl->type == URLT_FTP || purl->type == URLT_FTPS)
991       {
992         char *pext;
993 
994         pext = html_get_attrib_from_tag(hpinfo->stack, "PAVUKEXT");
995         if(pext)
996         {
997           ftp_url_extension *uext;
998 
999           uext = ftp_parse_ftpinf_ext(pext);
1000           purl->extension = uext;
1001 
1002           if(uext->type == FTP_TYPE_D)
1003             purl->p.ftp.dir = TRUE;
1004         }
1005         _free(pext);
1006       }
1007 
1008       einfo->urls = dllist_append(einfo->urls, (dllist_t) purl);
1009 
1010       if(hpinfo->current_tag->type == HTML_TAG_A &&
1011         hpinfo->current_attrib->type == HTML_ATTRIB_HREF)
1012       {
1013         einfo->prev_a = dllist_last(einfo->urls);
1014       }
1015     }
1016     else
1017     {
1018       LOCK_REJCNT;
1019       cfg.reject_cnt++;
1020       UNLOCK_REJCNT;
1021 
1022       DEBUG_HTML("Rejecting URL - %s\n", hpinfo->tag_attrib);
1023       free_deep_url(purl);
1024       _free(purl);
1025     }
1026   }
1027 }
1028 
html_parser_url_to_local(html_parser_t * hpinfo,char * stack,html_rewrite_info_t * rinfo)1029 void html_parser_url_to_local(html_parser_t * hpinfo, char *stack,
1030   html_rewrite_info_t * rinfo)
1031 {
1032   url *urlp, *before_url;
1033   char *anchor, *fn;
1034   int is_local;
1035 
1036   if(!hpinfo->rewrite || rinfo->all_to_remote)
1037     return;
1038 
1039   urlp = url_parse(hpinfo->tag_attrib);
1040   assert(urlp->type != URLT_FROMPARENT);
1041 
1042   if(urlp->type == URLT_FILE || !prottable[urlp->type].supported)
1043   {
1044     free_deep_url(urlp);
1045     _free(urlp);
1046     return;
1047   }
1048 
1049   anchor = url_get_anchor_name(urlp);
1050 
1051   /*******************************************/
1052   /* for better performance with info files  */
1053   /* we should rather use filename generated */
1054   /* for previous occurence of this URL      */
1055   /*******************************************/
1056   before_url = url_was_befor(urlp);
1057 
1058   if(!before_url)
1059   {
1060     dllist *ptr;
1061 
1062     ptr = dllist_find2(rinfo->einfo->urls, (dllist_t) urlp,
1063     dllist_url_compare);
1064     if(ptr)
1065       before_url = (url *) ptr->data;
1066   }
1067 
1068   if(before_url)
1069     fn = url_to_filename(before_url, TRUE);
1070   else
1071     fn = url_to_filename(urlp, FALSE);
1072 
1073   is_local = !access(fn, R_OK);
1074 
1075   if(is_local || rinfo->all_to_local ||
1076     (rinfo->selected_to_local && before_url) ||
1077     url_compare(urlp, hpinfo->doc_url))
1078   {
1079     char *actname, *relname;
1080     struct stat estat;
1081 
1082     if(is_local && !stat(fn, &estat) && S_ISDIR(estat.st_mode))
1083       fn = tl_str_concat(NULL, fn, "/", priv_cfg.index_name, NULL);
1084     else
1085       fn = tl_strdup(fn);
1086 
1087     actname = url_to_filename(hpinfo->doc_url, FALSE);
1088 
1089     /* it seems that lynx and netscape behave different on   */
1090     /* empty HREFs, so use it only in case when is specified */
1091     /* partname of document (#xxx)         */
1092     /* this is URL of current document -> "" */
1093     if(anchor && !strcmp(actname, fn))
1094       relname = tl_strdup("");
1095     else
1096       relname = get_relative_path(actname, fn);
1097 
1098     _free(fn);
1099 
1100     /* workaround for -sel_to_local && -nostore_index */
1101     if(rinfo->selected_to_local && !rinfo->store_index)
1102     {
1103       char *slp = strrchr(relname, '/');
1104 
1105       if(!slp)
1106         slp = relname;
1107       else
1108         slp++;
1109 
1110       if(!strcmp(slp, priv_cfg.index_name))
1111         *slp = '\0';
1112     }
1113 
1114     if(anchor)
1115       relname = tl_str_concat(relname, "#", anchor, NULL);
1116 
1117     DEBUG_HTML("Rewriting URL (to loc) - %s -> %s\n", hpinfo->tag_attrib,
1118       relname);
1119 
1120     _free(hpinfo->tag_attrib);
1121     hpinfo->tag_attrib = relname;
1122   }
1123   free_deep_url(urlp);
1124   _free(urlp);
1125 }
1126 
html_parser_remove_advertisement(html_parser_t * hpinfo,char * stack,void * data)1127 void html_parser_remove_advertisement(html_parser_t * hpinfo, char *stack,
1128   void *data)
1129 {
1130 #ifdef HAVE_REGEX
1131   int is_adver = FALSE;
1132 
1133   if(hpinfo->current_tag->type != HTML_TAG_IMG ||
1134     hpinfo->current_attrib->type != HTML_ATTRIB_SRC)
1135     return;
1136 
1137   if(cfg.remove_adv && priv_cfg.advert_res)
1138   {
1139     dllist *ptr = priv_cfg.advert_res;
1140 
1141     while(ptr)
1142     {
1143       if(re_pmatch((re_entry *) ptr->data, hpinfo->tag_attrib))
1144       {
1145         DEBUG_HTML("Removing advert URL - %s\n", hpinfo->tag_attrib);
1146         is_adver = TRUE;
1147         break;
1148       }
1149       ptr = ptr->next;
1150     }
1151   }
1152 
1153   if(is_adver)
1154   {
1155     int lp = strlen(ADVERT_PREFIX);
1156     int ls = strlen(ADVERT_SUFFIX);
1157 
1158     html_parser_SEND(hpinfo);
1159     html_parser_SEXPAND(hpinfo, (lp + ls));
1160 
1161     memmove(hpinfo->stack + lp, hpinfo->stack, strlen(hpinfo->stack) + 1);
1162     memcpy(hpinfo->stack, ADVERT_PREFIX, lp);
1163     strcat(hpinfo->stack, ADVERT_SUFFIX);
1164   }
1165 #endif
1166 }
1167 
html_parser_change_url(html_parser_t * hpinfo,char * stack,html_change_info_t * chinfo)1168 void html_parser_change_url(html_parser_t * hpinfo, char *stack,
1169   html_change_info_t * chinfo)
1170 {
1171   url *urlp;
1172 
1173   urlp = url_parse(hpinfo->tag_attrib);
1174   assert(urlp->type != URLT_FROMPARENT);
1175 
1176   if(urlp->type == URLT_FILE || !prottable[urlp->type].supported)
1177   {
1178     free_deep_url(urlp);
1179     _free(urlp);
1180     return;
1181   }
1182 
1183   if(url_compare(urlp, chinfo->url_old))
1184   {
1185     DEBUG_HTML("Rewriting URL (change) - %s -> %s\n",
1186       chinfo->url_new, chinfo->url_new);
1187 
1188     _free(hpinfo->tag_attrib);
1189     hpinfo->tag_attrib = tl_strdup(chinfo->url_new);
1190   }
1191 
1192   free_deep_url(urlp);
1193   _free(urlp);
1194 }
1195 
1196 /********************************************************/
1197 /* functions for processing CSS parts of HTML documents */
1198 /********************************************************/
html_parser_style_to_absolute_urls(html_parser_t * hpinfo,char * stack,void * data)1199 void html_parser_style_to_absolute_urls(html_parser_t * hpinfo, char *stack,
1200   void *data)
1201 {
1202   char *alttag;
1203 
1204   if(!hpinfo->rewrite)
1205     return;
1206 
1207   if(hpinfo->tag_attrib)
1208   {
1209     alttag = css_to_absolute_links(hpinfo->doc_url,
1210       hpinfo->tag_attrib, hpinfo->base, hpinfo->baset);
1211     _free(hpinfo->tag_attrib);
1212     hpinfo->tag_attrib = alttag;
1213   }
1214   else
1215   {
1216     int l;
1217 
1218     alttag = css_to_absolute_links(hpinfo->doc_url,
1219       hpinfo->stack, hpinfo->base, hpinfo->baset);
1220 
1221     l = strlen(alttag);
1222     if(l > hpinfo->stack_offset)
1223     {
1224       hpinfo->stack_offset = 0;
1225       html_parser_SEXPAND(hpinfo, l);
1226     }
1227     memcpy(hpinfo->stack, alttag, l + 1);
1228     hpinfo->stack_offset = l;
1229     _free(alttag);
1230   }
1231 }
1232 
html_parser_get_style_urls(html_parser_t * hpinfo,char * stack,html_extract_info_t * einfo)1233 void html_parser_get_style_urls(html_parser_t * hpinfo, char *stack,
1234   html_extract_info_t * einfo)
1235 {
1236   dllist *pv;
1237   if(!cfg.read_css)
1238   {                             /* don't fetch from css if not wanted */
1239     return;
1240   }
1241   if(hpinfo->tag_attrib)
1242     pv = css_get_all_links(hpinfo->doc_url, hpinfo->tag_attrib,
1243       hpinfo->base, hpinfo->baset, einfo->no_limits);
1244   else
1245     pv = css_get_all_links(hpinfo->doc_url, hpinfo->stack,
1246       hpinfo->base, hpinfo->baset, einfo->no_limits);
1247 
1248   einfo->urls = dllist_concat(einfo->urls, pv);
1249 }
1250 
html_parser_style_to_local_urls(html_parser_t * hpinfo,char * stack,html_rewrite_info_t * rinfo)1251 void html_parser_style_to_local_urls(html_parser_t * hpinfo, char *stack,
1252   html_rewrite_info_t * rinfo)
1253 {
1254   char *alttag;
1255 
1256   if(!hpinfo->rewrite || rinfo->all_to_remote)
1257     return;
1258 
1259   if(hpinfo->tag_attrib)
1260   {
1261     alttag = css_remote_to_local_links(hpinfo->doc_url,
1262       hpinfo->tag_attrib, rinfo->all_to_local,
1263       rinfo->selected_to_local, hpinfo->base, hpinfo->baset);
1264     _free(hpinfo->tag_attrib);
1265     hpinfo->tag_attrib = alttag;
1266   }
1267   else
1268   {
1269     int l;
1270 
1271     alttag = css_remote_to_local_links(hpinfo->doc_url,
1272       hpinfo->stack, rinfo->all_to_local,
1273       rinfo->selected_to_local, hpinfo->base, hpinfo->baset);
1274 
1275     l = strlen(alttag);
1276     if(l > hpinfo->stack_offset)
1277     {
1278       hpinfo->stack_offset = 0;
1279       html_parser_SEXPAND(hpinfo, l);
1280     }
1281     memcpy(hpinfo->stack, alttag, l + 1);
1282     hpinfo->stack_offset = l;
1283     _free(alttag);
1284   }
1285 }
1286 
html_parser_style_change_url(html_parser_t * hpinfo,char * stack,html_change_info_t * chinfo)1287 void html_parser_style_change_url(html_parser_t * hpinfo, char *stack,
1288   html_change_info_t * chinfo)
1289 {
1290   char *alttag;
1291 
1292   if(!hpinfo->rewrite)
1293     return;
1294 
1295   if(hpinfo->tag_attrib)
1296   {
1297     alttag = css_change_url(hpinfo->doc_url, hpinfo->tag_attrib,
1298       chinfo->url_old, chinfo->url_new);
1299     _free(hpinfo->tag_attrib);
1300     hpinfo->tag_attrib = alttag;
1301   }
1302   else
1303   {
1304     int l;
1305 
1306     alttag = css_change_url(hpinfo->doc_url, hpinfo->stack,
1307       chinfo->url_old, chinfo->url_new);
1308 
1309     l = strlen(alttag);
1310     if(l > hpinfo->stack_offset)
1311     {
1312       hpinfo->stack_offset = 0;
1313       html_parser_SEXPAND(hpinfo, l);
1314     }
1315     memcpy(hpinfo->stack, alttag, l + 1);
1316     hpinfo->stack_offset = l;
1317     _free(alttag);
1318   }
1319 }
1320 
1321 /***********************************************************/
1322 /* functions for processing SCRIPTs part of HTML documents */
1323 /***********************************************************/
html_parser_parse_jspatterns(html_parser_t * hpinfo,char * stack,void * data)1324 void html_parser_parse_jspatterns(html_parser_t * hpinfo, char *stack,
1325   void *data)
1326 {
1327 #ifdef HAVE_REGEX
1328   dllist *ptr;
1329   int found = FALSE;
1330   int start, end;
1331 
1332   start = -1;
1333   end = -1;
1334 
1335   if(!hpinfo->tag_attrib)
1336     return;
1337 
1338   for(ptr = priv_cfg.js_patterns; ptr; ptr = ptr->next)
1339   {
1340     if(re_pmatch_sub((re_entry *) ptr->data, hpinfo->tag_attrib,
1341         1, &start, &end))
1342     {
1343       found = TRUE;
1344       break;
1345     }
1346   }
1347 
1348   if(found && (start >= 0))
1349   {
1350     char *saved_attrib, *new_attrib;
1351 
1352     saved_attrib = hpinfo->tag_attrib;
1353 
1354     hpinfo->tag_attrib = tl_strndup(hpinfo->tag_attrib + start, end - start);
1355     html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs);
1356 
1357     new_attrib = _malloc(start + strlen(saved_attrib + end) +
1358       strlen(hpinfo->tag_attrib) + 1);
1359 
1360     strncpy(new_attrib, saved_attrib, start);
1361     strcpy(new_attrib + start, hpinfo->tag_attrib);
1362     strcat(new_attrib, saved_attrib + end);
1363 
1364     _free(hpinfo->tag_attrib);
1365     _free(saved_attrib);
1366 
1367     hpinfo->tag_attrib = new_attrib;
1368   }
1369 #endif
1370 }
1371 
html_parser_parse_body_jspatterns(html_parser_t * hpinfo,char * stack,void * data)1372 void html_parser_parse_body_jspatterns(html_parser_t * hpinfo, char *stack,
1373   void *data)
1374 {
1375 #ifdef HAVE_REGEX
1376   char *stackc = NULL;
1377   char *p;
1378   int ilen;
1379 
1380   html_tag_t t = { HTML_TAG_HACK, "HACK",
1381     {{HTML_ATTRIB_HACK, "HACK", LINK_INLINE | LINK_DOWNLD},
1382       {HTML_ATTRIB_NULL, NULL, 0}}
1383   };
1384 
1385   if(hpinfo->tag_attrib)
1386     return;
1387 
1388   /*****************************************/
1389   /* quite dirty hack to make happy attrib */
1390   /* parsing funcs which require valid     */
1391   /* current_tag & current_attrib          */
1392   /*****************************************/
1393   hpinfo->current_tag = &t;
1394   hpinfo->current_attrib = &(t.attribs[0]);
1395 
1396   p = hpinfo->stack;
1397 
1398   while(*p)
1399   {
1400     ilen = strcspn(p, "\r\n");
1401 
1402     hpinfo->tag_attrib = tl_strndup(p, ilen);
1403 
1404     html_parser_parse_jspatterns(hpinfo, stack, data);
1405 
1406     if(hpinfo->rewrite)
1407       stackc = tl_str_concat(stackc, hpinfo->tag_attrib, "\n", NULL);
1408     _free(hpinfo->tag_attrib);
1409 
1410     p += ilen;
1411     p += strspn(p, "\n\r");
1412   }
1413 
1414   if(hpinfo->rewrite)
1415   {
1416     ilen = strlen(stackc);
1417     hpinfo->stack_offset = 0;
1418     html_parser_SEXPAND(hpinfo, ilen);
1419     memcpy(hpinfo->stack, stackc, ilen + 1);
1420     hpinfo->stack_offset = ilen;
1421     _free(stackc);
1422   }
1423 
1424   /* :-) unhack */
1425   hpinfo->current_tag = NULL;
1426   hpinfo->current_attrib = NULL;
1427 
1428   hpinfo->tag_attrib = NULL;
1429 #endif
1430 }
1431 
html_parser_parse_body_jstransform(html_parser_t * hpinfo,char * stack,void * data)1432 void html_parser_parse_body_jstransform(html_parser_t * hpinfo, char *stack,
1433   void *data)
1434 {
1435 #ifdef HAVE_REGEX
1436   char *p;
1437   int ilen;
1438   char *stackc = NULL;
1439 
1440   html_tag_t t = { HTML_TAG_HACK, "HACK",
1441     {{HTML_ATTRIB_HACK, "HACK", LINK_INLINE | LINK_DOWNLD},
1442       {HTML_ATTRIB_NULL, NULL, 0}}
1443   };
1444 
1445   if(hpinfo->tag_attrib)
1446     return;
1447 
1448   /*****************************************/
1449   /* quite dirty hack to make happy attrib */
1450   /* parsing funcs which require valid     */
1451   /* current_tag & current_attrib          */
1452   /*****************************************/
1453   hpinfo->current_tag = &t;
1454   hpinfo->current_attrib = &(t.attribs[0]);
1455 
1456   p = hpinfo->stack;
1457 
1458   while(*p)
1459   {
1460     dllist *ptr;
1461     char *ln;
1462 
1463     ilen = strcspn(p, "\r\n");
1464 
1465     ln = tl_strndup(p, ilen);
1466 
1467     for(ptr = priv_cfg.js_transform; ptr; ptr = ptr->next)
1468     {
1469       int nsub, *subs;
1470       js_transform_t *jt = (js_transform_t *) ptr->data;
1471 
1472       if(jt->tag[0])
1473         continue;
1474 
1475       if(!re_pmatch_subs(jt->re, ln, &nsub, &subs))
1476       {
1477         continue;
1478       }
1479 
1480       hpinfo->tag_attrib = js_transform_apply(jt, ln, nsub, subs);
1481 
1482       /*****************************************/
1483       /* quite dirty hack to make happy attrib */
1484       /* parsing funcs which require valid     */
1485       /* current_tag & current_attrib          */
1486       /*****************************************/
1487       hpinfo->current_tag = &t;
1488       hpinfo->current_attrib = &(t.attribs[0]);
1489 
1490       if(hpinfo->tag_attrib)
1491         html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs);
1492 
1493       if(hpinfo->rewrite && jt->type == 1 && nsub)
1494       {
1495         int l = strlen(hpinfo->tag_attrib);
1496 
1497         ln = _realloc(ln, strlen(ln) + l + 1);
1498         memmove(ln + l + subs[2], ln + subs[3], strlen(ln + subs[3]) + 1);
1499         memcpy(ln + subs[2], hpinfo->tag_attrib, l);
1500       }
1501 
1502       _free(subs);
1503 
1504       /* :-) unhack */
1505       hpinfo->current_tag = NULL;
1506       hpinfo->current_attrib = NULL;
1507 
1508       _free(hpinfo->tag_attrib);
1509     }
1510 
1511     if(hpinfo->rewrite)
1512       stackc = tl_str_concat(stackc, ln, "\n", NULL);
1513 
1514     _free(ln);
1515     p += ilen;
1516     p += strspn(p, "\n\r");
1517   }
1518 
1519   if(hpinfo->rewrite)
1520   {
1521     ilen = strlen(stackc);
1522     hpinfo->stack_offset = 0;
1523     html_parser_SEXPAND(hpinfo, ilen);
1524     memcpy(hpinfo->stack, stackc, ilen + 1);
1525     hpinfo->stack_offset = ilen;
1526     _free(stackc);
1527   }
1528 
1529   /* :-) unhack */
1530   hpinfo->current_tag = NULL;
1531   hpinfo->current_attrib = NULL;
1532 
1533   hpinfo->tag_attrib = NULL;
1534 #endif
1535 }
1536