1 /***************************************************************************/
2 /* This code is part of WWW grabber called pavuk */
3 /* Copyright (c) 1997 - 2001 Stefan Ondrejicka */
4 /* Distributed under GPL 2 or later */
5 /***************************************************************************/
6
7 #include "config.h"
8
9 #include <assert.h>
10 #include <sys/types.h>
11 #include <sys/stat.h>
12 #include <unistd.h>
13 #include <string.h>
14
15 #include "htmlparser.h"
16
17 #include "tools.h"
18 #include "css.h"
19 #include "re.h"
20 #include "ftp.h"
21 #include "jstrans.h"
22
23 static dlhash *html_parser_tag_hash = NULL;
24
25 #define COMMENT_PREFIX "<!-- "
26 #define COMMENT_SUFFIX " -->"
27 #define ADVERT_PREFIX "adv<!-- Removed by pavuk "
28 #define ADVERT_SUFFIX " -->"
29
html_parser_tag_comp_func(dllist_t key1,dllist_t key2)30 static int html_parser_tag_comp_func(dllist_t key1, dllist_t key2)
31 {
32 return (!strcasecmp((void *) key1, (void *) key2));
33 }
34
html_parser_tag_hash_func(unsigned int size,dllist_t key)35 static unsigned int html_parser_tag_hash_func(unsigned int size, dllist_t key)
36 {
37 unsigned char *p = (unsigned char *) key;
38 unsigned int retv = 0;
39
40 while(*p)
41 {
42 retv = (retv + tl_ascii_tolower(*p)) % size;
43 p++;
44 }
45
46 return retv;
47 }
48
html_parser_tag_key_func(dllist_t data)49 static dllist_t html_parser_tag_key_func(dllist_t data)
50 {
51 return (dllist_t) ((html_tag_t *) data)->tag;
52 }
53
html_parser_init(html_tag_t * tags,int ntags,int with_tag_rewriting,int purestyle,int purescript)54 html_parser_t *html_parser_init(html_tag_t * tags, int ntags,
55 int with_tag_rewriting, int purestyle, int purescript)
56 {
57 html_parser_t *rv;
58 int i;
59
60 rv = _malloc(sizeof(html_parser_t));
61
62 rv->rewrite = with_tag_rewriting;
63 rv->purestyle = purestyle;
64 rv->purescript = purescript;
65 rv->in_content = NULL;
66 rv->out_content = NULL;
67 rv->in_size = 0;
68 rv->aout_size = 0;
69 rv->out_offset = 0;
70 rv->in_offset = 0;
71
72 rv->stack = NULL;
73 rv->stack_size = 0;
74 rv->stack_offset = 0;
75
76 rv->base = NULL;
77 rv->baset = NULL;
78
79 rv->tag_attrib = NULL;
80
81 rv->tag_funcs = NULL;
82 rv->attrib_funcs = NULL;
83 rv->style_funcs = NULL;
84 rv->script_funcs = NULL;
85
86 rv->current_tag = NULL;
87 rv->current_attrib = NULL;
88
89 LOCK_TAG_HASH;
90 if(html_parser_tag_hash)
91 rv->tag_hash = html_parser_tag_hash;
92 else
93 {
94
95 rv->tag_hash = dlhash_new(20,
96 html_parser_tag_key_func,
97 html_parser_tag_hash_func, html_parser_tag_comp_func);
98
99 for(i = 0; i < ntags; i++)
100 dlhash_insert(rv->tag_hash, (dllist_t) &tags[i]);
101
102 html_parser_tag_hash = rv->tag_hash;
103
104 }
105 UNLOCK_TAG_HASH;
106
107 return rv;
108 }
109
html_parser_do_cleanup(void)110 void html_parser_do_cleanup(void)
111 {
112 if(html_parser_tag_hash)
113 dlhash_free(html_parser_tag_hash);
114 }
115
html_parser_kill(html_parser_t * hpinfo)116 void html_parser_kill(html_parser_t * hpinfo)
117 {
118 #define KILL_FUNC_CHAIN(chain) \
119 while (chain) \
120 { \
121 if(chain->data) free((void *)chain->data);\
122 chain = dllist_remove_entry(chain, chain);\
123 }
124
125 KILL_FUNC_CHAIN(hpinfo->tag_funcs);
126 KILL_FUNC_CHAIN(hpinfo->attrib_funcs);
127 KILL_FUNC_CHAIN(hpinfo->style_funcs);
128 KILL_FUNC_CHAIN(hpinfo->script_funcs);
129
130 _free(hpinfo->stack);
131 _free(hpinfo->out_content);
132
133 _free(hpinfo->base);
134 _free(hpinfo->baset);
135 _free(hpinfo);
136 }
137
html_parser_add_tag_func(html_parser_t * hpinfo,html_parser_func_t func,void * data)138 void html_parser_add_tag_func(html_parser_t * hpinfo, html_parser_func_t func,
139 void *data)
140 {
141 html_parser_func_info_t *nfunc;
142
143 nfunc = _malloc(sizeof(html_parser_func_info_t));
144 nfunc->func = func;
145 nfunc->data = data;
146 hpinfo->tag_funcs = dllist_append(hpinfo->tag_funcs, (dllist_t) nfunc);
147 }
148
html_parser_add_attrib_func(html_parser_t * hpinfo,html_parser_func_t func,void * data)149 void html_parser_add_attrib_func(html_parser_t * hpinfo,
150 html_parser_func_t func, void *data)
151 {
152 html_parser_func_info_t *nfunc;
153
154 nfunc = _malloc(sizeof(html_parser_func_info_t));
155 nfunc->func = func;
156 nfunc->data = data;
157 hpinfo->attrib_funcs = dllist_append(hpinfo->attrib_funcs, (dllist_t) nfunc);
158 }
159
html_parser_add_style_func(html_parser_t * hpinfo,html_parser_func_t func,void * data)160 void html_parser_add_style_func(html_parser_t * hpinfo,
161 html_parser_func_t func, void *data)
162 {
163 html_parser_func_info_t *nfunc;
164
165 nfunc = _malloc(sizeof(html_parser_func_info_t));
166 nfunc->func = func;
167 nfunc->data = data;
168 hpinfo->style_funcs = dllist_append(hpinfo->style_funcs, (dllist_t) nfunc);
169 }
170
html_parser_add_script_func(html_parser_t * hpinfo,html_parser_func_t func,void * data)171 void html_parser_add_script_func(html_parser_t * hpinfo,
172 html_parser_func_t func, void *data)
173 {
174 html_parser_func_info_t *nfunc;
175
176 nfunc = _malloc(sizeof(html_parser_func_info_t));
177 nfunc->func = func;
178 nfunc->data = data;
179 hpinfo->script_funcs = dllist_append(hpinfo->script_funcs, (dllist_t) nfunc);
180 }
181
html_parser_set_document(html_parser_t * hpinfo,url * doc_url,char * content,ssize_t size)182 void html_parser_set_document(html_parser_t * hpinfo, url * doc_url,
183 char *content, ssize_t size)
184 {
185 hpinfo->doc_url = doc_url;
186 hpinfo->in_content = content;
187 hpinfo->in_size = size;
188 }
189
html_parser_take_document(html_parser_t * hpinfo,char ** out_content,ssize_t * out_size)190 void html_parser_take_document(html_parser_t * hpinfo, char **out_content,
191 ssize_t * out_size)
192 {
193 *out_content = hpinfo->out_content;
194 *out_size = hpinfo->out_offset;
195
196 hpinfo->out_content = NULL;
197 hpinfo->out_offset = 0;
198 hpinfo->aout_size = 0;
199 }
200
html_parser_set_base(html_parser_t * hpinfo,char * base,char * baset)201 void html_parser_set_base(html_parser_t * hpinfo, char *base, char *baset)
202 {
203 if(base)
204 {
205 _free(hpinfo->base);
206 hpinfo->base = base;
207 }
208
209 if(baset)
210 {
211 _free(hpinfo->baset);
212 hpinfo->baset = baset;
213 }
214 }
215
html_parser_process_new_base_url(html_parser_t * hpinfo,char * baseattr)216 static void html_parser_process_new_base_url(html_parser_t * hpinfo,
217 char *baseattr)
218 {
219 url *purl;
220 char *newbase;
221
222 purl = url_parse(baseattr);
223 assert(purl->type != URLT_FROMPARENT);
224
225 if(!prottable[purl->type].supported)
226 {
227 xprintf(1, gettext("Unsupported BASE URL - %s (probably bad handled)\n"),
228 baseattr);
229 newbase = tl_strdup(baseattr);
230 }
231 else
232 {
233 char *idx;
234
235 newbase =
236 url_to_absolute_url(hpinfo->base, hpinfo->baset, hpinfo->doc_url,
237 baseattr);
238
239 if(!newbase) return; /* collect base="" and ignore it */
240
241 if((idx = strrchr(newbase, '?')))
242 *idx = '\0';
243 if(!tl_is_dirname(newbase))
244 {
245 idx = strrchr(newbase, '/');
246 if(idx)
247 *(idx + 1) = '\0';
248 }
249 }
250 DEBUG_HTML("NEW BASE URL - %s\n", newbase);
251
252 free_deep_url(purl);
253 _free(purl);
254 _free(hpinfo->base);
255 hpinfo->base = newbase;
256 }
257
html_parser_call_funcs(html_parser_t * hpinfo,dllist * funcs)258 static void html_parser_call_funcs(html_parser_t * hpinfo, dllist * funcs)
259 {
260 dllist *ptr;
261
262 for(ptr = funcs; ptr; ptr = ptr->next)
263 {
264 html_parser_func_info_t *fi = (html_parser_func_info_t *) ptr->data;
265
266 fi->func(hpinfo, hpinfo->stack, fi->data);
267 }
268 }
269
html_parser_flush_stack_to_output(html_parser_t * hpinfo)270 static void html_parser_flush_stack_to_output(html_parser_t * hpinfo)
271 {
272 int l;
273
274 if(!hpinfo->rewrite)
275 return;
276
277 l = strlen(hpinfo->stack);
278 html_parser_MEXPAND(hpinfo, l)
279 memcpy(hpinfo->out_content + hpinfo->out_offset, hpinfo->stack, l);
280 hpinfo->out_offset += l;
281 hpinfo->stack_offset = 0;
282 }
283
html_parser_check_tag(html_parser_t * hpinfo,char * tagstart)284 static int html_parser_check_tag(html_parser_t * hpinfo, char *tagstart)
285 {
286 int tl;
287
288 hpinfo->current_tag = NULL;
289
290 for(tl = 0; tl_ascii_isalpha(tagstart[tl]); tl++);
291
292 if(strchr(" \t\r\n>", tagstart[tl]))
293 {
294 char *tagname;
295
296 tagname = tl_strndup(tagstart, tl);
297 hpinfo->current_tag = (html_tag_t *) dlhash_find_by_key(hpinfo->tag_hash,
298 (dllist_t) tagname);
299 _free(tagname);
300 }
301 return (hpinfo->current_tag != NULL);
302 }
303
html_parser_parse_init(html_parser_t * hpinfo)304 static void html_parser_parse_init(html_parser_t * hpinfo)
305 {
306 hpinfo->in_offset = 0;
307 if(hpinfo->rewrite)
308 {
309 hpinfo->aout_size = hpinfo->in_size + html_parser_FENDER;
310 hpinfo->out_content = _malloc(hpinfo->aout_size);
311 hpinfo->out_offset = 0;
312 }
313
314 hpinfo->stack_size = 2 * html_parser_FENDER;
315 hpinfo->stack = _malloc(hpinfo->stack_size);
316 hpinfo->stack_offset = 0;
317 }
318
html_parser_parse(html_parser_t * hpinfo)319 void html_parser_parse(html_parser_t * hpinfo)
320 {
321 int tagstart = FALSE;
322 int scriptstart = FALSE;
323 int commentstart = FALSE;
324 int stylestart = FALSE;
325 int singlequoteintag = FALSE;
326 int doublequoteintag = FALSE;
327 char *p;
328
329 html_parser_parse_init(hpinfo);
330
331 if(hpinfo->purestyle)
332 stylestart = TRUE;
333
334 if(hpinfo->purescript)
335 scriptstart = TRUE;
336
337 for(p = hpinfo->in_content; (p - hpinfo->in_content) < hpinfo->in_size;
338 p++, hpinfo->in_offset++)
339 {
340 if(stylestart)
341 {
342 if(!strncasecmp(p, "</STYLE", 7))
343 {
344 stylestart = FALSE;
345
346 hpinfo->stack[hpinfo->stack_offset] = *p;
347 hpinfo->stack[hpinfo->stack_offset + 1] = '\0';
348
349 html_parser_call_funcs(hpinfo, hpinfo->style_funcs);
350 html_parser_flush_stack_to_output(hpinfo);
351 }
352 else
353 {
354 html_parser_SEXPAND(hpinfo, 1)
355 hpinfo->stack[hpinfo->stack_offset] = *p;
356 hpinfo->stack_offset++;
357 }
358
359 continue;
360 }
361
362 if(scriptstart)
363 {
364 if(!strncasecmp(p + 1, "</SCRIPT", 8))
365 {
366 scriptstart = FALSE;
367
368 hpinfo->stack[hpinfo->stack_offset] = *p;
369 hpinfo->stack[hpinfo->stack_offset + 1] = '\0';
370
371 html_parser_call_funcs(hpinfo, hpinfo->script_funcs);
372 html_parser_flush_stack_to_output(hpinfo);
373 }
374 else
375 {
376 html_parser_SEXPAND(hpinfo, 1)
377 hpinfo->stack[hpinfo->stack_offset] = *p;
378 hpinfo->stack_offset++;
379 }
380
381 continue;
382 }
383
384 if(commentstart)
385 {
386 if(!strncmp(p, "-->", 3))
387 commentstart = FALSE;
388
389 if(hpinfo->rewrite)
390 {
391 hpinfo->out_content[hpinfo->out_offset] = *p;
392 hpinfo->out_offset++;
393 }
394 continue;
395 }
396
397 if((*p == '\"') && tagstart && !singlequoteintag)
398 {
399 if(doublequoteintag)
400 {
401 doublequoteintag = FALSE;
402 }
403 else
404 {
405 doublequoteintag = TRUE;
406 }
407 }
408 else if((*p == '\'') && tagstart && !doublequoteintag)
409 {
410 if(singlequoteintag)
411 {
412 singlequoteintag = FALSE;
413 }
414 else
415 {
416 singlequoteintag = TRUE;
417 }
418 }
419 else if(*p == '<')
420 {
421 if(singlequoteintag || doublequoteintag)
422 {
423 continue;
424 }
425 if(tagstart)
426 {
427 hpinfo->stack[hpinfo->stack_offset] = '\0';
428 html_parser_flush_stack_to_output(hpinfo);
429 }
430 tagstart = FALSE;
431
432 if(!strncasecmp(p, "<STYLE", 6))
433 {
434 stylestart = TRUE;
435 hpinfo->stack_offset = 0;
436 }
437 else if(!strncmp(p, "<!--", 4))
438 {
439 commentstart = TRUE;
440 }
441 else
442 {
443 hpinfo->stack_offset = 0;
444 tagstart = TRUE;
445 singlequoteintag = FALSE;
446 doublequoteintag = FALSE;
447 }
448 }
449 else if(*p == '>' && tagstart)
450 {
451 if(singlequoteintag || doublequoteintag)
452 {
453 continue;
454 }
455 hpinfo->stack[hpinfo->stack_offset] = *p;
456 hpinfo->stack[hpinfo->stack_offset + 1] = '\0';
457 html_parser_call_funcs(hpinfo, hpinfo->tag_funcs);
458 html_parser_flush_stack_to_output(hpinfo);
459
460 if(hpinfo->current_tag &&
461 hpinfo->current_tag->type == HTML_TAG_SCRIPT &&
462 !html_tag_co_elem(hpinfo->stack, "SRC"))
463 {
464 scriptstart = TRUE;
465 }
466
467 tagstart = FALSE;
468 singlequoteintag = FALSE;
469 doublequoteintag = FALSE;
470 continue;
471 }
472
473 if(tagstart || stylestart || scriptstart)
474 {
475 hpinfo->stack[hpinfo->stack_offset] = *p;
476 hpinfo->stack_offset++;
477 html_parser_SEXPAND(hpinfo, 1);
478 }
479 else
480 {
481 if(hpinfo->rewrite)
482 {
483 hpinfo->out_content[hpinfo->out_offset] = *p;
484 hpinfo->out_offset++;
485 }
486 }
487 }
488
489 /* pure style don't need to end with </STYLE> */
490 /* so we must parse CSS also at end of document */
491 if(stylestart && hpinfo->purestyle)
492 {
493 stylestart = FALSE;
494
495 hpinfo->stack[hpinfo->stack_offset] = *p;
496 hpinfo->stack[hpinfo->stack_offset + 1] = '\0';
497
498 html_parser_call_funcs(hpinfo, hpinfo->style_funcs);
499 html_parser_flush_stack_to_output(hpinfo);
500 }
501
502 /* pure script don't need to end with </SCRIPT> so we */
503 /* must parse script patterns also at end of document */
504 if(scriptstart && hpinfo->purescript)
505 {
506 scriptstart = FALSE;
507
508 hpinfo->stack[hpinfo->stack_offset] = *p;
509 hpinfo->stack[hpinfo->stack_offset + 1] = '\0';
510
511 html_parser_call_funcs(hpinfo, hpinfo->script_funcs);
512 html_parser_flush_stack_to_output(hpinfo);
513 }
514
515 if(tagstart || stylestart)
516 {
517 hpinfo->stack[hpinfo->stack_offset] = '\0';
518 html_parser_flush_stack_to_output(hpinfo);
519 }
520
521 if(hpinfo->rewrite)
522 hpinfo->out_content[hpinfo->out_offset] = '\0';
523 }
524
525 /********************************************/
526 /* functions for processing whole HTML tags */
527 /********************************************/
html_parser_parse_tag(html_parser_t * hpinfo,char * stack,void * data)528 void html_parser_parse_tag(html_parser_t * hpinfo, char *stack, void *data)
529 {
530 int j;
531 dllist *ptr;
532
533 if(!html_parser_check_tag(hpinfo, hpinfo->stack + 1))
534 return;
535
536 if(hpinfo->current_tag->type == HTML_TAG_META)
537 return;
538
539 for(j = 0; hpinfo->current_tag->attribs[j].attrib; j++)
540 {
541 hpinfo->current_attrib = &hpinfo->current_tag->attribs[j];
542
543 if(hpinfo->current_attrib->stat & LINK_DISABLED)
544 continue;
545
546 hpinfo->tag_attrib = html_get_attrib_from_tag(hpinfo->stack,
547 hpinfo->current_attrib->attrib);
548
549 /*** -dont_touch_url_pattern support ***/
550 if(hpinfo->tag_attrib && cfg.dont_touch_url_pattern)
551 {
552 if(is_in_pattern_list(hpinfo->tag_attrib, cfg.dont_touch_url_pattern))
553 {
554 _free(hpinfo->tag_attrib);
555 }
556 }
557
558 #ifdef HAVE_REGEX
559 /*** -dont_touch_url_rpattern support ***/
560 for(ptr = cfg.dont_touch_url_rpattern;
561 ptr && hpinfo->tag_attrib; ptr = ptr->next)
562 {
563 if(re_pmatch((re_entry *) ptr->data, hpinfo->tag_attrib))
564 _free(hpinfo->tag_attrib);
565 }
566
567 /*** -dont_touch_tag_rpattern support ***/
568 for(ptr = cfg.dont_touch_tag_rpattern;
569 ptr && hpinfo->tag_attrib; ptr = ptr->next)
570 {
571 if(re_pmatch((re_entry *)ptr->data, hpinfo->stack))
572 _free(hpinfo->tag_attrib);
573 }
574 #endif
575
576 if(hpinfo->tag_attrib)
577 {
578 /* to support javascript:... URLs */
579 /* inside any attribute */
580 if(!strncasecmp(hpinfo->tag_attrib, "javascript:", 11))
581 {
582 char *saved_attrib = hpinfo->tag_attrib;
583
584 hpinfo->tag_attrib = tl_strdup(saved_attrib + 11);
585 html_parser_call_funcs(hpinfo, hpinfo->script_funcs);
586 if(hpinfo->rewrite)
587 {
588 int len;
589
590 len = strlen(hpinfo->tag_attrib);
591 saved_attrib = _realloc(saved_attrib, 12 + len);
592 memcpy(saved_attrib + 11, hpinfo->tag_attrib, len + 1);
593 _free(hpinfo->tag_attrib);
594 hpinfo->tag_attrib = saved_attrib;
595 }
596 else
597 _free(saved_attrib);
598 }
599 else if(hpinfo->current_attrib->stat & LINK_STYLE)
600 html_parser_call_funcs(hpinfo, hpinfo->style_funcs);
601 else if(hpinfo->current_attrib->stat & LINK_JS)
602 html_parser_call_funcs(hpinfo, hpinfo->script_funcs);
603 else
604 html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs);
605 }
606
607 if(hpinfo->rewrite && hpinfo->tag_attrib)
608 {
609 int l = strlen(hpinfo->tag_attrib);
610
611 html_parser_SEND(hpinfo);
612 html_parser_SEXPAND(hpinfo, l);
613 html_replace_url_in_stack(hpinfo->stack,
614 hpinfo->current_attrib->attrib, hpinfo->tag_attrib, FALSE);
615 }
616
617 _free(hpinfo->tag_attrib);
618 }
619 }
620
html_parser_parse_tag_slash_a(html_parser_t * hpinfo,char * stack,html_extract_info_t * einfo)621 void html_parser_parse_tag_slash_a(html_parser_t * hpinfo, char *stack,
622 html_extract_info_t * einfo)
623 {
624 if(einfo->prev_a && !strcasecmp(hpinfo->stack, "</A>"))
625 {
626 einfo->prev_a = NULL;
627 }
628 }
629
html_parser_parse_tag_meta_refresh(html_parser_t * hpinfo,char * stack,void * data)630 void html_parser_parse_tag_meta_refresh(html_parser_t * hpinfo, char *stack,
631 void *data)
632 {
633 char *saved_meta = (char *) 0;
634 char *meta_type;
635
636 if(!hpinfo->current_tag || hpinfo->current_tag->type != HTML_TAG_META)
637 return;
638
639 hpinfo->current_attrib = &hpinfo->current_tag->attribs[0];
640
641 meta_type = html_get_attrib_from_tag(hpinfo->stack, "HTTP-EQUIV");
642
643 if(!meta_type || strcasecmp(meta_type, "Refresh"))
644 {
645 _free(meta_type);
646 return;
647 }
648 _free(meta_type);
649
650 saved_meta = html_get_attrib_from_tag(hpinfo->stack, "CONTENT");
651
652 if(!saved_meta)
653 return;
654
655 hpinfo->tag_attrib = html_get_attrib_from_tag(saved_meta, "URL");
656
657 if(hpinfo->tag_attrib)
658 {
659 html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs);
660
661 if(hpinfo->rewrite)
662 {
663 /* little hack to prevent writing */
664 /* outside of allocated memory chunk */
665 saved_meta = _realloc(saved_meta,
666 strlen(saved_meta) + strlen(hpinfo->tag_attrib) + 4);
667
668 html_replace_url_in_stack(saved_meta, "URL", hpinfo->tag_attrib, TRUE);
669 _free(hpinfo->tag_attrib);
670
671 hpinfo->tag_attrib = saved_meta;
672
673 if(hpinfo->tag_attrib)
674 {
675 int l = strlen(hpinfo->tag_attrib);
676
677 html_parser_SEND(hpinfo);
678 html_parser_SEXPAND(hpinfo, l);
679 html_replace_url_in_stack(hpinfo->stack,
680 hpinfo->current_attrib->attrib, hpinfo->tag_attrib, FALSE);
681 hpinfo->tag_attrib = 0;
682 }
683 }
684 else
685 {
686 _free(hpinfo->tag_attrib);
687 }
688 }
689
690 _free(saved_meta);
691 }
692
html_parser_parse_tag_meta_robots(html_parser_t * hpinfo,char * stack,html_robots_info_t * oinfo)693 void html_parser_parse_tag_meta_robots(html_parser_t * hpinfo, char *stack,
694 html_robots_info_t * oinfo)
695 {
696 char *meta_type;
697 char *content;
698 char **flags;
699 int i;
700
701 if(!hpinfo->current_tag || hpinfo->current_tag->type != HTML_TAG_META)
702 return;
703
704 meta_type = html_get_attrib_from_tag(hpinfo->stack, "NAME");
705
706 if(!meta_type || strcasecmp(meta_type, "Robots"))
707 {
708 _free(meta_type);
709 return;
710 }
711
712 _free(meta_type);
713
714 content = html_get_attrib_from_tag(hpinfo->stack, "CONTENT");
715
716 if(!content)
717 return;
718
719 flags = tl_str_split(content, ",");
720 _free(content);
721
722 for(i = 0; flags && flags[i]; i++)
723 {
724 if(!strcasecmp(flags[i], "all"))
725 {
726 oinfo->index = TRUE;
727 oinfo->follow = TRUE;
728 oinfo->images = TRUE;
729 }
730 else if(!strcasecmp(flags[i], "none"))
731 {
732 oinfo->index = FALSE;
733 oinfo->follow = FALSE;
734 oinfo->images = FALSE;
735 }
736 else if(!strcasecmp(flags[i], "index"))
737 oinfo->index = TRUE;
738 else if(!strcasecmp(flags[i], "follow"))
739 oinfo->follow = TRUE;
740 else if(!strcasecmp(flags[i], "noimageindex"))
741 oinfo->images = FALSE;
742 else if(!strcasecmp(flags[i], "noindex"))
743 oinfo->index = FALSE;
744 else if(!strcasecmp(flags[i], "nofollow"))
745 oinfo->follow = FALSE;
746 _free(flags[i]);
747 }
748 _free(flags);
749 }
750
html_parser_parse_tag_jstransform(html_parser_t * hpinfo,char * stack,void * data)751 void html_parser_parse_tag_jstransform(html_parser_t * hpinfo, char *stack,
752 void *data)
753 {
754 #ifdef HAVE_REGEX
755 dllist *ptr;
756 html_tag_t t = { HTML_TAG_HACK, "HACK",
757 {{HTML_ATTRIB_HACK, "HACK", LINK_INLINE | LINK_DOWNLD},
758 {HTML_ATTRIB_NULL, NULL, 0}}
759 };
760
761 for(ptr = priv_cfg.js_transform; ptr; ptr = ptr->next)
762 {
763 js_transform_t *jt = (js_transform_t *) ptr->data;
764
765 if(js_transform_match_tag(jt, hpinfo->stack))
766 {
767 int nsub, *subs;
768 char *attr = html_get_attrib_from_tag(hpinfo->stack,
769 jt->attrib);
770
771 if(!attr)
772 continue;
773
774 if(!re_pmatch_subs(jt->re, attr, &nsub, &subs))
775 {
776 _free(attr);
777 continue;
778 }
779
780 hpinfo->tag_attrib = js_transform_apply(jt, attr, nsub, subs);
781
782 /*****************************************/
783 /* quite dirty hack to make happy attrib */
784 /* parsing funcs which require valid */
785 /* current_tag & current_attrib */
786 /*****************************************/
787 hpinfo->current_tag = &t;
788 hpinfo->current_attrib = &(t.attribs[0]);
789
790 if(hpinfo->tag_attrib)
791 html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs);
792
793 if(hpinfo->rewrite && jt->type == 1 && nsub)
794 {
795 int l = strlen(hpinfo->tag_attrib);
796
797 attr = _realloc(attr, strlen(attr) + l + 1);
798 memmove(attr + l + subs[2], attr + subs[3],
799 strlen(attr + subs[3]) + 1);
800 memcpy(attr + subs[2], hpinfo->tag_attrib, l);
801
802 l = strlen(attr);
803 html_parser_SEND(hpinfo);
804 html_parser_SEXPAND(hpinfo, l);
805 html_replace_url_in_stack(hpinfo->stack, jt->attrib, attr, FALSE);
806 }
807
808 _free(subs);
809 _free(attr);
810
811 /* :-) unhack */
812 hpinfo->current_tag = NULL;
813 hpinfo->current_attrib = NULL;
814
815 _free(hpinfo->tag_attrib);
816 }
817 }
818 #endif
819 }
820
821 /********************************************************/
822 /* functions for processing URL attributes of HTML tags */
823 /********************************************************/
html_parser_url_to_absolute_url(html_parser_t * hpinfo,char * stack,void * data)824 void html_parser_url_to_absolute_url(html_parser_t * hpinfo, char *stack,
825 void *data)
826 {
827 char *ustr;
828
829 /*
830 printf("http_parser sees %s %s=\"%s\"<\n",
831 hpinfo->current_tag->tag,
832 hpinfo->current_attrib->attrib,
833 hpinfo->tag_attrib);
834 */
835
836 ustr = url_to_absolute_url(hpinfo->base, hpinfo->baset,
837 hpinfo->doc_url, hpinfo->tag_attrib);
838
839 if(ustr && *ustr)
840 {
841 DEBUG_HTML("Rewriting URL (to abs) - %s -> %s\n", hpinfo->tag_attrib,
842 ustr);
843 _free(hpinfo->tag_attrib);
844 hpinfo->tag_attrib = ustr;
845 }
846 }
847
html_parser_process_base(html_parser_t * hpinfo,char * stack,void * data)848 void html_parser_process_base(html_parser_t * hpinfo, char *stack, void *data)
849 {
850 if(hpinfo->current_tag->type == HTML_TAG_BASE &&
851 hpinfo->current_attrib->type == HTML_ATTRIB_HREF)
852 {
853 int lp, ls;
854
855 html_parser_process_new_base_url(hpinfo, hpinfo->tag_attrib);
856
857 /* comment BASE tag because pavuk */
858 /* overwrites URLs according to this tag */
859 lp = strlen(COMMENT_PREFIX);
860 ls = strlen(COMMENT_SUFFIX);
861
862 html_parser_SEND(hpinfo);
863 html_parser_SEXPAND(hpinfo, (lp + ls));
864
865 memmove(hpinfo->stack + lp, hpinfo->stack, strlen(hpinfo->stack) + 1);
866 memcpy(hpinfo->stack, COMMENT_PREFIX, lp);
867 strcat(hpinfo->stack, COMMENT_SUFFIX);
868 }
869 }
870
html_parser_process_form(html_parser_t * hpinfo,char * stack,dllist ** formlist)871 void html_parser_process_form(html_parser_t * hpinfo, char *stack,
872 dllist ** formlist)
873 {
874 if(hpinfo->current_attrib->stat & LINK_FORM &&
875 hpinfo->current_attrib->type == HTML_ATTRIB_ACTION)
876 {
877 hpinfo->doc_url->status |= URL_HAVE_FORMS;
878
879 if(formlist && hpinfo->tag_attrib)
880 {
881 *formlist = dllist_append(*formlist,
882 (dllist_t) tl_strdup(hpinfo->tag_attrib));
883 }
884 }
885 }
886
html_parser_get_url(html_parser_t * hpinfo,char * stack,html_extract_info_t * einfo)887 void html_parser_get_url(html_parser_t * hpinfo, char *stack,
888 html_extract_info_t * einfo)
889 {
890 if(*hpinfo->tag_attrib /* Never follow "" */ &&
891 (hpinfo->current_attrib->stat & LINK_DOWNLD) &&
892 (!einfo->only_inline ||
893 (einfo->only_inline &&
894 hpinfo->current_attrib->stat & LINK_INLINE)) &&
895 (!(hpinfo->current_attrib->stat & LINK_SCRIPT) ||
896 (einfo->enable_js && hpinfo->current_attrib->stat & LINK_SCRIPT)))
897 {
898 url *purl = (url *) 0;
899 cond_info_t condp;
900
901 condp.level = 0;
902 condp.urlnr = 0;
903 condp.size = 0;
904 condp.time = 0L;
905 condp.mimet = NULL;
906 condp.full_tag = stack;
907 condp.params = NULL;
908 condp.html_doc = hpinfo->in_content;
909 condp.html_doc_offset = hpinfo->in_offset;
910 condp.tag = hpinfo->current_tag ? hpinfo->current_tag->tag : NULL;
911 condp.attrib = hpinfo->current_attrib ?
912 hpinfo->current_attrib->attrib : NULL;
913
914 purl = url_parse(hpinfo->tag_attrib);
915 assert(purl->type != URLT_FROMPARENT);
916 url_path_abs(purl);
917
918 if(hpinfo->current_attrib->stat & LINK_INLINE)
919 purl->status |= URL_INLINE_OBJ;
920
921 if(hpinfo->current_attrib->stat & LINK_SCRIPT)
922 purl->status |= URL_ISSCRIPT;
923
924 purl->level = hpinfo->doc_url->level + 1;
925 purl->parent_url = dllist_append(purl->parent_url,
926 (dllist_t) hpinfo->doc_url);
927
928 /*****************************************************/
929 /* if we are in SYNC/MIRROR mode try to get original */
930 /* URL rather than processing it as file */
931 /* (mandatory thing to get working SYNC/MIRROR mode) */
932 /*****************************************************/
933 if((cfg.mode == MODE_SYNC || cfg.mode == MODE_MIRROR) &&
934 cfg.request && (purl->type == URLT_FILE))
935 {
936 url *pomurl = filename_to_url(purl->p.file.filename);
937
938 if(pomurl)
939 {
940 free_deep_url(purl);
941 _free(purl);
942 purl = pomurl;
943 }
944 }
945
946 /**********************************/
947 /* remove last anchor URL because */
948 /* it is server side image map */
949 /**********************************/
950 if(einfo->prev_a &&
951 hpinfo->current_tag->type == HTML_TAG_IMG &&
952 hpinfo->current_attrib->type == HTML_ATTRIB_SRC &&
953 html_tag_co_elem(hpinfo->stack, "ISMAP"))
954 {
955 DEBUG_HTML("Removing server image map\n");
956 free_deep_url((url *) einfo->prev_a->data);
957 free((url *) einfo->prev_a->data);
958 einfo->urls = dllist_remove_entry(einfo->urls, einfo->prev_a);
959 einfo->prev_a = NULL;
960 }
961
962 if(hpinfo->current_tag->type == HTML_TAG_A &&
963 hpinfo->current_attrib->type == HTML_ATTRIB_HREF)
964 {
965 einfo->prev_a = NULL;
966 }
967
968 /* Do not accept links, which only link inside the already loaded document
969 like <a href="#top">. This is a local relative reference, so remove it.
970 */
971 if((hpinfo->current_attrib->type == HTML_ATTRIB_USEMAP ||
972 hpinfo->current_attrib->type == HTML_ATTRIB_HREF) &&
973 hpinfo->tag_attrib[0] == '#')
974 {
975 LOCK_REJCNT;
976 cfg.reject_cnt++;
977 UNLOCK_REJCNT;
978
979 DEBUG_HTML("Rejecting local anchor URL - %s\n", hpinfo->tag_attrib);
980 }
981 else if(einfo->no_limits || url_append_condition(purl, &condp))
982 {
983 DEBUG_HTML("Accepting URL - %s\n", hpinfo->tag_attrib);
984
985 /***************************************/
986 /* process special add-on tag PAVUKEXT */
987 /* where are stored some additional */
988 /* informations about FTP URLs */
989 /***************************************/
990 if(purl->type == URLT_FTP || purl->type == URLT_FTPS)
991 {
992 char *pext;
993
994 pext = html_get_attrib_from_tag(hpinfo->stack, "PAVUKEXT");
995 if(pext)
996 {
997 ftp_url_extension *uext;
998
999 uext = ftp_parse_ftpinf_ext(pext);
1000 purl->extension = uext;
1001
1002 if(uext->type == FTP_TYPE_D)
1003 purl->p.ftp.dir = TRUE;
1004 }
1005 _free(pext);
1006 }
1007
1008 einfo->urls = dllist_append(einfo->urls, (dllist_t) purl);
1009
1010 if(hpinfo->current_tag->type == HTML_TAG_A &&
1011 hpinfo->current_attrib->type == HTML_ATTRIB_HREF)
1012 {
1013 einfo->prev_a = dllist_last(einfo->urls);
1014 }
1015 }
1016 else
1017 {
1018 LOCK_REJCNT;
1019 cfg.reject_cnt++;
1020 UNLOCK_REJCNT;
1021
1022 DEBUG_HTML("Rejecting URL - %s\n", hpinfo->tag_attrib);
1023 free_deep_url(purl);
1024 _free(purl);
1025 }
1026 }
1027 }
1028
html_parser_url_to_local(html_parser_t * hpinfo,char * stack,html_rewrite_info_t * rinfo)1029 void html_parser_url_to_local(html_parser_t * hpinfo, char *stack,
1030 html_rewrite_info_t * rinfo)
1031 {
1032 url *urlp, *before_url;
1033 char *anchor, *fn;
1034 int is_local;
1035
1036 if(!hpinfo->rewrite || rinfo->all_to_remote)
1037 return;
1038
1039 urlp = url_parse(hpinfo->tag_attrib);
1040 assert(urlp->type != URLT_FROMPARENT);
1041
1042 if(urlp->type == URLT_FILE || !prottable[urlp->type].supported)
1043 {
1044 free_deep_url(urlp);
1045 _free(urlp);
1046 return;
1047 }
1048
1049 anchor = url_get_anchor_name(urlp);
1050
1051 /*******************************************/
1052 /* for better performance with info files */
1053 /* we should rather use filename generated */
1054 /* for previous occurence of this URL */
1055 /*******************************************/
1056 before_url = url_was_befor(urlp);
1057
1058 if(!before_url)
1059 {
1060 dllist *ptr;
1061
1062 ptr = dllist_find2(rinfo->einfo->urls, (dllist_t) urlp,
1063 dllist_url_compare);
1064 if(ptr)
1065 before_url = (url *) ptr->data;
1066 }
1067
1068 if(before_url)
1069 fn = url_to_filename(before_url, TRUE);
1070 else
1071 fn = url_to_filename(urlp, FALSE);
1072
1073 is_local = !access(fn, R_OK);
1074
1075 if(is_local || rinfo->all_to_local ||
1076 (rinfo->selected_to_local && before_url) ||
1077 url_compare(urlp, hpinfo->doc_url))
1078 {
1079 char *actname, *relname;
1080 struct stat estat;
1081
1082 if(is_local && !stat(fn, &estat) && S_ISDIR(estat.st_mode))
1083 fn = tl_str_concat(NULL, fn, "/", priv_cfg.index_name, NULL);
1084 else
1085 fn = tl_strdup(fn);
1086
1087 actname = url_to_filename(hpinfo->doc_url, FALSE);
1088
1089 /* it seems that lynx and netscape behave different on */
1090 /* empty HREFs, so use it only in case when is specified */
1091 /* partname of document (#xxx) */
1092 /* this is URL of current document -> "" */
1093 if(anchor && !strcmp(actname, fn))
1094 relname = tl_strdup("");
1095 else
1096 relname = get_relative_path(actname, fn);
1097
1098 _free(fn);
1099
1100 /* workaround for -sel_to_local && -nostore_index */
1101 if(rinfo->selected_to_local && !rinfo->store_index)
1102 {
1103 char *slp = strrchr(relname, '/');
1104
1105 if(!slp)
1106 slp = relname;
1107 else
1108 slp++;
1109
1110 if(!strcmp(slp, priv_cfg.index_name))
1111 *slp = '\0';
1112 }
1113
1114 if(anchor)
1115 relname = tl_str_concat(relname, "#", anchor, NULL);
1116
1117 DEBUG_HTML("Rewriting URL (to loc) - %s -> %s\n", hpinfo->tag_attrib,
1118 relname);
1119
1120 _free(hpinfo->tag_attrib);
1121 hpinfo->tag_attrib = relname;
1122 }
1123 free_deep_url(urlp);
1124 _free(urlp);
1125 }
1126
html_parser_remove_advertisement(html_parser_t * hpinfo,char * stack,void * data)1127 void html_parser_remove_advertisement(html_parser_t * hpinfo, char *stack,
1128 void *data)
1129 {
1130 #ifdef HAVE_REGEX
1131 int is_adver = FALSE;
1132
1133 if(hpinfo->current_tag->type != HTML_TAG_IMG ||
1134 hpinfo->current_attrib->type != HTML_ATTRIB_SRC)
1135 return;
1136
1137 if(cfg.remove_adv && priv_cfg.advert_res)
1138 {
1139 dllist *ptr = priv_cfg.advert_res;
1140
1141 while(ptr)
1142 {
1143 if(re_pmatch((re_entry *) ptr->data, hpinfo->tag_attrib))
1144 {
1145 DEBUG_HTML("Removing advert URL - %s\n", hpinfo->tag_attrib);
1146 is_adver = TRUE;
1147 break;
1148 }
1149 ptr = ptr->next;
1150 }
1151 }
1152
1153 if(is_adver)
1154 {
1155 int lp = strlen(ADVERT_PREFIX);
1156 int ls = strlen(ADVERT_SUFFIX);
1157
1158 html_parser_SEND(hpinfo);
1159 html_parser_SEXPAND(hpinfo, (lp + ls));
1160
1161 memmove(hpinfo->stack + lp, hpinfo->stack, strlen(hpinfo->stack) + 1);
1162 memcpy(hpinfo->stack, ADVERT_PREFIX, lp);
1163 strcat(hpinfo->stack, ADVERT_SUFFIX);
1164 }
1165 #endif
1166 }
1167
html_parser_change_url(html_parser_t * hpinfo,char * stack,html_change_info_t * chinfo)1168 void html_parser_change_url(html_parser_t * hpinfo, char *stack,
1169 html_change_info_t * chinfo)
1170 {
1171 url *urlp;
1172
1173 urlp = url_parse(hpinfo->tag_attrib);
1174 assert(urlp->type != URLT_FROMPARENT);
1175
1176 if(urlp->type == URLT_FILE || !prottable[urlp->type].supported)
1177 {
1178 free_deep_url(urlp);
1179 _free(urlp);
1180 return;
1181 }
1182
1183 if(url_compare(urlp, chinfo->url_old))
1184 {
1185 DEBUG_HTML("Rewriting URL (change) - %s -> %s\n",
1186 chinfo->url_new, chinfo->url_new);
1187
1188 _free(hpinfo->tag_attrib);
1189 hpinfo->tag_attrib = tl_strdup(chinfo->url_new);
1190 }
1191
1192 free_deep_url(urlp);
1193 _free(urlp);
1194 }
1195
1196 /********************************************************/
1197 /* functions for processing CSS parts of HTML documents */
1198 /********************************************************/
html_parser_style_to_absolute_urls(html_parser_t * hpinfo,char * stack,void * data)1199 void html_parser_style_to_absolute_urls(html_parser_t * hpinfo, char *stack,
1200 void *data)
1201 {
1202 char *alttag;
1203
1204 if(!hpinfo->rewrite)
1205 return;
1206
1207 if(hpinfo->tag_attrib)
1208 {
1209 alttag = css_to_absolute_links(hpinfo->doc_url,
1210 hpinfo->tag_attrib, hpinfo->base, hpinfo->baset);
1211 _free(hpinfo->tag_attrib);
1212 hpinfo->tag_attrib = alttag;
1213 }
1214 else
1215 {
1216 int l;
1217
1218 alttag = css_to_absolute_links(hpinfo->doc_url,
1219 hpinfo->stack, hpinfo->base, hpinfo->baset);
1220
1221 l = strlen(alttag);
1222 if(l > hpinfo->stack_offset)
1223 {
1224 hpinfo->stack_offset = 0;
1225 html_parser_SEXPAND(hpinfo, l);
1226 }
1227 memcpy(hpinfo->stack, alttag, l + 1);
1228 hpinfo->stack_offset = l;
1229 _free(alttag);
1230 }
1231 }
1232
html_parser_get_style_urls(html_parser_t * hpinfo,char * stack,html_extract_info_t * einfo)1233 void html_parser_get_style_urls(html_parser_t * hpinfo, char *stack,
1234 html_extract_info_t * einfo)
1235 {
1236 dllist *pv;
1237 if(!cfg.read_css)
1238 { /* don't fetch from css if not wanted */
1239 return;
1240 }
1241 if(hpinfo->tag_attrib)
1242 pv = css_get_all_links(hpinfo->doc_url, hpinfo->tag_attrib,
1243 hpinfo->base, hpinfo->baset, einfo->no_limits);
1244 else
1245 pv = css_get_all_links(hpinfo->doc_url, hpinfo->stack,
1246 hpinfo->base, hpinfo->baset, einfo->no_limits);
1247
1248 einfo->urls = dllist_concat(einfo->urls, pv);
1249 }
1250
html_parser_style_to_local_urls(html_parser_t * hpinfo,char * stack,html_rewrite_info_t * rinfo)1251 void html_parser_style_to_local_urls(html_parser_t * hpinfo, char *stack,
1252 html_rewrite_info_t * rinfo)
1253 {
1254 char *alttag;
1255
1256 if(!hpinfo->rewrite || rinfo->all_to_remote)
1257 return;
1258
1259 if(hpinfo->tag_attrib)
1260 {
1261 alttag = css_remote_to_local_links(hpinfo->doc_url,
1262 hpinfo->tag_attrib, rinfo->all_to_local,
1263 rinfo->selected_to_local, hpinfo->base, hpinfo->baset);
1264 _free(hpinfo->tag_attrib);
1265 hpinfo->tag_attrib = alttag;
1266 }
1267 else
1268 {
1269 int l;
1270
1271 alttag = css_remote_to_local_links(hpinfo->doc_url,
1272 hpinfo->stack, rinfo->all_to_local,
1273 rinfo->selected_to_local, hpinfo->base, hpinfo->baset);
1274
1275 l = strlen(alttag);
1276 if(l > hpinfo->stack_offset)
1277 {
1278 hpinfo->stack_offset = 0;
1279 html_parser_SEXPAND(hpinfo, l);
1280 }
1281 memcpy(hpinfo->stack, alttag, l + 1);
1282 hpinfo->stack_offset = l;
1283 _free(alttag);
1284 }
1285 }
1286
html_parser_style_change_url(html_parser_t * hpinfo,char * stack,html_change_info_t * chinfo)1287 void html_parser_style_change_url(html_parser_t * hpinfo, char *stack,
1288 html_change_info_t * chinfo)
1289 {
1290 char *alttag;
1291
1292 if(!hpinfo->rewrite)
1293 return;
1294
1295 if(hpinfo->tag_attrib)
1296 {
1297 alttag = css_change_url(hpinfo->doc_url, hpinfo->tag_attrib,
1298 chinfo->url_old, chinfo->url_new);
1299 _free(hpinfo->tag_attrib);
1300 hpinfo->tag_attrib = alttag;
1301 }
1302 else
1303 {
1304 int l;
1305
1306 alttag = css_change_url(hpinfo->doc_url, hpinfo->stack,
1307 chinfo->url_old, chinfo->url_new);
1308
1309 l = strlen(alttag);
1310 if(l > hpinfo->stack_offset)
1311 {
1312 hpinfo->stack_offset = 0;
1313 html_parser_SEXPAND(hpinfo, l);
1314 }
1315 memcpy(hpinfo->stack, alttag, l + 1);
1316 hpinfo->stack_offset = l;
1317 _free(alttag);
1318 }
1319 }
1320
1321 /***********************************************************/
1322 /* functions for processing SCRIPTs part of HTML documents */
1323 /***********************************************************/
html_parser_parse_jspatterns(html_parser_t * hpinfo,char * stack,void * data)1324 void html_parser_parse_jspatterns(html_parser_t * hpinfo, char *stack,
1325 void *data)
1326 {
1327 #ifdef HAVE_REGEX
1328 dllist *ptr;
1329 int found = FALSE;
1330 int start, end;
1331
1332 start = -1;
1333 end = -1;
1334
1335 if(!hpinfo->tag_attrib)
1336 return;
1337
1338 for(ptr = priv_cfg.js_patterns; ptr; ptr = ptr->next)
1339 {
1340 if(re_pmatch_sub((re_entry *) ptr->data, hpinfo->tag_attrib,
1341 1, &start, &end))
1342 {
1343 found = TRUE;
1344 break;
1345 }
1346 }
1347
1348 if(found && (start >= 0))
1349 {
1350 char *saved_attrib, *new_attrib;
1351
1352 saved_attrib = hpinfo->tag_attrib;
1353
1354 hpinfo->tag_attrib = tl_strndup(hpinfo->tag_attrib + start, end - start);
1355 html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs);
1356
1357 new_attrib = _malloc(start + strlen(saved_attrib + end) +
1358 strlen(hpinfo->tag_attrib) + 1);
1359
1360 strncpy(new_attrib, saved_attrib, start);
1361 strcpy(new_attrib + start, hpinfo->tag_attrib);
1362 strcat(new_attrib, saved_attrib + end);
1363
1364 _free(hpinfo->tag_attrib);
1365 _free(saved_attrib);
1366
1367 hpinfo->tag_attrib = new_attrib;
1368 }
1369 #endif
1370 }
1371
html_parser_parse_body_jspatterns(html_parser_t * hpinfo,char * stack,void * data)1372 void html_parser_parse_body_jspatterns(html_parser_t * hpinfo, char *stack,
1373 void *data)
1374 {
1375 #ifdef HAVE_REGEX
1376 char *stackc = NULL;
1377 char *p;
1378 int ilen;
1379
1380 html_tag_t t = { HTML_TAG_HACK, "HACK",
1381 {{HTML_ATTRIB_HACK, "HACK", LINK_INLINE | LINK_DOWNLD},
1382 {HTML_ATTRIB_NULL, NULL, 0}}
1383 };
1384
1385 if(hpinfo->tag_attrib)
1386 return;
1387
1388 /*****************************************/
1389 /* quite dirty hack to make happy attrib */
1390 /* parsing funcs which require valid */
1391 /* current_tag & current_attrib */
1392 /*****************************************/
1393 hpinfo->current_tag = &t;
1394 hpinfo->current_attrib = &(t.attribs[0]);
1395
1396 p = hpinfo->stack;
1397
1398 while(*p)
1399 {
1400 ilen = strcspn(p, "\r\n");
1401
1402 hpinfo->tag_attrib = tl_strndup(p, ilen);
1403
1404 html_parser_parse_jspatterns(hpinfo, stack, data);
1405
1406 if(hpinfo->rewrite)
1407 stackc = tl_str_concat(stackc, hpinfo->tag_attrib, "\n", NULL);
1408 _free(hpinfo->tag_attrib);
1409
1410 p += ilen;
1411 p += strspn(p, "\n\r");
1412 }
1413
1414 if(hpinfo->rewrite)
1415 {
1416 ilen = strlen(stackc);
1417 hpinfo->stack_offset = 0;
1418 html_parser_SEXPAND(hpinfo, ilen);
1419 memcpy(hpinfo->stack, stackc, ilen + 1);
1420 hpinfo->stack_offset = ilen;
1421 _free(stackc);
1422 }
1423
1424 /* :-) unhack */
1425 hpinfo->current_tag = NULL;
1426 hpinfo->current_attrib = NULL;
1427
1428 hpinfo->tag_attrib = NULL;
1429 #endif
1430 }
1431
html_parser_parse_body_jstransform(html_parser_t * hpinfo,char * stack,void * data)1432 void html_parser_parse_body_jstransform(html_parser_t * hpinfo, char *stack,
1433 void *data)
1434 {
1435 #ifdef HAVE_REGEX
1436 char *p;
1437 int ilen;
1438 char *stackc = NULL;
1439
1440 html_tag_t t = { HTML_TAG_HACK, "HACK",
1441 {{HTML_ATTRIB_HACK, "HACK", LINK_INLINE | LINK_DOWNLD},
1442 {HTML_ATTRIB_NULL, NULL, 0}}
1443 };
1444
1445 if(hpinfo->tag_attrib)
1446 return;
1447
1448 /*****************************************/
1449 /* quite dirty hack to make happy attrib */
1450 /* parsing funcs which require valid */
1451 /* current_tag & current_attrib */
1452 /*****************************************/
1453 hpinfo->current_tag = &t;
1454 hpinfo->current_attrib = &(t.attribs[0]);
1455
1456 p = hpinfo->stack;
1457
1458 while(*p)
1459 {
1460 dllist *ptr;
1461 char *ln;
1462
1463 ilen = strcspn(p, "\r\n");
1464
1465 ln = tl_strndup(p, ilen);
1466
1467 for(ptr = priv_cfg.js_transform; ptr; ptr = ptr->next)
1468 {
1469 int nsub, *subs;
1470 js_transform_t *jt = (js_transform_t *) ptr->data;
1471
1472 if(jt->tag[0])
1473 continue;
1474
1475 if(!re_pmatch_subs(jt->re, ln, &nsub, &subs))
1476 {
1477 continue;
1478 }
1479
1480 hpinfo->tag_attrib = js_transform_apply(jt, ln, nsub, subs);
1481
1482 /*****************************************/
1483 /* quite dirty hack to make happy attrib */
1484 /* parsing funcs which require valid */
1485 /* current_tag & current_attrib */
1486 /*****************************************/
1487 hpinfo->current_tag = &t;
1488 hpinfo->current_attrib = &(t.attribs[0]);
1489
1490 if(hpinfo->tag_attrib)
1491 html_parser_call_funcs(hpinfo, hpinfo->attrib_funcs);
1492
1493 if(hpinfo->rewrite && jt->type == 1 && nsub)
1494 {
1495 int l = strlen(hpinfo->tag_attrib);
1496
1497 ln = _realloc(ln, strlen(ln) + l + 1);
1498 memmove(ln + l + subs[2], ln + subs[3], strlen(ln + subs[3]) + 1);
1499 memcpy(ln + subs[2], hpinfo->tag_attrib, l);
1500 }
1501
1502 _free(subs);
1503
1504 /* :-) unhack */
1505 hpinfo->current_tag = NULL;
1506 hpinfo->current_attrib = NULL;
1507
1508 _free(hpinfo->tag_attrib);
1509 }
1510
1511 if(hpinfo->rewrite)
1512 stackc = tl_str_concat(stackc, ln, "\n", NULL);
1513
1514 _free(ln);
1515 p += ilen;
1516 p += strspn(p, "\n\r");
1517 }
1518
1519 if(hpinfo->rewrite)
1520 {
1521 ilen = strlen(stackc);
1522 hpinfo->stack_offset = 0;
1523 html_parser_SEXPAND(hpinfo, ilen);
1524 memcpy(hpinfo->stack, stackc, ilen + 1);
1525 hpinfo->stack_offset = ilen;
1526 _free(stackc);
1527 }
1528
1529 /* :-) unhack */
1530 hpinfo->current_tag = NULL;
1531 hpinfo->current_attrib = NULL;
1532
1533 hpinfo->tag_attrib = NULL;
1534 #endif
1535 }
1536