1 /* HTML core parser routines */
2
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
6
7 #include <errno.h>
8 #include <stdarg.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12
13 #include "elinks.h"
14
15 #include "document/css/apply.h"
16 #include "document/css/parser.h"
17 #include "document/html/parser/forms.h"
18 #include "document/html/parser/general.h"
19 #include "document/html/parser/link.h"
20 #include "document/html/parser/parse.h"
21 #include "document/html/parser/stack.h"
22 #include "document/html/parser.h"
23 #include "document/options.h"
24 #include "intl/charsets.h"
25 #include "util/conv.h"
26 #include "util/error.h"
27 #include "util/fastfind.h"
28 #include "util/memdebug.h"
29 #include "util/memory.h"
30 #include "util/string.h"
31
32 /* Unsafe macros */
33 #include "document/html/internal.h"
34
35
36 #define end_of_tag(c) ((c) == '>' || (c) == '<')
37
38 static inline int
atchr(register unsigned char c)39 atchr(register unsigned char c)
40 {
41 return (c < 127 && (c > '>' || (c > ' ' && c != '=' && !end_of_tag(c))));
42 }
43
44 /* This function eats one html element. */
45 /* - e is pointer to the begining of the element (*e must be '<')
46 * - eof is pointer to the end of scanned area
47 * - parsed element name is stored in name, it's length is namelen
48 * - first attribute is stored in attr
49 * - end points to first character behind the html element */
50 /* It returns -1 when it failed (returned values in pointers are invalid) and
51 * 0 for success. */
52 int
parse_element(register unsigned char * e,unsigned char * eof,unsigned char ** name,int * namelen,unsigned char ** attr,unsigned char ** end)53 parse_element(register unsigned char *e, unsigned char *eof,
54 unsigned char **name, int *namelen,
55 unsigned char **attr, unsigned char **end)
56 {
57 #define next_char() if (++e == eof) return -1;
58
59 assert(e && eof);
60 if (e >= eof || *e != '<') return -1;
61
62 next_char();
63 if (name) *name = e;
64
65 if (*e == '/') next_char();
66 if (!isident(*e)) return -1;
67
68 while (isident(*e)) next_char();
69
70 if (!isspace(*e) && !end_of_tag(*e) && *e != '/' && *e != ':' && *e != '=')
71 return -1;
72
73 if (name && namelen) *namelen = e - *name;
74
75 while (isspace(*e) || *e == '/' || *e == ':') next_char();
76
77 /* Skip bad attribute */
78 while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char();
79
80 if (attr) *attr = e;
81
82 next_attr:
83 while (isspace(*e)) next_char();
84
85 /* Skip bad attribute */
86 while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char();
87
88 if (end_of_tag(*e)) goto end;
89
90 while (atchr(*e)) next_char();
91 while (isspace(*e)) next_char();
92
93 if (*e != '=') {
94 if (end_of_tag(*e)) goto end;
95 goto next_attr;
96 }
97 next_char();
98
99 while (isspace(*e)) next_char();
100
101 if (isquote(*e)) {
102 unsigned char quote = *e;
103
104 /* quoted_value: */
105 next_char();
106 while (*e != quote) next_char();
107 next_char();
108 /* The following apparently handles the case of <foo
109 * id="a""b">, however that is very rare and probably not
110 * conforming. More frequent (and mishandling it more fatal) is
111 * probably the typo of <foo id="a""> - we can handle it as
112 * long as this is commented out. --pasky */
113 /* if (*e == quote) goto quoted_value; */
114 } else {
115 while (!isspace(*e) && !end_of_tag(*e)) next_char();
116 }
117
118 while (isspace(*e)) next_char();
119
120 if (!end_of_tag(*e)) goto next_attr;
121
122 end:
123 if (end) *end = e + (*e == '>');
124
125 return 0;
126 }
127
128
129 #define realloc_chrs(x, l) mem_align_alloc(x, l, (l) + 1, unsigned char, 0xFF)
130
131 #define add_chr(s, l, c) \
132 do { \
133 if (!realloc_chrs(&(s), l)) return NULL; \
134 (s)[(l)++] = (c); \
135 } while (0)
136
137 unsigned char *
get_attr_value(register unsigned char * e,unsigned char * name,struct document_options * options,enum html_attr_flags flags)138 get_attr_value(register unsigned char *e, unsigned char *name,
139 struct document_options *options, enum html_attr_flags flags)
140 {
141 unsigned char *n;
142 unsigned char *name_start;
143 unsigned char *attr = NULL;
144 int attrlen = 0;
145 int found;
146
147 next_attr:
148 skip_space(e);
149 if (end_of_tag(*e) || !atchr(*e)) goto parse_error;
150 n = name;
151 name_start = e;
152
153 while (atchr(*n) && atchr(*e) && c_toupper(*e) == c_toupper(*n)) e++, n++;
154 found = !*n && !atchr(*e);
155
156 if (found && (flags & HTML_ATTR_TEST)) return name_start;
157
158 while (atchr(*e)) e++;
159 skip_space(e);
160 if (*e != '=') {
161 if (found) goto found_endattr;
162 goto next_attr;
163 }
164 e++;
165 skip_space(e);
166
167 if (found) {
168 if (!isquote(*e)) {
169 while (!isspace(*e) && !end_of_tag(*e)) {
170 if (!*e) goto parse_error;
171 add_chr(attr, attrlen, *e);
172 e++;
173 }
174 } else {
175 unsigned char quote = *e;
176
177 /* parse_quoted_value: */
178 while (*(++e) != quote) {
179 if (*e == ASCII_CR) continue;
180 if (!*e) goto parse_error;
181 if (*e != ASCII_TAB && *e != ASCII_LF)
182 add_chr(attr, attrlen, *e);
183 else if (!(flags & HTML_ATTR_EAT_NL))
184 add_chr(attr, attrlen, ' ');
185 }
186 e++;
187 /* The following apparently handles the case of <foo
188 * id="a""b">, however that is very rare and probably
189 * not conforming. More frequent (and mishandling it
190 * more fatal) is probably the typo of <foo id="a""> -
191 * we can handle it as long as this is commented out.
192 * --pasky */
193 #if 0
194 if (*e == quote) {
195 add_chr(attr, attrlen, *e);
196 goto parse_quoted_value;
197 }
198 #endif
199 }
200
201 found_endattr:
202 add_chr(attr, attrlen, '\0');
203 attrlen--;
204
205 if (/* Unused: !(flags & HTML_ATTR_NO_CONV) && */
206 memchr(attr, '&', attrlen)) {
207 unsigned char *saved_attr = attr;
208
209 attr = convert_string(NULL, saved_attr, attrlen,
210 options->cp, CSM_QUERY,
211 NULL, NULL, NULL);
212 mem_free(saved_attr);
213 }
214
215 set_mem_comment(attr, name, strlen(name));
216 return attr;
217
218 } else {
219 if (!isquote(*e)) {
220 while (!isspace(*e) && !end_of_tag(*e)) {
221 if (!*e) goto parse_error;
222 e++;
223 }
224 } else {
225 unsigned char quote = *e;
226
227 do {
228 while (*(++e) != quote)
229 if (!*e) goto parse_error;
230 e++;
231 } while (/* See above. *e == quote */ 0);
232 }
233 }
234
235 goto next_attr;
236
237 parse_error:
238 mem_free_if(attr);
239 return NULL;
240 }
241
242 #undef add_chr
243
244
245 /* Extract numerical value of attribute @name.
246 * It will return a positive integer value on success,
247 * or -1 on error. */
248 int
get_num(unsigned char * a,unsigned char * name,struct document_options * options)249 get_num(unsigned char *a, unsigned char *name, struct document_options *options)
250 {
251 unsigned char *al = get_attr_val(a, name, options);
252 int result = -1;
253
254 if (al) {
255 unsigned char *end;
256 long num;
257
258 errno = 0;
259 num = strtol(al, (char **) &end, 10);
260 if (!errno && *al && !*end && num >= 0 && num <= INT_MAX)
261 result = (int) num;
262
263 mem_free(al);
264 }
265
266 return result;
267 }
268
269 /* Parse 'width[%],....'-like attribute @name of element @a. If @limited is
270 * set, it will limit the width value to the current usable width. Note that
271 * @limited must be set to be able to parse percentage widths. */
272 /* The function returns width in characters or -1 in case of error. */
273 int
get_width(unsigned char * a,unsigned char * name,int limited,struct html_context * html_context)274 get_width(unsigned char *a, unsigned char *name, int limited,
275 struct html_context *html_context)
276 {
277 unsigned char *value = get_attr_val(a, name, html_context->options);
278 unsigned char *str = value;
279 unsigned char *end;
280 int percentage = 0;
281 int len;
282 long width;
283
284 if (!value) return -1;
285
286 /* Skip spaces at start of string if any. */
287 skip_space(str);
288
289 /* Search for end of string or ',' character (ie. in "100,200") */
290 for (len = 0; str[len] && str[len] != ','; len++);
291
292 /* Go back, and skip spaces after width if any. */
293 while (len && isspace(str[len - 1])) len--;
294 if (!len) { mem_free(value); return -1; } /* Nothing to parse. */
295
296 /* Is this a percentage ? */
297 if (str[len - 1] == '%') len--, percentage = 1;
298
299 /* Skip spaces between width number and percentage if any. */
300 while (len && isspace(str[len - 1])) len--;
301 if (!len) { mem_free(value); return -1; } /* Nothing to parse. */
302
303 /* Shorten the string a bit, so strtoul() will work on useful
304 * part of it. */
305 str[len] = '\0';
306
307 /* Convert to number if possible. */
308 errno = 0;
309 width = strtoul((char *) str, (char **) &end, 10);
310
311 /* @end points into the @value string so check @end position
312 * before freeing @value. */
313 if (errno || *end || width >= INT_MAX) {
314 /* Not a valid number. */
315 mem_free(value);
316 return -1;
317 }
318
319 mem_free(value);
320
321 #define WIDTH_PIXELS2CHARS(width) ((width) + (HTML_CHAR_WIDTH - 1) / 2) / HTML_CHAR_WIDTH;
322
323 if (limited) {
324 int maxwidth = get_html_max_width();
325
326 if (percentage) {
327 /* Value is a percentage. */
328 width = width * maxwidth / 100;
329 } else {
330 /* Value is a number of pixels, makes an approximation. */
331 width = WIDTH_PIXELS2CHARS(width);
332 }
333
334 if (width > maxwidth)
335 width = maxwidth;
336
337 } else {
338 if (percentage) {
339 /* No sense, we need @limited and @maxwidth for percentage. */
340 return -1;
341 } else {
342 /* Value is a number of pixels, makes an approximation,
343 * no limit here */
344 width = WIDTH_PIXELS2CHARS(width);
345 }
346 }
347
348 #undef WIDTH_PIXELS2CHARS
349
350 if (width < 0)
351 width = 0;
352
353 return width;
354 }
355
356
357 unsigned char *
skip_comment(unsigned char * html,unsigned char * eof)358 skip_comment(unsigned char *html, unsigned char *eof)
359 {
360 if (html + 4 <= eof && html[2] == '-' && html[3] == '-') {
361 html += 4;
362 while (html < eof) {
363 if (html + 2 <= eof && html[0] == '-' && html[1] == '-') {
364 html += 2;
365 while (html < eof && *html == '-') html++;
366 while (html < eof && isspace(*html)) html++;
367 if (html >= eof) return eof;
368 if (*html == '>') return html + 1;
369 continue;
370 }
371 html++;
372 }
373
374 } else {
375 html += 2;
376 while (html < eof) {
377 if (html[0] == '>') return html + 1;
378 html++;
379 }
380 }
381
382 return eof;
383 }
384
385
386
387
388 enum element_type {
389 ELEMENT_TYPE_NESTABLE,
390 ELEMENT_TYPE_NON_NESTABLE,
391 ELEMENT_TYPE_NON_PAIRABLE,
392 ELEMENT_TYPE_LI,
393 };
394
395 struct element_info {
396 /* Element name, uppercase. */
397 unsigned char *name;
398
399 /* Element handler. This does the relevant arguments processing and
400 * formatting (by calling renderer hooks). Note that in a few cases,
401 * this is just a placeholder and the element is given special care
402 * in start_element() (which is also where we call these handlers). */
403 element_handler_T *func;
404
405 /* How many line-breaks to ensure we have before and after an element.
406 * Value of 1 means the element will be on a line on its own, value
407 * of 2 means that it will also have empty lines before and after.
408 * Note that this does not add up - it just ensures that there is
409 * at least so many linebreaks, but does not add more if that is the
410 * case. Therefore, something like e.g. </pre></p> will add only two
411 * linebreaks, not four. */
412 /* In some stack killing logic, we use some weird heuristic based on
413 * whether an element is block or inline. That is determined from
414 * whether this attribute is zero on non-zero. */
415 int linebreak;
416
417 enum element_type type;
418 };
419
420 static struct element_info elements[] = {
421 {"A", html_a, 0, ELEMENT_TYPE_NON_NESTABLE},
422 {"ABBR", html_italic, 0, ELEMENT_TYPE_NESTABLE },
423 {"ADDRESS", html_address, 2, ELEMENT_TYPE_NESTABLE },
424 {"APPLET", html_applet, 1, ELEMENT_TYPE_NON_PAIRABLE},
425 {"B", html_bold, 0, ELEMENT_TYPE_NESTABLE },
426 {"BASE", html_base, 0, ELEMENT_TYPE_NON_PAIRABLE},
427 {"BASEFONT", html_font, 0, ELEMENT_TYPE_NON_PAIRABLE},
428 {"BLOCKQUOTE", html_blockquote, 2, ELEMENT_TYPE_NESTABLE },
429 {"BODY", html_body, 0, ELEMENT_TYPE_NESTABLE },
430 {"BR", html_br, 1, ELEMENT_TYPE_NON_PAIRABLE},
431 {"BUTTON", html_button, 0, ELEMENT_TYPE_NESTABLE },
432 {"CAPTION", html_center, 1, ELEMENT_TYPE_NESTABLE },
433 {"CENTER", html_center, 1, ELEMENT_TYPE_NESTABLE },
434 {"CODE", html_fixed, 0, ELEMENT_TYPE_NESTABLE },
435 {"DD", html_dd, 1, ELEMENT_TYPE_NON_PAIRABLE},
436 {"DFN", html_bold, 0, ELEMENT_TYPE_NESTABLE },
437 {"DIR", html_ul, 2, ELEMENT_TYPE_NESTABLE },
438 {"DIV", html_linebrk, 1, ELEMENT_TYPE_NESTABLE },
439 {"DL", html_dl, 2, ELEMENT_TYPE_NESTABLE },
440 {"DT", html_dt, 1, ELEMENT_TYPE_NON_PAIRABLE},
441 {"EM", html_italic, 0, ELEMENT_TYPE_NESTABLE },
442 {"EMBED", html_embed, 0, ELEMENT_TYPE_NON_PAIRABLE},
443 {"FIXED", html_fixed, 0, ELEMENT_TYPE_NESTABLE },
444 {"FONT", html_font, 0, ELEMENT_TYPE_NESTABLE },
445 {"FORM", html_form, 1, ELEMENT_TYPE_NESTABLE },
446 {"FRAME", html_frame, 1, ELEMENT_TYPE_NON_PAIRABLE},
447 {"FRAMESET", html_frameset, 1, ELEMENT_TYPE_NESTABLE },
448 {"H1", html_h1, 2, ELEMENT_TYPE_NON_NESTABLE},
449 {"H2", html_h2, 2, ELEMENT_TYPE_NON_NESTABLE},
450 {"H3", html_h3, 2, ELEMENT_TYPE_NON_NESTABLE},
451 {"H4", html_h4, 2, ELEMENT_TYPE_NON_NESTABLE},
452 {"H5", html_h5, 2, ELEMENT_TYPE_NON_NESTABLE},
453 {"H6", html_h6, 2, ELEMENT_TYPE_NON_NESTABLE},
454 {"HEAD", html_head, 0, ELEMENT_TYPE_NESTABLE },
455 {"HR", html_hr, 2, ELEMENT_TYPE_NON_PAIRABLE},
456 {"HTML", html_html, 0, ELEMENT_TYPE_NESTABLE },
457 {"I", html_italic, 0, ELEMENT_TYPE_NESTABLE },
458 {"IFRAME", html_iframe, 1, ELEMENT_TYPE_NON_PAIRABLE},
459 {"IMG", html_img, 0, ELEMENT_TYPE_NON_PAIRABLE},
460 {"INPUT", html_input, 0, ELEMENT_TYPE_NON_PAIRABLE},
461 {"LI", html_li, 1, ELEMENT_TYPE_LI },
462 {"LINK", html_link, 1, ELEMENT_TYPE_NON_PAIRABLE},
463 {"LISTING", html_pre, 2, ELEMENT_TYPE_NESTABLE },
464 {"MENU", html_ul, 2, ELEMENT_TYPE_NESTABLE },
465 {"META", html_meta, 0, ELEMENT_TYPE_NON_PAIRABLE},
466 {"NOFRAMES", html_noframes, 0, ELEMENT_TYPE_NESTABLE },
467 {"NOSCRIPT", html_noscript, 0, ELEMENT_TYPE_NESTABLE },
468 {"OBJECT", html_object, 1, ELEMENT_TYPE_NON_PAIRABLE},
469 {"OL", html_ol, 2, ELEMENT_TYPE_NESTABLE },
470 {"OPTION", html_option, 1, ELEMENT_TYPE_NON_PAIRABLE},
471 {"P", html_p, 2, ELEMENT_TYPE_NON_NESTABLE},
472 {"PRE", html_pre, 2, ELEMENT_TYPE_NESTABLE },
473 {"Q", html_italic, 0, ELEMENT_TYPE_NESTABLE },
474 {"S", html_underline, 0, ELEMENT_TYPE_NESTABLE },
475 {"SCRIPT", html_script, 0, ELEMENT_TYPE_NESTABLE },
476 {"SELECT", html_select, 0, ELEMENT_TYPE_NESTABLE },
477 {"SPAN", html_span, 0, ELEMENT_TYPE_NESTABLE },
478 {"STRIKE", html_underline, 0, ELEMENT_TYPE_NESTABLE },
479 {"STRONG", html_bold, 0, ELEMENT_TYPE_NESTABLE },
480 {"STYLE", html_style, 0, ELEMENT_TYPE_NESTABLE },
481 {"SUB", html_subscript, 0, ELEMENT_TYPE_NESTABLE },
482 {"SUP", html_superscript, 0, ELEMENT_TYPE_NESTABLE },
483 {"TABLE", html_table, 2, ELEMENT_TYPE_NESTABLE },
484 {"TD", html_td, 0, ELEMENT_TYPE_NESTABLE },
485 {"TEXTAREA", html_textarea, 0, ELEMENT_TYPE_NON_PAIRABLE},
486 {"TH", html_th, 0, ELEMENT_TYPE_NESTABLE },
487 {"TITLE", html_title, 0, ELEMENT_TYPE_NESTABLE },
488 {"TR", html_tr, 1, ELEMENT_TYPE_NESTABLE },
489 {"TT", html_tt, 0, ELEMENT_TYPE_NON_NESTABLE},
490 {"U", html_underline, 0, ELEMENT_TYPE_NESTABLE },
491 {"UL", html_ul, 2, ELEMENT_TYPE_NESTABLE },
492 {"XMP", html_xmp, 2, ELEMENT_TYPE_NESTABLE },
493 {NULL, NULL, 0, ELEMENT_TYPE_NESTABLE },
494 };
495
496 #define NUMBER_OF_TAGS (sizeof_array(elements) - 1)
497
498
499 #ifndef USE_FASTFIND
500
501 static int
compar(const void * a,const void * b)502 compar(const void *a, const void *b)
503 {
504 return c_strcasecmp(((struct element_info *) a)->name,
505 ((struct element_info *) b)->name);
506 }
507
508 #else
509
510 static struct element_info *internal_pointer;
511
512 /* Reset internal list pointer */
513 static void
tags_list_reset(void)514 tags_list_reset(void)
515 {
516 internal_pointer = elements;
517 }
518
519 /* Returns a pointer to a struct that contains
520 * current key and data pointers and increment
521 * internal pointer.
522 * It returns NULL when key is NULL. */
523 static struct fastfind_key_value *
tags_list_next(void)524 tags_list_next(void)
525 {
526 static struct fastfind_key_value kv;
527
528 if (!internal_pointer->name) return NULL;
529
530 kv.key = internal_pointer->name;
531 kv.data = internal_pointer;
532
533 internal_pointer++;
534
535 return &kv;
536 }
537
538 static struct fastfind_index ff_tags_index
539 = INIT_FASTFIND_INDEX("tags_lookup", tags_list_reset, tags_list_next);
540
541 #endif /* USE_FASTFIND */
542
543
544 void
init_tags_lookup(void)545 init_tags_lookup(void)
546 {
547 #ifdef USE_FASTFIND
548 fastfind_index(&ff_tags_index, FF_COMPRESS | FF_LOCALE_INDEP);
549 #endif
550 }
551
552 void
free_tags_lookup(void)553 free_tags_lookup(void)
554 {
555 #ifdef USE_FASTFIND
556 fastfind_done(&ff_tags_index);
557 #endif
558 }
559
560
561 static unsigned char *process_element(unsigned char *name, int namelen, int endingtag,
562 unsigned char *html, unsigned char *prev_html,
563 unsigned char *eof, unsigned char *attr,
564 struct html_context *html_context);
565
566 void
parse_html(unsigned char * html,unsigned char * eof,struct part * part,unsigned char * head,struct html_context * html_context)567 parse_html(unsigned char *html, unsigned char *eof,
568 struct part *part, unsigned char *head,
569 struct html_context *html_context)
570 {
571 unsigned char *base_pos = html;
572 int noupdate = 0;
573
574 html_context->putsp = HTML_SPACE_SUPPRESS;
575 html_context->line_breax = html_context->table_level ? 2 : 1;
576 html_context->position = 0;
577 html_context->was_br = 0;
578 html_context->was_li = 0;
579 html_context->was_body = 0;
580 /* html_context->was_body_background = 0; */
581 html_context->part = part;
582 html_context->eoff = eof;
583 if (head) process_head(html_context, head);
584
585 main_loop:
586 while (html < eof) {
587 unsigned char *name, *attr, *end;
588 int namelen, endingtag;
589 int dotcounter = 0;
590
591 if (!noupdate) {
592 html_context->part = part;
593 html_context->eoff = eof;
594 base_pos = html;
595 } else {
596 noupdate = 0;
597 }
598
599 if (isspace(*html) && !html_is_preformatted()) {
600 unsigned char *h = html;
601
602 while (h < eof && isspace(*h))
603 h++;
604 if (h + 1 < eof && h[0] == '<' && h[1] == '/') {
605 if (!parse_element(h, eof, &name, &namelen, &attr, &end)) {
606 put_chrs(html_context, base_pos, html - base_pos);
607 base_pos = html = h;
608 html_context->putsp = HTML_SPACE_ADD;
609 goto element;
610 }
611 }
612 html++;
613 if (!(html_context->position + (html - base_pos - 1)))
614 goto skip_w; /* ??? */
615 if (*(html - 1) == ' ') { /* Do not replace with isspace() ! --Zas */
616 /* BIG performance win; not sure if it doesn't cause any bug */
617 if (html < eof && !isspace(*html)) {
618 noupdate = 1;
619 continue;
620 }
621 put_chrs(html_context, base_pos, html - base_pos);
622 } else {
623 put_chrs(html_context, base_pos, html - base_pos - 1);
624 put_chrs(html_context, " ", 1);
625 }
626
627 skip_w:
628 while (html < eof && isspace(*html))
629 html++;
630 continue;
631 }
632
633 if (html_is_preformatted()) {
634 html_context->putsp = HTML_SPACE_NORMAL;
635 if (*html == ASCII_TAB) {
636 put_chrs(html_context, base_pos, html - base_pos);
637 put_chrs(html_context, " ",
638 8 - (html_context->position % 8));
639 html++;
640 continue;
641
642 } else if (*html == ASCII_CR || *html == ASCII_LF) {
643 put_chrs(html_context, base_pos, html - base_pos);
644 if (html - base_pos == 0 && html_context->line_breax > 0)
645 html_context->line_breax--;
646 next_break:
647 if (*html == ASCII_CR && html < eof - 1
648 && html[1] == ASCII_LF)
649 html++;
650 ln_break(html_context, 1);
651 html++;
652 if (*html == ASCII_CR || *html == ASCII_LF) {
653 html_context->line_breax = 0;
654 goto next_break;
655 }
656 continue;
657
658 } else if (html + 5 < eof && *html == '&') {
659 /* Really nasty hack to make handling in
660 * <pre>-tags lynx-compatible. It works around
661 * the entity handling done in the renderer,
662 * since checking #13 value there would require
663 * something along the lines of NBSP_CHAR or
664 * checking for '\n's in AT_PREFORMATTED text. */
665 /* See bug 52 and 387 for more info. */
666 int length = html - base_pos;
667 int newlines = 0;
668
669 while ((html + 5 < eof && html[0] == '&' && html[1] == '#')
670 && (!memcmp(html + 2, "13;", 3)
671 || (html + 6 < eof && !c_strncasecmp(html + 2, "x0a;", 4)))) {
672 newlines++;
673 html += 5 + (html[4] != ';');
674 }
675
676 if (newlines) {
677 put_chrs(html_context, base_pos, length);
678 ln_break(html_context, newlines);
679 continue;
680 }
681 }
682 }
683
684 while (*html < ' ') {
685 if (html - base_pos)
686 put_chrs(html_context, base_pos, html - base_pos);
687
688 dotcounter++;
689 base_pos = ++html;
690 if (*html >= ' ' || isspace(*html) || html >= eof) {
691 unsigned char *dots = fmem_alloc(dotcounter);
692
693 if (dots) {
694 memset(dots, '.', dotcounter);
695 put_chrs(html_context, dots, dotcounter);
696 fmem_free(dots);
697 }
698 goto main_loop;
699 }
700 }
701
702 if (html + 2 <= eof && html[0] == '<' && (html[1] == '!' || html[1] == '?')
703 && !(html_context->was_xmp || html_context->was_style)) {
704 put_chrs(html_context, base_pos, html - base_pos);
705 html = skip_comment(html, eof);
706 continue;
707 }
708
709 if (*html != '<' || parse_element(html, eof, &name, &namelen, &attr, &end)) {
710 html++;
711 noupdate = 1;
712 continue;
713 }
714
715 element:
716 endingtag = *name == '/'; name += endingtag; namelen -= endingtag;
717 if (!endingtag && html_context->putsp == HTML_SPACE_ADD && !html_top.invisible)
718 put_chrs(html_context, " ", 1);
719 put_chrs(html_context, base_pos, html - base_pos);
720 if (!html_is_preformatted() && !endingtag && html_context->putsp == HTML_SPACE_NORMAL) {
721 unsigned char *ee = end;
722 unsigned char *nm;
723
724 while (!parse_element(ee, eof, &nm, NULL, NULL, &ee))
725 if (*nm == '/')
726 goto ng;
727 if (ee < eof && isspace(*ee)) {
728 put_chrs(html_context, " ", 1);
729 }
730 ng:;
731 }
732
733 html = process_element(name, namelen, endingtag, end, html, eof, attr, html_context);
734 }
735
736 if (noupdate) put_chrs(html_context, base_pos, html - base_pos);
737 ln_break(html_context, 1);
738 /* Restore the part in case the html_context was trashed in the last
739 * iteration so that when destroying the stack in the caller we still
740 * get the right part pointer. */
741 html_context->part = part;
742 html_context->putsp = HTML_SPACE_SUPPRESS;
743 html_context->position = 0;
744 html_context->was_br = 0;
745 }
746
747 static unsigned char *
start_element(struct element_info * ei,unsigned char * name,int namelen,unsigned char * html,unsigned char * eof,unsigned char * attr,struct html_context * html_context)748 start_element(struct element_info *ei,
749 unsigned char *name, int namelen,
750 unsigned char *html,
751 unsigned char *eof, unsigned char *attr,
752 struct html_context *html_context)
753 {
754 #define ELEMENT_RENDER_PROLOGUE \
755 ln_break(html_context, ei->linebreak); \
756 a = get_attr_val(attr, "id", html_context->options); \
757 if (a) { \
758 html_context->special_f(html_context, SP_TAG, a); \
759 mem_free(a); \
760 }
761
762 unsigned char *a;
763 struct par_attrib old_format;
764 int restore_format;
765 #ifdef CONFIG_CSS
766 struct css_selector *selector = NULL;
767 #endif
768
769 if (html_top.type == ELEMENT_WEAK) {
770 kill_html_stack_item(html_context, &html_top);
771 }
772
773 /* We try to process nested <script> if we didn't process the parent
774 * one. */
775 if (html_top.invisible
776 && (ei->func != html_script || html_top.invisible < 2)) {
777 ELEMENT_RENDER_PROLOGUE
778 return html;
779 }
780
781 restore_format = html_is_preformatted();
782 old_format = par_format;
783
784 /* Support for <meta refresh="..."> inside <body>. (bug 700) */
785 if (ei->func == html_meta && html_context->was_body) {
786 html_handle_body_meta(html_context, name - 1, eof);
787 html_context->was_body = 0;
788 }
789
790 #ifdef CONFIG_CSS
791 if (ei->func == html_style && html_context->options->css_enable) {
792 css_parse_stylesheet(&html_context->css_styles,
793 html_context->base_href, html, eof);
794 }
795 #endif
796
797 if (ei->type == ELEMENT_TYPE_NON_NESTABLE
798 || ei->type == ELEMENT_TYPE_LI) {
799 struct html_element *e;
800
801 if (ei->type == ELEMENT_TYPE_NON_NESTABLE) {
802 foreach (e, html_context->stack) {
803 if (e->type < ELEMENT_KILLABLE) break;
804 if (is_block_element(e) || is_inline_element(ei)) break;
805 }
806 } else foreach (e, html_context->stack) {
807 if (is_block_element(e) && is_inline_element(ei)) break;
808 if (e->type < ELEMENT_KILLABLE) break;
809 if (!c_strlcasecmp(e->name, e->namelen, name, namelen)) break;
810 }
811 if (!c_strlcasecmp(e->name, e->namelen, name, namelen)) {
812 while (e->prev != (void *) &html_context->stack)
813 kill_html_stack_item(html_context, e->prev);
814
815 if (e->type > ELEMENT_IMMORTAL)
816 kill_html_stack_item(html_context, e);
817 }
818 }
819
820 if (ei->type != ELEMENT_TYPE_NON_PAIRABLE) {
821 html_stack_dup(html_context, ELEMENT_KILLABLE);
822 html_top.name = name;
823 html_top.namelen = namelen;
824 html_top.options = attr;
825 html_top.linebreak = ei->linebreak;
826
827 #ifdef CONFIG_ECMASCRIPT
828 if (has_attr(attr, "onClick", html_context->options)) {
829 /* XXX: Put something better to format.link. --pasky */
830 mem_free_set(&format.link, stracpy("javascript:void(0);"));
831 mem_free_set(&format.target, stracpy(html_context->base_target));
832 format.style.fg = format.clink;
833 html_top.pseudo_class = ELEMENT_LINK;
834 mem_free_set(&format.title, stracpy("onClick placeholder"));
835 /* Er. I know. Well, double html_focusable()s shouldn't
836 * really hurt. */
837 html_focusable(html_context, attr);
838 }
839 #endif
840 }
841
842 #ifdef CONFIG_CSS
843 if (html_top.options && html_context->options->css_enable) {
844 /* XXX: We should apply CSS otherwise as well, but that'll need
845 * some deeper changes in order to have options filled etc.
846 * Probably just applying CSS from more places, since we
847 * usually have type != ELEMENT_TYPE_NESTABLE when we either (1)
848 * rescan on your own from somewhere else (2) html_stack_dup()
849 * in our own way. --pasky */
850 /* Call it now to gain some of the stuff which might affect
851 * formatting of some elements. */
852 /* FIXME: The caching of the CSS selector is broken, since t can
853 * lead to wrong styles being applied to following elements, so
854 * disabled for now. */
855 selector = get_css_selector_for_element(html_context, &html_top,
856 &html_context->css_styles,
857 &html_context->stack);
858
859 if (selector) {
860 apply_css_selector_style(html_context, &html_top, selector);
861 done_css_selector(selector);
862 }
863 }
864 /* Now this was the reason for this whole funny ELEMENT_RENDER_PROLOGUE
865 * bussiness. Only now we have the definitive linebreak value, since
866 * that's what the display: property plays with. */
867 #endif
868 ELEMENT_RENDER_PROLOGUE
869 if (ei->func) ei->func(html_context, attr, html, eof, &html);
870 #ifdef CONFIG_CSS
871 if (selector && html_top.options) {
872 /* Call it now to override default colors of the elements. */
873 selector = get_css_selector_for_element(html_context, &html_top,
874 &html_context->css_styles,
875 &html_context->stack);
876
877 if (selector) {
878 apply_css_selector_style(html_context, &html_top, selector);
879 done_css_selector(selector);
880 }
881 }
882 #endif
883
884 if (ei->func != html_br) html_context->was_br = 0;
885
886 if (restore_format) par_format = old_format;
887
888 return html;
889 #undef ELEMENT_RENDER_PROLOGUE
890 }
891
892 static unsigned char *
end_element(struct element_info * ei,unsigned char * name,int namelen,unsigned char * html,unsigned char * eof,unsigned char * attr,struct html_context * html_context)893 end_element(struct element_info *ei,
894 unsigned char *name, int namelen,
895 unsigned char *html,
896 unsigned char *eof, unsigned char *attr,
897 struct html_context *html_context)
898 {
899 struct html_element *e, *elt;
900 int lnb = 0;
901 int kill = 0;
902
903 if (ei->func == html_xmp) html_context->was_xmp = 0;
904 if (ei->func == html_style) html_context->was_style = 0;
905
906 html_context->was_br = 0;
907 if (ei->type == ELEMENT_TYPE_NON_PAIRABLE
908 || ei->type == ELEMENT_TYPE_LI)
909 return html;
910
911 /* Apply background color from the <HTML> element. (bug 696) */
912 if (ei->func == html_html
913 && html_top.type >= ELEMENT_KILLABLE
914 && !html_context->was_body_background)
915 html_apply_canvas_bgcolor(html_context);
916
917 /* dump_html_stack(html_context); */
918 foreach (e, html_context->stack) {
919 if (is_block_element(e) && is_inline_element(ei)) kill = 1;
920 if (c_strlcasecmp(e->name, e->namelen, name, namelen)) {
921 if (e->type < ELEMENT_KILLABLE)
922 break;
923 else
924 continue;
925 }
926 if (kill) {
927 kill_html_stack_item(html_context, e);
928 break;
929 }
930 for (elt = e;
931 elt != (void *) &html_context->stack;
932 elt = elt->prev)
933 if (elt->linebreak > lnb)
934 lnb = elt->linebreak;
935
936 /* This hack forces a line break after a list end. It is needed
937 * when ending a list with the last <li> having no text the
938 * line_breax is 2 so the ending list's linebreak will be
939 * ignored when calling ln_break(). */
940 if (html_context->was_li)
941 html_context->line_breax = 0;
942
943 ln_break(html_context, lnb);
944 while (e->prev != (void *) &html_context->stack)
945 kill_html_stack_item(html_context, e->prev);
946 kill_html_stack_item(html_context, e);
947 break;
948 }
949 /* dump_html_stack(html_context); */
950
951 return html;
952 }
953
954 static unsigned char *
process_element(unsigned char * name,int namelen,int endingtag,unsigned char * html,unsigned char * prev_html,unsigned char * eof,unsigned char * attr,struct html_context * html_context)955 process_element(unsigned char *name, int namelen, int endingtag,
956 unsigned char *html, unsigned char *prev_html,
957 unsigned char *eof, unsigned char *attr,
958 struct html_context *html_context)
959
960 {
961 struct element_info *ei;
962
963 #ifndef USE_FASTFIND
964 {
965 struct element_info elem;
966 unsigned char tmp;
967
968 tmp = name[namelen];
969 name[namelen] = '\0';
970
971 elem.name = name;
972 ei = bsearch(&elem, elements, NUMBER_OF_TAGS, sizeof(elem), compar);
973 name[namelen] = tmp;
974 }
975 #else
976 ei = (struct element_info *) fastfind_search(&ff_tags_index, name, namelen);
977 #endif
978 if (html_context->was_xmp || html_context->was_style) {
979 if (!ei || (ei->func != html_xmp && ei->func != html_style) || !endingtag) {
980 put_chrs(html_context, "<", 1);
981 return prev_html + 1;
982 }
983 }
984
985 if (!ei) return html;
986
987 if (!endingtag) {
988 return start_element(ei, name, namelen, html, eof, attr, html_context);
989 } else {
990 return end_element(ei, name, namelen, html, eof, attr, html_context);
991 }
992 }
993
994 void
scan_http_equiv(unsigned char * s,unsigned char * eof,struct string * head,struct string * title,struct document_options * options)995 scan_http_equiv(unsigned char *s, unsigned char *eof, struct string *head,
996 struct string *title, struct document_options *options)
997 {
998 unsigned char *name, *attr, *he, *c;
999 int namelen;
1000
1001 if (title && !init_string(title)) return;
1002
1003 add_char_to_string(head, '\n');
1004
1005 se:
1006 while (s < eof && *s != '<') {
1007 sp:
1008 s++;
1009 }
1010 if (s >= eof) return;
1011 if (s + 2 <= eof && (s[1] == '!' || s[1] == '?')) {
1012 s = skip_comment(s, eof);
1013 goto se;
1014 }
1015 if (parse_element(s, eof, &name, &namelen, &attr, &s)) goto sp;
1016
1017 ps:
1018 if (!c_strlcasecmp(name, namelen, "HEAD", 4)) goto se;
1019 if (!c_strlcasecmp(name, namelen, "/HEAD", 5)) return;
1020 if (!c_strlcasecmp(name, namelen, "BODY", 4)) return;
1021 if (title && !title->length && !c_strlcasecmp(name, namelen, "TITLE", 5)) {
1022 unsigned char *s1;
1023
1024 xse:
1025 s1 = s;
1026 while (s < eof && *s != '<') {
1027 xsp:
1028 s++;
1029 }
1030 if (s - s1)
1031 add_bytes_to_string(title, s1, s - s1);
1032 if (s >= eof) goto se;
1033 if (s + 2 <= eof && (s[1] == '!' || s[1] == '?')) {
1034 s = skip_comment(s, eof);
1035 goto xse;
1036 }
1037 if (parse_element(s, eof, &name, &namelen, &attr, &s)) {
1038 s1 = s;
1039 goto xsp;
1040 }
1041 clr_spaces(title->source);
1042 goto ps;
1043 }
1044 if (c_strlcasecmp(name, namelen, "META", 4)) goto se;
1045
1046 he = get_attr_val(attr, "charset", options);
1047 if (he) {
1048 add_to_string(head, "Charset: ");
1049 add_to_string(head, he);
1050 mem_free(he);
1051 }
1052
1053 he = get_attr_val(attr, "http-equiv", options);
1054 if (!he) goto se;
1055
1056 add_to_string(head, he);
1057 mem_free(he);
1058
1059 c = get_attr_val(attr, "content", options);
1060 if (c) {
1061 add_to_string(head, ": ");
1062 add_to_string(head, c);
1063 mem_free(c);
1064 }
1065
1066 add_crlf_to_string(head);
1067 goto se;
1068 }
1069