1 /* HTML core parser routines */
2 
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
6 
7 #include <errno.h>
8 #include <stdarg.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 
13 #include "elinks.h"
14 
15 #include "document/css/apply.h"
16 #include "document/css/parser.h"
17 #include "document/html/parser/forms.h"
18 #include "document/html/parser/general.h"
19 #include "document/html/parser/link.h"
20 #include "document/html/parser/parse.h"
21 #include "document/html/parser/stack.h"
22 #include "document/html/parser.h"
23 #include "document/options.h"
24 #include "intl/charsets.h"
25 #include "util/conv.h"
26 #include "util/error.h"
27 #include "util/fastfind.h"
28 #include "util/memdebug.h"
29 #include "util/memory.h"
30 #include "util/string.h"
31 
32 /* Unsafe macros */
33 #include "document/html/internal.h"
34 
35 
36 #define end_of_tag(c) ((c) == '>' || (c) == '<')
37 
38 static inline int
atchr(register unsigned char c)39 atchr(register unsigned char c)
40 {
41 	return (c < 127 && (c > '>' || (c > ' ' && c != '=' && !end_of_tag(c))));
42 }
43 
44 /* This function eats one html element. */
45 /* - e is pointer to the begining of the element (*e must be '<')
46  * - eof is pointer to the end of scanned area
47  * - parsed element name is stored in name, it's length is namelen
48  * - first attribute is stored in attr
49  * - end points to first character behind the html element */
50 /* It returns -1 when it failed (returned values in pointers are invalid) and
51  * 0 for success. */
52 int
parse_element(register unsigned char * e,unsigned char * eof,unsigned char ** name,int * namelen,unsigned char ** attr,unsigned char ** end)53 parse_element(register unsigned char *e, unsigned char *eof,
54 	      unsigned char **name, int *namelen,
55 	      unsigned char **attr, unsigned char **end)
56 {
57 #define next_char() if (++e == eof) return -1;
58 
59 	assert(e && eof);
60 	if (e >= eof || *e != '<') return -1;
61 
62 	next_char();
63 	if (name) *name = e;
64 
65 	if (*e == '/') next_char();
66 	if (!isident(*e)) return -1;
67 
68 	while (isident(*e)) next_char();
69 
70 	if (!isspace(*e) && !end_of_tag(*e) && *e != '/' && *e != ':' && *e != '=')
71 		return -1;
72 
73 	if (name && namelen) *namelen = e - *name;
74 
75 	while (isspace(*e) || *e == '/' || *e == ':') next_char();
76 
77 	/* Skip bad attribute */
78 	while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char();
79 
80 	if (attr) *attr = e;
81 
82 next_attr:
83 	while (isspace(*e)) next_char();
84 
85 	/* Skip bad attribute */
86 	while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char();
87 
88 	if (end_of_tag(*e)) goto end;
89 
90 	while (atchr(*e)) next_char();
91 	while (isspace(*e)) next_char();
92 
93 	if (*e != '=') {
94 		if (end_of_tag(*e)) goto end;
95 		goto next_attr;
96 	}
97 	next_char();
98 
99 	while (isspace(*e)) next_char();
100 
101 	if (isquote(*e)) {
102 		unsigned char quote = *e;
103 
104 /* quoted_value: */
105 		next_char();
106 		while (*e != quote) next_char();
107 		next_char();
108 		/* The following apparently handles the case of <foo
109 		 * id="a""b">, however that is very rare and probably not
110 		 * conforming. More frequent (and mishandling it more fatal) is
111 		 * probably the typo of <foo id="a""> - we can handle it as
112 		 * long as this is commented out. --pasky */
113 		/* if (*e == quote) goto quoted_value; */
114 	} else {
115 		while (!isspace(*e) && !end_of_tag(*e)) next_char();
116 	}
117 
118 	while (isspace(*e)) next_char();
119 
120 	if (!end_of_tag(*e)) goto next_attr;
121 
122 end:
123 	if (end) *end = e + (*e == '>');
124 
125 	return 0;
126 }
127 
128 
129 #define realloc_chrs(x, l) mem_align_alloc(x, l, (l) + 1, unsigned char, 0xFF)
130 
131 #define add_chr(s, l, c)						\
132 	do {								\
133 		if (!realloc_chrs(&(s), l)) return NULL;		\
134 		(s)[(l)++] = (c);					\
135 	} while (0)
136 
137 unsigned char *
get_attr_value(register unsigned char * e,unsigned char * name,struct document_options * options,enum html_attr_flags flags)138 get_attr_value(register unsigned char *e, unsigned char *name,
139 	       struct document_options *options, enum html_attr_flags flags)
140 {
141 	unsigned char *n;
142 	unsigned char *name_start;
143 	unsigned char *attr = NULL;
144 	int attrlen = 0;
145 	int found;
146 
147 next_attr:
148 	skip_space(e);
149 	if (end_of_tag(*e) || !atchr(*e)) goto parse_error;
150 	n = name;
151 	name_start = e;
152 
153 	while (atchr(*n) && atchr(*e) && c_toupper(*e) == c_toupper(*n)) e++, n++;
154 	found = !*n && !atchr(*e);
155 
156 	if (found && (flags & HTML_ATTR_TEST)) return name_start;
157 
158 	while (atchr(*e)) e++;
159 	skip_space(e);
160 	if (*e != '=') {
161 		if (found) goto found_endattr;
162 		goto next_attr;
163 	}
164 	e++;
165 	skip_space(e);
166 
167 	if (found) {
168 		if (!isquote(*e)) {
169 			while (!isspace(*e) && !end_of_tag(*e)) {
170 				if (!*e) goto parse_error;
171 				add_chr(attr, attrlen, *e);
172 				e++;
173 			}
174 		} else {
175 			unsigned char quote = *e;
176 
177 /* parse_quoted_value: */
178 			while (*(++e) != quote) {
179 				if (*e == ASCII_CR) continue;
180 				if (!*e) goto parse_error;
181 				if (*e != ASCII_TAB && *e != ASCII_LF)
182 					add_chr(attr, attrlen, *e);
183 				else if (!(flags & HTML_ATTR_EAT_NL))
184 					add_chr(attr, attrlen, ' ');
185 			}
186 			e++;
187 			/* The following apparently handles the case of <foo
188 			 * id="a""b">, however that is very rare and probably
189 			 * not conforming. More frequent (and mishandling it
190 			 * more fatal) is probably the typo of <foo id="a""> -
191 			 * we can handle it as long as this is commented out.
192 			 * --pasky */
193 #if 0
194 			if (*e == quote) {
195 				add_chr(attr, attrlen, *e);
196 				goto parse_quoted_value;
197 			}
198 #endif
199 		}
200 
201 found_endattr:
202 		add_chr(attr, attrlen, '\0');
203 		attrlen--;
204 
205 		if (/* Unused: !(flags & HTML_ATTR_NO_CONV) && */
206 		    memchr(attr, '&', attrlen)) {
207 			unsigned char *saved_attr = attr;
208 
209 			attr = convert_string(NULL, saved_attr, attrlen,
210 			                      options->cp, CSM_QUERY,
211 			                      NULL, NULL, NULL);
212 			mem_free(saved_attr);
213 		}
214 
215 		set_mem_comment(attr, name, strlen(name));
216 		return attr;
217 
218 	} else {
219 		if (!isquote(*e)) {
220 			while (!isspace(*e) && !end_of_tag(*e)) {
221 				if (!*e) goto parse_error;
222 				e++;
223 			}
224 		} else {
225 			unsigned char quote = *e;
226 
227 			do {
228 				while (*(++e) != quote)
229 					if (!*e) goto parse_error;
230 				e++;
231 			} while (/* See above. *e == quote */ 0);
232 		}
233 	}
234 
235 	goto next_attr;
236 
237 parse_error:
238 	mem_free_if(attr);
239 	return NULL;
240 }
241 
242 #undef add_chr
243 
244 
245 /* Extract numerical value of attribute @name.
246  * It will return a positive integer value on success,
247  * or -1 on error. */
248 int
get_num(unsigned char * a,unsigned char * name,struct document_options * options)249 get_num(unsigned char *a, unsigned char *name, struct document_options *options)
250 {
251 	unsigned char *al = get_attr_val(a, name, options);
252 	int result = -1;
253 
254 	if (al) {
255 		unsigned char *end;
256 		long num;
257 
258 		errno = 0;
259 		num = strtol(al, (char **) &end, 10);
260 		if (!errno && *al && !*end && num >= 0 && num <= INT_MAX)
261 			result = (int) num;
262 
263 		mem_free(al);
264 	}
265 
266 	return result;
267 }
268 
269 /* Parse 'width[%],....'-like attribute @name of element @a.  If @limited is
270  * set, it will limit the width value to the current usable width. Note that
271  * @limited must be set to be able to parse percentage widths. */
272 /* The function returns width in characters or -1 in case of error. */
273 int
get_width(unsigned char * a,unsigned char * name,int limited,struct html_context * html_context)274 get_width(unsigned char *a, unsigned char *name, int limited,
275           struct html_context *html_context)
276 {
277 	unsigned char *value = get_attr_val(a, name, html_context->options);
278 	unsigned char *str = value;
279 	unsigned char *end;
280 	int percentage = 0;
281 	int len;
282 	long width;
283 
284 	if (!value) return -1;
285 
286 	/* Skip spaces at start of string if any. */
287 	skip_space(str);
288 
289 	/* Search for end of string or ',' character (ie. in "100,200") */
290 	for (len = 0; str[len] && str[len] != ','; len++);
291 
292 	/* Go back, and skip spaces after width if any. */
293 	while (len && isspace(str[len - 1])) len--;
294 	if (!len) { mem_free(value); return -1; } /* Nothing to parse. */
295 
296 	/* Is this a percentage ? */
297 	if (str[len - 1] == '%') len--, percentage = 1;
298 
299 	/* Skip spaces between width number and percentage if any. */
300 	while (len && isspace(str[len - 1])) len--;
301 	if (!len) { mem_free(value); return -1; } /* Nothing to parse. */
302 
303 	/* Shorten the string a bit, so strtoul() will work on useful
304 	 * part of it. */
305 	str[len] = '\0';
306 
307 	/* Convert to number if possible. */
308 	errno = 0;
309 	width = strtoul((char *) str, (char **) &end, 10);
310 
311 	/* @end points into the @value string so check @end position
312 	 * before freeing @value. */
313 	if (errno || *end || width >= INT_MAX) {
314 		/* Not a valid number. */
315 		mem_free(value);
316 		return -1;
317 	}
318 
319 	mem_free(value);
320 
321 #define WIDTH_PIXELS2CHARS(width) ((width) + (HTML_CHAR_WIDTH - 1) / 2) / HTML_CHAR_WIDTH;
322 
323 	if (limited) {
324 		int maxwidth = get_html_max_width();
325 
326 		if (percentage) {
327 			/* Value is a percentage. */
328 			width = width * maxwidth / 100;
329 		} else {
330 			/* Value is a number of pixels, makes an approximation. */
331 			width = WIDTH_PIXELS2CHARS(width);
332 		}
333 
334 		if (width > maxwidth)
335 			width = maxwidth;
336 
337 	} else {
338 		if (percentage) {
339 			/* No sense, we need @limited and @maxwidth for percentage. */
340 			return -1;
341 		} else {
342 			/* Value is a number of pixels, makes an approximation,
343 			 * no limit here */
344 			width = WIDTH_PIXELS2CHARS(width);
345 		}
346 	}
347 
348 #undef WIDTH_PIXELS2CHARS
349 
350 	if (width < 0)
351 		width = 0;
352 
353 	return width;
354 }
355 
356 
357 unsigned char *
skip_comment(unsigned char * html,unsigned char * eof)358 skip_comment(unsigned char *html, unsigned char *eof)
359 {
360 	if (html + 4 <= eof && html[2] == '-' && html[3] == '-') {
361 		html += 4;
362 		while (html < eof) {
363 			if (html + 2 <= eof && html[0] == '-' && html[1] == '-') {
364 				html += 2;
365 				while (html < eof && *html == '-') html++;
366 				while (html < eof && isspace(*html)) html++;
367 				if (html >= eof) return eof;
368 				if (*html == '>') return html + 1;
369 				continue;
370 			}
371 			html++;
372 		}
373 
374 	} else {
375 		html += 2;
376 		while (html < eof) {
377 			if (html[0] == '>') return html + 1;
378 			html++;
379 		}
380 	}
381 
382 	return eof;
383 }
384 
385 
386 
387 
388 enum element_type {
389 	ELEMENT_TYPE_NESTABLE,
390 	ELEMENT_TYPE_NON_NESTABLE,
391 	ELEMENT_TYPE_NON_PAIRABLE,
392 	ELEMENT_TYPE_LI,
393 };
394 
395 struct element_info {
396 	/* Element name, uppercase. */
397 	unsigned char *name;
398 
399 	/* Element handler. This does the relevant arguments processing and
400 	 * formatting (by calling renderer hooks). Note that in a few cases,
401 	 * this is just a placeholder and the element is given special care
402 	 * in start_element() (which is also where we call these handlers). */
403 	element_handler_T *func;
404 
405 	/* How many line-breaks to ensure we have before and after an element.
406 	 * Value of 1 means the element will be on a line on its own, value
407 	 * of 2 means that it will also have empty lines before and after.
408 	 * Note that this does not add up - it just ensures that there is
409 	 * at least so many linebreaks, but does not add more if that is the
410 	 * case. Therefore, something like e.g. </pre></p> will add only two
411 	 * linebreaks, not four. */
412 	/* In some stack killing logic, we use some weird heuristic based on
413 	 * whether an element is block or inline. That is determined from
414 	 * whether this attribute is zero on non-zero. */
415 	int linebreak;
416 
417 	enum element_type type;
418 };
419 
420 static struct element_info elements[] = {
421         {"A",           html_a,           0, ELEMENT_TYPE_NON_NESTABLE},
422         {"ABBR",        html_italic,      0, ELEMENT_TYPE_NESTABLE    },
423         {"ADDRESS",     html_address,     2, ELEMENT_TYPE_NESTABLE    },
424         {"APPLET",      html_applet,      1, ELEMENT_TYPE_NON_PAIRABLE},
425         {"B",           html_bold,        0, ELEMENT_TYPE_NESTABLE    },
426         {"BASE",        html_base,        0, ELEMENT_TYPE_NON_PAIRABLE},
427         {"BASEFONT",    html_font,        0, ELEMENT_TYPE_NON_PAIRABLE},
428         {"BLOCKQUOTE",  html_blockquote,  2, ELEMENT_TYPE_NESTABLE    },
429         {"BODY",        html_body,        0, ELEMENT_TYPE_NESTABLE    },
430         {"BR",          html_br,          1, ELEMENT_TYPE_NON_PAIRABLE},
431         {"BUTTON",      html_button,      0, ELEMENT_TYPE_NESTABLE    },
432         {"CAPTION",     html_center,      1, ELEMENT_TYPE_NESTABLE    },
433         {"CENTER",      html_center,      1, ELEMENT_TYPE_NESTABLE    },
434         {"CODE",        html_fixed,       0, ELEMENT_TYPE_NESTABLE    },
435         {"DD",          html_dd,          1, ELEMENT_TYPE_NON_PAIRABLE},
436         {"DFN",         html_bold,        0, ELEMENT_TYPE_NESTABLE    },
437         {"DIR",         html_ul,          2, ELEMENT_TYPE_NESTABLE    },
438         {"DIV",         html_linebrk,     1, ELEMENT_TYPE_NESTABLE    },
439         {"DL",          html_dl,          2, ELEMENT_TYPE_NESTABLE    },
440         {"DT",          html_dt,          1, ELEMENT_TYPE_NON_PAIRABLE},
441         {"EM",          html_italic,      0, ELEMENT_TYPE_NESTABLE    },
442         {"EMBED",       html_embed,       0, ELEMENT_TYPE_NON_PAIRABLE},
443         {"FIXED",       html_fixed,       0, ELEMENT_TYPE_NESTABLE    },
444         {"FONT",        html_font,        0, ELEMENT_TYPE_NESTABLE    },
445         {"FORM",        html_form,        1, ELEMENT_TYPE_NESTABLE    },
446         {"FRAME",       html_frame,       1, ELEMENT_TYPE_NON_PAIRABLE},
447         {"FRAMESET",    html_frameset,    1, ELEMENT_TYPE_NESTABLE    },
448         {"H1",          html_h1,          2, ELEMENT_TYPE_NON_NESTABLE},
449         {"H2",          html_h2,          2, ELEMENT_TYPE_NON_NESTABLE},
450         {"H3",          html_h3,          2, ELEMENT_TYPE_NON_NESTABLE},
451         {"H4",          html_h4,          2, ELEMENT_TYPE_NON_NESTABLE},
452         {"H5",          html_h5,          2, ELEMENT_TYPE_NON_NESTABLE},
453         {"H6",          html_h6,          2, ELEMENT_TYPE_NON_NESTABLE},
454         {"HEAD",        html_head,        0, ELEMENT_TYPE_NESTABLE    },
455         {"HR",          html_hr,          2, ELEMENT_TYPE_NON_PAIRABLE},
456         {"HTML",        html_html,        0, ELEMENT_TYPE_NESTABLE    },
457         {"I",           html_italic,      0, ELEMENT_TYPE_NESTABLE    },
458         {"IFRAME",      html_iframe,      1, ELEMENT_TYPE_NON_PAIRABLE},
459         {"IMG",         html_img,         0, ELEMENT_TYPE_NON_PAIRABLE},
460         {"INPUT",       html_input,       0, ELEMENT_TYPE_NON_PAIRABLE},
461         {"LI",          html_li,          1, ELEMENT_TYPE_LI          },
462         {"LINK",        html_link,        1, ELEMENT_TYPE_NON_PAIRABLE},
463         {"LISTING",     html_pre,         2, ELEMENT_TYPE_NESTABLE    },
464         {"MENU",        html_ul,          2, ELEMENT_TYPE_NESTABLE    },
465         {"META",        html_meta,        0, ELEMENT_TYPE_NON_PAIRABLE},
466         {"NOFRAMES",    html_noframes,    0, ELEMENT_TYPE_NESTABLE    },
467         {"NOSCRIPT",    html_noscript,    0, ELEMENT_TYPE_NESTABLE    },
468         {"OBJECT",      html_object,      1, ELEMENT_TYPE_NON_PAIRABLE},
469         {"OL",          html_ol,          2, ELEMENT_TYPE_NESTABLE    },
470         {"OPTION",      html_option,      1, ELEMENT_TYPE_NON_PAIRABLE},
471         {"P",           html_p,           2, ELEMENT_TYPE_NON_NESTABLE},
472         {"PRE",         html_pre,         2, ELEMENT_TYPE_NESTABLE    },
473         {"Q",           html_italic,      0, ELEMENT_TYPE_NESTABLE    },
474         {"S",           html_underline,   0, ELEMENT_TYPE_NESTABLE    },
475         {"SCRIPT",      html_script,      0, ELEMENT_TYPE_NESTABLE    },
476         {"SELECT",      html_select,      0, ELEMENT_TYPE_NESTABLE    },
477         {"SPAN",        html_span,        0, ELEMENT_TYPE_NESTABLE    },
478         {"STRIKE",      html_underline,   0, ELEMENT_TYPE_NESTABLE    },
479         {"STRONG",      html_bold,        0, ELEMENT_TYPE_NESTABLE    },
480         {"STYLE",       html_style,       0, ELEMENT_TYPE_NESTABLE    },
481         {"SUB",         html_subscript,   0, ELEMENT_TYPE_NESTABLE    },
482         {"SUP",         html_superscript, 0, ELEMENT_TYPE_NESTABLE    },
483         {"TABLE",       html_table,       2, ELEMENT_TYPE_NESTABLE    },
484         {"TD",          html_td,          0, ELEMENT_TYPE_NESTABLE    },
485         {"TEXTAREA",    html_textarea,    0, ELEMENT_TYPE_NON_PAIRABLE},
486         {"TH",          html_th,          0, ELEMENT_TYPE_NESTABLE    },
487         {"TITLE",       html_title,       0, ELEMENT_TYPE_NESTABLE    },
488         {"TR",          html_tr,          1, ELEMENT_TYPE_NESTABLE    },
489         {"TT",          html_tt,          0, ELEMENT_TYPE_NON_NESTABLE},
490         {"U",           html_underline,   0, ELEMENT_TYPE_NESTABLE    },
491         {"UL",          html_ul,          2, ELEMENT_TYPE_NESTABLE    },
492         {"XMP",         html_xmp,         2, ELEMENT_TYPE_NESTABLE    },
493         {NULL,          NULL,             0, ELEMENT_TYPE_NESTABLE    },
494 };
495 
496 #define NUMBER_OF_TAGS (sizeof_array(elements) - 1)
497 
498 
499 #ifndef USE_FASTFIND
500 
501 static int
compar(const void * a,const void * b)502 compar(const void *a, const void *b)
503 {
504 	return c_strcasecmp(((struct element_info *) a)->name,
505 			    ((struct element_info *) b)->name);
506 }
507 
508 #else
509 
510 static struct element_info *internal_pointer;
511 
512 /* Reset internal list pointer */
513 static void
tags_list_reset(void)514 tags_list_reset(void)
515 {
516 	internal_pointer = elements;
517 }
518 
519 /* Returns a pointer to a struct that contains
520  * current key and data pointers and increment
521  * internal pointer.
522  * It returns NULL when key is NULL. */
523 static struct fastfind_key_value *
tags_list_next(void)524 tags_list_next(void)
525 {
526 	static struct fastfind_key_value kv;
527 
528 	if (!internal_pointer->name) return NULL;
529 
530 	kv.key = internal_pointer->name;
531 	kv.data = internal_pointer;
532 
533 	internal_pointer++;
534 
535 	return &kv;
536 }
537 
538 static struct fastfind_index ff_tags_index
539 	= INIT_FASTFIND_INDEX("tags_lookup", tags_list_reset, tags_list_next);
540 
541 #endif /* USE_FASTFIND */
542 
543 
544 void
init_tags_lookup(void)545 init_tags_lookup(void)
546 {
547 #ifdef USE_FASTFIND
548 	fastfind_index(&ff_tags_index, FF_COMPRESS | FF_LOCALE_INDEP);
549 #endif
550 }
551 
552 void
free_tags_lookup(void)553 free_tags_lookup(void)
554 {
555 #ifdef USE_FASTFIND
556 	fastfind_done(&ff_tags_index);
557 #endif
558 }
559 
560 
561 static unsigned char *process_element(unsigned char *name, int namelen, int endingtag,
562                 unsigned char *html, unsigned char *prev_html,
563                 unsigned char *eof, unsigned char *attr,
564                 struct html_context *html_context);
565 
566 void
parse_html(unsigned char * html,unsigned char * eof,struct part * part,unsigned char * head,struct html_context * html_context)567 parse_html(unsigned char *html, unsigned char *eof,
568 	   struct part *part, unsigned char *head,
569 	   struct html_context *html_context)
570 {
571 	unsigned char *base_pos = html;
572 	int noupdate = 0;
573 
574 	html_context->putsp = HTML_SPACE_SUPPRESS;
575 	html_context->line_breax = html_context->table_level ? 2 : 1;
576 	html_context->position = 0;
577 	html_context->was_br = 0;
578 	html_context->was_li = 0;
579 	html_context->was_body = 0;
580 /*	html_context->was_body_background = 0; */
581 	html_context->part = part;
582 	html_context->eoff = eof;
583 	if (head) process_head(html_context, head);
584 
585 main_loop:
586 	while (html < eof) {
587 		unsigned char *name, *attr, *end;
588 		int namelen, endingtag;
589 		int dotcounter = 0;
590 
591 		if (!noupdate) {
592 			html_context->part = part;
593 			html_context->eoff = eof;
594 			base_pos = html;
595 		} else {
596 			noupdate = 0;
597 		}
598 
599 		if (isspace(*html) && !html_is_preformatted()) {
600 			unsigned char *h = html;
601 
602 			while (h < eof && isspace(*h))
603 				h++;
604 			if (h + 1 < eof && h[0] == '<' && h[1] == '/') {
605 				if (!parse_element(h, eof, &name, &namelen, &attr, &end)) {
606 					put_chrs(html_context, base_pos, html - base_pos);
607 					base_pos = html = h;
608 					html_context->putsp = HTML_SPACE_ADD;
609 					goto element;
610 				}
611 			}
612 			html++;
613 			if (!(html_context->position + (html - base_pos - 1)))
614 				goto skip_w; /* ??? */
615 			if (*(html - 1) == ' ') {	/* Do not replace with isspace() ! --Zas */
616 				/* BIG performance win; not sure if it doesn't cause any bug */
617 				if (html < eof && !isspace(*html)) {
618 					noupdate = 1;
619 					continue;
620 				}
621 				put_chrs(html_context, base_pos, html - base_pos);
622 			} else {
623 				put_chrs(html_context, base_pos, html - base_pos - 1);
624 				put_chrs(html_context, " ", 1);
625 			}
626 
627 skip_w:
628 			while (html < eof && isspace(*html))
629 				html++;
630 			continue;
631 		}
632 
633 		if (html_is_preformatted()) {
634 			html_context->putsp = HTML_SPACE_NORMAL;
635 			if (*html == ASCII_TAB) {
636 				put_chrs(html_context, base_pos, html - base_pos);
637 				put_chrs(html_context, "        ",
638 				         8 - (html_context->position % 8));
639 				html++;
640 				continue;
641 
642 			} else if (*html == ASCII_CR || *html == ASCII_LF) {
643 				put_chrs(html_context, base_pos, html - base_pos);
644 				if (html - base_pos == 0 && html_context->line_breax > 0)
645 					html_context->line_breax--;
646 next_break:
647 				if (*html == ASCII_CR && html < eof - 1
648 				    && html[1] == ASCII_LF)
649 					html++;
650 				ln_break(html_context, 1);
651 				html++;
652 				if (*html == ASCII_CR || *html == ASCII_LF) {
653 					html_context->line_breax = 0;
654 					goto next_break;
655 				}
656 				continue;
657 
658 			} else if (html + 5 < eof && *html == '&') {
659 				/* Really nasty hack to make &#13; handling in
660 				 * <pre>-tags lynx-compatible. It works around
661 				 * the entity handling done in the renderer,
662 				 * since checking #13 value there would require
663 				 * something along the lines of NBSP_CHAR or
664 				 * checking for '\n's in AT_PREFORMATTED text. */
665 				/* See bug 52 and 387 for more info. */
666 				int length = html - base_pos;
667 				int newlines = 0;
668 
669 				while ((html + 5 < eof && html[0] == '&' && html[1] == '#')
670 				       && (!memcmp(html + 2, "13;", 3)
671 					   || (html + 6 < eof && !c_strncasecmp(html + 2, "x0a;", 4)))) {
672 					newlines++;
673 					html += 5 + (html[4] != ';');
674 				}
675 
676 				if (newlines) {
677 					put_chrs(html_context, base_pos, length);
678 					ln_break(html_context, newlines);
679 					continue;
680 				}
681 			}
682 		}
683 
684 		while (*html < ' ') {
685 			if (html - base_pos)
686 				put_chrs(html_context, base_pos, html - base_pos);
687 
688 			dotcounter++;
689 			base_pos = ++html;
690 			if (*html >= ' ' || isspace(*html) || html >= eof) {
691 				unsigned char *dots = fmem_alloc(dotcounter);
692 
693 				if (dots) {
694 					memset(dots, '.', dotcounter);
695 					put_chrs(html_context, dots, dotcounter);
696 					fmem_free(dots);
697 				}
698 				goto main_loop;
699 			}
700 		}
701 
702 		if (html + 2 <= eof && html[0] == '<' && (html[1] == '!' || html[1] == '?')
703 		    && !(html_context->was_xmp || html_context->was_style)) {
704 			put_chrs(html_context, base_pos, html - base_pos);
705 			html = skip_comment(html, eof);
706 			continue;
707 		}
708 
709 		if (*html != '<' || parse_element(html, eof, &name, &namelen, &attr, &end)) {
710 			html++;
711 			noupdate = 1;
712 			continue;
713 		}
714 
715 element:
716 		endingtag = *name == '/'; name += endingtag; namelen -= endingtag;
717 		if (!endingtag && html_context->putsp == HTML_SPACE_ADD && !html_top.invisible)
718 			put_chrs(html_context, " ", 1);
719 		put_chrs(html_context, base_pos, html - base_pos);
720 		if (!html_is_preformatted() && !endingtag && html_context->putsp == HTML_SPACE_NORMAL) {
721 			unsigned char *ee = end;
722 			unsigned char *nm;
723 
724 			while (!parse_element(ee, eof, &nm, NULL, NULL, &ee))
725 				if (*nm == '/')
726 					goto ng;
727 			if (ee < eof && isspace(*ee)) {
728 				put_chrs(html_context, " ", 1);
729 			}
730 ng:;
731 		}
732 
733 		html = process_element(name, namelen, endingtag, end, html, eof, attr, html_context);
734 	}
735 
736 	if (noupdate) put_chrs(html_context, base_pos, html - base_pos);
737 	ln_break(html_context, 1);
738 	/* Restore the part in case the html_context was trashed in the last
739 	 * iteration so that when destroying the stack in the caller we still
740 	 * get the right part pointer. */
741 	html_context->part = part;
742 	html_context->putsp = HTML_SPACE_SUPPRESS;
743 	html_context->position = 0;
744 	html_context->was_br = 0;
745 }
746 
747 static unsigned char *
start_element(struct element_info * ei,unsigned char * name,int namelen,unsigned char * html,unsigned char * eof,unsigned char * attr,struct html_context * html_context)748 start_element(struct element_info *ei,
749               unsigned char *name, int namelen,
750               unsigned char *html,
751               unsigned char *eof, unsigned char *attr,
752               struct html_context *html_context)
753 {
754 #define ELEMENT_RENDER_PROLOGUE \
755 	ln_break(html_context, ei->linebreak); \
756 	a = get_attr_val(attr, "id", html_context->options); \
757 	if (a) { \
758 		html_context->special_f(html_context, SP_TAG, a); \
759 		mem_free(a); \
760 	}
761 
762 	unsigned char *a;
763 	struct par_attrib old_format;
764 	int restore_format;
765 #ifdef CONFIG_CSS
766 	struct css_selector *selector = NULL;
767 #endif
768 
769 	if (html_top.type == ELEMENT_WEAK) {
770 		kill_html_stack_item(html_context, &html_top);
771 	}
772 
773 	/* We try to process nested <script> if we didn't process the parent
774 	 * one. */
775 	if (html_top.invisible
776 	    && (ei->func != html_script || html_top.invisible < 2)) {
777 		ELEMENT_RENDER_PROLOGUE
778 		return html;
779 	}
780 
781 	restore_format = html_is_preformatted();
782 	old_format = par_format;
783 
784 	/* Support for <meta refresh="..."> inside <body>. (bug 700) */
785 	if (ei->func == html_meta && html_context->was_body) {
786 		html_handle_body_meta(html_context, name - 1, eof);
787 		html_context->was_body = 0;
788 	}
789 
790 #ifdef CONFIG_CSS
791 	if (ei->func == html_style && html_context->options->css_enable) {
792 		css_parse_stylesheet(&html_context->css_styles,
793 				     html_context->base_href, html, eof);
794 	}
795 #endif
796 
797 	if (ei->type == ELEMENT_TYPE_NON_NESTABLE
798 	    || ei->type == ELEMENT_TYPE_LI) {
799 		struct html_element *e;
800 
801 		if (ei->type == ELEMENT_TYPE_NON_NESTABLE) {
802 			foreach (e, html_context->stack) {
803 				if (e->type < ELEMENT_KILLABLE) break;
804 				if (is_block_element(e) || is_inline_element(ei)) break;
805 			}
806 		} else foreach (e, html_context->stack) {
807 			if (is_block_element(e) && is_inline_element(ei)) break;
808 			if (e->type < ELEMENT_KILLABLE) break;
809 			if (!c_strlcasecmp(e->name, e->namelen, name, namelen)) break;
810 		}
811 		if (!c_strlcasecmp(e->name, e->namelen, name, namelen)) {
812 			while (e->prev != (void *) &html_context->stack)
813 				kill_html_stack_item(html_context, e->prev);
814 
815 			if (e->type > ELEMENT_IMMORTAL)
816 				kill_html_stack_item(html_context, e);
817 		}
818 	}
819 
820 	if (ei->type != ELEMENT_TYPE_NON_PAIRABLE) {
821 		html_stack_dup(html_context, ELEMENT_KILLABLE);
822 		html_top.name = name;
823 		html_top.namelen = namelen;
824 		html_top.options = attr;
825 		html_top.linebreak = ei->linebreak;
826 
827 #ifdef CONFIG_ECMASCRIPT
828 		if (has_attr(attr, "onClick", html_context->options)) {
829 			/* XXX: Put something better to format.link. --pasky */
830 			mem_free_set(&format.link, stracpy("javascript:void(0);"));
831 			mem_free_set(&format.target, stracpy(html_context->base_target));
832 			format.style.fg = format.clink;
833 			html_top.pseudo_class = ELEMENT_LINK;
834 			mem_free_set(&format.title, stracpy("onClick placeholder"));
835 			/* Er. I know. Well, double html_focusable()s shouldn't
836 			 * really hurt. */
837 			html_focusable(html_context, attr);
838 		}
839 #endif
840 	}
841 
842 #ifdef CONFIG_CSS
843 	if (html_top.options && html_context->options->css_enable) {
844 		/* XXX: We should apply CSS otherwise as well, but that'll need
845 		 * some deeper changes in order to have options filled etc.
846 		 * Probably just applying CSS from more places, since we
847 		 * usually have type != ELEMENT_TYPE_NESTABLE when we either (1)
848 		 * rescan on your own from somewhere else (2) html_stack_dup()
849 		 * in our own way.  --pasky */
850 		/* Call it now to gain some of the stuff which might affect
851 		 * formatting of some elements. */
852 		/* FIXME: The caching of the CSS selector is broken, since t can
853 		 * lead to wrong styles being applied to following elements, so
854 		 * disabled for now. */
855 		selector = get_css_selector_for_element(html_context, &html_top,
856 							&html_context->css_styles,
857 							&html_context->stack);
858 
859 		if (selector) {
860 			apply_css_selector_style(html_context, &html_top, selector);
861 			done_css_selector(selector);
862 		}
863 	}
864 	/* Now this was the reason for this whole funny ELEMENT_RENDER_PROLOGUE
865 	 * bussiness. Only now we have the definitive linebreak value, since
866 	 * that's what the display: property plays with. */
867 #endif
868 	ELEMENT_RENDER_PROLOGUE
869 	if (ei->func) ei->func(html_context, attr, html, eof, &html);
870 #ifdef CONFIG_CSS
871 	if (selector && html_top.options) {
872 		/* Call it now to override default colors of the elements. */
873 		selector = get_css_selector_for_element(html_context, &html_top,
874 							&html_context->css_styles,
875 							&html_context->stack);
876 
877 		if (selector) {
878 			apply_css_selector_style(html_context, &html_top, selector);
879 			done_css_selector(selector);
880 		}
881 	}
882 #endif
883 
884 	if (ei->func != html_br) html_context->was_br = 0;
885 
886 	if (restore_format) par_format = old_format;
887 
888 	return html;
889 #undef ELEMENT_RENDER_PROLOGUE
890 }
891 
892 static unsigned char *
end_element(struct element_info * ei,unsigned char * name,int namelen,unsigned char * html,unsigned char * eof,unsigned char * attr,struct html_context * html_context)893 end_element(struct element_info *ei,
894             unsigned char *name, int namelen,
895             unsigned char *html,
896             unsigned char *eof, unsigned char *attr,
897             struct html_context *html_context)
898 {
899 	struct html_element *e, *elt;
900 	int lnb = 0;
901 	int kill = 0;
902 
903 	if (ei->func == html_xmp) html_context->was_xmp = 0;
904 	if (ei->func == html_style) html_context->was_style = 0;
905 
906 	html_context->was_br = 0;
907 	if (ei->type == ELEMENT_TYPE_NON_PAIRABLE
908 	    || ei->type == ELEMENT_TYPE_LI)
909 		return html;
910 
911 	/* Apply background color from the <HTML> element. (bug 696) */
912 	if (ei->func == html_html
913 	    && html_top.type >= ELEMENT_KILLABLE
914 	    && !html_context->was_body_background)
915 		html_apply_canvas_bgcolor(html_context);
916 
917 	/* dump_html_stack(html_context); */
918 	foreach (e, html_context->stack) {
919 		if (is_block_element(e) && is_inline_element(ei)) kill = 1;
920 		if (c_strlcasecmp(e->name, e->namelen, name, namelen)) {
921 			if (e->type < ELEMENT_KILLABLE)
922 				break;
923 			else
924 				continue;
925 		}
926 		if (kill) {
927 			kill_html_stack_item(html_context, e);
928 			break;
929 		}
930 		for (elt = e;
931 		     elt != (void *) &html_context->stack;
932 		     elt = elt->prev)
933 			if (elt->linebreak > lnb)
934 				lnb = elt->linebreak;
935 
936 		/* This hack forces a line break after a list end. It is needed
937 		 * when ending a list with the last <li> having no text the
938 		 * line_breax is 2 so the ending list's linebreak will be
939 		 * ignored when calling ln_break(). */
940 		if (html_context->was_li)
941 			html_context->line_breax = 0;
942 
943 		ln_break(html_context, lnb);
944 		while (e->prev != (void *) &html_context->stack)
945 			kill_html_stack_item(html_context, e->prev);
946 		kill_html_stack_item(html_context, e);
947 		break;
948 	}
949 	/* dump_html_stack(html_context); */
950 
951 	return html;
952 }
953 
954 static unsigned char *
process_element(unsigned char * name,int namelen,int endingtag,unsigned char * html,unsigned char * prev_html,unsigned char * eof,unsigned char * attr,struct html_context * html_context)955 process_element(unsigned char *name, int namelen, int endingtag,
956                 unsigned char *html, unsigned char *prev_html,
957                 unsigned char *eof, unsigned char *attr,
958                 struct html_context *html_context)
959 
960 {
961 	struct element_info *ei;
962 
963 #ifndef USE_FASTFIND
964 	{
965 		struct element_info elem;
966 		unsigned char tmp;
967 
968 		tmp = name[namelen];
969 		name[namelen] = '\0';
970 
971 		elem.name = name;
972 		ei = bsearch(&elem, elements, NUMBER_OF_TAGS, sizeof(elem), compar);
973 		name[namelen] = tmp;
974 	}
975 #else
976 	ei = (struct element_info *) fastfind_search(&ff_tags_index, name, namelen);
977 #endif
978 	if (html_context->was_xmp || html_context->was_style) {
979 		if (!ei || (ei->func != html_xmp && ei->func != html_style) || !endingtag) {
980 			put_chrs(html_context, "<", 1);
981 			return prev_html + 1;
982 		}
983 	}
984 
985 	if (!ei) return html;
986 
987 	if (!endingtag) {
988 		return start_element(ei, name, namelen, html, eof, attr, html_context);
989 	} else {
990 		return end_element(ei, name, namelen, html, eof, attr, html_context);
991 	}
992 }
993 
994 void
scan_http_equiv(unsigned char * s,unsigned char * eof,struct string * head,struct string * title,struct document_options * options)995 scan_http_equiv(unsigned char *s, unsigned char *eof, struct string *head,
996 		struct string *title, struct document_options *options)
997 {
998 	unsigned char *name, *attr, *he, *c;
999 	int namelen;
1000 
1001 	if (title && !init_string(title)) return;
1002 
1003 	add_char_to_string(head, '\n');
1004 
1005 se:
1006 	while (s < eof && *s != '<') {
1007 sp:
1008 		s++;
1009 	}
1010 	if (s >= eof) return;
1011 	if (s + 2 <= eof && (s[1] == '!' || s[1] == '?')) {
1012 		s = skip_comment(s, eof);
1013 		goto se;
1014 	}
1015 	if (parse_element(s, eof, &name, &namelen, &attr, &s)) goto sp;
1016 
1017 ps:
1018 	if (!c_strlcasecmp(name, namelen, "HEAD", 4)) goto se;
1019 	if (!c_strlcasecmp(name, namelen, "/HEAD", 5)) return;
1020 	if (!c_strlcasecmp(name, namelen, "BODY", 4)) return;
1021 	if (title && !title->length && !c_strlcasecmp(name, namelen, "TITLE", 5)) {
1022 		unsigned char *s1;
1023 
1024 xse:
1025 		s1 = s;
1026 		while (s < eof && *s != '<') {
1027 xsp:
1028 			s++;
1029 		}
1030 		if (s - s1)
1031 			add_bytes_to_string(title, s1, s - s1);
1032 		if (s >= eof) goto se;
1033 		if (s + 2 <= eof && (s[1] == '!' || s[1] == '?')) {
1034 			s = skip_comment(s, eof);
1035 			goto xse;
1036 		}
1037 		if (parse_element(s, eof, &name, &namelen, &attr, &s)) {
1038 			s1 = s;
1039 			goto xsp;
1040 		}
1041 		clr_spaces(title->source);
1042 		goto ps;
1043 	}
1044 	if (c_strlcasecmp(name, namelen, "META", 4)) goto se;
1045 
1046 	he = get_attr_val(attr, "charset", options);
1047 	if (he) {
1048 		add_to_string(head, "Charset: ");
1049 		add_to_string(head, he);
1050 		mem_free(he);
1051 	}
1052 
1053 	he = get_attr_val(attr, "http-equiv", options);
1054 	if (!he) goto se;
1055 
1056 	add_to_string(head, he);
1057 	mem_free(he);
1058 
1059 	c = get_attr_val(attr, "content", options);
1060 	if (c) {
1061 		add_to_string(head, ": ");
1062 		add_to_string(head, c);
1063 	        mem_free(c);
1064 	}
1065 
1066 	add_crlf_to_string(head);
1067 	goto se;
1068 }
1069