1 /*
2  * This file is part of Hubbub.
3  * Licensed under the MIT License,
4  *                http://www.opensource.org/licenses/mit-license.php
5  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6  * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org>
7  */
8 #include <assert.h>
9 #include <stdbool.h>
10 #include <string.h>
11 
12 #include <stdio.h>
13 
14 #include <parserutils/charset/utf8.h>
15 
16 #include "utils/parserutilserror.h"
17 #include "utils/utils.h"
18 
19 #include "hubbub/errors.h"
20 #include "tokeniser/entities.h"
21 #include "tokeniser/tokeniser.h"
22 
23 /**
24  * Table of mappings between Windows-1252 codepoints 128-159 and UCS4
25  */
26 static const uint32_t cp1252Table[32] = {
27 	0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
28 	0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
29 	0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
30 	0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
31 };
32 
33 /**
34  * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER
35  */
36 static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' };
37 static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) };
38 
39 
40 /**
41  * String for when we want to emit newlines
42  */
43 static const uint8_t lf = '\n';
44 static const hubbub_string lf_str = { &lf, 1 };
45 
46 
47 /**
48  * Tokeniser states
49  */
50 typedef enum hubbub_tokeniser_state {
51 	STATE_DATA,
52 	STATE_CHARACTER_REFERENCE_DATA,
53 	STATE_TAG_OPEN,
54 	STATE_CLOSE_TAG_OPEN,
55 	STATE_TAG_NAME,
56 	STATE_BEFORE_ATTRIBUTE_NAME,
57 	STATE_ATTRIBUTE_NAME,
58 	STATE_AFTER_ATTRIBUTE_NAME,
59 	STATE_BEFORE_ATTRIBUTE_VALUE,
60 	STATE_ATTRIBUTE_VALUE_DQ,
61 	STATE_ATTRIBUTE_VALUE_SQ,
62 	STATE_ATTRIBUTE_VALUE_UQ,
63 	STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE,
64 	STATE_AFTER_ATTRIBUTE_VALUE_Q,
65 	STATE_SELF_CLOSING_START_TAG,
66 	STATE_BOGUS_COMMENT,
67 	STATE_MARKUP_DECLARATION_OPEN,
68 	STATE_MATCH_COMMENT,
69 	STATE_COMMENT_START,
70 	STATE_COMMENT_START_DASH,
71 	STATE_COMMENT,
72 	STATE_COMMENT_END_DASH,
73 	STATE_COMMENT_END,
74 	STATE_MATCH_DOCTYPE,
75 	STATE_DOCTYPE,
76 	STATE_BEFORE_DOCTYPE_NAME,
77 	STATE_DOCTYPE_NAME,
78 	STATE_AFTER_DOCTYPE_NAME,
79 	STATE_MATCH_PUBLIC,
80 	STATE_BEFORE_DOCTYPE_PUBLIC,
81 	STATE_DOCTYPE_PUBLIC_DQ,
82 	STATE_DOCTYPE_PUBLIC_SQ,
83 	STATE_AFTER_DOCTYPE_PUBLIC,
84 	STATE_MATCH_SYSTEM,
85 	STATE_BEFORE_DOCTYPE_SYSTEM,
86 	STATE_DOCTYPE_SYSTEM_DQ,
87 	STATE_DOCTYPE_SYSTEM_SQ,
88 	STATE_AFTER_DOCTYPE_SYSTEM,
89 	STATE_BOGUS_DOCTYPE,
90 	STATE_MATCH_CDATA,
91 	STATE_CDATA_BLOCK,
92 	STATE_NUMBERED_ENTITY,
93 	STATE_NAMED_ENTITY
94 } hubbub_tokeniser_state;
95 
96 /**
97  * Context for tokeniser
98  */
99 typedef struct hubbub_tokeniser_context {
100 	size_t pending;				/**< Count of pending chars */
101 
102 	hubbub_string current_comment;		/**< Current comment text */
103 
104 	hubbub_token_type current_tag_type;	/**< Type of current_tag */
105 	hubbub_tag current_tag;			/**< Current tag */
106 	hubbub_doctype current_doctype;		/**< Current doctype */
107 	hubbub_tokeniser_state prev_state;	/**< Previous state */
108 
109 	uint8_t last_start_tag_name[10];	/**< Name of the last start tag
110 						 * emitted */
111 	size_t last_start_tag_len;		/**< Length of last start tag */
112 
113 	struct {
114 		uint32_t count;
115 		bool match;
116 	} close_tag_match;			/**< State for matching close
117 						 * tags */
118 
119 	struct {
120 		uint32_t count;			/**< Index into "DOCTYPE" */
121 	} match_doctype;			/**< State for matching doctype */
122 
123 	struct {
124 		uint32_t count;			/**< Index into "[CDATA[" */
125 		uint32_t end;			/**< Index into "]]>" */
126 	} match_cdata;				/**< State for matching cdata */
127 
128 	struct {
129 		size_t offset;			/**< Offset in buffer */
130 		uint32_t length;		/**< Length of entity */
131 		uint32_t codepoint;		/**< UCS4 codepoint */
132 		bool complete;			/**< True if match complete */
133 
134 		uint32_t poss_length;		/**< Optimistic length
135 						 * when matching named
136 						 * character references */
137 		uint8_t base;			/**< Base for numeric
138 						 * entities */
139 		int32_t context;		/**< Context for named
140 						 * entity search */
141 		size_t prev_len;		/**< Previous byte length
142 						 * of str */
143 		bool had_data;			/**< Whether we read
144 						 * anything after &#(x)? */
145 		bool overflow;			/**< Whether this entity has
146 						 * has overflowed the maximum
147 						 * numeric entity value */
148 		hubbub_tokeniser_state return_state;	/**< State we were
149 							 * called from */
150 	} match_entity;				/**< Entity matching state */
151 
152 	struct {
153 		uint32_t line;			/**< Current line of input */
154 		uint32_t col;			/**< Current character in
155 						 * line */
156 	} position;				/**< Position in source data */
157 
158 	uint32_t allowed_char;			/**< Used for quote matching */
159 
160 } hubbub_tokeniser_context;
161 
162 /**
163  * Tokeniser data structure
164  */
165 struct hubbub_tokeniser {
166 	hubbub_tokeniser_state state;	/**< Current tokeniser state */
167 	hubbub_content_model content_model;	/**< Current content
168 						 * model flag */
169 	bool escape_flag;		/**< Escape flag **/
170 	bool process_cdata_section;	/**< Whether to process CDATA sections*/
171 	bool paused; /**< flag for if parsing is currently paused */
172 
173 	parserutils_inputstream *input;	/**< Input stream */
174 	parserutils_buffer *buffer;	/**< Input buffer */
175 	parserutils_buffer *insert_buf; /**< Stream insertion buffer */
176 
177 	hubbub_tokeniser_context context;	/**< Tokeniser context */
178 
179 	hubbub_token_handler token_handler;	/**< Token handling callback */
180 	void *token_pw;				/**< Token handler data */
181 
182 	hubbub_error_handler error_handler;	/**< Error handling callback */
183 	void *error_pw;				/**< Error handler data */
184 };
185 
186 static hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser);
187 static hubbub_error hubbub_tokeniser_handle_character_reference_data(
188 		hubbub_tokeniser *tokeniser);
189 static hubbub_error hubbub_tokeniser_handle_tag_open(
190 		hubbub_tokeniser *tokeniser);
191 static hubbub_error hubbub_tokeniser_handle_close_tag_open(
192 		hubbub_tokeniser *tokeniser);
193 static hubbub_error hubbub_tokeniser_handle_tag_name(
194 		hubbub_tokeniser *tokeniser);
195 static hubbub_error hubbub_tokeniser_handle_before_attribute_name(
196 		hubbub_tokeniser *tokeniser);
197 static hubbub_error hubbub_tokeniser_handle_attribute_name(
198 		hubbub_tokeniser *tokeniser);
199 static hubbub_error hubbub_tokeniser_handle_after_attribute_name(
200 		hubbub_tokeniser *tokeniser);
201 static hubbub_error hubbub_tokeniser_handle_before_attribute_value(
202 		hubbub_tokeniser *tokeniser);
203 static hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
204 		hubbub_tokeniser *tokeniser);
205 static hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
206 		hubbub_tokeniser *tokeniser);
207 static hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
208 		hubbub_tokeniser *tokeniser);
209 static hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
210 		hubbub_tokeniser *tokeniser);
211 static hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
212 		hubbub_tokeniser *tokeniser);
213 static hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
214 		hubbub_tokeniser *tokeniser);
215 static hubbub_error hubbub_tokeniser_handle_bogus_comment(
216 		hubbub_tokeniser *tokeniser);
217 static hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
218 		hubbub_tokeniser *tokeniser);
219 static hubbub_error hubbub_tokeniser_handle_match_comment(
220 		hubbub_tokeniser *tokeniser);
221 static hubbub_error hubbub_tokeniser_handle_comment(
222 		hubbub_tokeniser *tokeniser);
223 static hubbub_error hubbub_tokeniser_handle_match_doctype(
224 		hubbub_tokeniser *tokeniser);
225 static hubbub_error hubbub_tokeniser_handle_doctype(
226 		hubbub_tokeniser *tokeniser);
227 static hubbub_error hubbub_tokeniser_handle_before_doctype_name(
228 		hubbub_tokeniser *tokeniser);
229 static hubbub_error hubbub_tokeniser_handle_doctype_name(
230 		hubbub_tokeniser *tokeniser);
231 static hubbub_error hubbub_tokeniser_handle_after_doctype_name(
232 		hubbub_tokeniser *tokeniser);
233 static hubbub_error hubbub_tokeniser_handle_match_public(
234 		hubbub_tokeniser *tokeniser);
235 static hubbub_error hubbub_tokeniser_handle_before_doctype_public(
236 		hubbub_tokeniser *tokeniser);
237 static hubbub_error hubbub_tokeniser_handle_doctype_public_dq(
238 		hubbub_tokeniser *tokeniser);
239 static hubbub_error hubbub_tokeniser_handle_doctype_public_sq(
240 		hubbub_tokeniser *tokeniser);
241 static hubbub_error hubbub_tokeniser_handle_after_doctype_public(
242 		hubbub_tokeniser *tokeniser);
243 static hubbub_error hubbub_tokeniser_handle_match_system(
244 		hubbub_tokeniser *tokeniser);
245 static hubbub_error hubbub_tokeniser_handle_before_doctype_system(
246 		hubbub_tokeniser *tokeniser);
247 static hubbub_error hubbub_tokeniser_handle_doctype_system_dq(
248 		hubbub_tokeniser *tokeniser);
249 static hubbub_error hubbub_tokeniser_handle_doctype_system_sq(
250 		hubbub_tokeniser *tokeniser);
251 static hubbub_error hubbub_tokeniser_handle_after_doctype_system(
252 		hubbub_tokeniser *tokeniser);
253 static hubbub_error hubbub_tokeniser_handle_bogus_doctype(
254 		hubbub_tokeniser *tokeniser);
255 static hubbub_error hubbub_tokeniser_handle_match_cdata(
256 		hubbub_tokeniser *tokeniser);
257 static hubbub_error hubbub_tokeniser_handle_cdata_block(
258 		hubbub_tokeniser *tokeniser);
259 static hubbub_error hubbub_tokeniser_consume_character_reference(
260 		hubbub_tokeniser *tokeniser, size_t off);
261 static hubbub_error hubbub_tokeniser_handle_numbered_entity(
262 		hubbub_tokeniser *tokeniser);
263 static hubbub_error hubbub_tokeniser_handle_named_entity(
264 		hubbub_tokeniser *tokeniser);
265 
266 static inline hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
267 		const hubbub_string *chars);
268 static inline hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser);
269 static inline hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser);
270 static inline hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser);
271 static inline hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
272 		bool force_quirks);
273 static hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
274 		hubbub_token *token);
275 
276 /**
277  * Create a hubbub tokeniser
278  *
279  * \param input      Input stream instance
280  * \param tokeniser  Pointer to location to receive tokeniser instance
281  * \return HUBBUB_OK on success,
282  *         HUBBUB_BADPARM on bad parameters,
283  *         HUBBUB_NOMEM on memory exhaustion
284  */
hubbub_tokeniser_create(parserutils_inputstream * input,hubbub_tokeniser ** tokeniser)285 hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input,
286 		hubbub_tokeniser **tokeniser)
287 {
288 	parserutils_error perror;
289 	hubbub_tokeniser *tok;
290 
291 	if (input == NULL || tokeniser == NULL)
292 		return HUBBUB_BADPARM;
293 
294 	tok = malloc(sizeof(hubbub_tokeniser));
295 	if (tok == NULL)
296 		return HUBBUB_NOMEM;
297 
298 	perror = parserutils_buffer_create(&tok->buffer);
299 	if (perror != PARSERUTILS_OK) {
300 		free(tok);
301 		return hubbub_error_from_parserutils_error(perror);
302 	}
303 
304 	perror = parserutils_buffer_create(&tok->insert_buf);
305 	if (perror != PARSERUTILS_OK) {
306 		parserutils_buffer_destroy(tok->buffer);
307 		free(tok);
308 		return hubbub_error_from_parserutils_error(perror);
309 	}
310 
311 	tok->state = STATE_DATA;
312 	tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
313 
314 	tok->escape_flag = false;
315 	tok->process_cdata_section = false;
316 
317 	tok->paused = false;
318 
319 	tok->input = input;
320 
321 	tok->token_handler = NULL;
322 	tok->token_pw = NULL;
323 
324 	tok->error_handler = NULL;
325 	tok->error_pw = NULL;
326 
327 	memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
328 
329 	*tokeniser = tok;
330 
331 	return HUBBUB_OK;
332 }
333 
334 /**
335  * Destroy a hubbub tokeniser
336  *
337  * \param tokeniser  The tokeniser instance to destroy
338  * \return HUBBUB_OK on success, appropriate error otherwise
339  */
hubbub_tokeniser_destroy(hubbub_tokeniser * tokeniser)340 hubbub_error hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
341 {
342 	if (tokeniser == NULL)
343 		return HUBBUB_BADPARM;
344 
345 	if (tokeniser->context.current_tag.attributes != NULL) {
346 		free(tokeniser->context.current_tag.attributes);
347 	}
348 
349 	parserutils_buffer_destroy(tokeniser->insert_buf);
350 
351 	parserutils_buffer_destroy(tokeniser->buffer);
352 
353 	free(tokeniser);
354 
355 	return HUBBUB_OK;
356 }
357 
358 /**
359  * Configure a hubbub tokeniser
360  *
361  * \param tokeniser  The tokeniser instance to configure
362  * \param type       The option type to set
363  * \param params     Option-specific parameters
364  * \return HUBBUB_OK on success, appropriate error otherwise
365  */
hubbub_tokeniser_setopt(hubbub_tokeniser * tokeniser,hubbub_tokeniser_opttype type,hubbub_tokeniser_optparams * params)366 hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
367 		hubbub_tokeniser_opttype type,
368 		hubbub_tokeniser_optparams *params)
369 {
370 	hubbub_error err = HUBBUB_OK;
371 
372 	if (tokeniser == NULL || params == NULL)
373 		return HUBBUB_BADPARM;
374 
375 	switch (type) {
376 	case HUBBUB_TOKENISER_TOKEN_HANDLER:
377 		tokeniser->token_handler = params->token_handler.handler;
378 		tokeniser->token_pw = params->token_handler.pw;
379 		break;
380 	case HUBBUB_TOKENISER_ERROR_HANDLER:
381 		tokeniser->error_handler = params->error_handler.handler;
382 		tokeniser->error_pw = params->error_handler.pw;
383 		break;
384 	case HUBBUB_TOKENISER_CONTENT_MODEL:
385 		tokeniser->content_model = params->content_model.model;
386 		break;
387 	case HUBBUB_TOKENISER_PROCESS_CDATA:
388 		tokeniser->process_cdata_section = params->process_cdata;
389 		break;
390 	case HUBBUB_TOKENISER_PAUSE:
391 		if (params->pause_parse == true) {
392 			tokeniser->paused = true;
393 		} else {
394 			if (tokeniser->paused == true) {
395 				tokeniser->paused = false;
396 				/* When unpausing, if we have had something
397 				 * akin to document.write() happen while
398 				 * we were paused, then the insert_buf will
399 				 * have some content.
400 				 * In this case, we need to prepend it to
401 				 * the input buffer before we resume parsing,
402 				 * discarding the insert_buf as we go.
403 				 */
404 				if (tokeniser->insert_buf->length > 0) {
405 					parserutils_inputstream_insert(
406 						tokeniser->input,
407 						tokeniser->insert_buf->data,
408 						tokeniser->insert_buf->length);
409 					parserutils_buffer_discard(
410 						tokeniser->insert_buf, 0,
411 						tokeniser->insert_buf->length);
412 				}
413 
414 				err = hubbub_tokeniser_run(tokeniser);
415 			}
416 		}
417 	}
418 
419 	return err;
420 }
421 
422 /**
423  * Insert a chunk of data into the input stream.
424  *
425  * Inserts the given data into the input stream ready for parsing but
426  * does not cause any additional processing of the input.
427  *
428  * \param tokeniser  Tokeniser instance
429  * \param data       Data to insert (UTF-8 encoded)
430  * \param len        Length, in bytes, of data
431  * \return HUBBUB_OK on success, appropriate error otherwise
432  */
hubbub_tokeniser_insert_chunk(hubbub_tokeniser * tokeniser,const uint8_t * data,size_t len)433 hubbub_error hubbub_tokeniser_insert_chunk(hubbub_tokeniser *tokeniser,
434 		const uint8_t *data, size_t len)
435 {
436 	parserutils_error perror;
437 
438 	if (tokeniser == NULL || data == NULL)
439 		return HUBBUB_BADPARM;
440 
441 	perror = parserutils_buffer_append(tokeniser->insert_buf, data, len);
442 	if (perror != PARSERUTILS_OK)
443 		return hubbub_error_from_parserutils_error(perror);
444 
445 	return HUBBUB_OK;
446 }
447 
448 /**
449  * Process remaining data in the input stream
450  *
451  * \param tokeniser  The tokeniser instance to invoke
452  * \return HUBBUB_OK on success, appropriate error otherwise
453  */
hubbub_tokeniser_run(hubbub_tokeniser * tokeniser)454 hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
455 {
456 	hubbub_error cont = HUBBUB_OK;
457 
458 	if (tokeniser == NULL)
459 		return HUBBUB_BADPARM;
460 
461 	if (tokeniser->paused == true)
462 		return HUBBUB_PAUSED;
463 
464 #if 0
465 #define state(x) \
466 		case x: \
467 			printf( #x "\n");
468 #else
469 #define state(x) \
470 		case x:
471 #endif
472 
473 	while (cont == HUBBUB_OK) {
474 		switch (tokeniser->state) {
475 		state(STATE_DATA)
476 			cont = hubbub_tokeniser_handle_data(tokeniser);
477 			break;
478 		state(STATE_CHARACTER_REFERENCE_DATA)
479 			cont = hubbub_tokeniser_handle_character_reference_data(
480 					tokeniser);
481 			break;
482 		state(STATE_TAG_OPEN)
483 			cont = hubbub_tokeniser_handle_tag_open(tokeniser);
484 			break;
485 		state(STATE_CLOSE_TAG_OPEN)
486 			cont = hubbub_tokeniser_handle_close_tag_open(
487 					tokeniser);
488 			break;
489 		state(STATE_TAG_NAME)
490 			cont = hubbub_tokeniser_handle_tag_name(tokeniser);
491 			break;
492 		state(STATE_BEFORE_ATTRIBUTE_NAME)
493 			cont = hubbub_tokeniser_handle_before_attribute_name(
494 					tokeniser);
495 			break;
496 		state(STATE_ATTRIBUTE_NAME)
497 			cont = hubbub_tokeniser_handle_attribute_name(
498 					tokeniser);
499 			break;
500 		state(STATE_AFTER_ATTRIBUTE_NAME)
501 			cont = hubbub_tokeniser_handle_after_attribute_name(
502 					tokeniser);
503 			break;
504 		state(STATE_BEFORE_ATTRIBUTE_VALUE)
505 			cont = hubbub_tokeniser_handle_before_attribute_value(
506 					tokeniser);
507 			break;
508 		state(STATE_ATTRIBUTE_VALUE_DQ)
509 			cont = hubbub_tokeniser_handle_attribute_value_dq(
510 					tokeniser);
511 			break;
512 		state(STATE_ATTRIBUTE_VALUE_SQ)
513 			cont = hubbub_tokeniser_handle_attribute_value_sq(
514 					tokeniser);
515 			break;
516 		state(STATE_ATTRIBUTE_VALUE_UQ)
517 			cont = hubbub_tokeniser_handle_attribute_value_uq(
518 					tokeniser);
519 			break;
520 		state(STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE)
521 			cont = hubbub_tokeniser_handle_character_reference_in_attribute_value(
522 					tokeniser);
523 			break;
524 		state(STATE_AFTER_ATTRIBUTE_VALUE_Q)
525 			cont = hubbub_tokeniser_handle_after_attribute_value_q(
526 					tokeniser);
527 			break;
528 		state(STATE_SELF_CLOSING_START_TAG)
529 			cont = hubbub_tokeniser_handle_self_closing_start_tag(
530 					tokeniser);
531 			break;
532 		state(STATE_BOGUS_COMMENT)
533 			cont = hubbub_tokeniser_handle_bogus_comment(
534 					tokeniser);
535 			break;
536 		state(STATE_MARKUP_DECLARATION_OPEN)
537 			cont = hubbub_tokeniser_handle_markup_declaration_open(
538 					tokeniser);
539 			break;
540 		state(STATE_MATCH_COMMENT)
541 			cont = hubbub_tokeniser_handle_match_comment(
542 					tokeniser);
543 			break;
544 		case STATE_COMMENT_START:
545 		case STATE_COMMENT_START_DASH:
546 		case STATE_COMMENT:
547 		case STATE_COMMENT_END_DASH:
548 		case STATE_COMMENT_END:
549 			cont = hubbub_tokeniser_handle_comment(tokeniser);
550 			break;
551 		state(STATE_MATCH_DOCTYPE)
552 			cont = hubbub_tokeniser_handle_match_doctype(
553 					tokeniser);
554 			break;
555 		state(STATE_DOCTYPE)
556 			cont = hubbub_tokeniser_handle_doctype(tokeniser);
557 			break;
558 		state(STATE_BEFORE_DOCTYPE_NAME)
559 			cont = hubbub_tokeniser_handle_before_doctype_name(
560 					tokeniser);
561 			break;
562 		state(STATE_DOCTYPE_NAME)
563 			cont = hubbub_tokeniser_handle_doctype_name(
564 					tokeniser);
565 			break;
566 		state(STATE_AFTER_DOCTYPE_NAME)
567 			cont = hubbub_tokeniser_handle_after_doctype_name(
568 					tokeniser);
569 			break;
570 
571 		state(STATE_MATCH_PUBLIC)
572 			cont = hubbub_tokeniser_handle_match_public(
573 					tokeniser);
574 			break;
575 		state(STATE_BEFORE_DOCTYPE_PUBLIC)
576 			cont = hubbub_tokeniser_handle_before_doctype_public(
577 					tokeniser);
578 			break;
579 		state(STATE_DOCTYPE_PUBLIC_DQ)
580 			cont = hubbub_tokeniser_handle_doctype_public_dq(
581 					tokeniser);
582 			break;
583 		state(STATE_DOCTYPE_PUBLIC_SQ)
584 			cont = hubbub_tokeniser_handle_doctype_public_sq(
585 					tokeniser);
586 			break;
587 		state(STATE_AFTER_DOCTYPE_PUBLIC)
588 			cont = hubbub_tokeniser_handle_after_doctype_public(
589 					tokeniser);
590 			break;
591 		state(STATE_MATCH_SYSTEM)
592 			cont = hubbub_tokeniser_handle_match_system(
593 					tokeniser);
594 			break;
595 		state(STATE_BEFORE_DOCTYPE_SYSTEM)
596 			cont = hubbub_tokeniser_handle_before_doctype_system(
597 					tokeniser);
598 			break;
599 		state(STATE_DOCTYPE_SYSTEM_DQ)
600 			cont = hubbub_tokeniser_handle_doctype_system_dq(
601 					tokeniser);
602 			break;
603 		state(STATE_DOCTYPE_SYSTEM_SQ)
604 			cont = hubbub_tokeniser_handle_doctype_system_sq(
605 					tokeniser);
606 			break;
607 		state(STATE_AFTER_DOCTYPE_SYSTEM)
608 			cont = hubbub_tokeniser_handle_after_doctype_system(
609 					tokeniser);
610 			break;
611 		state(STATE_BOGUS_DOCTYPE)
612 			cont = hubbub_tokeniser_handle_bogus_doctype(
613 					tokeniser);
614 			break;
615 		state(STATE_MATCH_CDATA)
616 			cont = hubbub_tokeniser_handle_match_cdata(
617 					tokeniser);
618 			break;
619 		state(STATE_CDATA_BLOCK)
620 			cont = hubbub_tokeniser_handle_cdata_block(
621 					tokeniser);
622 			break;
623 		state(STATE_NUMBERED_ENTITY)
624 			cont = hubbub_tokeniser_handle_numbered_entity(
625 					tokeniser);
626 			break;
627 		state(STATE_NAMED_ENTITY)
628 			cont = hubbub_tokeniser_handle_named_entity(
629 					tokeniser);
630 			break;
631 		}
632 	}
633 
634 	return (cont == HUBBUB_NEEDDATA) ? HUBBUB_OK : cont;
635 }
636 
637 
638 /**
639  * Various macros for manipulating buffers.
640  *
641  * \todo make some of these inline functions (type-safety)
642  * \todo document them properly here
643  */
644 
645 #define START_BUF(str, cptr, length) \
646 	do { \
647 		parserutils_error perror; \
648 		perror = parserutils_buffer_append(tokeniser->buffer, \
649 				(uint8_t *) (cptr), (length)); \
650 		if (perror != PARSERUTILS_OK) \
651 			return hubbub_error_from_parserutils_error(perror); \
652 		(str).len = (length); \
653 	} while (0)
654 
655 #define COLLECT(str, cptr, length) \
656 	do { \
657 		parserutils_error perror; \
658 		assert(str.len != 0); \
659 		perror = parserutils_buffer_append(tokeniser->buffer, \
660 				(uint8_t *) (cptr), (length)); \
661 		if (perror != PARSERUTILS_OK) \
662 			return hubbub_error_from_parserutils_error(perror); \
663 		(str).len += (length); \
664 	} while (0)
665 
666 #define COLLECT_MS(str, cptr, length) \
667 	do { \
668 		parserutils_error perror; \
669 		perror = parserutils_buffer_append(tokeniser->buffer, \
670 				(uint8_t *) (cptr), (length)); \
671 		if (perror != PARSERUTILS_OK) \
672 			return hubbub_error_from_parserutils_error(perror); \
673 		(str).len += (length); \
674 	} while (0)
675 
676 
677 /* this should always be called with an empty "chars" buffer */
hubbub_tokeniser_handle_data(hubbub_tokeniser * tokeniser)678 hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
679 {
680 	parserutils_error error;
681 	hubbub_token token;
682 	const uint8_t *cptr;
683 	size_t len;
684 
685 	while ((error = parserutils_inputstream_peek(tokeniser->input,
686 			tokeniser->context.pending, &cptr, &len)) ==
687 					PARSERUTILS_OK) {
688 		const uint8_t c = *cptr;
689 
690 		if (c == '&' &&
691 				(tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA ||
692 				tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA) &&
693 				tokeniser->escape_flag == false) {
694 			tokeniser->state =
695 					STATE_CHARACTER_REFERENCE_DATA;
696 			/* Don't eat the '&'; it'll be handled by entity
697 			 * consumption */
698 			break;
699 		} else if (c == '-' &&
700 				tokeniser->escape_flag == false &&
701 				(tokeniser->content_model ==
702 						HUBBUB_CONTENT_MODEL_RCDATA ||
703 				tokeniser->content_model ==
704 						HUBBUB_CONTENT_MODEL_CDATA) &&
705 				tokeniser->context.pending >= 3) {
706 			size_t ignore;
707 			error = parserutils_inputstream_peek(
708 					tokeniser->input,
709 					tokeniser->context.pending - 3,
710 					&cptr,
711 					&ignore);
712 
713 			assert(error == PARSERUTILS_OK);
714 
715 			if (strncmp((char *)cptr,
716 					"<!--", SLEN("<!--")) == 0) {
717 				tokeniser->escape_flag = true;
718 			}
719 
720 			tokeniser->context.pending += len;
721 		} else if (c == '<' && (tokeniser->content_model ==
722 						HUBBUB_CONTENT_MODEL_PCDATA ||
723 					((tokeniser->content_model ==
724 						HUBBUB_CONTENT_MODEL_RCDATA ||
725 					tokeniser->content_model ==
726 						HUBBUB_CONTENT_MODEL_CDATA) &&
727 				tokeniser->escape_flag == false))) {
728 			if (tokeniser->context.pending > 0) {
729 				/* Emit any pending characters */
730 				emit_current_chars(tokeniser);
731 			}
732 
733 			/* Buffer '<' */
734 			tokeniser->context.pending = len;
735 			tokeniser->state = STATE_TAG_OPEN;
736 			break;
737 		} else if (c == '>' && tokeniser->escape_flag == true &&
738 				(tokeniser->content_model ==
739 						HUBBUB_CONTENT_MODEL_RCDATA ||
740 				tokeniser->content_model ==
741 						HUBBUB_CONTENT_MODEL_CDATA)) {
742 			/* no need to check that there are enough characters,
743 			 * since you can only run into this if the flag is
744 			 * true in the first place, which requires four
745 			 * characters. */
746 			error = parserutils_inputstream_peek(
747 					tokeniser->input,
748 					tokeniser->context.pending - 2,
749 					&cptr,
750 					&len);
751 
752 			assert(error == PARSERUTILS_OK);
753 
754 			if (strncmp((char *) cptr, "-->", SLEN("-->")) == 0) {
755 				tokeniser->escape_flag = false;
756 			}
757 
758 			tokeniser->context.pending += len;
759 		} else if (c == '\0') {
760 			if (tokeniser->context.pending > 0) {
761 				/* Emit any pending characters */
762 				emit_current_chars(tokeniser);
763 			}
764 
765 			/* Emit a replacement character */
766 			emit_character_token(tokeniser, &u_fffd_str);
767 
768 			/* Advance past NUL */
769 			parserutils_inputstream_advance(tokeniser->input, 1);
770 		} else if (c == '\r') {
771 			error = parserutils_inputstream_peek(
772 					tokeniser->input,
773 					tokeniser->context.pending + len,
774 					&cptr,
775 					&len);
776 
777 			if (error != PARSERUTILS_OK &&
778 					error != PARSERUTILS_EOF) {
779 				break;
780 			}
781 
782 			if (tokeniser->context.pending > 0) {
783 				/* Emit any pending characters */
784 				emit_current_chars(tokeniser);
785 			}
786 
787 			if (error == PARSERUTILS_EOF ||	*cptr != '\n') {
788 				/* Emit newline */
789 				emit_character_token(tokeniser, &lf_str);
790 			}
791 
792 			/* Advance over */
793 			parserutils_inputstream_advance(tokeniser->input, 1);
794 		} else {
795 			/* Just collect into buffer */
796 			tokeniser->context.pending += len;
797 		}
798 	}
799 
800 	if (tokeniser->state != STATE_TAG_OPEN &&
801 		(tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) &&
802 			tokeniser->context.pending > 0) {
803 		/* Emit any pending characters */
804 		emit_current_chars(tokeniser);
805 	}
806 
807 	if (error == PARSERUTILS_EOF) {
808 		token.type = HUBBUB_TOKEN_EOF;
809 		hubbub_tokeniser_emit_token(tokeniser, &token);
810 	}
811 
812 	if (error == PARSERUTILS_EOF) {
813 		return HUBBUB_NEEDDATA;
814 	} else {
815 		return hubbub_error_from_parserutils_error(error);
816 	}
817 }
818 
819 /* emit any pending tokens before calling */
hubbub_tokeniser_handle_character_reference_data(hubbub_tokeniser * tokeniser)820 hubbub_error hubbub_tokeniser_handle_character_reference_data(
821 		hubbub_tokeniser *tokeniser)
822 {
823 	assert(tokeniser->context.pending == 0);
824 
825 	if (tokeniser->context.match_entity.complete == false) {
826 		return hubbub_tokeniser_consume_character_reference(tokeniser,
827 				tokeniser->context.pending);
828 	} else {
829 		hubbub_token token;
830 
831 		uint8_t utf8[6];
832 		uint8_t *utf8ptr = utf8;
833 		size_t len = sizeof(utf8);
834 
835 		token.type = HUBBUB_TOKEN_CHARACTER;
836 
837 		if (tokeniser->context.match_entity.codepoint) {
838 			parserutils_charset_utf8_from_ucs4(
839 				tokeniser->context.match_entity.codepoint,
840 				&utf8ptr, &len);
841 
842 			token.data.character.ptr = utf8;
843 			token.data.character.len = sizeof(utf8) - len;
844 
845 			hubbub_tokeniser_emit_token(tokeniser, &token);
846 
847 			/* +1 for ampersand */
848 			parserutils_inputstream_advance(tokeniser->input,
849 					tokeniser->context.match_entity.length
850 							+ 1);
851 		} else {
852 			parserutils_error error;
853 			const uint8_t *cptr = NULL;
854 
855 			error = parserutils_inputstream_peek(
856 					tokeniser->input,
857 					tokeniser->context.pending,
858 					&cptr,
859 					&len);
860 			if (error != PARSERUTILS_OK) {
861 				return hubbub_error_from_parserutils_error(
862 						error);
863 			}
864 
865 			token.data.character.ptr = cptr;
866 			token.data.character.len = len;
867 
868 			hubbub_tokeniser_emit_token(tokeniser, &token);
869 			parserutils_inputstream_advance(tokeniser->input, len);
870 		}
871 
872 		/* Reset for next time */
873 		tokeniser->context.match_entity.complete = false;
874 
875 		tokeniser->state = STATE_DATA;
876 	}
877 
878 	return HUBBUB_OK;
879 }
880 
881 /* this state always switches to another state straight away */
882 /* this state expects the current character to be '<' */
hubbub_tokeniser_handle_tag_open(hubbub_tokeniser * tokeniser)883 hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
884 {
885 	hubbub_tag *ctag = &tokeniser->context.current_tag;
886 
887 	size_t len;
888 	const uint8_t *cptr;
889 	parserutils_error error;
890 	uint8_t c;
891 
892 	assert(tokeniser->context.pending == 1);
893 /*	assert(tokeniser->context.chars.ptr[0] == '<'); */
894 
895 	error = parserutils_inputstream_peek(tokeniser->input,
896 			tokeniser->context.pending, &cptr, &len);
897 
898 	if (error != PARSERUTILS_OK) {
899 		if (error == PARSERUTILS_EOF) {
900 			/* Return to data state with '<' still in "chars" */
901 			tokeniser->state = STATE_DATA;
902 			return HUBBUB_OK;
903 		} else {
904 			return hubbub_error_from_parserutils_error(error);
905 		}
906 	}
907 
908 	c = *cptr;
909 
910 	if (c == '/') {
911 		tokeniser->context.pending += len;
912 
913 		tokeniser->context.close_tag_match.match = false;
914 		tokeniser->context.close_tag_match.count = 0;
915 
916 		tokeniser->state = STATE_CLOSE_TAG_OPEN;
917 	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
918 			tokeniser->content_model ==
919 					HUBBUB_CONTENT_MODEL_CDATA) {
920 		/* Return to data state with '<' still in "chars" */
921 		tokeniser->state = STATE_DATA;
922 	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
923 		if (c == '!') {
924 			parserutils_inputstream_advance(tokeniser->input,
925 					SLEN("<!"));
926 
927 			tokeniser->context.pending = 0;
928 			tokeniser->state = STATE_MARKUP_DECLARATION_OPEN;
929 		} else if ('A' <= c && c <= 'Z') {
930 			uint8_t lc = (c + 0x20);
931 
932 			START_BUF(ctag->name, &lc, len);
933 			ctag->n_attributes = 0;
934 			tokeniser->context.current_tag_type =
935 					HUBBUB_TOKEN_START_TAG;
936 
937 			tokeniser->context.pending += len;
938 
939 			tokeniser->state = STATE_TAG_NAME;
940 		} else if ('a' <= c && c <= 'z') {
941 			START_BUF(ctag->name, cptr, len);
942 			ctag->n_attributes = 0;
943 			tokeniser->context.current_tag_type =
944 					HUBBUB_TOKEN_START_TAG;
945 
946 			tokeniser->context.pending += len;
947 
948 			tokeniser->state = STATE_TAG_NAME;
949 		} else if (c == '>') {
950 			/** \todo parse error */
951 
952 			tokeniser->context.pending += len;
953 			tokeniser->state = STATE_DATA;
954 		} else if (c == '?') {
955 			/** \todo parse error */
956 
957 			/* Cursor still at "<", need to advance past it */
958 			parserutils_inputstream_advance(
959 					tokeniser->input, SLEN("<"));
960 			tokeniser->context.pending = 0;
961 
962 			tokeniser->state = STATE_BOGUS_COMMENT;
963 		} else {
964 			/* Return to data state with '<' still in "chars" */
965 			tokeniser->state = STATE_DATA;
966 		}
967 	}
968 
969 	return HUBBUB_OK;
970 }
971 
972 /* this state expects tokeniser->context.chars to be "</" */
973 /* this state never stays in this state for more than one character */
hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser * tokeniser)974 hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
975 {
976 	hubbub_tokeniser_context *ctx = &tokeniser->context;
977 
978 	size_t len;
979 	const uint8_t *cptr;
980 	parserutils_error error;
981 	uint8_t c;
982 
983 	assert(tokeniser->context.pending == 2);
984 /*	assert(tokeniser->context.chars.ptr[0] == '<'); */
985 /*	assert(tokeniser->context.chars.ptr[1] == '/'); */
986 
987 	/**\todo fragment case */
988 
989 	if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
990 			tokeniser->content_model ==
991 					HUBBUB_CONTENT_MODEL_CDATA) {
992 		uint8_t *start_tag_name =
993 			tokeniser->context.last_start_tag_name;
994 		size_t start_tag_len =
995 			tokeniser->context.last_start_tag_len;
996 
997 		while ((error = parserutils_inputstream_peek(tokeniser->input,
998 					ctx->pending +
999 						ctx->close_tag_match.count,
1000 					&cptr,
1001 					&len)) == PARSERUTILS_OK) {
1002 			c = *cptr;
1003 
1004 			if ((start_tag_name[ctx->close_tag_match.count] & ~0x20)
1005 					!= (c & ~0x20)) {
1006 				break;
1007 			}
1008 
1009 			ctx->close_tag_match.count += len;
1010 
1011 			if (ctx->close_tag_match.count == start_tag_len) {
1012 				ctx->close_tag_match.match = true;
1013 				break;
1014 			}
1015 		}
1016 
1017 		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1018 			return hubbub_error_from_parserutils_error(error);
1019 		}
1020 
1021 		if (ctx->close_tag_match.match == true) {
1022 			error = parserutils_inputstream_peek(
1023 			 		tokeniser->input,
1024 			 		ctx->pending +
1025 				 		ctx->close_tag_match.count,
1026 					&cptr,
1027 			 		&len);
1028 
1029 			if (error != PARSERUTILS_OK &&
1030 					error != PARSERUTILS_EOF) {
1031 				return hubbub_error_from_parserutils_error(
1032 						error);
1033 			} else if (error != PARSERUTILS_EOF) {
1034 				c = *cptr;
1035 
1036 				if (c != '\t' && c != '\n' && c != '\f' &&
1037 						c != ' ' && c != '>' &&
1038 						c != '/') {
1039 					ctx->close_tag_match.match = false;
1040 				}
1041 			}
1042 		}
1043 	}
1044 
1045 	if (ctx->close_tag_match.match == false &&
1046 			tokeniser->content_model !=
1047 					HUBBUB_CONTENT_MODEL_PCDATA) {
1048 		/* We should emit "</" here, but instead we leave it in the
1049 		 * buffer so the data state emits it with any characters
1050 		 * following it */
1051 		tokeniser->state = STATE_DATA;
1052 	} else {
1053 		error = parserutils_inputstream_peek(tokeniser->input,
1054 				tokeniser->context.pending, &cptr, &len);
1055 
1056 		if (error == PARSERUTILS_EOF) {
1057 			/** \todo parse error */
1058 
1059 			/* Return to data state with "</" pending */
1060 			tokeniser->state = STATE_DATA;
1061 			return HUBBUB_OK;
1062 		} else if (error != PARSERUTILS_OK) {
1063 			return hubbub_error_from_parserutils_error(error);
1064 		}
1065 
1066 		c = *cptr;
1067 
1068 		if ('A' <= c && c <= 'Z') {
1069 			uint8_t lc = (c + 0x20);
1070 			START_BUF(tokeniser->context.current_tag.name,
1071 					&lc, len);
1072 			tokeniser->context.current_tag.n_attributes = 0;
1073 
1074 			tokeniser->context.current_tag_type =
1075 					HUBBUB_TOKEN_END_TAG;
1076 
1077 			tokeniser->context.pending += len;
1078 
1079 			tokeniser->state = STATE_TAG_NAME;
1080 		} else if ('a' <= c && c <= 'z') {
1081 			START_BUF(tokeniser->context.current_tag.name,
1082 					cptr, len);
1083 			tokeniser->context.current_tag.n_attributes = 0;
1084 
1085 			tokeniser->context.current_tag_type =
1086 					HUBBUB_TOKEN_END_TAG;
1087 
1088 			tokeniser->context.pending += len;
1089 
1090 			tokeniser->state = STATE_TAG_NAME;
1091 		} else if (c == '>') {
1092 			/* Cursor still at "</", need to collect ">" */
1093 			tokeniser->context.pending += len;
1094 
1095 			/* Now need to advance past "</>" */
1096 			parserutils_inputstream_advance(tokeniser->input,
1097 					tokeniser->context.pending);
1098 			tokeniser->context.pending = 0;
1099 
1100 			/** \todo parse error */
1101 			tokeniser->state = STATE_DATA;
1102 		} else {
1103 			/** \todo parse error */
1104 
1105 			/* Cursor still at "</", need to advance past it */
1106 			parserutils_inputstream_advance(tokeniser->input,
1107 					tokeniser->context.pending);
1108 			tokeniser->context.pending = 0;
1109 
1110 			tokeniser->state = STATE_BOGUS_COMMENT;
1111 		}
1112 	}
1113 
1114 	return HUBBUB_OK;
1115 }
1116 
1117 /* this state expects tokeniser->context.current_tag to already have its
1118    first character set */
hubbub_tokeniser_handle_tag_name(hubbub_tokeniser * tokeniser)1119 hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
1120 {
1121 	hubbub_tag *ctag = &tokeniser->context.current_tag;
1122 
1123 	size_t len;
1124 	const uint8_t *cptr;
1125 	parserutils_error error;
1126 	uint8_t c;
1127 
1128 	assert(tokeniser->context.pending > 0);
1129 /*	assert(tokeniser->context.chars.ptr[0] == '<'); */
1130 	assert(ctag->name.len > 0);
1131 /*	assert(ctag->name.ptr); */
1132 
1133 	error = parserutils_inputstream_peek(tokeniser->input,
1134 			tokeniser->context.pending, &cptr, &len);
1135 
1136 	if (error != PARSERUTILS_OK) {
1137 		if (error == PARSERUTILS_EOF) {
1138 			tokeniser->state = STATE_DATA;
1139 			return emit_current_tag(tokeniser);
1140 		} else {
1141 			return hubbub_error_from_parserutils_error(error);
1142 		}
1143 	}
1144 
1145 	c = *cptr;
1146 
1147 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1148 		tokeniser->context.pending += len;
1149 		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1150 	} else if (c == '>') {
1151 		tokeniser->context.pending += len;
1152 		tokeniser->state = STATE_DATA;
1153 		return emit_current_tag(tokeniser);
1154 	} else if (c == '\0') {
1155 		COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
1156 		tokeniser->context.pending += len;
1157 	} else if (c == '/') {
1158 		tokeniser->context.pending += len;
1159 		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1160 	} else if ('A' <= c && c <= 'Z') {
1161 		uint8_t lc = (c + 0x20);
1162 		COLLECT(ctag->name, &lc, len);
1163 		tokeniser->context.pending += len;
1164 	} else {
1165 		COLLECT(ctag->name, cptr, len);
1166 		tokeniser->context.pending += len;
1167 	}
1168 
1169 	return HUBBUB_OK;
1170 }
1171 
hubbub_tokeniser_handle_before_attribute_name(hubbub_tokeniser * tokeniser)1172 hubbub_error hubbub_tokeniser_handle_before_attribute_name(
1173 		hubbub_tokeniser *tokeniser)
1174 {
1175 	hubbub_tag *ctag = &tokeniser->context.current_tag;
1176 
1177 	size_t len;
1178 	const uint8_t *cptr;
1179 	parserutils_error error;
1180 	uint8_t c;
1181 
1182 	error = parserutils_inputstream_peek(tokeniser->input,
1183 			tokeniser->context.pending, &cptr, &len);
1184 
1185 	if (error != PARSERUTILS_OK) {
1186 		if (error == PARSERUTILS_EOF) {
1187 			tokeniser->state = STATE_DATA;
1188 			return emit_current_tag(tokeniser);
1189 		} else {
1190 			return hubbub_error_from_parserutils_error(error);
1191 		}
1192 	}
1193 
1194 	c = *cptr;
1195 
1196 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1197 		/* pass over in silence */
1198 		tokeniser->context.pending += len;
1199 	} else if (c == '>') {
1200 		tokeniser->context.pending += len;
1201 		tokeniser->state = STATE_DATA;
1202 		return emit_current_tag(tokeniser);
1203 	} else if (c == '/') {
1204 		tokeniser->context.pending += len;
1205 		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1206 	} else {
1207 		hubbub_attribute *attr;
1208 
1209 		if (c == '"' || c == '\'' || c == '=') {
1210 			/** \todo parse error */
1211 		}
1212 
1213 		attr = realloc(ctag->attributes,
1214 				(ctag->n_attributes + 1) *
1215 					sizeof(hubbub_attribute));
1216 		if (attr == NULL)
1217 			return HUBBUB_NOMEM;
1218 
1219 		ctag->attributes = attr;
1220 
1221 		if ('A' <= c && c <= 'Z') {
1222 			uint8_t lc = (c + 0x20);
1223 			START_BUF(attr[ctag->n_attributes].name, &lc, len);
1224 		} else if (c == '\0') {
1225 			START_BUF(attr[ctag->n_attributes].name,
1226 					u_fffd, sizeof(u_fffd));
1227 		} else {
1228 			START_BUF(attr[ctag->n_attributes].name, cptr, len);
1229 		}
1230 
1231 		attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
1232 		attr[ctag->n_attributes].value.ptr = NULL;
1233 		attr[ctag->n_attributes].value.len = 0;
1234 
1235 		ctag->n_attributes++;
1236 
1237 		tokeniser->context.pending += len;
1238 		tokeniser->state = STATE_ATTRIBUTE_NAME;
1239 	}
1240 
1241 	return HUBBUB_OK;
1242 }
1243 
hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser * tokeniser)1244 hubbub_error hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser)
1245 {
1246 	hubbub_tag *ctag = &tokeniser->context.current_tag;
1247 
1248 	size_t len;
1249 	const uint8_t *cptr;
1250 	parserutils_error error;
1251 	uint8_t c;
1252 
1253 	assert(ctag->attributes[ctag->n_attributes - 1].name.len > 0);
1254 
1255 	error = parserutils_inputstream_peek(tokeniser->input,
1256 			tokeniser->context.pending, &cptr, &len);
1257 
1258 	if (error != PARSERUTILS_OK) {
1259 		if (error == PARSERUTILS_EOF) {
1260 			tokeniser->state = STATE_DATA;
1261 			return emit_current_tag(tokeniser);
1262 		} else {
1263 			return hubbub_error_from_parserutils_error(error);
1264 		}
1265 	}
1266 
1267 	c = *cptr;
1268 
1269 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1270 		tokeniser->context.pending += len;
1271 		tokeniser->state = STATE_AFTER_ATTRIBUTE_NAME;
1272 	} else if (c == '=') {
1273 		tokeniser->context.pending += len;
1274 		tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
1275 	} else if (c == '>') {
1276 		tokeniser->context.pending += len;
1277 		tokeniser->state = STATE_DATA;
1278 		return emit_current_tag(tokeniser);
1279 	} else if (c == '/') {
1280 		tokeniser->context.pending += len;
1281 		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1282 	} else if (c == '\0') {
1283 		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1284 				u_fffd, sizeof(u_fffd));
1285 		tokeniser->context.pending += len;
1286 	} else if ('A' <= c && c <= 'Z') {
1287 		uint8_t lc = (c + 0x20);
1288 		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1289 				&lc, len);
1290 		tokeniser->context.pending += len;
1291 	} else {
1292 		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1293 				cptr, len);
1294 		tokeniser->context.pending += len;
1295 	}
1296 
1297 	return HUBBUB_OK;
1298 }
1299 
hubbub_tokeniser_handle_after_attribute_name(hubbub_tokeniser * tokeniser)1300 hubbub_error hubbub_tokeniser_handle_after_attribute_name(
1301 		hubbub_tokeniser *tokeniser)
1302 {
1303 	hubbub_tag *ctag = &tokeniser->context.current_tag;
1304 
1305 	size_t len;
1306 	const uint8_t *cptr;
1307 	parserutils_error error;
1308 	uint8_t c;
1309 
1310 	error = parserutils_inputstream_peek(tokeniser->input,
1311 			tokeniser->context.pending, &cptr, &len);
1312 
1313 	if (error != PARSERUTILS_OK) {
1314 		if (error == PARSERUTILS_EOF) {
1315 			tokeniser->state = STATE_DATA;
1316 			return emit_current_tag(tokeniser);
1317 		} else {
1318 			return hubbub_error_from_parserutils_error(error);
1319 		}
1320 	}
1321 
1322 	c = *cptr;
1323 
1324 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1325 		tokeniser->context.pending += len;
1326 	} else if (c == '=') {
1327 		tokeniser->context.pending += len;
1328 		tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
1329 	} else if (c == '>') {
1330 		tokeniser->context.pending += len;
1331 
1332 		tokeniser->state = STATE_DATA;
1333 		return emit_current_tag(tokeniser);
1334 	} else if (c == '/') {
1335 		tokeniser->context.pending += len;
1336 		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1337 	} else {
1338 		hubbub_attribute *attr;
1339 
1340 		if (c == '"' || c == '\'') {
1341 			/** \todo parse error */
1342 		}
1343 
1344 		attr = realloc(ctag->attributes,
1345 				(ctag->n_attributes + 1) *
1346 					sizeof(hubbub_attribute));
1347 		if (attr == NULL)
1348 			return HUBBUB_NOMEM;
1349 
1350 		ctag->attributes = attr;
1351 
1352 		if ('A' <= c && c <= 'Z') {
1353 			uint8_t lc = (c + 0x20);
1354 			START_BUF(attr[ctag->n_attributes].name, &lc, len);
1355 		} else if (c == '\0') {
1356 			START_BUF(attr[ctag->n_attributes].name,
1357 					u_fffd, sizeof(u_fffd));
1358 		} else {
1359 			START_BUF(attr[ctag->n_attributes].name, cptr, len);
1360 		}
1361 
1362 		attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
1363 		attr[ctag->n_attributes].value.ptr = NULL;
1364 		attr[ctag->n_attributes].value.len = 0;
1365 
1366 		ctag->n_attributes++;
1367 
1368 		tokeniser->context.pending += len;
1369 		tokeniser->state = STATE_ATTRIBUTE_NAME;
1370 	}
1371 
1372 	return HUBBUB_OK;
1373 }
1374 
1375 /* this state is only ever triggered by an '=' */
hubbub_tokeniser_handle_before_attribute_value(hubbub_tokeniser * tokeniser)1376 hubbub_error hubbub_tokeniser_handle_before_attribute_value(
1377 		hubbub_tokeniser *tokeniser)
1378 {
1379 	hubbub_tag *ctag = &tokeniser->context.current_tag;
1380 
1381 	size_t len;
1382 	const uint8_t *cptr;
1383 	parserutils_error error;
1384 	uint8_t c;
1385 
1386 	error = parserutils_inputstream_peek(tokeniser->input,
1387 			tokeniser->context.pending, &cptr, &len);
1388 
1389 	if (error != PARSERUTILS_OK) {
1390 		if (error == PARSERUTILS_EOF) {
1391 			/** \todo parse error */
1392 			tokeniser->state = STATE_DATA;
1393 			return emit_current_tag(tokeniser);
1394 		} else {
1395 			return hubbub_error_from_parserutils_error(error);
1396 		}
1397 	}
1398 
1399 	c = *cptr;
1400 
1401 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1402 		tokeniser->context.pending += len;
1403 	} else if (c == '"') {
1404 		tokeniser->context.pending += len;
1405 		tokeniser->state = STATE_ATTRIBUTE_VALUE_DQ;
1406 	} else if (c == '&') {
1407 		/* Don't consume the '&' -- reprocess in UQ state */
1408 		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1409 	} else if (c == '\'') {
1410 		tokeniser->context.pending += len;
1411 		tokeniser->state = STATE_ATTRIBUTE_VALUE_SQ;
1412 	} else if (c == '>') {
1413 		/** \todo parse error */
1414 		tokeniser->context.pending += len;
1415 
1416 		tokeniser->state = STATE_DATA;
1417 		return emit_current_tag(tokeniser);
1418 	} else if (c == '\0') {
1419 		START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
1420 				u_fffd, sizeof(u_fffd));
1421 		tokeniser->context.pending += len;
1422 		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1423 	} else {
1424 		if (c == '=') {
1425 			/** \todo parse error */
1426 		}
1427 
1428 		START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
1429 				cptr, len);
1430 
1431 		tokeniser->context.pending += len;
1432 		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1433 	}
1434 
1435 	return HUBBUB_OK;
1436 }
1437 
hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser * tokeniser)1438 hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
1439 		hubbub_tokeniser *tokeniser)
1440 {
1441 	hubbub_tag *ctag = &tokeniser->context.current_tag;
1442 
1443 	size_t len;
1444 	const uint8_t *cptr;
1445 	parserutils_error error;
1446 	uint8_t c;
1447 
1448 	error = parserutils_inputstream_peek(tokeniser->input,
1449 			tokeniser->context.pending, &cptr, &len);
1450 
1451 	if (error != PARSERUTILS_OK) {
1452 		if (error == PARSERUTILS_EOF) {
1453 			tokeniser->state = STATE_DATA;
1454 			return emit_current_tag(tokeniser);
1455 		} else {
1456 			return hubbub_error_from_parserutils_error(error);
1457 		}
1458 	}
1459 
1460 	c = *cptr;
1461 
1462 	if (c == '"') {
1463 		tokeniser->context.pending += len;
1464 		tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
1465 	} else if (c == '&') {
1466 		tokeniser->context.prev_state = tokeniser->state;
1467 		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1468 		tokeniser->context.allowed_char = '"';
1469 		/* Don't eat the '&'; it'll be handled by entity consumption */
1470 	} else if (c == '\0') {
1471 		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1472 				u_fffd, sizeof(u_fffd));
1473 		tokeniser->context.pending += len;
1474 	} else if (c == '\r') {
1475 		error = parserutils_inputstream_peek(
1476 				tokeniser->input,
1477 				tokeniser->context.pending + len,
1478 				&cptr,
1479 				&len);
1480 
1481 		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1482 			return hubbub_error_from_parserutils_error(error);
1483 		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1484 			COLLECT_MS(ctag->attributes[
1485 					ctag->n_attributes - 1].value,
1486 					&lf, sizeof(lf));
1487 		}
1488 
1489 		/* Consume '\r' */
1490 		tokeniser->context.pending += 1;
1491 	} else {
1492 		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1493 				cptr, len);
1494 		tokeniser->context.pending += len;
1495 	}
1496 
1497 	return HUBBUB_OK;
1498 }
1499 
hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser * tokeniser)1500 hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
1501 		hubbub_tokeniser *tokeniser)
1502 {
1503 	hubbub_tag *ctag = &tokeniser->context.current_tag;
1504 
1505 	size_t len;
1506 	const uint8_t *cptr;
1507 	parserutils_error error;
1508 	uint8_t c;
1509 
1510 	error = parserutils_inputstream_peek(tokeniser->input,
1511 			tokeniser->context.pending, &cptr, &len);
1512 
1513 	if (error != PARSERUTILS_OK) {
1514 		if (error == PARSERUTILS_EOF) {
1515 			tokeniser->state = STATE_DATA;
1516 			return emit_current_tag(tokeniser);
1517 		} else {
1518 			return hubbub_error_from_parserutils_error(error);
1519 		}
1520 	}
1521 
1522 	c = *cptr;
1523 
1524 	if (c == '\'') {
1525 		tokeniser->context.pending += len;
1526 		tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
1527 	} else if (c == '&') {
1528 		tokeniser->context.prev_state = tokeniser->state;
1529 		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1530 		tokeniser->context.allowed_char = '\'';
1531 		/* Don't eat the '&'; it'll be handled by entity consumption */
1532 	} else if (c == '\0') {
1533 		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1534 				u_fffd, sizeof(u_fffd));
1535 		tokeniser->context.pending += len;
1536 	} else if (c == '\r') {
1537 		error = parserutils_inputstream_peek(
1538 				tokeniser->input,
1539 				tokeniser->context.pending + len,
1540 				&cptr,
1541 				&len);
1542 
1543 		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1544 			return hubbub_error_from_parserutils_error(error);
1545 		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1546 			COLLECT_MS(ctag->attributes[
1547 					ctag->n_attributes - 1].value,
1548 					&lf, sizeof(lf));
1549 		}
1550 
1551 		/* Consume \r */
1552 		tokeniser->context.pending += 1;
1553 	} else {
1554 		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1555 				cptr, len);
1556 		tokeniser->context.pending += len;
1557 	}
1558 
1559 	return HUBBUB_OK;
1560 }
1561 
hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser * tokeniser)1562 hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
1563 		hubbub_tokeniser *tokeniser)
1564 {
1565 	hubbub_tag *ctag = &tokeniser->context.current_tag;
1566 	uint8_t c;
1567 
1568 	size_t len;
1569 	const uint8_t *cptr;
1570 	parserutils_error error;
1571 
1572 	error = parserutils_inputstream_peek(tokeniser->input,
1573 			tokeniser->context.pending, &cptr, &len);
1574 
1575 	if (error != PARSERUTILS_OK) {
1576 		if (error == PARSERUTILS_EOF) {
1577 			tokeniser->state = STATE_DATA;
1578 			return emit_current_tag(tokeniser);
1579 		} else {
1580 			return hubbub_error_from_parserutils_error(error);
1581 		}
1582 	}
1583 
1584 	c = *cptr;
1585 
1586 	assert(c == '&' ||
1587 		ctag->attributes[ctag->n_attributes - 1].value.len >= 1);
1588 
1589 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1590 		tokeniser->context.pending += len;
1591 		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1592 	} else if (c == '&') {
1593 		tokeniser->context.prev_state = tokeniser->state;
1594 		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1595 		/* Don't eat the '&'; it'll be handled by entity consumption */
1596 	} else if (c == '>') {
1597 		tokeniser->context.pending += len;
1598 		tokeniser->state = STATE_DATA;
1599 		return emit_current_tag(tokeniser);
1600 	} else if (c == '\0') {
1601 		COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
1602 				u_fffd, sizeof(u_fffd));
1603 		tokeniser->context.pending += len;
1604 	} else {
1605 		if (c == '"' || c == '\'' || c == '=') {
1606 			/** \todo parse error */
1607 		}
1608 
1609 		COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
1610 				cptr, len);
1611 		tokeniser->context.pending += len;
1612 	}
1613 
1614 	return HUBBUB_OK;
1615 }
1616 
hubbub_tokeniser_handle_character_reference_in_attribute_value(hubbub_tokeniser * tokeniser)1617 hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
1618 		hubbub_tokeniser *tokeniser)
1619 {
1620 	if (tokeniser->context.match_entity.complete == false) {
1621 		return hubbub_tokeniser_consume_character_reference(tokeniser,
1622 				tokeniser->context.pending);
1623 	} else {
1624 		hubbub_tag *ctag = &tokeniser->context.current_tag;
1625 		hubbub_attribute *attr = &ctag->attributes[
1626 				ctag->n_attributes - 1];
1627 
1628 		uint8_t utf8[6];
1629 		uint8_t *utf8ptr = utf8;
1630 		size_t len = sizeof(utf8);
1631 
1632 		if (tokeniser->context.match_entity.codepoint) {
1633 			parserutils_charset_utf8_from_ucs4(
1634 				tokeniser->context.match_entity.codepoint,
1635 				&utf8ptr, &len);
1636 
1637 			COLLECT_MS(attr->value, utf8, sizeof(utf8) - len);
1638 
1639 			/* +1 for the ampersand */
1640 			tokeniser->context.pending +=
1641 					tokeniser->context.match_entity.length
1642 					+ 1;
1643 		} else {
1644 			size_t len = 0;
1645 			const uint8_t *cptr = NULL;
1646 			parserutils_error error;
1647 
1648 			error = parserutils_inputstream_peek(
1649 					tokeniser->input,
1650 					tokeniser->context.pending,
1651 					&cptr,
1652 					&len);
1653 			if (error != PARSERUTILS_OK) {
1654 				return hubbub_error_from_parserutils_error(
1655 						error);
1656 			}
1657 
1658 			/* Insert the ampersand */
1659 			COLLECT_MS(attr->value, cptr, len);
1660 			tokeniser->context.pending += len;
1661 		}
1662 
1663 		/* Reset for next time */
1664 		tokeniser->context.match_entity.complete = false;
1665 
1666 		/* And back to the previous state */
1667 		tokeniser->state = tokeniser->context.prev_state;
1668 	}
1669 
1670 	return HUBBUB_OK;
1671 }
1672 
1673 /* always switches state */
hubbub_tokeniser_handle_after_attribute_value_q(hubbub_tokeniser * tokeniser)1674 hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
1675 		hubbub_tokeniser *tokeniser)
1676 {
1677 	size_t len;
1678 	const uint8_t *cptr;
1679 	parserutils_error error;
1680 	uint8_t c;
1681 
1682 	error = parserutils_inputstream_peek(tokeniser->input,
1683 			tokeniser->context.pending, &cptr, &len);
1684 
1685 	if (error != PARSERUTILS_OK) {
1686 		if (error == PARSERUTILS_EOF) {
1687 			tokeniser->state = STATE_DATA;
1688 			return emit_current_tag(tokeniser);
1689 		} else {
1690 			return hubbub_error_from_parserutils_error(error);
1691 		}
1692 	}
1693 
1694 	c = *cptr;
1695 
1696 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1697 		tokeniser->context.pending += len;
1698 		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1699 	} else if (c == '>') {
1700 		tokeniser->context.pending += len;
1701 
1702 		tokeniser->state = STATE_DATA;
1703 		return emit_current_tag(tokeniser);
1704 	} else if (c == '/') {
1705 		tokeniser->context.pending += len;
1706 		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1707 	} else {
1708 		/** \todo parse error */
1709 		/* Reprocess character in before attribute name state */
1710 		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1711 	}
1712 
1713 	return HUBBUB_OK;
1714 }
1715 
hubbub_tokeniser_handle_self_closing_start_tag(hubbub_tokeniser * tokeniser)1716 hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
1717 		hubbub_tokeniser *tokeniser)
1718 {
1719 	size_t len;
1720 	const uint8_t *cptr;
1721 	parserutils_error error;
1722 	uint8_t c;
1723 
1724 	error = parserutils_inputstream_peek(tokeniser->input,
1725 			tokeniser->context.pending, &cptr, &len);
1726 
1727 	if (error != PARSERUTILS_OK) {
1728 		if (error == PARSERUTILS_EOF) {
1729 			tokeniser->state = STATE_DATA;
1730 			return emit_current_tag(tokeniser);
1731 		} else {
1732 			return hubbub_error_from_parserutils_error(error);
1733 		}
1734 	}
1735 
1736 	c = *cptr;
1737 
1738 	if (c == '>') {
1739 		tokeniser->context.pending += len;
1740 		tokeniser->state = STATE_DATA;
1741 
1742 		tokeniser->context.current_tag.self_closing = true;
1743 		return emit_current_tag(tokeniser);
1744 	} else {
1745 		/* Reprocess character in before attribute name state */
1746 		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1747 	}
1748 
1749 	return HUBBUB_OK;
1750 }
1751 
1752 /* this state expects tokeniser->context.chars to be empty on first entry */
hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser * tokeniser)1753 hubbub_error hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
1754 {
1755 	size_t len;
1756 	const uint8_t *cptr;
1757 	parserutils_error error;
1758 	uint8_t c;
1759 
1760 	error = parserutils_inputstream_peek(tokeniser->input,
1761 			tokeniser->context.pending, &cptr, &len);
1762 
1763 	if (error != PARSERUTILS_OK) {
1764 		if (error == PARSERUTILS_EOF) {
1765 			tokeniser->state = STATE_DATA;
1766 			return emit_current_comment(tokeniser);
1767 		} else {
1768 			return hubbub_error_from_parserutils_error(error);
1769 		}
1770 	}
1771 
1772 	c = *cptr;
1773 
1774 	if (c == '>') {
1775 		tokeniser->context.pending += len;
1776 		tokeniser->state = STATE_DATA;
1777 		return emit_current_comment(tokeniser);
1778 	} else if (c == '\0') {
1779 		error = parserutils_buffer_append(tokeniser->buffer,
1780 				u_fffd, sizeof(u_fffd));
1781 		if (error != PARSERUTILS_OK)
1782 			return hubbub_error_from_parserutils_error(error);
1783 
1784 		tokeniser->context.pending += len;
1785 	} else if (c == '\r') {
1786 		error = parserutils_inputstream_peek(
1787 				tokeniser->input,
1788 				tokeniser->context.pending,
1789 				&cptr,
1790 				&len);
1791 
1792 		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1793 			return hubbub_error_from_parserutils_error(error);
1794 		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1795 			error = parserutils_buffer_append(tokeniser->buffer,
1796 					&lf, sizeof(lf));
1797 			if (error != PARSERUTILS_OK) {
1798 				return hubbub_error_from_parserutils_error(
1799 						error);
1800 			}
1801 		}
1802 		tokeniser->context.pending += len;
1803 	} else {
1804 		error = parserutils_buffer_append(tokeniser->buffer,
1805 				(uint8_t *) cptr, len);
1806 		if (error != PARSERUTILS_OK)
1807 			return hubbub_error_from_parserutils_error(error);
1808 
1809 		tokeniser->context.pending += len;
1810 	}
1811 
1812 	return HUBBUB_OK;
1813 }
1814 
1815 /* this state always switches to another state straight away */
hubbub_tokeniser_handle_markup_declaration_open(hubbub_tokeniser * tokeniser)1816 hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
1817 		hubbub_tokeniser *tokeniser)
1818 {
1819 	size_t len;
1820 	const uint8_t *cptr;
1821 	parserutils_error error;
1822 	uint8_t c;
1823 
1824 	assert(tokeniser->context.pending == 0);
1825 
1826 	error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);
1827 
1828 	if (error != PARSERUTILS_OK) {
1829 		if (error == PARSERUTILS_EOF) {
1830 			tokeniser->state = STATE_BOGUS_COMMENT;
1831 			return HUBBUB_OK;
1832 		} else {
1833 			return hubbub_error_from_parserutils_error(error);
1834 		}
1835 	}
1836 
1837 	c = *cptr;
1838 
1839 	if (c == '-') {
1840 		tokeniser->context.pending = len;
1841 		tokeniser->state = STATE_MATCH_COMMENT;
1842 	} else if ((c & ~0x20) == 'D') {
1843 		tokeniser->context.pending = len;
1844 		tokeniser->context.match_doctype.count = len;
1845 		tokeniser->state = STATE_MATCH_DOCTYPE;
1846 	} else if (tokeniser->process_cdata_section == true && c == '[') {
1847 		tokeniser->context.pending = len;
1848 		tokeniser->context.match_cdata.count = len;
1849 		tokeniser->state = STATE_MATCH_CDATA;
1850 	} else {
1851 		tokeniser->state = STATE_BOGUS_COMMENT;
1852 	}
1853 
1854 	return HUBBUB_OK;
1855 }
1856 
1857 
hubbub_tokeniser_handle_match_comment(hubbub_tokeniser * tokeniser)1858 hubbub_error hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser)
1859 {
1860 	size_t len;
1861 	const uint8_t *cptr;
1862 	parserutils_error error;
1863 
1864 	error = parserutils_inputstream_peek(tokeniser->input,
1865 			tokeniser->context.pending, &cptr, &len);
1866 
1867 	if (error != PARSERUTILS_OK) {
1868 		if (error == PARSERUTILS_EOF) {
1869 			tokeniser->context.pending =
1870 				tokeniser->context.current_comment.len = 0;
1871 			tokeniser->state = STATE_BOGUS_COMMENT;
1872 			return HUBBUB_OK;
1873 		} else {
1874 			return hubbub_error_from_parserutils_error(error);
1875 		}
1876 	}
1877 
1878 	tokeniser->context.pending = tokeniser->context.current_comment.len = 0;
1879 
1880 	if (*cptr == '-') {
1881 		parserutils_inputstream_advance(tokeniser->input, SLEN("--"));
1882 		tokeniser->state = STATE_COMMENT_START;
1883 	} else {
1884 		tokeniser->state = STATE_BOGUS_COMMENT;
1885 	}
1886 
1887 	return HUBBUB_OK;
1888 }
1889 
1890 
hubbub_tokeniser_handle_comment(hubbub_tokeniser * tokeniser)1891 hubbub_error hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
1892 {
1893 	size_t len;
1894 	const uint8_t *cptr;
1895 	parserutils_error error;
1896 	uint8_t c;
1897 
1898 	error = parserutils_inputstream_peek(tokeniser->input,
1899 			tokeniser->context.pending, &cptr, &len);
1900 
1901 	if (error != PARSERUTILS_OK) {
1902 		if (error == PARSERUTILS_EOF) {
1903 			tokeniser->state = STATE_DATA;
1904 			return emit_current_comment(tokeniser);
1905 		} else {
1906 			return hubbub_error_from_parserutils_error(error);
1907 		}
1908 	}
1909 
1910 	c = *cptr;
1911 
1912 	if (c == '>' && (tokeniser->state == STATE_COMMENT_START_DASH ||
1913 			tokeniser->state == STATE_COMMENT_START ||
1914 			tokeniser->state == STATE_COMMENT_END)) {
1915 		tokeniser->context.pending += len;
1916 
1917 		/** \todo parse error if state != COMMENT_END */
1918 		tokeniser->state = STATE_DATA;
1919 		return emit_current_comment(tokeniser);
1920 	} else if (c == '-') {
1921 		if (tokeniser->state == STATE_COMMENT_START) {
1922 			tokeniser->state = STATE_COMMENT_START_DASH;
1923 		} else if (tokeniser->state == STATE_COMMENT_START_DASH) {
1924 			tokeniser->state = STATE_COMMENT_END;
1925 		} else if (tokeniser->state == STATE_COMMENT) {
1926 			tokeniser->state = STATE_COMMENT_END_DASH;
1927 		} else if (tokeniser->state == STATE_COMMENT_END_DASH) {
1928 			tokeniser->state = STATE_COMMENT_END;
1929 		} else if (tokeniser->state == STATE_COMMENT_END) {
1930 			error = parserutils_buffer_append(tokeniser->buffer,
1931 					(uint8_t *) "-", SLEN("-"));
1932 			if (error != PARSERUTILS_OK) {
1933 				return hubbub_error_from_parserutils_error(
1934 						error);
1935 			}
1936 		}
1937 
1938 		tokeniser->context.pending += len;
1939 	} else {
1940 		if (tokeniser->state == STATE_COMMENT_START_DASH ||
1941 				tokeniser->state == STATE_COMMENT_END_DASH) {
1942 			error = parserutils_buffer_append(tokeniser->buffer,
1943 					(uint8_t *) "-", SLEN("-"));
1944 			if (error != PARSERUTILS_OK) {
1945 				return hubbub_error_from_parserutils_error(
1946 						error);
1947 			}
1948 		} else if (tokeniser->state == STATE_COMMENT_END) {
1949 			error = parserutils_buffer_append(tokeniser->buffer,
1950 					(uint8_t *) "--", SLEN("--"));
1951 			if (error != PARSERUTILS_OK) {
1952 				return hubbub_error_from_parserutils_error(
1953 						error);
1954 			}
1955 		}
1956 
1957 		if (c == '\0') {
1958 			error = parserutils_buffer_append(tokeniser->buffer,
1959 					u_fffd, sizeof(u_fffd));
1960 			if (error != PARSERUTILS_OK) {
1961 				return hubbub_error_from_parserutils_error(
1962 						error);
1963 			}
1964 		} else if (c == '\r') {
1965 			size_t next_len;
1966 			error = parserutils_inputstream_peek(
1967 					tokeniser->input,
1968 					tokeniser->context.pending + len,
1969 					&cptr,
1970 					&next_len);
1971 			if (error != PARSERUTILS_OK &&
1972 					error != PARSERUTILS_EOF) {
1973 				return hubbub_error_from_parserutils_error(
1974 						error);
1975 			} else if (error != PARSERUTILS_EOF && *cptr != '\n') {
1976 				error = parserutils_buffer_append(
1977 						tokeniser->buffer,
1978 						&lf, sizeof(lf));
1979 				if (error != PARSERUTILS_OK) {
1980 					return hubbub_error_from_parserutils_error(
1981 							error);
1982 				}
1983 			}
1984 		} else {
1985 			error = parserutils_buffer_append(tokeniser->buffer,
1986 					cptr, len);
1987 			if (error != PARSERUTILS_OK) {
1988 				return hubbub_error_from_parserutils_error(
1989 						error);
1990 			}
1991 		}
1992 
1993 		tokeniser->context.pending += len;
1994 		tokeniser->state = STATE_COMMENT;
1995 	}
1996 
1997 	return HUBBUB_OK;
1998 }
1999 
2000 
2001 
2002 
2003 #define DOCTYPE		"DOCTYPE"
2004 #define DOCTYPE_LEN	(SLEN(DOCTYPE) - 1)
2005 
hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser * tokeniser)2006 hubbub_error hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
2007 {
2008 	size_t len;
2009 	const uint8_t *cptr;
2010 	parserutils_error error;
2011 	uint8_t c;
2012 
2013 	error = parserutils_inputstream_peek(tokeniser->input,
2014 			tokeniser->context.match_doctype.count, &cptr, &len);
2015 
2016 	if (error != PARSERUTILS_OK) {
2017 		if (error == PARSERUTILS_EOF) {
2018 			tokeniser->context.current_comment.len =
2019 					tokeniser->context.pending = 0;
2020 			tokeniser->state = STATE_BOGUS_COMMENT;
2021 			return HUBBUB_OK;
2022 		} else {
2023 			return hubbub_error_from_parserutils_error(error);
2024 		}
2025 	}
2026 
2027 	c = *cptr;
2028 
2029 	assert(tokeniser->context.match_doctype.count <= DOCTYPE_LEN);
2030 
2031 	if (DOCTYPE[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
2032 		tokeniser->context.current_comment.len =
2033 				tokeniser->context.pending = 0;
2034 		tokeniser->state = STATE_BOGUS_COMMENT;
2035 		return HUBBUB_OK;
2036 	}
2037 
2038 	tokeniser->context.pending += len;
2039 
2040 	if (tokeniser->context.match_doctype.count == DOCTYPE_LEN) {
2041 		/* Skip over the DOCTYPE bit */
2042 		parserutils_inputstream_advance(tokeniser->input,
2043 				tokeniser->context.pending);
2044 
2045 		memset(&tokeniser->context.current_doctype, 0,
2046 				sizeof tokeniser->context.current_doctype);
2047 		tokeniser->context.current_doctype.public_missing = true;
2048 		tokeniser->context.current_doctype.system_missing = true;
2049 		tokeniser->context.pending = 0;
2050 
2051 		tokeniser->state = STATE_DOCTYPE;
2052 	}
2053 
2054 	tokeniser->context.match_doctype.count++;
2055 
2056 	return HUBBUB_OK;
2057 }
2058 
2059 #undef DOCTYPE
2060 #undef DOCTYPE_LEN
2061 
hubbub_tokeniser_handle_doctype(hubbub_tokeniser * tokeniser)2062 hubbub_error hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser)
2063 {
2064 	size_t len;
2065 	const uint8_t *cptr;
2066 	parserutils_error error;
2067 	uint8_t c;
2068 
2069 	error = parserutils_inputstream_peek(tokeniser->input,
2070 			tokeniser->context.pending, &cptr, &len);
2071 
2072 	if (error != PARSERUTILS_OK) {
2073 		if (error == PARSERUTILS_EOF) {
2074 			tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
2075 			return HUBBUB_OK;
2076 		} else {
2077 			return hubbub_error_from_parserutils_error(error);
2078 		}
2079 	}
2080 
2081 	c = *cptr;
2082 
2083 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2084 		tokeniser->context.pending += len;
2085 	}
2086 
2087 	tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
2088 
2089 	return HUBBUB_OK;
2090 }
2091 
hubbub_tokeniser_handle_before_doctype_name(hubbub_tokeniser * tokeniser)2092 hubbub_error hubbub_tokeniser_handle_before_doctype_name(
2093 		hubbub_tokeniser *tokeniser)
2094 {
2095 	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2096 	size_t len;
2097 	const uint8_t *cptr;
2098 	parserutils_error error;
2099 	uint8_t c;
2100 
2101 	error = parserutils_inputstream_peek(tokeniser->input,
2102 			tokeniser->context.pending, &cptr, &len);
2103 
2104 	if (error != PARSERUTILS_OK) {
2105 		if (error == PARSERUTILS_EOF) {
2106 			/** \todo parse error */
2107 			/* Emit current doctype, force-quirks on */
2108 			tokeniser->state = STATE_DATA;
2109 			return emit_current_doctype(tokeniser, true);
2110 		} else {
2111 			return hubbub_error_from_parserutils_error(error);
2112 		}
2113 	}
2114 
2115 	c = *cptr;
2116 
2117 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2118 		/* pass over in silence */
2119 		tokeniser->context.pending += len;
2120 	} else if (c == '>') {
2121 		/** \todo parse error */
2122 		tokeniser->context.pending += len;
2123 		tokeniser->state = STATE_DATA;
2124 		return emit_current_doctype(tokeniser, true);
2125 	} else {
2126 		if (c == '\0') {
2127 			START_BUF(cdoc->name, u_fffd, sizeof(u_fffd));
2128 		} else if ('A' <= c && c <= 'Z') {
2129 			uint8_t lc = c + 0x20;
2130 
2131 			START_BUF(cdoc->name, &lc, len);
2132 		} else {
2133 			START_BUF(cdoc->name, cptr, len);
2134 		}
2135 
2136 		tokeniser->context.pending += len;
2137 		tokeniser->state = STATE_DOCTYPE_NAME;
2138 	}
2139 
2140 	return HUBBUB_OK;
2141 }
2142 
hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser * tokeniser)2143 hubbub_error hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
2144 {
2145 	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2146 	size_t len;
2147 	const uint8_t *cptr;
2148 	parserutils_error error;
2149 	uint8_t c;
2150 
2151 	error = parserutils_inputstream_peek(tokeniser->input,
2152 			tokeniser->context.pending, &cptr, &len);
2153 
2154 	if (error != PARSERUTILS_OK) {
2155 		if (error == PARSERUTILS_EOF) {
2156 			tokeniser->state = STATE_DATA;
2157 			return emit_current_doctype(tokeniser, true);
2158 		} else {
2159 			return hubbub_error_from_parserutils_error(error);
2160 		}
2161 	}
2162 
2163 	c = *cptr;
2164 
2165 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2166 		tokeniser->context.pending += len;
2167 		tokeniser->state = STATE_AFTER_DOCTYPE_NAME;
2168 	} else if (c == '>') {
2169 		tokeniser->context.pending += len;
2170 		tokeniser->state = STATE_DATA;
2171 		return emit_current_doctype(tokeniser, false);
2172 	} else if (c == '\0') {
2173 		COLLECT(cdoc->name, u_fffd, sizeof(u_fffd));
2174 		tokeniser->context.pending += len;
2175 	} else if ('A' <= c && c <= 'Z') {
2176 		uint8_t lc = c + 0x20;
2177 		COLLECT(cdoc->name, &lc, len);
2178 		tokeniser->context.pending += len;
2179 	} else {
2180 		COLLECT(cdoc->name, cptr, len);
2181 		tokeniser->context.pending += len;
2182 	}
2183 
2184 	return HUBBUB_OK;
2185 }
2186 
hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser * tokeniser)2187 hubbub_error hubbub_tokeniser_handle_after_doctype_name(
2188 		hubbub_tokeniser *tokeniser)
2189 {
2190 	size_t len;
2191 	const uint8_t *cptr;
2192 	parserutils_error error;
2193 	uint8_t c;
2194 
2195 	error = parserutils_inputstream_peek(tokeniser->input,
2196 			tokeniser->context.pending, &cptr, &len);
2197 
2198 	if (error != PARSERUTILS_OK) {
2199 		if (error == PARSERUTILS_EOF) {
2200 			tokeniser->state = STATE_DATA;
2201 			return emit_current_doctype(tokeniser, true);
2202 		} else {
2203 			return hubbub_error_from_parserutils_error(error);
2204 		}
2205 	}
2206 
2207 	c = *cptr;
2208 	tokeniser->context.pending += len;
2209 
2210 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2211 		/* pass over in silence */
2212 	} else if (c == '>') {
2213 		tokeniser->state = STATE_DATA;
2214 		return emit_current_doctype(tokeniser, false);
2215 	} else if ((c & ~0x20) == 'P') {
2216 		tokeniser->context.match_doctype.count = 1;
2217 		tokeniser->state = STATE_MATCH_PUBLIC;
2218 	} else if ((c & ~0x20) == 'S') {
2219 		tokeniser->context.match_doctype.count = 1;
2220 		tokeniser->state = STATE_MATCH_SYSTEM;
2221 	} else {
2222 		tokeniser->state = STATE_BOGUS_DOCTYPE;
2223 		tokeniser->context.current_doctype.force_quirks = true;
2224 	}
2225 
2226 	return HUBBUB_OK;
2227 }
2228 
2229 #define PUBLIC		"PUBLIC"
2230 #define PUBLIC_LEN	(SLEN(PUBLIC) - 1)
2231 
hubbub_tokeniser_handle_match_public(hubbub_tokeniser * tokeniser)2232 hubbub_error hubbub_tokeniser_handle_match_public(hubbub_tokeniser *tokeniser)
2233 {
2234 	size_t len;
2235 	const uint8_t *cptr;
2236 	parserutils_error error;
2237 	uint8_t c;
2238 
2239 	error = parserutils_inputstream_peek(tokeniser->input,
2240 			tokeniser->context.pending, &cptr, &len);
2241 
2242 	if (error != PARSERUTILS_OK) {
2243 		if (error == PARSERUTILS_EOF) {
2244 			tokeniser->context.current_doctype.force_quirks = true;
2245 			tokeniser->state = STATE_BOGUS_DOCTYPE;
2246 			return HUBBUB_OK;
2247 		} else {
2248 			return hubbub_error_from_parserutils_error(error);
2249 		}
2250 	}
2251 
2252 	c = *cptr;
2253 
2254 	assert(tokeniser->context.match_doctype.count <= PUBLIC_LEN);
2255 
2256 	if (PUBLIC[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
2257 		tokeniser->context.current_doctype.force_quirks = true;
2258 		tokeniser->state = STATE_BOGUS_DOCTYPE;
2259 		return HUBBUB_OK;
2260 	}
2261 
2262 	tokeniser->context.pending += len;
2263 
2264 	if (tokeniser->context.match_doctype.count == PUBLIC_LEN) {
2265 		tokeniser->state = STATE_BEFORE_DOCTYPE_PUBLIC;
2266 	}
2267 
2268 	tokeniser->context.match_doctype.count++;
2269 
2270 	return HUBBUB_OK;
2271 }
2272 
2273 #undef PUBLIC
2274 #undef PUBLIC_LEN
2275 
hubbub_tokeniser_handle_before_doctype_public(hubbub_tokeniser * tokeniser)2276 hubbub_error hubbub_tokeniser_handle_before_doctype_public(
2277 		hubbub_tokeniser *tokeniser)
2278 {
2279 	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2280 	size_t len;
2281 	const uint8_t *cptr;
2282 	parserutils_error error;
2283 	uint8_t c;
2284 
2285 	error = parserutils_inputstream_peek(tokeniser->input,
2286 			tokeniser->context.pending, &cptr, &len);
2287 
2288 	if (error != PARSERUTILS_OK) {
2289 		if (error == PARSERUTILS_EOF) {
2290 			tokeniser->state = STATE_DATA;
2291 			return emit_current_doctype(tokeniser, true);
2292 		} else {
2293 			return hubbub_error_from_parserutils_error(error);
2294 		}
2295 	}
2296 
2297 	c = *cptr;
2298 	tokeniser->context.pending += len;
2299 
2300 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2301 		/* pass over in silence */
2302 	} else if (c == '"') {
2303 		cdoc->public_missing = false;
2304 		cdoc->public_id.len = 0;
2305 		tokeniser->state = STATE_DOCTYPE_PUBLIC_DQ;
2306 	} else if (c == '\'') {
2307 		cdoc->public_missing = false;
2308 		cdoc->public_id.len = 0;
2309 		tokeniser->state = STATE_DOCTYPE_PUBLIC_SQ;
2310 	} else if (c == '>') {
2311 		tokeniser->state = STATE_DATA;
2312 		return emit_current_doctype(tokeniser, true);
2313 	} else {
2314 		cdoc->force_quirks = true;
2315 		tokeniser->state = STATE_BOGUS_DOCTYPE;
2316 	}
2317 
2318 	return HUBBUB_OK;
2319 }
2320 
hubbub_tokeniser_handle_doctype_public_dq(hubbub_tokeniser * tokeniser)2321 hubbub_error hubbub_tokeniser_handle_doctype_public_dq(
2322 		hubbub_tokeniser *tokeniser)
2323 {
2324 	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2325 	size_t len;
2326 	const uint8_t *cptr;
2327 	parserutils_error error;
2328 	uint8_t c;
2329 
2330 	error = parserutils_inputstream_peek(tokeniser->input,
2331 			tokeniser->context.pending, &cptr, &len);
2332 
2333 	if (error != PARSERUTILS_OK) {
2334 		if (error == PARSERUTILS_EOF) {
2335 			tokeniser->state = STATE_DATA;
2336 			return emit_current_doctype(tokeniser, true);
2337 		} else {
2338 			return hubbub_error_from_parserutils_error(error);
2339 		}
2340 	}
2341 
2342 	c = *cptr;
2343 
2344 	if (c == '"') {
2345 		tokeniser->context.pending += len;
2346 		tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
2347 	} else if (c == '>') {
2348 		tokeniser->context.pending += len;
2349 		tokeniser->state = STATE_DATA;
2350 		return emit_current_doctype(tokeniser, true);
2351 	} else if (c == '\0') {
2352 		COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd));
2353 		tokeniser->context.pending += len;
2354 	} else if (c == '\r') {
2355 		error = parserutils_inputstream_peek(
2356 				tokeniser->input,
2357 				tokeniser->context.pending,
2358 				&cptr,
2359 				&len);
2360 
2361 		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2362 			return hubbub_error_from_parserutils_error(error);
2363 		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2364 			COLLECT_MS(cdoc->public_id, &lf, sizeof(lf));
2365 		}
2366 
2367 		/* Collect '\r' */
2368 		tokeniser->context.pending += 1;
2369 	} else {
2370 		COLLECT_MS(cdoc->public_id, cptr, len);
2371 
2372 		tokeniser->context.pending += len;
2373 	}
2374 
2375 	return HUBBUB_OK;
2376 }
2377 
hubbub_tokeniser_handle_doctype_public_sq(hubbub_tokeniser * tokeniser)2378 hubbub_error hubbub_tokeniser_handle_doctype_public_sq(
2379 		hubbub_tokeniser *tokeniser)
2380 {
2381 	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2382 	size_t len;
2383 	const uint8_t *cptr;
2384 	parserutils_error error;
2385 	uint8_t c;
2386 
2387 	error = parserutils_inputstream_peek(tokeniser->input,
2388 			tokeniser->context.pending, &cptr, &len);
2389 
2390 	if (error != PARSERUTILS_OK) {
2391 		if (error == PARSERUTILS_EOF) {
2392 			tokeniser->state = STATE_DATA;
2393 			return emit_current_doctype(tokeniser, true);
2394 		} else {
2395 			return hubbub_error_from_parserutils_error(error);
2396 		}
2397 	}
2398 
2399 	c = *cptr;
2400 
2401 	if (c == '\'') {
2402 		tokeniser->context.pending += len;
2403 		tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
2404 	} else if (c == '>') {
2405 		tokeniser->context.pending += len;
2406 		tokeniser->state = STATE_DATA;
2407 		return emit_current_doctype(tokeniser, true);
2408 	} else if (c == '\0') {
2409 		COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd));
2410 		tokeniser->context.pending += len;
2411 	} else if (c == '\r') {
2412 		error = parserutils_inputstream_peek(
2413 				tokeniser->input,
2414 				tokeniser->context.pending,
2415 				&cptr,
2416 				&len);
2417 
2418 		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2419 			return hubbub_error_from_parserutils_error(error);
2420 		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2421 			COLLECT_MS(cdoc->public_id, &lf, sizeof(lf));
2422 		}
2423 
2424 		/* Collect '\r' */
2425 		tokeniser->context.pending += 1;
2426 	} else {
2427 		COLLECT_MS(cdoc->public_id, cptr, len);
2428 		tokeniser->context.pending += len;
2429 	}
2430 
2431 	return HUBBUB_OK;
2432 }
2433 
2434 
hubbub_tokeniser_handle_after_doctype_public(hubbub_tokeniser * tokeniser)2435 hubbub_error hubbub_tokeniser_handle_after_doctype_public(
2436 		hubbub_tokeniser *tokeniser)
2437 {
2438 	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2439 	size_t len;
2440 	const uint8_t *cptr;
2441 	parserutils_error error;
2442 	uint8_t c;
2443 
2444 	error = parserutils_inputstream_peek(tokeniser->input,
2445 			tokeniser->context.pending, &cptr, &len);
2446 
2447 	if (error != PARSERUTILS_OK) {
2448 		if (error == PARSERUTILS_EOF) {
2449 			tokeniser->state = STATE_DATA;
2450 			return emit_current_doctype(tokeniser, true);
2451 		} else {
2452 			return hubbub_error_from_parserutils_error(error);
2453 		}
2454 	}
2455 
2456 	c = *cptr;
2457 	tokeniser->context.pending += len;
2458 
2459 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2460 		/* pass over in silence */
2461 	} else if (c == '"') {
2462 		cdoc->system_missing = false;
2463 		cdoc->system_id.len = 0;
2464 
2465 		tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
2466 	} else if (c == '\'') {
2467 		cdoc->system_missing = false;
2468 		cdoc->system_id.len = 0;
2469 
2470 		tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
2471 	} else if (c == '>') {
2472 		tokeniser->state = STATE_DATA;
2473 		return emit_current_doctype(tokeniser, false);
2474 	} else {
2475 		cdoc->force_quirks = true;
2476 		tokeniser->state = STATE_BOGUS_DOCTYPE;
2477 	}
2478 
2479 	return HUBBUB_OK;
2480 }
2481 
2482 
2483 
2484 #define SYSTEM		"SYSTEM"
2485 #define SYSTEM_LEN	(SLEN(SYSTEM) - 1)
2486 
hubbub_tokeniser_handle_match_system(hubbub_tokeniser * tokeniser)2487 hubbub_error hubbub_tokeniser_handle_match_system(hubbub_tokeniser *tokeniser)
2488 {
2489 	size_t len;
2490 	const uint8_t *cptr;
2491 	parserutils_error error;
2492 	uint8_t c;
2493 
2494 	error = parserutils_inputstream_peek(tokeniser->input,
2495 			tokeniser->context.pending, &cptr, &len);
2496 
2497 	if (error != PARSERUTILS_OK){
2498 		if (error == PARSERUTILS_EOF) {
2499 			tokeniser->context.current_doctype.force_quirks = true;
2500 			tokeniser->state = STATE_BOGUS_DOCTYPE;
2501 			return HUBBUB_OK;
2502 		} else {
2503 			return hubbub_error_from_parserutils_error(error);
2504 		}
2505 	}
2506 
2507 	c = *cptr;
2508 
2509 	assert(tokeniser->context.match_doctype.count <= SYSTEM_LEN);
2510 
2511 	if (SYSTEM[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
2512 		tokeniser->context.current_doctype.force_quirks = true;
2513 		tokeniser->state = STATE_BOGUS_DOCTYPE;
2514 		return HUBBUB_OK;
2515 	}
2516 
2517 	tokeniser->context.pending += len;
2518 
2519 	if (tokeniser->context.match_doctype.count == SYSTEM_LEN) {
2520 		tokeniser->state = STATE_BEFORE_DOCTYPE_SYSTEM;
2521 	}
2522 
2523 	tokeniser->context.match_doctype.count++;
2524 
2525 	return HUBBUB_OK;
2526 }
2527 
2528 #undef SYSTEM
2529 #undef SYSTEM_LEN
2530 
hubbub_tokeniser_handle_before_doctype_system(hubbub_tokeniser * tokeniser)2531 hubbub_error hubbub_tokeniser_handle_before_doctype_system(
2532 		hubbub_tokeniser *tokeniser)
2533 {
2534 	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2535 	size_t len;
2536 	const uint8_t *cptr;
2537 	parserutils_error error;
2538 	uint8_t c;
2539 
2540 	error = parserutils_inputstream_peek(tokeniser->input,
2541 			tokeniser->context.pending, &cptr, &len);
2542 
2543 	if (error != PARSERUTILS_OK) {
2544 		if (error == PARSERUTILS_EOF) {
2545 			tokeniser->state = STATE_DATA;
2546 			return emit_current_doctype(tokeniser, true);
2547 		} else {
2548 			return hubbub_error_from_parserutils_error(error);
2549 		}
2550 	}
2551 
2552 	c = *cptr;
2553 	tokeniser->context.pending += len;
2554 
2555 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2556 		/* pass over */
2557 	} else if (c == '"') {
2558 		cdoc->system_missing = false;
2559 		cdoc->system_id.len = 0;
2560 
2561 		tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
2562 	} else if (c == '\'') {
2563 		cdoc->system_missing = false;
2564 		cdoc->system_id.len = 0;
2565 
2566 		tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
2567 	} else if (c == '>') {
2568 		tokeniser->state = STATE_DATA;
2569 		return emit_current_doctype(tokeniser, true);
2570 	} else {
2571 		cdoc->force_quirks = true;
2572 		tokeniser->state = STATE_BOGUS_DOCTYPE;
2573 	}
2574 
2575 	return HUBBUB_OK;
2576 }
2577 
hubbub_tokeniser_handle_doctype_system_dq(hubbub_tokeniser * tokeniser)2578 hubbub_error hubbub_tokeniser_handle_doctype_system_dq(
2579 		hubbub_tokeniser *tokeniser)
2580 {
2581 	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2582 	size_t len;
2583 	const uint8_t *cptr;
2584 	parserutils_error error;
2585 	uint8_t c;
2586 
2587 	error = parserutils_inputstream_peek(tokeniser->input,
2588 			tokeniser->context.pending, &cptr, &len);
2589 
2590 	if (error != PARSERUTILS_OK) {
2591 		if (error == PARSERUTILS_EOF) {
2592 			tokeniser->state = STATE_DATA;
2593 			return emit_current_doctype(tokeniser, true);
2594 		} else {
2595 			return hubbub_error_from_parserutils_error(error);
2596 		}
2597 	}
2598 
2599 	c = *cptr;
2600 
2601 	if (c == '"') {
2602 		tokeniser->context.pending += len;
2603 		tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
2604 	} else if (c == '>') {
2605 		tokeniser->context.pending += len;
2606 		tokeniser->state = STATE_DATA;
2607 		return emit_current_doctype(tokeniser, true);
2608 	} else if (c == '\0') {
2609 		COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd));
2610 		tokeniser->context.pending += len;
2611 	} else if (c == '\r') {
2612 		error = parserutils_inputstream_peek(
2613 				tokeniser->input,
2614 				tokeniser->context.pending,
2615 				&cptr,
2616 				&len);
2617 
2618 		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2619 			return hubbub_error_from_parserutils_error(error);
2620 		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2621 			COLLECT_MS(cdoc->system_id, &lf, sizeof(lf));
2622 		}
2623 
2624 		/* Collect '\r' */
2625 		tokeniser->context.pending += 1;
2626 	} else {
2627 		COLLECT_MS(cdoc->system_id, cptr, len);
2628 		tokeniser->context.pending += len;
2629 	}
2630 
2631 	return HUBBUB_OK;
2632 }
2633 
hubbub_tokeniser_handle_doctype_system_sq(hubbub_tokeniser * tokeniser)2634 hubbub_error hubbub_tokeniser_handle_doctype_system_sq(
2635 		hubbub_tokeniser *tokeniser)
2636 {
2637 	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2638 	size_t len;
2639 	const uint8_t *cptr;
2640 	parserutils_error error;
2641 	uint8_t c;
2642 
2643 	error = parserutils_inputstream_peek(tokeniser->input,
2644 			tokeniser->context.pending, &cptr, &len);
2645 
2646 	if (error != PARSERUTILS_OK) {
2647 		if (error == PARSERUTILS_EOF) {
2648 			tokeniser->state = STATE_DATA;
2649 			return emit_current_doctype(tokeniser, true);
2650 		} else {
2651 			return hubbub_error_from_parserutils_error(error);
2652 		}
2653 	}
2654 
2655 	c = *cptr;
2656 
2657 	if (c == '\'') {
2658 		tokeniser->context.pending += len;
2659 		tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
2660 	} else if (c == '>') {
2661 		tokeniser->context.pending += len;
2662 		tokeniser->state = STATE_DATA;
2663 		return emit_current_doctype(tokeniser, true);
2664 	} else if (c == '\0') {
2665 		COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd));
2666 		tokeniser->context.pending += len;
2667 	} else if (c == '\r') {
2668 		error = parserutils_inputstream_peek(
2669 				tokeniser->input,
2670 				tokeniser->context.pending,
2671 				&cptr,
2672 				&len);
2673 
2674 		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2675 			return hubbub_error_from_parserutils_error(error);
2676 		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2677 			COLLECT_MS(cdoc->system_id, &lf, sizeof(lf));
2678 		}
2679 
2680 		/* Collect '\r' */
2681 		tokeniser->context.pending += 1;
2682 	} else {
2683 		COLLECT_MS(cdoc->system_id, cptr, len);
2684 		tokeniser->context.pending += len;
2685 	}
2686 
2687 	return HUBBUB_OK;
2688 }
2689 
hubbub_tokeniser_handle_after_doctype_system(hubbub_tokeniser * tokeniser)2690 hubbub_error hubbub_tokeniser_handle_after_doctype_system(
2691 		hubbub_tokeniser *tokeniser)
2692 {
2693 	size_t len;
2694 	const uint8_t *cptr;
2695 	parserutils_error error;
2696 	uint8_t c;
2697 
2698 	error = parserutils_inputstream_peek(tokeniser->input,
2699 			tokeniser->context.pending, &cptr, &len);
2700 
2701 	if (error != PARSERUTILS_OK) {
2702 		if (error == PARSERUTILS_EOF) {
2703 			tokeniser->state = STATE_DATA;
2704 			return emit_current_doctype(tokeniser, true);
2705 		} else {
2706 			return hubbub_error_from_parserutils_error(error);
2707 		}
2708 	}
2709 
2710 	c = *cptr;
2711 	tokeniser->context.pending += len;
2712 
2713 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2714 		/* pass over in silence */
2715 	} else if (c == '>') {
2716 		tokeniser->state = STATE_DATA;
2717 		return emit_current_doctype(tokeniser, false);
2718 	} else {
2719 		tokeniser->state = STATE_BOGUS_DOCTYPE;
2720 	}
2721 
2722 	return HUBBUB_OK;
2723 }
2724 
2725 
hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser * tokeniser)2726 hubbub_error hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser)
2727 {
2728 	size_t len;
2729 	const uint8_t *cptr;
2730 	parserutils_error error;
2731 	uint8_t c;
2732 
2733 	error = parserutils_inputstream_peek(tokeniser->input,
2734 			tokeniser->context.pending, &cptr, &len);
2735 
2736 	if (error != PARSERUTILS_OK) {
2737 		if (error == PARSERUTILS_EOF) {
2738 			tokeniser->state = STATE_DATA;
2739 			return emit_current_doctype(tokeniser, false);
2740 		} else {
2741 			return hubbub_error_from_parserutils_error(error);
2742 		}
2743 	}
2744 
2745 	c = *cptr;
2746 	tokeniser->context.pending += len;
2747 
2748 	if (c == '>') {
2749 		tokeniser->state = STATE_DATA;
2750 		return emit_current_doctype(tokeniser, false);
2751 	}
2752 
2753 	return HUBBUB_OK;
2754 }
2755 
2756 
2757 
2758 #define CDATA		"[CDATA["
2759 #define CDATA_LEN	(SLEN(CDATA) - 1)
2760 
hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser * tokeniser)2761 hubbub_error hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser *tokeniser)
2762 {
2763 	size_t len;
2764 	const uint8_t *cptr;
2765 	parserutils_error error;
2766 	uint8_t c;
2767 
2768 	error = parserutils_inputstream_peek(tokeniser->input,
2769 			tokeniser->context.pending, &cptr, &len);
2770 
2771 	if (error != PARSERUTILS_OK) {
2772 		if (error == PARSERUTILS_EOF) {
2773 			tokeniser->context.current_comment.len =
2774 					tokeniser->context.pending = 0;
2775 			tokeniser->state = STATE_BOGUS_COMMENT;
2776 			return HUBBUB_OK;
2777 		} else {
2778 			return hubbub_error_from_parserutils_error(error);
2779 		}
2780 	}
2781 
2782 	c = *cptr;
2783 
2784 	assert(tokeniser->context.match_cdata.count <= CDATA_LEN);
2785 
2786 	if (CDATA[tokeniser->context.match_cdata.count] != (c & ~0x20)) {
2787 		tokeniser->context.current_comment.len =
2788 				tokeniser->context.pending =
2789 				0;
2790 		tokeniser->state = STATE_BOGUS_COMMENT;
2791 		return HUBBUB_OK;
2792 	}
2793 
2794 	tokeniser->context.pending += len;
2795 
2796 	if (tokeniser->context.match_cdata.count == CDATA_LEN) {
2797 		parserutils_inputstream_advance(tokeniser->input,
2798 				tokeniser->context.match_cdata.count + len);
2799 		tokeniser->context.pending = 0;
2800 		tokeniser->context.match_cdata.end = 0;
2801 		tokeniser->state = STATE_CDATA_BLOCK;
2802 	}
2803 
2804 	tokeniser->context.match_cdata.count += len;
2805 
2806 	return HUBBUB_OK;
2807 }
2808 
2809 #undef CDATA
2810 #undef CDATA_LEN
2811 
2812 
hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser * tokeniser)2813 hubbub_error hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser)
2814 {
2815 	size_t len;
2816 	const uint8_t *cptr;
2817 	parserutils_error error;
2818 	uint8_t c;
2819 
2820 	error = parserutils_inputstream_peek(tokeniser->input,
2821 			tokeniser->context.pending, &cptr, &len);
2822 
2823 	if (error != PARSERUTILS_OK) {
2824 		if (error == PARSERUTILS_EOF) {
2825 			tokeniser->state = STATE_DATA;
2826 			return emit_current_chars(tokeniser);
2827 		} else {
2828 			return hubbub_error_from_parserutils_error(error);
2829 		}
2830 	}
2831 
2832 	c = *cptr;
2833 
2834 	if (c == ']' && (tokeniser->context.match_cdata.end == 0 ||
2835 			tokeniser->context.match_cdata.end == 1)) {
2836 		tokeniser->context.pending += len;
2837 		tokeniser->context.match_cdata.end += len;
2838 	} else if (c == '>' && tokeniser->context.match_cdata.end == 2) {
2839 		/* Remove the previous two "]]" */
2840 		tokeniser->context.pending -= 2;
2841 
2842 		/* Emit any pending characters */
2843 		emit_current_chars(tokeniser);
2844 
2845 		/* Now move past the "]]>" bit */
2846 		parserutils_inputstream_advance(tokeniser->input, SLEN("]]>"));
2847 
2848 		tokeniser->state = STATE_DATA;
2849 	} else if (c == '\0') {
2850 		if (tokeniser->context.pending > 0) {
2851 			/* Emit any pending characters */
2852 			emit_current_chars(tokeniser);
2853 		}
2854 
2855 		/* Perform NUL-byte replacement */
2856 		emit_character_token(tokeniser, &u_fffd_str);
2857 
2858 		parserutils_inputstream_advance(tokeniser->input, len);
2859 		tokeniser->context.match_cdata.end = 0;
2860 	} else if (c == '\r') {
2861 		error = parserutils_inputstream_peek(
2862 				tokeniser->input,
2863 				tokeniser->context.pending + len,
2864 				&cptr,
2865 				&len);
2866 
2867 		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2868 			return hubbub_error_from_parserutils_error(error);
2869 		}
2870 
2871 		if (tokeniser->context.pending > 0) {
2872 			/* Emit any pending characters */
2873 			emit_current_chars(tokeniser);
2874 		}
2875 
2876 		if (error == PARSERUTILS_EOF || *cptr != '\n') {
2877 			/* Emit newline */
2878 			emit_character_token(tokeniser, &lf_str);
2879 		}
2880 
2881 		/* Advance over \r */
2882 		parserutils_inputstream_advance(tokeniser->input, 1);
2883 		tokeniser->context.match_cdata.end = 0;
2884 	} else {
2885 		tokeniser->context.pending += len;
2886 		tokeniser->context.match_cdata.end = 0;
2887 	}
2888 
2889 	return HUBBUB_OK;
2890 }
2891 
2892 
hubbub_tokeniser_consume_character_reference(hubbub_tokeniser * tokeniser,size_t pos)2893 hubbub_error hubbub_tokeniser_consume_character_reference(
2894 		hubbub_tokeniser *tokeniser, size_t pos)
2895 {
2896 	uint32_t allowed_char = tokeniser->context.allowed_char;
2897 
2898 	size_t len;
2899 	const uint8_t *cptr;
2900 	parserutils_error error;
2901 	uint8_t c;
2902 	size_t off;
2903 
2904 	error = parserutils_inputstream_peek(tokeniser->input, pos,
2905 			&cptr, &len);
2906 
2907 	/* We should always start on an ampersand */
2908 	assert(error == PARSERUTILS_OK);
2909 	assert(len == 1 && *cptr == '&');
2910 
2911 	off = pos + len;
2912 
2913 	/* Look at the character after the ampersand */
2914 	error = parserutils_inputstream_peek(tokeniser->input, off,
2915 			&cptr, &len);
2916 
2917 	if (error != PARSERUTILS_OK) {
2918 		if (error == PARSERUTILS_EOF) {
2919 			tokeniser->context.match_entity.complete = true;
2920 			tokeniser->context.match_entity.codepoint = 0;
2921 			return HUBBUB_OK;
2922 		} else {
2923 			return hubbub_error_from_parserutils_error(error);
2924 		}
2925 	}
2926 
2927 	c = *cptr;
2928 
2929 	/* Set things up */
2930 	tokeniser->context.match_entity.offset = off;
2931 	tokeniser->context.match_entity.poss_length = 0;
2932 	tokeniser->context.match_entity.length = 0;
2933 	tokeniser->context.match_entity.base = 0;
2934 	tokeniser->context.match_entity.codepoint = 0;
2935 	tokeniser->context.match_entity.had_data = false;
2936 	tokeniser->context.match_entity.return_state = tokeniser->state;
2937 	tokeniser->context.match_entity.complete = false;
2938 	tokeniser->context.match_entity.overflow = false;
2939 	tokeniser->context.match_entity.context = -1;
2940 	tokeniser->context.match_entity.prev_len = len;
2941 
2942 	/* Reset allowed character for future calls */
2943 	tokeniser->context.allowed_char = '\0';
2944 
2945 	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' ||
2946 			c == '<' || c == '&' ||
2947 			(allowed_char && c == allowed_char)) {
2948 		tokeniser->context.match_entity.complete = true;
2949 		tokeniser->context.match_entity.codepoint = 0;
2950 	} else if (c == '#') {
2951 		tokeniser->context.match_entity.length += len;
2952 		tokeniser->state = STATE_NUMBERED_ENTITY;
2953 	} else {
2954 		tokeniser->state = STATE_NAMED_ENTITY;
2955 	}
2956 
2957 	return HUBBUB_OK;
2958 }
2959 
2960 
hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser * tokeniser)2961 hubbub_error hubbub_tokeniser_handle_numbered_entity(
2962 		hubbub_tokeniser *tokeniser)
2963 {
2964 	hubbub_tokeniser_context *ctx = &tokeniser->context;
2965 
2966 	size_t len;
2967 	const uint8_t *cptr;
2968 	parserutils_error error;
2969 
2970 	error = parserutils_inputstream_peek(tokeniser->input,
2971 			ctx->match_entity.offset + ctx->match_entity.length,
2972 			&cptr, &len);
2973 
2974 	if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2975 		return hubbub_error_from_parserutils_error(error);
2976 	}
2977 
2978 	if (error != PARSERUTILS_EOF && ctx->match_entity.base == 0) {
2979 		uint8_t c = *cptr;
2980 		if ((c & ~0x20) == 'X') {
2981 			ctx->match_entity.base = 16;
2982 			ctx->match_entity.length += len;
2983 		} else {
2984 			ctx->match_entity.base = 10;
2985 		}
2986 	}
2987 
2988 	while ((error = parserutils_inputstream_peek(tokeniser->input,
2989 			ctx->match_entity.offset + ctx->match_entity.length,
2990 			&cptr, &len)) == PARSERUTILS_OK) {
2991 		uint8_t c = *cptr;
2992 
2993 		if (ctx->match_entity.base == 10 &&
2994 				('0' <= c && c <= '9')) {
2995 			ctx->match_entity.had_data = true;
2996 			ctx->match_entity.codepoint =
2997 				ctx->match_entity.codepoint * 10 + (c - '0');
2998 
2999 			ctx->match_entity.length += len;
3000 		} else if (ctx->match_entity.base == 16 &&
3001 				(('0' <= c && c <= '9') ||
3002 				('A' <= (c & ~0x20) &&
3003 						(c & ~0x20) <= 'F'))) {
3004 			ctx->match_entity.had_data = true;
3005 			ctx->match_entity.codepoint *= 16;
3006 
3007 			if ('0' <= c && c <= '9') {
3008 				ctx->match_entity.codepoint += (c - '0');
3009 			} else {
3010 				ctx->match_entity.codepoint +=
3011 						((c & ~0x20) - 'A' + 10);
3012 			}
3013 
3014 			ctx->match_entity.length += len;
3015 		} else {
3016 			break;
3017 		}
3018 
3019 		if (ctx->match_entity.codepoint >= 0x10FFFF) {
3020 			ctx->match_entity.overflow = true;
3021 		}
3022 	}
3023 
3024 	if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
3025 		return hubbub_error_from_parserutils_error(error);
3026 	}
3027 
3028 	/* Eat trailing semicolon, if any */
3029 	if (error != PARSERUTILS_EOF && *cptr == ';') {
3030 		ctx->match_entity.length += len;
3031 	}
3032 
3033 	/* Had data, so calculate final codepoint */
3034 	if (ctx->match_entity.had_data) {
3035 		uint32_t cp = ctx->match_entity.codepoint;
3036 
3037 		if (0x80 <= cp && cp <= 0x9F) {
3038 			cp = cp1252Table[cp - 0x80];
3039 		} else if (cp == 0x0D) {
3040 			cp = 0x000A;
3041 		} else if (ctx->match_entity.overflow ||
3042 				cp <= 0x0008 || cp == 0x000B ||
3043 				(0x000E <= cp && cp <= 0x001F) ||
3044 				(0x007F <= cp && cp <= 0x009F) ||
3045 				(0xD800 <= cp && cp <= 0xDFFF) ||
3046 				(0xFDD0 <= cp && cp <= 0xFDEF) ||
3047 				(cp & 0xFFFE) == 0xFFFE) {
3048 			/* the check for cp > 0x10FFFF per spec is performed
3049 			 * in the loop above to avoid overflow */
3050 			cp = 0xFFFD;
3051 		}
3052 
3053 		ctx->match_entity.codepoint = cp;
3054 	}
3055 
3056 	/* Flag completion */
3057 	ctx->match_entity.complete = true;
3058 
3059 	/* And back to the state we were entered in */
3060 	tokeniser->state = ctx->match_entity.return_state;
3061 
3062 	return HUBBUB_OK;
3063 }
3064 
hubbub_tokeniser_handle_named_entity(hubbub_tokeniser * tokeniser)3065 hubbub_error hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
3066 {
3067 	hubbub_tokeniser_context *ctx = &tokeniser->context;
3068 
3069 	size_t len;
3070 	const uint8_t *cptr;
3071 	parserutils_error error;
3072 
3073 	while ((error = parserutils_inputstream_peek(tokeniser->input,
3074 			ctx->match_entity.offset +
3075 					ctx->match_entity.poss_length,
3076 			&cptr, &len)) == PARSERUTILS_OK) {
3077 		uint32_t cp;
3078 
3079 		uint8_t c = *cptr;
3080 		hubbub_error error;
3081 
3082 		if (c > 0x7F) {
3083 			/* Entity names are ASCII only */
3084 			break;
3085 		}
3086 
3087 		error = hubbub_entities_search_step(c, &cp,
3088 				&ctx->match_entity.context);
3089 		if (error == HUBBUB_OK) {
3090 			/* Had a match - store it for later */
3091 			ctx->match_entity.codepoint = cp;
3092 
3093 			ctx->match_entity.length =
3094 					ctx->match_entity.poss_length + len;
3095 			ctx->match_entity.poss_length =
3096 					ctx->match_entity.length;
3097 		} else if (error == HUBBUB_INVALID) {
3098 			/* No further matches - use last found */
3099 			break;
3100 		} else {
3101 			/* Need more data */
3102 			ctx->match_entity.poss_length += len;
3103 		}
3104 	}
3105 
3106 	if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
3107 		return hubbub_error_from_parserutils_error(error);
3108 	}
3109 
3110 	if (ctx->match_entity.length > 0) {
3111 		uint8_t c;
3112 		error = parserutils_inputstream_peek(tokeniser->input,
3113 				ctx->match_entity.offset +
3114 					ctx->match_entity.length - 1,
3115 				&cptr, &len);
3116 		/* We're re-reading a character we've already read after.
3117 		 * Therefore, there's no way that an error may occur as
3118 		 * a result. */
3119 		assert(error == PARSERUTILS_OK);
3120 
3121 		c = *cptr;
3122 
3123 		if ((tokeniser->context.match_entity.return_state ==
3124 				STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) &&
3125 				c != ';') {
3126 			error = parserutils_inputstream_peek(tokeniser->input,
3127 					ctx->match_entity.offset +
3128 						ctx->match_entity.length,
3129 					&cptr, &len);
3130 			/* We must have attempted to read one more character
3131 			 * than was present in the entity name, as that is the
3132 			 * only way to break out of the loop above. If that
3133 			 * failed, then any non-EOF case will have been handled
3134 			 * by the if statement after the loop thus it cannot
3135 			 * occur here. */
3136 			assert(error == PARSERUTILS_OK ||
3137 					error == PARSERUTILS_EOF);
3138 
3139 			if (error == PARSERUTILS_EOF) {
3140 				ctx->match_entity.codepoint = 0;
3141 			}
3142 
3143 			c = *cptr;
3144 			if ((0x0030 <= c && c <= 0x0039) ||
3145 					(0x0041 <= c && c <= 0x005A) ||
3146 					(0x0061 <= c && c <= 0x007A)) {
3147 				ctx->match_entity.codepoint = 0;
3148 			}
3149 		}
3150 	}
3151 
3152 	/* Flag completion */
3153 	ctx->match_entity.complete = true;
3154 
3155 	/* And back to the state from whence we came */
3156 	tokeniser->state = ctx->match_entity.return_state;
3157 
3158 	return HUBBUB_OK;
3159 }
3160 
3161 
3162 
3163 /*** Token emitting bits ***/
3164 
3165 /**
3166  * Emit a character token.
3167  *
3168  * \param tokeniser	Tokeniser instance
3169  * \param chars		Pointer to hubbub_string to emit
3170  * \return	true
3171  */
emit_character_token(hubbub_tokeniser * tokeniser,const hubbub_string * chars)3172 hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
3173 		const hubbub_string *chars)
3174 {
3175 	hubbub_token token;
3176 
3177 	token.type = HUBBUB_TOKEN_CHARACTER;
3178 	token.data.character = *chars;
3179 
3180 	return hubbub_tokeniser_emit_token(tokeniser, &token);
3181 }
3182 
3183 /**
3184  * Emit the current pending characters being stored in the tokeniser context.
3185  *
3186  * \param tokeniser	Tokeniser instance
3187  * \return	true
3188  */
emit_current_chars(hubbub_tokeniser * tokeniser)3189 hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser)
3190 {
3191 	hubbub_token token;
3192 	size_t len;
3193 	const uint8_t *cptr = NULL;
3194 	parserutils_error error;
3195 
3196 	/* Calling this with nothing to output is a probable bug */
3197 	assert(tokeniser->context.pending > 0);
3198 
3199 	error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);
3200 	if (error != PARSERUTILS_OK)
3201 		return hubbub_error_from_parserutils_error(error);
3202 
3203 	token.type = HUBBUB_TOKEN_CHARACTER;
3204 	token.data.character.ptr = cptr;
3205 	token.data.character.len = tokeniser->context.pending;
3206 
3207 	return hubbub_tokeniser_emit_token(tokeniser, &token);
3208 }
3209 
3210 /**
3211  * Emit the current tag token being stored in the tokeniser context.
3212  *
3213  * \param tokeniser	Tokeniser instance
3214  * \return	true
3215  */
emit_current_tag(hubbub_tokeniser * tokeniser)3216 hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser)
3217 {
3218 	hubbub_error err;
3219 	hubbub_token token;
3220 	uint32_t n_attributes;
3221 	hubbub_attribute *attrs;
3222 	uint8_t *ptr;
3223 	uint32_t i, j;
3224 
3225 	/* Emit current tag */
3226 	token.type = tokeniser->context.current_tag_type;
3227 	token.data.tag = tokeniser->context.current_tag;
3228  	token.data.tag.ns = HUBBUB_NS_HTML;
3229 
3230 
3231 	n_attributes = token.data.tag.n_attributes;
3232 	attrs = token.data.tag.attributes;
3233 
3234 	/* Set pointers correctly... */
3235 	ptr = tokeniser->buffer->data;
3236 	token.data.tag.name.ptr = tokeniser->buffer->data;
3237 	ptr += token.data.tag.name.len;
3238 
3239 	for (i = 0; i < n_attributes; i++) {
3240 		attrs[i].name.ptr = ptr;
3241 		ptr += attrs[i].name.len;
3242 		attrs[i].value.ptr = ptr;
3243 		ptr += attrs[i].value.len;
3244 	}
3245 
3246 
3247 	/* Discard duplicate attributes */
3248 	for (i = 0; i < n_attributes; i++) {
3249 		for (j = 0; j < n_attributes; j++) {
3250 			uint32_t move;
3251 
3252 			if (j == i ||
3253 				attrs[i].name.len !=
3254 						attrs[j].name.len ||
3255 				strncmp((char *) attrs[i].name.ptr,
3256 					(char *) attrs[j].name.ptr,
3257 					attrs[i].name.len) != 0) {
3258 				/* Attributes don't match */
3259 				continue;
3260 			}
3261 
3262 			assert(i < j);
3263 
3264 			/* Calculate amount to move */
3265 			move = (n_attributes - 1 - j) *
3266 					sizeof(hubbub_attribute);
3267 
3268 			if (move > 0) {
3269 				memmove(&attrs[j],&attrs[j+1], move);
3270 			}
3271 
3272 			/* We've deleted an item, so we need to
3273 			 * reprocess this index */
3274 			j--;
3275 
3276 			/* And reduce the number of attributes */
3277 			n_attributes--;
3278 		}
3279 	}
3280 
3281 	token.data.tag.n_attributes = n_attributes;
3282 
3283 	err = hubbub_tokeniser_emit_token(tokeniser, &token);
3284 
3285 	if (token.type == HUBBUB_TOKEN_START_TAG) {
3286 		/* Save start tag name for R?CDATA */
3287 		if (token.data.tag.name.len <
3288 			sizeof(tokeniser->context.last_start_tag_name)) {
3289 			strncpy((char *) tokeniser->context.last_start_tag_name,
3290 				(const char *) token.data.tag.name.ptr,
3291 				token.data.tag.name.len);
3292 			tokeniser->context.last_start_tag_len =
3293 					token.data.tag.name.len;
3294 		} else {
3295 			tokeniser->context.last_start_tag_name[0] = '\0';
3296 			tokeniser->context.last_start_tag_len = 0;
3297 		}
3298 	} else /* if (token->type == HUBBUB_TOKEN_END_TAG) */ {
3299 		/* Reset content model after R?CDATA elements */
3300 		tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
3301 	}
3302 
3303 	/* Reset the self-closing flag */
3304 	tokeniser->context.current_tag.self_closing = false;
3305 
3306 	return err;
3307 }
3308 
3309 /**
3310  * Emit the current comment token being stored in the tokeniser context.
3311  *
3312  * \param tokeniser	Tokeniser instance
3313  * \return	true
3314  */
emit_current_comment(hubbub_tokeniser * tokeniser)3315 hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser)
3316 {
3317 	hubbub_token token;
3318 
3319 	token.type = HUBBUB_TOKEN_COMMENT;
3320 	token.data.comment.ptr = tokeniser->buffer->data;
3321 	token.data.comment.len = tokeniser->buffer->length;
3322 
3323 	return hubbub_tokeniser_emit_token(tokeniser, &token);
3324 }
3325 
3326 /**
3327  * Emit the current doctype token being stored in the tokeniser context.
3328  *
3329  * \param tokeniser	Tokeniser instance
3330  * \param force_quirks	Force quirks mode on this document
3331  * \return	true
3332  */
emit_current_doctype(hubbub_tokeniser * tokeniser,bool force_quirks)3333 hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
3334 		bool force_quirks)
3335 {
3336 	hubbub_token token;
3337 
3338 	/* Emit doctype */
3339 	token.type = HUBBUB_TOKEN_DOCTYPE;
3340 	token.data.doctype = tokeniser->context.current_doctype;
3341 	if (force_quirks == true)
3342 		token.data.doctype.force_quirks = true;
3343 
3344 	/* Set pointers correctly */
3345 	token.data.doctype.name.ptr = tokeniser->buffer->data;
3346 
3347 	if (token.data.doctype.public_missing == false) {
3348 		token.data.doctype.public_id.ptr = tokeniser->buffer->data +
3349 				token.data.doctype.name.len;
3350 	}
3351 
3352 	if (token.data.doctype.system_missing == false) {
3353 		token.data.doctype.system_id.ptr = tokeniser->buffer->data +
3354 				token.data.doctype.name.len +
3355 				token.data.doctype.public_id.len;
3356 	}
3357 
3358 	return hubbub_tokeniser_emit_token(tokeniser, &token);
3359 }
3360 
3361 /**
3362  * Emit a token, performing sanity checks if necessary
3363  *
3364  * \param tokeniser  Tokeniser instance
3365  * \param token      Token to emit
3366  */
hubbub_tokeniser_emit_token(hubbub_tokeniser * tokeniser,hubbub_token * token)3367 hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
3368 		hubbub_token *token)
3369 {
3370 	hubbub_error err = HUBBUB_OK;
3371 
3372 	assert(tokeniser != NULL);
3373 	assert(token != NULL);
3374 	assert(tokeniser->insert_buf->length == 0);
3375 
3376 #ifndef NDEBUG
3377 	/* Sanity checks */
3378 	switch (token->type) {
3379 	case HUBBUB_TOKEN_DOCTYPE:
3380 		assert(memchr(token->data.doctype.name.ptr, 0xff,
3381 				token->data.doctype.name.len) == NULL);
3382 		if (token->data.doctype.public_missing == false)
3383 			assert(memchr(token->data.doctype.public_id.ptr, 0xff,
3384 				token->data.doctype.public_id.len) == NULL);
3385 		if (token->data.doctype.system_missing == false)
3386 			assert(memchr(token->data.doctype.system_id.ptr, 0xff,
3387 				token->data.doctype.system_id.len) == NULL);
3388 		break;
3389 	case HUBBUB_TOKEN_START_TAG:
3390 	case HUBBUB_TOKEN_END_TAG:
3391 	{
3392 		uint32_t i;
3393 		assert(memchr(token->data.tag.name.ptr, 0xff,
3394 				token->data.tag.name.len) == NULL);
3395 		for (i = 0; i < token->data.tag.n_attributes; i++) {
3396 			hubbub_attribute *attr = &token->data.tag.attributes[i];
3397 
3398 			assert(memchr(attr->name.ptr, 0xff, attr->name.len) ==
3399 					NULL);
3400 			assert(memchr(attr->value.ptr, 0xff, attr->value.len) ==
3401 					NULL);
3402 		}
3403 	}
3404 		break;
3405 	case HUBBUB_TOKEN_COMMENT:
3406 		assert(memchr(token->data.comment.ptr, 0xff,
3407 				token->data.comment.len) == NULL);
3408 		break;
3409 	case HUBBUB_TOKEN_CHARACTER:
3410 		assert(memchr(token->data.character.ptr, 0xff,
3411 				token->data.character.len) == NULL);
3412 		break;
3413 	case HUBBUB_TOKEN_EOF:
3414 		break;
3415 	}
3416 #endif
3417 
3418 	/* Emit the token */
3419 	if (tokeniser->token_handler) {
3420 		err = tokeniser->token_handler(token, tokeniser->token_pw);
3421 	}
3422 
3423 	/* Discard current buffer */
3424 	if (tokeniser->buffer->length) {
3425 		parserutils_buffer_discard(tokeniser->buffer, 0,
3426 				tokeniser->buffer->length);
3427 	}
3428 
3429 	/* Advance the pointer */
3430 	if (tokeniser->context.pending) {
3431 		parserutils_inputstream_advance(tokeniser->input,
3432 				tokeniser->context.pending);
3433 		tokeniser->context.pending = 0;
3434 	}
3435 
3436 	if (tokeniser->insert_buf->length > 0) {
3437 		parserutils_inputstream_insert(tokeniser->input,
3438 				tokeniser->insert_buf->data,
3439 				tokeniser->insert_buf->length);
3440 		parserutils_buffer_discard(tokeniser->insert_buf, 0,
3441 				tokeniser->insert_buf->length);
3442 	}
3443 
3444 	/* Ensure callback can pause the tokenise */
3445 	if (err == HUBBUB_PAUSED) {
3446 		tokeniser->paused = true;
3447 	}
3448 
3449 	return err;
3450 }
3451