1 /*
2 * This file is part of Hubbub.
3 * Licensed under the MIT License,
4 * http://www.opensource.org/licenses/mit-license.php
5 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6 * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org>
7 */
8 #include <assert.h>
9 #include <stdbool.h>
10 #include <string.h>
11
12 #include <stdio.h>
13
14 #include <parserutils/charset/utf8.h>
15
16 #include "utils/parserutilserror.h"
17 #include "utils/utils.h"
18
19 #include "hubbub/errors.h"
20 #include "tokeniser/entities.h"
21 #include "tokeniser/tokeniser.h"
22
23 /**
24 * Table of mappings between Windows-1252 codepoints 128-159 and UCS4
25 */
26 static const uint32_t cp1252Table[32] = {
27 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
28 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
29 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
30 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
31 };
32
33 /**
34 * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER
35 */
36 static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' };
37 static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) };
38
39
40 /**
41 * String for when we want to emit newlines
42 */
43 static const uint8_t lf = '\n';
44 static const hubbub_string lf_str = { &lf, 1 };
45
46
47 /**
48 * Tokeniser states
49 */
50 typedef enum hubbub_tokeniser_state {
51 STATE_DATA,
52 STATE_CHARACTER_REFERENCE_DATA,
53 STATE_TAG_OPEN,
54 STATE_CLOSE_TAG_OPEN,
55 STATE_TAG_NAME,
56 STATE_BEFORE_ATTRIBUTE_NAME,
57 STATE_ATTRIBUTE_NAME,
58 STATE_AFTER_ATTRIBUTE_NAME,
59 STATE_BEFORE_ATTRIBUTE_VALUE,
60 STATE_ATTRIBUTE_VALUE_DQ,
61 STATE_ATTRIBUTE_VALUE_SQ,
62 STATE_ATTRIBUTE_VALUE_UQ,
63 STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE,
64 STATE_AFTER_ATTRIBUTE_VALUE_Q,
65 STATE_SELF_CLOSING_START_TAG,
66 STATE_BOGUS_COMMENT,
67 STATE_MARKUP_DECLARATION_OPEN,
68 STATE_MATCH_COMMENT,
69 STATE_COMMENT_START,
70 STATE_COMMENT_START_DASH,
71 STATE_COMMENT,
72 STATE_COMMENT_END_DASH,
73 STATE_COMMENT_END,
74 STATE_MATCH_DOCTYPE,
75 STATE_DOCTYPE,
76 STATE_BEFORE_DOCTYPE_NAME,
77 STATE_DOCTYPE_NAME,
78 STATE_AFTER_DOCTYPE_NAME,
79 STATE_MATCH_PUBLIC,
80 STATE_BEFORE_DOCTYPE_PUBLIC,
81 STATE_DOCTYPE_PUBLIC_DQ,
82 STATE_DOCTYPE_PUBLIC_SQ,
83 STATE_AFTER_DOCTYPE_PUBLIC,
84 STATE_MATCH_SYSTEM,
85 STATE_BEFORE_DOCTYPE_SYSTEM,
86 STATE_DOCTYPE_SYSTEM_DQ,
87 STATE_DOCTYPE_SYSTEM_SQ,
88 STATE_AFTER_DOCTYPE_SYSTEM,
89 STATE_BOGUS_DOCTYPE,
90 STATE_MATCH_CDATA,
91 STATE_CDATA_BLOCK,
92 STATE_NUMBERED_ENTITY,
93 STATE_NAMED_ENTITY
94 } hubbub_tokeniser_state;
95
96 /**
97 * Context for tokeniser
98 */
99 typedef struct hubbub_tokeniser_context {
100 size_t pending; /**< Count of pending chars */
101
102 hubbub_string current_comment; /**< Current comment text */
103
104 hubbub_token_type current_tag_type; /**< Type of current_tag */
105 hubbub_tag current_tag; /**< Current tag */
106 hubbub_doctype current_doctype; /**< Current doctype */
107 hubbub_tokeniser_state prev_state; /**< Previous state */
108
109 uint8_t last_start_tag_name[10]; /**< Name of the last start tag
110 * emitted */
111 size_t last_start_tag_len; /**< Length of last start tag */
112
113 struct {
114 uint32_t count;
115 bool match;
116 } close_tag_match; /**< State for matching close
117 * tags */
118
119 struct {
120 uint32_t count; /**< Index into "DOCTYPE" */
121 } match_doctype; /**< State for matching doctype */
122
123 struct {
124 uint32_t count; /**< Index into "[CDATA[" */
125 uint32_t end; /**< Index into "]]>" */
126 } match_cdata; /**< State for matching cdata */
127
128 struct {
129 size_t offset; /**< Offset in buffer */
130 uint32_t length; /**< Length of entity */
131 uint32_t codepoint; /**< UCS4 codepoint */
132 bool complete; /**< True if match complete */
133
134 uint32_t poss_length; /**< Optimistic length
135 * when matching named
136 * character references */
137 uint8_t base; /**< Base for numeric
138 * entities */
139 int32_t context; /**< Context for named
140 * entity search */
141 size_t prev_len; /**< Previous byte length
142 * of str */
143 bool had_data; /**< Whether we read
144 * anything after &#(x)? */
145 bool overflow; /**< Whether this entity has
146 * has overflowed the maximum
147 * numeric entity value */
148 hubbub_tokeniser_state return_state; /**< State we were
149 * called from */
150 } match_entity; /**< Entity matching state */
151
152 struct {
153 uint32_t line; /**< Current line of input */
154 uint32_t col; /**< Current character in
155 * line */
156 } position; /**< Position in source data */
157
158 uint32_t allowed_char; /**< Used for quote matching */
159
160 } hubbub_tokeniser_context;
161
162 /**
163 * Tokeniser data structure
164 */
165 struct hubbub_tokeniser {
166 hubbub_tokeniser_state state; /**< Current tokeniser state */
167 hubbub_content_model content_model; /**< Current content
168 * model flag */
169 bool escape_flag; /**< Escape flag **/
170 bool process_cdata_section; /**< Whether to process CDATA sections*/
171 bool paused; /**< flag for if parsing is currently paused */
172
173 parserutils_inputstream *input; /**< Input stream */
174 parserutils_buffer *buffer; /**< Input buffer */
175 parserutils_buffer *insert_buf; /**< Stream insertion buffer */
176
177 hubbub_tokeniser_context context; /**< Tokeniser context */
178
179 hubbub_token_handler token_handler; /**< Token handling callback */
180 void *token_pw; /**< Token handler data */
181
182 hubbub_error_handler error_handler; /**< Error handling callback */
183 void *error_pw; /**< Error handler data */
184 };
185
186 static hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser);
187 static hubbub_error hubbub_tokeniser_handle_character_reference_data(
188 hubbub_tokeniser *tokeniser);
189 static hubbub_error hubbub_tokeniser_handle_tag_open(
190 hubbub_tokeniser *tokeniser);
191 static hubbub_error hubbub_tokeniser_handle_close_tag_open(
192 hubbub_tokeniser *tokeniser);
193 static hubbub_error hubbub_tokeniser_handle_tag_name(
194 hubbub_tokeniser *tokeniser);
195 static hubbub_error hubbub_tokeniser_handle_before_attribute_name(
196 hubbub_tokeniser *tokeniser);
197 static hubbub_error hubbub_tokeniser_handle_attribute_name(
198 hubbub_tokeniser *tokeniser);
199 static hubbub_error hubbub_tokeniser_handle_after_attribute_name(
200 hubbub_tokeniser *tokeniser);
201 static hubbub_error hubbub_tokeniser_handle_before_attribute_value(
202 hubbub_tokeniser *tokeniser);
203 static hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
204 hubbub_tokeniser *tokeniser);
205 static hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
206 hubbub_tokeniser *tokeniser);
207 static hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
208 hubbub_tokeniser *tokeniser);
209 static hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
210 hubbub_tokeniser *tokeniser);
211 static hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
212 hubbub_tokeniser *tokeniser);
213 static hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
214 hubbub_tokeniser *tokeniser);
215 static hubbub_error hubbub_tokeniser_handle_bogus_comment(
216 hubbub_tokeniser *tokeniser);
217 static hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
218 hubbub_tokeniser *tokeniser);
219 static hubbub_error hubbub_tokeniser_handle_match_comment(
220 hubbub_tokeniser *tokeniser);
221 static hubbub_error hubbub_tokeniser_handle_comment(
222 hubbub_tokeniser *tokeniser);
223 static hubbub_error hubbub_tokeniser_handle_match_doctype(
224 hubbub_tokeniser *tokeniser);
225 static hubbub_error hubbub_tokeniser_handle_doctype(
226 hubbub_tokeniser *tokeniser);
227 static hubbub_error hubbub_tokeniser_handle_before_doctype_name(
228 hubbub_tokeniser *tokeniser);
229 static hubbub_error hubbub_tokeniser_handle_doctype_name(
230 hubbub_tokeniser *tokeniser);
231 static hubbub_error hubbub_tokeniser_handle_after_doctype_name(
232 hubbub_tokeniser *tokeniser);
233 static hubbub_error hubbub_tokeniser_handle_match_public(
234 hubbub_tokeniser *tokeniser);
235 static hubbub_error hubbub_tokeniser_handle_before_doctype_public(
236 hubbub_tokeniser *tokeniser);
237 static hubbub_error hubbub_tokeniser_handle_doctype_public_dq(
238 hubbub_tokeniser *tokeniser);
239 static hubbub_error hubbub_tokeniser_handle_doctype_public_sq(
240 hubbub_tokeniser *tokeniser);
241 static hubbub_error hubbub_tokeniser_handle_after_doctype_public(
242 hubbub_tokeniser *tokeniser);
243 static hubbub_error hubbub_tokeniser_handle_match_system(
244 hubbub_tokeniser *tokeniser);
245 static hubbub_error hubbub_tokeniser_handle_before_doctype_system(
246 hubbub_tokeniser *tokeniser);
247 static hubbub_error hubbub_tokeniser_handle_doctype_system_dq(
248 hubbub_tokeniser *tokeniser);
249 static hubbub_error hubbub_tokeniser_handle_doctype_system_sq(
250 hubbub_tokeniser *tokeniser);
251 static hubbub_error hubbub_tokeniser_handle_after_doctype_system(
252 hubbub_tokeniser *tokeniser);
253 static hubbub_error hubbub_tokeniser_handle_bogus_doctype(
254 hubbub_tokeniser *tokeniser);
255 static hubbub_error hubbub_tokeniser_handle_match_cdata(
256 hubbub_tokeniser *tokeniser);
257 static hubbub_error hubbub_tokeniser_handle_cdata_block(
258 hubbub_tokeniser *tokeniser);
259 static hubbub_error hubbub_tokeniser_consume_character_reference(
260 hubbub_tokeniser *tokeniser, size_t off);
261 static hubbub_error hubbub_tokeniser_handle_numbered_entity(
262 hubbub_tokeniser *tokeniser);
263 static hubbub_error hubbub_tokeniser_handle_named_entity(
264 hubbub_tokeniser *tokeniser);
265
266 static inline hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
267 const hubbub_string *chars);
268 static inline hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser);
269 static inline hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser);
270 static inline hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser);
271 static inline hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
272 bool force_quirks);
273 static hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
274 hubbub_token *token);
275
276 /**
277 * Create a hubbub tokeniser
278 *
279 * \param input Input stream instance
280 * \param tokeniser Pointer to location to receive tokeniser instance
281 * \return HUBBUB_OK on success,
282 * HUBBUB_BADPARM on bad parameters,
283 * HUBBUB_NOMEM on memory exhaustion
284 */
hubbub_tokeniser_create(parserutils_inputstream * input,hubbub_tokeniser ** tokeniser)285 hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input,
286 hubbub_tokeniser **tokeniser)
287 {
288 parserutils_error perror;
289 hubbub_tokeniser *tok;
290
291 if (input == NULL || tokeniser == NULL)
292 return HUBBUB_BADPARM;
293
294 tok = malloc(sizeof(hubbub_tokeniser));
295 if (tok == NULL)
296 return HUBBUB_NOMEM;
297
298 perror = parserutils_buffer_create(&tok->buffer);
299 if (perror != PARSERUTILS_OK) {
300 free(tok);
301 return hubbub_error_from_parserutils_error(perror);
302 }
303
304 perror = parserutils_buffer_create(&tok->insert_buf);
305 if (perror != PARSERUTILS_OK) {
306 parserutils_buffer_destroy(tok->buffer);
307 free(tok);
308 return hubbub_error_from_parserutils_error(perror);
309 }
310
311 tok->state = STATE_DATA;
312 tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
313
314 tok->escape_flag = false;
315 tok->process_cdata_section = false;
316
317 tok->paused = false;
318
319 tok->input = input;
320
321 tok->token_handler = NULL;
322 tok->token_pw = NULL;
323
324 tok->error_handler = NULL;
325 tok->error_pw = NULL;
326
327 memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
328
329 *tokeniser = tok;
330
331 return HUBBUB_OK;
332 }
333
334 /**
335 * Destroy a hubbub tokeniser
336 *
337 * \param tokeniser The tokeniser instance to destroy
338 * \return HUBBUB_OK on success, appropriate error otherwise
339 */
hubbub_tokeniser_destroy(hubbub_tokeniser * tokeniser)340 hubbub_error hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
341 {
342 if (tokeniser == NULL)
343 return HUBBUB_BADPARM;
344
345 if (tokeniser->context.current_tag.attributes != NULL) {
346 free(tokeniser->context.current_tag.attributes);
347 }
348
349 parserutils_buffer_destroy(tokeniser->insert_buf);
350
351 parserutils_buffer_destroy(tokeniser->buffer);
352
353 free(tokeniser);
354
355 return HUBBUB_OK;
356 }
357
358 /**
359 * Configure a hubbub tokeniser
360 *
361 * \param tokeniser The tokeniser instance to configure
362 * \param type The option type to set
363 * \param params Option-specific parameters
364 * \return HUBBUB_OK on success, appropriate error otherwise
365 */
hubbub_tokeniser_setopt(hubbub_tokeniser * tokeniser,hubbub_tokeniser_opttype type,hubbub_tokeniser_optparams * params)366 hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
367 hubbub_tokeniser_opttype type,
368 hubbub_tokeniser_optparams *params)
369 {
370 hubbub_error err = HUBBUB_OK;
371
372 if (tokeniser == NULL || params == NULL)
373 return HUBBUB_BADPARM;
374
375 switch (type) {
376 case HUBBUB_TOKENISER_TOKEN_HANDLER:
377 tokeniser->token_handler = params->token_handler.handler;
378 tokeniser->token_pw = params->token_handler.pw;
379 break;
380 case HUBBUB_TOKENISER_ERROR_HANDLER:
381 tokeniser->error_handler = params->error_handler.handler;
382 tokeniser->error_pw = params->error_handler.pw;
383 break;
384 case HUBBUB_TOKENISER_CONTENT_MODEL:
385 tokeniser->content_model = params->content_model.model;
386 break;
387 case HUBBUB_TOKENISER_PROCESS_CDATA:
388 tokeniser->process_cdata_section = params->process_cdata;
389 break;
390 case HUBBUB_TOKENISER_PAUSE:
391 if (params->pause_parse == true) {
392 tokeniser->paused = true;
393 } else {
394 if (tokeniser->paused == true) {
395 tokeniser->paused = false;
396 /* When unpausing, if we have had something
397 * akin to document.write() happen while
398 * we were paused, then the insert_buf will
399 * have some content.
400 * In this case, we need to prepend it to
401 * the input buffer before we resume parsing,
402 * discarding the insert_buf as we go.
403 */
404 if (tokeniser->insert_buf->length > 0) {
405 parserutils_inputstream_insert(
406 tokeniser->input,
407 tokeniser->insert_buf->data,
408 tokeniser->insert_buf->length);
409 parserutils_buffer_discard(
410 tokeniser->insert_buf, 0,
411 tokeniser->insert_buf->length);
412 }
413
414 err = hubbub_tokeniser_run(tokeniser);
415 }
416 }
417 }
418
419 return err;
420 }
421
422 /**
423 * Insert a chunk of data into the input stream.
424 *
425 * Inserts the given data into the input stream ready for parsing but
426 * does not cause any additional processing of the input.
427 *
428 * \param tokeniser Tokeniser instance
429 * \param data Data to insert (UTF-8 encoded)
430 * \param len Length, in bytes, of data
431 * \return HUBBUB_OK on success, appropriate error otherwise
432 */
hubbub_tokeniser_insert_chunk(hubbub_tokeniser * tokeniser,const uint8_t * data,size_t len)433 hubbub_error hubbub_tokeniser_insert_chunk(hubbub_tokeniser *tokeniser,
434 const uint8_t *data, size_t len)
435 {
436 parserutils_error perror;
437
438 if (tokeniser == NULL || data == NULL)
439 return HUBBUB_BADPARM;
440
441 perror = parserutils_buffer_append(tokeniser->insert_buf, data, len);
442 if (perror != PARSERUTILS_OK)
443 return hubbub_error_from_parserutils_error(perror);
444
445 return HUBBUB_OK;
446 }
447
448 /**
449 * Process remaining data in the input stream
450 *
451 * \param tokeniser The tokeniser instance to invoke
452 * \return HUBBUB_OK on success, appropriate error otherwise
453 */
hubbub_tokeniser_run(hubbub_tokeniser * tokeniser)454 hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
455 {
456 hubbub_error cont = HUBBUB_OK;
457
458 if (tokeniser == NULL)
459 return HUBBUB_BADPARM;
460
461 if (tokeniser->paused == true)
462 return HUBBUB_PAUSED;
463
464 #if 0
465 #define state(x) \
466 case x: \
467 printf( #x "\n");
468 #else
469 #define state(x) \
470 case x:
471 #endif
472
473 while (cont == HUBBUB_OK) {
474 switch (tokeniser->state) {
475 state(STATE_DATA)
476 cont = hubbub_tokeniser_handle_data(tokeniser);
477 break;
478 state(STATE_CHARACTER_REFERENCE_DATA)
479 cont = hubbub_tokeniser_handle_character_reference_data(
480 tokeniser);
481 break;
482 state(STATE_TAG_OPEN)
483 cont = hubbub_tokeniser_handle_tag_open(tokeniser);
484 break;
485 state(STATE_CLOSE_TAG_OPEN)
486 cont = hubbub_tokeniser_handle_close_tag_open(
487 tokeniser);
488 break;
489 state(STATE_TAG_NAME)
490 cont = hubbub_tokeniser_handle_tag_name(tokeniser);
491 break;
492 state(STATE_BEFORE_ATTRIBUTE_NAME)
493 cont = hubbub_tokeniser_handle_before_attribute_name(
494 tokeniser);
495 break;
496 state(STATE_ATTRIBUTE_NAME)
497 cont = hubbub_tokeniser_handle_attribute_name(
498 tokeniser);
499 break;
500 state(STATE_AFTER_ATTRIBUTE_NAME)
501 cont = hubbub_tokeniser_handle_after_attribute_name(
502 tokeniser);
503 break;
504 state(STATE_BEFORE_ATTRIBUTE_VALUE)
505 cont = hubbub_tokeniser_handle_before_attribute_value(
506 tokeniser);
507 break;
508 state(STATE_ATTRIBUTE_VALUE_DQ)
509 cont = hubbub_tokeniser_handle_attribute_value_dq(
510 tokeniser);
511 break;
512 state(STATE_ATTRIBUTE_VALUE_SQ)
513 cont = hubbub_tokeniser_handle_attribute_value_sq(
514 tokeniser);
515 break;
516 state(STATE_ATTRIBUTE_VALUE_UQ)
517 cont = hubbub_tokeniser_handle_attribute_value_uq(
518 tokeniser);
519 break;
520 state(STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE)
521 cont = hubbub_tokeniser_handle_character_reference_in_attribute_value(
522 tokeniser);
523 break;
524 state(STATE_AFTER_ATTRIBUTE_VALUE_Q)
525 cont = hubbub_tokeniser_handle_after_attribute_value_q(
526 tokeniser);
527 break;
528 state(STATE_SELF_CLOSING_START_TAG)
529 cont = hubbub_tokeniser_handle_self_closing_start_tag(
530 tokeniser);
531 break;
532 state(STATE_BOGUS_COMMENT)
533 cont = hubbub_tokeniser_handle_bogus_comment(
534 tokeniser);
535 break;
536 state(STATE_MARKUP_DECLARATION_OPEN)
537 cont = hubbub_tokeniser_handle_markup_declaration_open(
538 tokeniser);
539 break;
540 state(STATE_MATCH_COMMENT)
541 cont = hubbub_tokeniser_handle_match_comment(
542 tokeniser);
543 break;
544 case STATE_COMMENT_START:
545 case STATE_COMMENT_START_DASH:
546 case STATE_COMMENT:
547 case STATE_COMMENT_END_DASH:
548 case STATE_COMMENT_END:
549 cont = hubbub_tokeniser_handle_comment(tokeniser);
550 break;
551 state(STATE_MATCH_DOCTYPE)
552 cont = hubbub_tokeniser_handle_match_doctype(
553 tokeniser);
554 break;
555 state(STATE_DOCTYPE)
556 cont = hubbub_tokeniser_handle_doctype(tokeniser);
557 break;
558 state(STATE_BEFORE_DOCTYPE_NAME)
559 cont = hubbub_tokeniser_handle_before_doctype_name(
560 tokeniser);
561 break;
562 state(STATE_DOCTYPE_NAME)
563 cont = hubbub_tokeniser_handle_doctype_name(
564 tokeniser);
565 break;
566 state(STATE_AFTER_DOCTYPE_NAME)
567 cont = hubbub_tokeniser_handle_after_doctype_name(
568 tokeniser);
569 break;
570
571 state(STATE_MATCH_PUBLIC)
572 cont = hubbub_tokeniser_handle_match_public(
573 tokeniser);
574 break;
575 state(STATE_BEFORE_DOCTYPE_PUBLIC)
576 cont = hubbub_tokeniser_handle_before_doctype_public(
577 tokeniser);
578 break;
579 state(STATE_DOCTYPE_PUBLIC_DQ)
580 cont = hubbub_tokeniser_handle_doctype_public_dq(
581 tokeniser);
582 break;
583 state(STATE_DOCTYPE_PUBLIC_SQ)
584 cont = hubbub_tokeniser_handle_doctype_public_sq(
585 tokeniser);
586 break;
587 state(STATE_AFTER_DOCTYPE_PUBLIC)
588 cont = hubbub_tokeniser_handle_after_doctype_public(
589 tokeniser);
590 break;
591 state(STATE_MATCH_SYSTEM)
592 cont = hubbub_tokeniser_handle_match_system(
593 tokeniser);
594 break;
595 state(STATE_BEFORE_DOCTYPE_SYSTEM)
596 cont = hubbub_tokeniser_handle_before_doctype_system(
597 tokeniser);
598 break;
599 state(STATE_DOCTYPE_SYSTEM_DQ)
600 cont = hubbub_tokeniser_handle_doctype_system_dq(
601 tokeniser);
602 break;
603 state(STATE_DOCTYPE_SYSTEM_SQ)
604 cont = hubbub_tokeniser_handle_doctype_system_sq(
605 tokeniser);
606 break;
607 state(STATE_AFTER_DOCTYPE_SYSTEM)
608 cont = hubbub_tokeniser_handle_after_doctype_system(
609 tokeniser);
610 break;
611 state(STATE_BOGUS_DOCTYPE)
612 cont = hubbub_tokeniser_handle_bogus_doctype(
613 tokeniser);
614 break;
615 state(STATE_MATCH_CDATA)
616 cont = hubbub_tokeniser_handle_match_cdata(
617 tokeniser);
618 break;
619 state(STATE_CDATA_BLOCK)
620 cont = hubbub_tokeniser_handle_cdata_block(
621 tokeniser);
622 break;
623 state(STATE_NUMBERED_ENTITY)
624 cont = hubbub_tokeniser_handle_numbered_entity(
625 tokeniser);
626 break;
627 state(STATE_NAMED_ENTITY)
628 cont = hubbub_tokeniser_handle_named_entity(
629 tokeniser);
630 break;
631 }
632 }
633
634 return (cont == HUBBUB_NEEDDATA) ? HUBBUB_OK : cont;
635 }
636
637
638 /**
639 * Various macros for manipulating buffers.
640 *
641 * \todo make some of these inline functions (type-safety)
642 * \todo document them properly here
643 */
644
645 #define START_BUF(str, cptr, length) \
646 do { \
647 parserutils_error perror; \
648 perror = parserutils_buffer_append(tokeniser->buffer, \
649 (uint8_t *) (cptr), (length)); \
650 if (perror != PARSERUTILS_OK) \
651 return hubbub_error_from_parserutils_error(perror); \
652 (str).len = (length); \
653 } while (0)
654
655 #define COLLECT(str, cptr, length) \
656 do { \
657 parserutils_error perror; \
658 assert(str.len != 0); \
659 perror = parserutils_buffer_append(tokeniser->buffer, \
660 (uint8_t *) (cptr), (length)); \
661 if (perror != PARSERUTILS_OK) \
662 return hubbub_error_from_parserutils_error(perror); \
663 (str).len += (length); \
664 } while (0)
665
666 #define COLLECT_MS(str, cptr, length) \
667 do { \
668 parserutils_error perror; \
669 perror = parserutils_buffer_append(tokeniser->buffer, \
670 (uint8_t *) (cptr), (length)); \
671 if (perror != PARSERUTILS_OK) \
672 return hubbub_error_from_parserutils_error(perror); \
673 (str).len += (length); \
674 } while (0)
675
676
677 /* this should always be called with an empty "chars" buffer */
hubbub_tokeniser_handle_data(hubbub_tokeniser * tokeniser)678 hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
679 {
680 parserutils_error error;
681 hubbub_token token;
682 const uint8_t *cptr;
683 size_t len;
684
685 while ((error = parserutils_inputstream_peek(tokeniser->input,
686 tokeniser->context.pending, &cptr, &len)) ==
687 PARSERUTILS_OK) {
688 const uint8_t c = *cptr;
689
690 if (c == '&' &&
691 (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA ||
692 tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA) &&
693 tokeniser->escape_flag == false) {
694 tokeniser->state =
695 STATE_CHARACTER_REFERENCE_DATA;
696 /* Don't eat the '&'; it'll be handled by entity
697 * consumption */
698 break;
699 } else if (c == '-' &&
700 tokeniser->escape_flag == false &&
701 (tokeniser->content_model ==
702 HUBBUB_CONTENT_MODEL_RCDATA ||
703 tokeniser->content_model ==
704 HUBBUB_CONTENT_MODEL_CDATA) &&
705 tokeniser->context.pending >= 3) {
706 size_t ignore;
707 error = parserutils_inputstream_peek(
708 tokeniser->input,
709 tokeniser->context.pending - 3,
710 &cptr,
711 &ignore);
712
713 assert(error == PARSERUTILS_OK);
714
715 if (strncmp((char *)cptr,
716 "<!--", SLEN("<!--")) == 0) {
717 tokeniser->escape_flag = true;
718 }
719
720 tokeniser->context.pending += len;
721 } else if (c == '<' && (tokeniser->content_model ==
722 HUBBUB_CONTENT_MODEL_PCDATA ||
723 ((tokeniser->content_model ==
724 HUBBUB_CONTENT_MODEL_RCDATA ||
725 tokeniser->content_model ==
726 HUBBUB_CONTENT_MODEL_CDATA) &&
727 tokeniser->escape_flag == false))) {
728 if (tokeniser->context.pending > 0) {
729 /* Emit any pending characters */
730 emit_current_chars(tokeniser);
731 }
732
733 /* Buffer '<' */
734 tokeniser->context.pending = len;
735 tokeniser->state = STATE_TAG_OPEN;
736 break;
737 } else if (c == '>' && tokeniser->escape_flag == true &&
738 (tokeniser->content_model ==
739 HUBBUB_CONTENT_MODEL_RCDATA ||
740 tokeniser->content_model ==
741 HUBBUB_CONTENT_MODEL_CDATA)) {
742 /* no need to check that there are enough characters,
743 * since you can only run into this if the flag is
744 * true in the first place, which requires four
745 * characters. */
746 error = parserutils_inputstream_peek(
747 tokeniser->input,
748 tokeniser->context.pending - 2,
749 &cptr,
750 &len);
751
752 assert(error == PARSERUTILS_OK);
753
754 if (strncmp((char *) cptr, "-->", SLEN("-->")) == 0) {
755 tokeniser->escape_flag = false;
756 }
757
758 tokeniser->context.pending += len;
759 } else if (c == '\0') {
760 if (tokeniser->context.pending > 0) {
761 /* Emit any pending characters */
762 emit_current_chars(tokeniser);
763 }
764
765 /* Emit a replacement character */
766 emit_character_token(tokeniser, &u_fffd_str);
767
768 /* Advance past NUL */
769 parserutils_inputstream_advance(tokeniser->input, 1);
770 } else if (c == '\r') {
771 error = parserutils_inputstream_peek(
772 tokeniser->input,
773 tokeniser->context.pending + len,
774 &cptr,
775 &len);
776
777 if (error != PARSERUTILS_OK &&
778 error != PARSERUTILS_EOF) {
779 break;
780 }
781
782 if (tokeniser->context.pending > 0) {
783 /* Emit any pending characters */
784 emit_current_chars(tokeniser);
785 }
786
787 if (error == PARSERUTILS_EOF || *cptr != '\n') {
788 /* Emit newline */
789 emit_character_token(tokeniser, &lf_str);
790 }
791
792 /* Advance over */
793 parserutils_inputstream_advance(tokeniser->input, 1);
794 } else {
795 /* Just collect into buffer */
796 tokeniser->context.pending += len;
797 }
798 }
799
800 if (tokeniser->state != STATE_TAG_OPEN &&
801 (tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) &&
802 tokeniser->context.pending > 0) {
803 /* Emit any pending characters */
804 emit_current_chars(tokeniser);
805 }
806
807 if (error == PARSERUTILS_EOF) {
808 token.type = HUBBUB_TOKEN_EOF;
809 hubbub_tokeniser_emit_token(tokeniser, &token);
810 }
811
812 if (error == PARSERUTILS_EOF) {
813 return HUBBUB_NEEDDATA;
814 } else {
815 return hubbub_error_from_parserutils_error(error);
816 }
817 }
818
819 /* emit any pending tokens before calling */
hubbub_tokeniser_handle_character_reference_data(hubbub_tokeniser * tokeniser)820 hubbub_error hubbub_tokeniser_handle_character_reference_data(
821 hubbub_tokeniser *tokeniser)
822 {
823 assert(tokeniser->context.pending == 0);
824
825 if (tokeniser->context.match_entity.complete == false) {
826 return hubbub_tokeniser_consume_character_reference(tokeniser,
827 tokeniser->context.pending);
828 } else {
829 hubbub_token token;
830
831 uint8_t utf8[6];
832 uint8_t *utf8ptr = utf8;
833 size_t len = sizeof(utf8);
834
835 token.type = HUBBUB_TOKEN_CHARACTER;
836
837 if (tokeniser->context.match_entity.codepoint) {
838 parserutils_charset_utf8_from_ucs4(
839 tokeniser->context.match_entity.codepoint,
840 &utf8ptr, &len);
841
842 token.data.character.ptr = utf8;
843 token.data.character.len = sizeof(utf8) - len;
844
845 hubbub_tokeniser_emit_token(tokeniser, &token);
846
847 /* +1 for ampersand */
848 parserutils_inputstream_advance(tokeniser->input,
849 tokeniser->context.match_entity.length
850 + 1);
851 } else {
852 parserutils_error error;
853 const uint8_t *cptr = NULL;
854
855 error = parserutils_inputstream_peek(
856 tokeniser->input,
857 tokeniser->context.pending,
858 &cptr,
859 &len);
860 if (error != PARSERUTILS_OK) {
861 return hubbub_error_from_parserutils_error(
862 error);
863 }
864
865 token.data.character.ptr = cptr;
866 token.data.character.len = len;
867
868 hubbub_tokeniser_emit_token(tokeniser, &token);
869 parserutils_inputstream_advance(tokeniser->input, len);
870 }
871
872 /* Reset for next time */
873 tokeniser->context.match_entity.complete = false;
874
875 tokeniser->state = STATE_DATA;
876 }
877
878 return HUBBUB_OK;
879 }
880
881 /* this state always switches to another state straight away */
882 /* this state expects the current character to be '<' */
hubbub_tokeniser_handle_tag_open(hubbub_tokeniser * tokeniser)883 hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
884 {
885 hubbub_tag *ctag = &tokeniser->context.current_tag;
886
887 size_t len;
888 const uint8_t *cptr;
889 parserutils_error error;
890 uint8_t c;
891
892 assert(tokeniser->context.pending == 1);
893 /* assert(tokeniser->context.chars.ptr[0] == '<'); */
894
895 error = parserutils_inputstream_peek(tokeniser->input,
896 tokeniser->context.pending, &cptr, &len);
897
898 if (error != PARSERUTILS_OK) {
899 if (error == PARSERUTILS_EOF) {
900 /* Return to data state with '<' still in "chars" */
901 tokeniser->state = STATE_DATA;
902 return HUBBUB_OK;
903 } else {
904 return hubbub_error_from_parserutils_error(error);
905 }
906 }
907
908 c = *cptr;
909
910 if (c == '/') {
911 tokeniser->context.pending += len;
912
913 tokeniser->context.close_tag_match.match = false;
914 tokeniser->context.close_tag_match.count = 0;
915
916 tokeniser->state = STATE_CLOSE_TAG_OPEN;
917 } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
918 tokeniser->content_model ==
919 HUBBUB_CONTENT_MODEL_CDATA) {
920 /* Return to data state with '<' still in "chars" */
921 tokeniser->state = STATE_DATA;
922 } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
923 if (c == '!') {
924 parserutils_inputstream_advance(tokeniser->input,
925 SLEN("<!"));
926
927 tokeniser->context.pending = 0;
928 tokeniser->state = STATE_MARKUP_DECLARATION_OPEN;
929 } else if ('A' <= c && c <= 'Z') {
930 uint8_t lc = (c + 0x20);
931
932 START_BUF(ctag->name, &lc, len);
933 ctag->n_attributes = 0;
934 tokeniser->context.current_tag_type =
935 HUBBUB_TOKEN_START_TAG;
936
937 tokeniser->context.pending += len;
938
939 tokeniser->state = STATE_TAG_NAME;
940 } else if ('a' <= c && c <= 'z') {
941 START_BUF(ctag->name, cptr, len);
942 ctag->n_attributes = 0;
943 tokeniser->context.current_tag_type =
944 HUBBUB_TOKEN_START_TAG;
945
946 tokeniser->context.pending += len;
947
948 tokeniser->state = STATE_TAG_NAME;
949 } else if (c == '>') {
950 /** \todo parse error */
951
952 tokeniser->context.pending += len;
953 tokeniser->state = STATE_DATA;
954 } else if (c == '?') {
955 /** \todo parse error */
956
957 /* Cursor still at "<", need to advance past it */
958 parserutils_inputstream_advance(
959 tokeniser->input, SLEN("<"));
960 tokeniser->context.pending = 0;
961
962 tokeniser->state = STATE_BOGUS_COMMENT;
963 } else {
964 /* Return to data state with '<' still in "chars" */
965 tokeniser->state = STATE_DATA;
966 }
967 }
968
969 return HUBBUB_OK;
970 }
971
972 /* this state expects tokeniser->context.chars to be "</" */
973 /* this state never stays in this state for more than one character */
hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser * tokeniser)974 hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
975 {
976 hubbub_tokeniser_context *ctx = &tokeniser->context;
977
978 size_t len;
979 const uint8_t *cptr;
980 parserutils_error error;
981 uint8_t c;
982
983 assert(tokeniser->context.pending == 2);
984 /* assert(tokeniser->context.chars.ptr[0] == '<'); */
985 /* assert(tokeniser->context.chars.ptr[1] == '/'); */
986
987 /**\todo fragment case */
988
989 if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
990 tokeniser->content_model ==
991 HUBBUB_CONTENT_MODEL_CDATA) {
992 uint8_t *start_tag_name =
993 tokeniser->context.last_start_tag_name;
994 size_t start_tag_len =
995 tokeniser->context.last_start_tag_len;
996
997 while ((error = parserutils_inputstream_peek(tokeniser->input,
998 ctx->pending +
999 ctx->close_tag_match.count,
1000 &cptr,
1001 &len)) == PARSERUTILS_OK) {
1002 c = *cptr;
1003
1004 if ((start_tag_name[ctx->close_tag_match.count] & ~0x20)
1005 != (c & ~0x20)) {
1006 break;
1007 }
1008
1009 ctx->close_tag_match.count += len;
1010
1011 if (ctx->close_tag_match.count == start_tag_len) {
1012 ctx->close_tag_match.match = true;
1013 break;
1014 }
1015 }
1016
1017 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1018 return hubbub_error_from_parserutils_error(error);
1019 }
1020
1021 if (ctx->close_tag_match.match == true) {
1022 error = parserutils_inputstream_peek(
1023 tokeniser->input,
1024 ctx->pending +
1025 ctx->close_tag_match.count,
1026 &cptr,
1027 &len);
1028
1029 if (error != PARSERUTILS_OK &&
1030 error != PARSERUTILS_EOF) {
1031 return hubbub_error_from_parserutils_error(
1032 error);
1033 } else if (error != PARSERUTILS_EOF) {
1034 c = *cptr;
1035
1036 if (c != '\t' && c != '\n' && c != '\f' &&
1037 c != ' ' && c != '>' &&
1038 c != '/') {
1039 ctx->close_tag_match.match = false;
1040 }
1041 }
1042 }
1043 }
1044
1045 if (ctx->close_tag_match.match == false &&
1046 tokeniser->content_model !=
1047 HUBBUB_CONTENT_MODEL_PCDATA) {
1048 /* We should emit "</" here, but instead we leave it in the
1049 * buffer so the data state emits it with any characters
1050 * following it */
1051 tokeniser->state = STATE_DATA;
1052 } else {
1053 error = parserutils_inputstream_peek(tokeniser->input,
1054 tokeniser->context.pending, &cptr, &len);
1055
1056 if (error == PARSERUTILS_EOF) {
1057 /** \todo parse error */
1058
1059 /* Return to data state with "</" pending */
1060 tokeniser->state = STATE_DATA;
1061 return HUBBUB_OK;
1062 } else if (error != PARSERUTILS_OK) {
1063 return hubbub_error_from_parserutils_error(error);
1064 }
1065
1066 c = *cptr;
1067
1068 if ('A' <= c && c <= 'Z') {
1069 uint8_t lc = (c + 0x20);
1070 START_BUF(tokeniser->context.current_tag.name,
1071 &lc, len);
1072 tokeniser->context.current_tag.n_attributes = 0;
1073
1074 tokeniser->context.current_tag_type =
1075 HUBBUB_TOKEN_END_TAG;
1076
1077 tokeniser->context.pending += len;
1078
1079 tokeniser->state = STATE_TAG_NAME;
1080 } else if ('a' <= c && c <= 'z') {
1081 START_BUF(tokeniser->context.current_tag.name,
1082 cptr, len);
1083 tokeniser->context.current_tag.n_attributes = 0;
1084
1085 tokeniser->context.current_tag_type =
1086 HUBBUB_TOKEN_END_TAG;
1087
1088 tokeniser->context.pending += len;
1089
1090 tokeniser->state = STATE_TAG_NAME;
1091 } else if (c == '>') {
1092 /* Cursor still at "</", need to collect ">" */
1093 tokeniser->context.pending += len;
1094
1095 /* Now need to advance past "</>" */
1096 parserutils_inputstream_advance(tokeniser->input,
1097 tokeniser->context.pending);
1098 tokeniser->context.pending = 0;
1099
1100 /** \todo parse error */
1101 tokeniser->state = STATE_DATA;
1102 } else {
1103 /** \todo parse error */
1104
1105 /* Cursor still at "</", need to advance past it */
1106 parserutils_inputstream_advance(tokeniser->input,
1107 tokeniser->context.pending);
1108 tokeniser->context.pending = 0;
1109
1110 tokeniser->state = STATE_BOGUS_COMMENT;
1111 }
1112 }
1113
1114 return HUBBUB_OK;
1115 }
1116
1117 /* this state expects tokeniser->context.current_tag to already have its
1118 first character set */
hubbub_tokeniser_handle_tag_name(hubbub_tokeniser * tokeniser)1119 hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
1120 {
1121 hubbub_tag *ctag = &tokeniser->context.current_tag;
1122
1123 size_t len;
1124 const uint8_t *cptr;
1125 parserutils_error error;
1126 uint8_t c;
1127
1128 assert(tokeniser->context.pending > 0);
1129 /* assert(tokeniser->context.chars.ptr[0] == '<'); */
1130 assert(ctag->name.len > 0);
1131 /* assert(ctag->name.ptr); */
1132
1133 error = parserutils_inputstream_peek(tokeniser->input,
1134 tokeniser->context.pending, &cptr, &len);
1135
1136 if (error != PARSERUTILS_OK) {
1137 if (error == PARSERUTILS_EOF) {
1138 tokeniser->state = STATE_DATA;
1139 return emit_current_tag(tokeniser);
1140 } else {
1141 return hubbub_error_from_parserutils_error(error);
1142 }
1143 }
1144
1145 c = *cptr;
1146
1147 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1148 tokeniser->context.pending += len;
1149 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1150 } else if (c == '>') {
1151 tokeniser->context.pending += len;
1152 tokeniser->state = STATE_DATA;
1153 return emit_current_tag(tokeniser);
1154 } else if (c == '\0') {
1155 COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
1156 tokeniser->context.pending += len;
1157 } else if (c == '/') {
1158 tokeniser->context.pending += len;
1159 tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1160 } else if ('A' <= c && c <= 'Z') {
1161 uint8_t lc = (c + 0x20);
1162 COLLECT(ctag->name, &lc, len);
1163 tokeniser->context.pending += len;
1164 } else {
1165 COLLECT(ctag->name, cptr, len);
1166 tokeniser->context.pending += len;
1167 }
1168
1169 return HUBBUB_OK;
1170 }
1171
hubbub_tokeniser_handle_before_attribute_name(hubbub_tokeniser * tokeniser)1172 hubbub_error hubbub_tokeniser_handle_before_attribute_name(
1173 hubbub_tokeniser *tokeniser)
1174 {
1175 hubbub_tag *ctag = &tokeniser->context.current_tag;
1176
1177 size_t len;
1178 const uint8_t *cptr;
1179 parserutils_error error;
1180 uint8_t c;
1181
1182 error = parserutils_inputstream_peek(tokeniser->input,
1183 tokeniser->context.pending, &cptr, &len);
1184
1185 if (error != PARSERUTILS_OK) {
1186 if (error == PARSERUTILS_EOF) {
1187 tokeniser->state = STATE_DATA;
1188 return emit_current_tag(tokeniser);
1189 } else {
1190 return hubbub_error_from_parserutils_error(error);
1191 }
1192 }
1193
1194 c = *cptr;
1195
1196 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1197 /* pass over in silence */
1198 tokeniser->context.pending += len;
1199 } else if (c == '>') {
1200 tokeniser->context.pending += len;
1201 tokeniser->state = STATE_DATA;
1202 return emit_current_tag(tokeniser);
1203 } else if (c == '/') {
1204 tokeniser->context.pending += len;
1205 tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1206 } else {
1207 hubbub_attribute *attr;
1208
1209 if (c == '"' || c == '\'' || c == '=') {
1210 /** \todo parse error */
1211 }
1212
1213 attr = realloc(ctag->attributes,
1214 (ctag->n_attributes + 1) *
1215 sizeof(hubbub_attribute));
1216 if (attr == NULL)
1217 return HUBBUB_NOMEM;
1218
1219 ctag->attributes = attr;
1220
1221 if ('A' <= c && c <= 'Z') {
1222 uint8_t lc = (c + 0x20);
1223 START_BUF(attr[ctag->n_attributes].name, &lc, len);
1224 } else if (c == '\0') {
1225 START_BUF(attr[ctag->n_attributes].name,
1226 u_fffd, sizeof(u_fffd));
1227 } else {
1228 START_BUF(attr[ctag->n_attributes].name, cptr, len);
1229 }
1230
1231 attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
1232 attr[ctag->n_attributes].value.ptr = NULL;
1233 attr[ctag->n_attributes].value.len = 0;
1234
1235 ctag->n_attributes++;
1236
1237 tokeniser->context.pending += len;
1238 tokeniser->state = STATE_ATTRIBUTE_NAME;
1239 }
1240
1241 return HUBBUB_OK;
1242 }
1243
hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser * tokeniser)1244 hubbub_error hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser)
1245 {
1246 hubbub_tag *ctag = &tokeniser->context.current_tag;
1247
1248 size_t len;
1249 const uint8_t *cptr;
1250 parserutils_error error;
1251 uint8_t c;
1252
1253 assert(ctag->attributes[ctag->n_attributes - 1].name.len > 0);
1254
1255 error = parserutils_inputstream_peek(tokeniser->input,
1256 tokeniser->context.pending, &cptr, &len);
1257
1258 if (error != PARSERUTILS_OK) {
1259 if (error == PARSERUTILS_EOF) {
1260 tokeniser->state = STATE_DATA;
1261 return emit_current_tag(tokeniser);
1262 } else {
1263 return hubbub_error_from_parserutils_error(error);
1264 }
1265 }
1266
1267 c = *cptr;
1268
1269 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1270 tokeniser->context.pending += len;
1271 tokeniser->state = STATE_AFTER_ATTRIBUTE_NAME;
1272 } else if (c == '=') {
1273 tokeniser->context.pending += len;
1274 tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
1275 } else if (c == '>') {
1276 tokeniser->context.pending += len;
1277 tokeniser->state = STATE_DATA;
1278 return emit_current_tag(tokeniser);
1279 } else if (c == '/') {
1280 tokeniser->context.pending += len;
1281 tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1282 } else if (c == '\0') {
1283 COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1284 u_fffd, sizeof(u_fffd));
1285 tokeniser->context.pending += len;
1286 } else if ('A' <= c && c <= 'Z') {
1287 uint8_t lc = (c + 0x20);
1288 COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1289 &lc, len);
1290 tokeniser->context.pending += len;
1291 } else {
1292 COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1293 cptr, len);
1294 tokeniser->context.pending += len;
1295 }
1296
1297 return HUBBUB_OK;
1298 }
1299
hubbub_tokeniser_handle_after_attribute_name(hubbub_tokeniser * tokeniser)1300 hubbub_error hubbub_tokeniser_handle_after_attribute_name(
1301 hubbub_tokeniser *tokeniser)
1302 {
1303 hubbub_tag *ctag = &tokeniser->context.current_tag;
1304
1305 size_t len;
1306 const uint8_t *cptr;
1307 parserutils_error error;
1308 uint8_t c;
1309
1310 error = parserutils_inputstream_peek(tokeniser->input,
1311 tokeniser->context.pending, &cptr, &len);
1312
1313 if (error != PARSERUTILS_OK) {
1314 if (error == PARSERUTILS_EOF) {
1315 tokeniser->state = STATE_DATA;
1316 return emit_current_tag(tokeniser);
1317 } else {
1318 return hubbub_error_from_parserutils_error(error);
1319 }
1320 }
1321
1322 c = *cptr;
1323
1324 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1325 tokeniser->context.pending += len;
1326 } else if (c == '=') {
1327 tokeniser->context.pending += len;
1328 tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
1329 } else if (c == '>') {
1330 tokeniser->context.pending += len;
1331
1332 tokeniser->state = STATE_DATA;
1333 return emit_current_tag(tokeniser);
1334 } else if (c == '/') {
1335 tokeniser->context.pending += len;
1336 tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1337 } else {
1338 hubbub_attribute *attr;
1339
1340 if (c == '"' || c == '\'') {
1341 /** \todo parse error */
1342 }
1343
1344 attr = realloc(ctag->attributes,
1345 (ctag->n_attributes + 1) *
1346 sizeof(hubbub_attribute));
1347 if (attr == NULL)
1348 return HUBBUB_NOMEM;
1349
1350 ctag->attributes = attr;
1351
1352 if ('A' <= c && c <= 'Z') {
1353 uint8_t lc = (c + 0x20);
1354 START_BUF(attr[ctag->n_attributes].name, &lc, len);
1355 } else if (c == '\0') {
1356 START_BUF(attr[ctag->n_attributes].name,
1357 u_fffd, sizeof(u_fffd));
1358 } else {
1359 START_BUF(attr[ctag->n_attributes].name, cptr, len);
1360 }
1361
1362 attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
1363 attr[ctag->n_attributes].value.ptr = NULL;
1364 attr[ctag->n_attributes].value.len = 0;
1365
1366 ctag->n_attributes++;
1367
1368 tokeniser->context.pending += len;
1369 tokeniser->state = STATE_ATTRIBUTE_NAME;
1370 }
1371
1372 return HUBBUB_OK;
1373 }
1374
1375 /* this state is only ever triggered by an '=' */
hubbub_tokeniser_handle_before_attribute_value(hubbub_tokeniser * tokeniser)1376 hubbub_error hubbub_tokeniser_handle_before_attribute_value(
1377 hubbub_tokeniser *tokeniser)
1378 {
1379 hubbub_tag *ctag = &tokeniser->context.current_tag;
1380
1381 size_t len;
1382 const uint8_t *cptr;
1383 parserutils_error error;
1384 uint8_t c;
1385
1386 error = parserutils_inputstream_peek(tokeniser->input,
1387 tokeniser->context.pending, &cptr, &len);
1388
1389 if (error != PARSERUTILS_OK) {
1390 if (error == PARSERUTILS_EOF) {
1391 /** \todo parse error */
1392 tokeniser->state = STATE_DATA;
1393 return emit_current_tag(tokeniser);
1394 } else {
1395 return hubbub_error_from_parserutils_error(error);
1396 }
1397 }
1398
1399 c = *cptr;
1400
1401 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1402 tokeniser->context.pending += len;
1403 } else if (c == '"') {
1404 tokeniser->context.pending += len;
1405 tokeniser->state = STATE_ATTRIBUTE_VALUE_DQ;
1406 } else if (c == '&') {
1407 /* Don't consume the '&' -- reprocess in UQ state */
1408 tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1409 } else if (c == '\'') {
1410 tokeniser->context.pending += len;
1411 tokeniser->state = STATE_ATTRIBUTE_VALUE_SQ;
1412 } else if (c == '>') {
1413 /** \todo parse error */
1414 tokeniser->context.pending += len;
1415
1416 tokeniser->state = STATE_DATA;
1417 return emit_current_tag(tokeniser);
1418 } else if (c == '\0') {
1419 START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
1420 u_fffd, sizeof(u_fffd));
1421 tokeniser->context.pending += len;
1422 tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1423 } else {
1424 if (c == '=') {
1425 /** \todo parse error */
1426 }
1427
1428 START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
1429 cptr, len);
1430
1431 tokeniser->context.pending += len;
1432 tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1433 }
1434
1435 return HUBBUB_OK;
1436 }
1437
hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser * tokeniser)1438 hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
1439 hubbub_tokeniser *tokeniser)
1440 {
1441 hubbub_tag *ctag = &tokeniser->context.current_tag;
1442
1443 size_t len;
1444 const uint8_t *cptr;
1445 parserutils_error error;
1446 uint8_t c;
1447
1448 error = parserutils_inputstream_peek(tokeniser->input,
1449 tokeniser->context.pending, &cptr, &len);
1450
1451 if (error != PARSERUTILS_OK) {
1452 if (error == PARSERUTILS_EOF) {
1453 tokeniser->state = STATE_DATA;
1454 return emit_current_tag(tokeniser);
1455 } else {
1456 return hubbub_error_from_parserutils_error(error);
1457 }
1458 }
1459
1460 c = *cptr;
1461
1462 if (c == '"') {
1463 tokeniser->context.pending += len;
1464 tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
1465 } else if (c == '&') {
1466 tokeniser->context.prev_state = tokeniser->state;
1467 tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1468 tokeniser->context.allowed_char = '"';
1469 /* Don't eat the '&'; it'll be handled by entity consumption */
1470 } else if (c == '\0') {
1471 COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1472 u_fffd, sizeof(u_fffd));
1473 tokeniser->context.pending += len;
1474 } else if (c == '\r') {
1475 error = parserutils_inputstream_peek(
1476 tokeniser->input,
1477 tokeniser->context.pending + len,
1478 &cptr,
1479 &len);
1480
1481 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1482 return hubbub_error_from_parserutils_error(error);
1483 } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1484 COLLECT_MS(ctag->attributes[
1485 ctag->n_attributes - 1].value,
1486 &lf, sizeof(lf));
1487 }
1488
1489 /* Consume '\r' */
1490 tokeniser->context.pending += 1;
1491 } else {
1492 COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1493 cptr, len);
1494 tokeniser->context.pending += len;
1495 }
1496
1497 return HUBBUB_OK;
1498 }
1499
hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser * tokeniser)1500 hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
1501 hubbub_tokeniser *tokeniser)
1502 {
1503 hubbub_tag *ctag = &tokeniser->context.current_tag;
1504
1505 size_t len;
1506 const uint8_t *cptr;
1507 parserutils_error error;
1508 uint8_t c;
1509
1510 error = parserutils_inputstream_peek(tokeniser->input,
1511 tokeniser->context.pending, &cptr, &len);
1512
1513 if (error != PARSERUTILS_OK) {
1514 if (error == PARSERUTILS_EOF) {
1515 tokeniser->state = STATE_DATA;
1516 return emit_current_tag(tokeniser);
1517 } else {
1518 return hubbub_error_from_parserutils_error(error);
1519 }
1520 }
1521
1522 c = *cptr;
1523
1524 if (c == '\'') {
1525 tokeniser->context.pending += len;
1526 tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
1527 } else if (c == '&') {
1528 tokeniser->context.prev_state = tokeniser->state;
1529 tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1530 tokeniser->context.allowed_char = '\'';
1531 /* Don't eat the '&'; it'll be handled by entity consumption */
1532 } else if (c == '\0') {
1533 COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1534 u_fffd, sizeof(u_fffd));
1535 tokeniser->context.pending += len;
1536 } else if (c == '\r') {
1537 error = parserutils_inputstream_peek(
1538 tokeniser->input,
1539 tokeniser->context.pending + len,
1540 &cptr,
1541 &len);
1542
1543 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1544 return hubbub_error_from_parserutils_error(error);
1545 } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1546 COLLECT_MS(ctag->attributes[
1547 ctag->n_attributes - 1].value,
1548 &lf, sizeof(lf));
1549 }
1550
1551 /* Consume \r */
1552 tokeniser->context.pending += 1;
1553 } else {
1554 COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1555 cptr, len);
1556 tokeniser->context.pending += len;
1557 }
1558
1559 return HUBBUB_OK;
1560 }
1561
hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser * tokeniser)1562 hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
1563 hubbub_tokeniser *tokeniser)
1564 {
1565 hubbub_tag *ctag = &tokeniser->context.current_tag;
1566 uint8_t c;
1567
1568 size_t len;
1569 const uint8_t *cptr;
1570 parserutils_error error;
1571
1572 error = parserutils_inputstream_peek(tokeniser->input,
1573 tokeniser->context.pending, &cptr, &len);
1574
1575 if (error != PARSERUTILS_OK) {
1576 if (error == PARSERUTILS_EOF) {
1577 tokeniser->state = STATE_DATA;
1578 return emit_current_tag(tokeniser);
1579 } else {
1580 return hubbub_error_from_parserutils_error(error);
1581 }
1582 }
1583
1584 c = *cptr;
1585
1586 assert(c == '&' ||
1587 ctag->attributes[ctag->n_attributes - 1].value.len >= 1);
1588
1589 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1590 tokeniser->context.pending += len;
1591 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1592 } else if (c == '&') {
1593 tokeniser->context.prev_state = tokeniser->state;
1594 tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1595 /* Don't eat the '&'; it'll be handled by entity consumption */
1596 } else if (c == '>') {
1597 tokeniser->context.pending += len;
1598 tokeniser->state = STATE_DATA;
1599 return emit_current_tag(tokeniser);
1600 } else if (c == '\0') {
1601 COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
1602 u_fffd, sizeof(u_fffd));
1603 tokeniser->context.pending += len;
1604 } else {
1605 if (c == '"' || c == '\'' || c == '=') {
1606 /** \todo parse error */
1607 }
1608
1609 COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
1610 cptr, len);
1611 tokeniser->context.pending += len;
1612 }
1613
1614 return HUBBUB_OK;
1615 }
1616
hubbub_tokeniser_handle_character_reference_in_attribute_value(hubbub_tokeniser * tokeniser)1617 hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
1618 hubbub_tokeniser *tokeniser)
1619 {
1620 if (tokeniser->context.match_entity.complete == false) {
1621 return hubbub_tokeniser_consume_character_reference(tokeniser,
1622 tokeniser->context.pending);
1623 } else {
1624 hubbub_tag *ctag = &tokeniser->context.current_tag;
1625 hubbub_attribute *attr = &ctag->attributes[
1626 ctag->n_attributes - 1];
1627
1628 uint8_t utf8[6];
1629 uint8_t *utf8ptr = utf8;
1630 size_t len = sizeof(utf8);
1631
1632 if (tokeniser->context.match_entity.codepoint) {
1633 parserutils_charset_utf8_from_ucs4(
1634 tokeniser->context.match_entity.codepoint,
1635 &utf8ptr, &len);
1636
1637 COLLECT_MS(attr->value, utf8, sizeof(utf8) - len);
1638
1639 /* +1 for the ampersand */
1640 tokeniser->context.pending +=
1641 tokeniser->context.match_entity.length
1642 + 1;
1643 } else {
1644 size_t len = 0;
1645 const uint8_t *cptr = NULL;
1646 parserutils_error error;
1647
1648 error = parserutils_inputstream_peek(
1649 tokeniser->input,
1650 tokeniser->context.pending,
1651 &cptr,
1652 &len);
1653 if (error != PARSERUTILS_OK) {
1654 return hubbub_error_from_parserutils_error(
1655 error);
1656 }
1657
1658 /* Insert the ampersand */
1659 COLLECT_MS(attr->value, cptr, len);
1660 tokeniser->context.pending += len;
1661 }
1662
1663 /* Reset for next time */
1664 tokeniser->context.match_entity.complete = false;
1665
1666 /* And back to the previous state */
1667 tokeniser->state = tokeniser->context.prev_state;
1668 }
1669
1670 return HUBBUB_OK;
1671 }
1672
1673 /* always switches state */
hubbub_tokeniser_handle_after_attribute_value_q(hubbub_tokeniser * tokeniser)1674 hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
1675 hubbub_tokeniser *tokeniser)
1676 {
1677 size_t len;
1678 const uint8_t *cptr;
1679 parserutils_error error;
1680 uint8_t c;
1681
1682 error = parserutils_inputstream_peek(tokeniser->input,
1683 tokeniser->context.pending, &cptr, &len);
1684
1685 if (error != PARSERUTILS_OK) {
1686 if (error == PARSERUTILS_EOF) {
1687 tokeniser->state = STATE_DATA;
1688 return emit_current_tag(tokeniser);
1689 } else {
1690 return hubbub_error_from_parserutils_error(error);
1691 }
1692 }
1693
1694 c = *cptr;
1695
1696 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1697 tokeniser->context.pending += len;
1698 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1699 } else if (c == '>') {
1700 tokeniser->context.pending += len;
1701
1702 tokeniser->state = STATE_DATA;
1703 return emit_current_tag(tokeniser);
1704 } else if (c == '/') {
1705 tokeniser->context.pending += len;
1706 tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1707 } else {
1708 /** \todo parse error */
1709 /* Reprocess character in before attribute name state */
1710 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1711 }
1712
1713 return HUBBUB_OK;
1714 }
1715
hubbub_tokeniser_handle_self_closing_start_tag(hubbub_tokeniser * tokeniser)1716 hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
1717 hubbub_tokeniser *tokeniser)
1718 {
1719 size_t len;
1720 const uint8_t *cptr;
1721 parserutils_error error;
1722 uint8_t c;
1723
1724 error = parserutils_inputstream_peek(tokeniser->input,
1725 tokeniser->context.pending, &cptr, &len);
1726
1727 if (error != PARSERUTILS_OK) {
1728 if (error == PARSERUTILS_EOF) {
1729 tokeniser->state = STATE_DATA;
1730 return emit_current_tag(tokeniser);
1731 } else {
1732 return hubbub_error_from_parserutils_error(error);
1733 }
1734 }
1735
1736 c = *cptr;
1737
1738 if (c == '>') {
1739 tokeniser->context.pending += len;
1740 tokeniser->state = STATE_DATA;
1741
1742 tokeniser->context.current_tag.self_closing = true;
1743 return emit_current_tag(tokeniser);
1744 } else {
1745 /* Reprocess character in before attribute name state */
1746 tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1747 }
1748
1749 return HUBBUB_OK;
1750 }
1751
1752 /* this state expects tokeniser->context.chars to be empty on first entry */
hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser * tokeniser)1753 hubbub_error hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
1754 {
1755 size_t len;
1756 const uint8_t *cptr;
1757 parserutils_error error;
1758 uint8_t c;
1759
1760 error = parserutils_inputstream_peek(tokeniser->input,
1761 tokeniser->context.pending, &cptr, &len);
1762
1763 if (error != PARSERUTILS_OK) {
1764 if (error == PARSERUTILS_EOF) {
1765 tokeniser->state = STATE_DATA;
1766 return emit_current_comment(tokeniser);
1767 } else {
1768 return hubbub_error_from_parserutils_error(error);
1769 }
1770 }
1771
1772 c = *cptr;
1773
1774 if (c == '>') {
1775 tokeniser->context.pending += len;
1776 tokeniser->state = STATE_DATA;
1777 return emit_current_comment(tokeniser);
1778 } else if (c == '\0') {
1779 error = parserutils_buffer_append(tokeniser->buffer,
1780 u_fffd, sizeof(u_fffd));
1781 if (error != PARSERUTILS_OK)
1782 return hubbub_error_from_parserutils_error(error);
1783
1784 tokeniser->context.pending += len;
1785 } else if (c == '\r') {
1786 error = parserutils_inputstream_peek(
1787 tokeniser->input,
1788 tokeniser->context.pending,
1789 &cptr,
1790 &len);
1791
1792 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1793 return hubbub_error_from_parserutils_error(error);
1794 } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1795 error = parserutils_buffer_append(tokeniser->buffer,
1796 &lf, sizeof(lf));
1797 if (error != PARSERUTILS_OK) {
1798 return hubbub_error_from_parserutils_error(
1799 error);
1800 }
1801 }
1802 tokeniser->context.pending += len;
1803 } else {
1804 error = parserutils_buffer_append(tokeniser->buffer,
1805 (uint8_t *) cptr, len);
1806 if (error != PARSERUTILS_OK)
1807 return hubbub_error_from_parserutils_error(error);
1808
1809 tokeniser->context.pending += len;
1810 }
1811
1812 return HUBBUB_OK;
1813 }
1814
1815 /* this state always switches to another state straight away */
hubbub_tokeniser_handle_markup_declaration_open(hubbub_tokeniser * tokeniser)1816 hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
1817 hubbub_tokeniser *tokeniser)
1818 {
1819 size_t len;
1820 const uint8_t *cptr;
1821 parserutils_error error;
1822 uint8_t c;
1823
1824 assert(tokeniser->context.pending == 0);
1825
1826 error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);
1827
1828 if (error != PARSERUTILS_OK) {
1829 if (error == PARSERUTILS_EOF) {
1830 tokeniser->state = STATE_BOGUS_COMMENT;
1831 return HUBBUB_OK;
1832 } else {
1833 return hubbub_error_from_parserutils_error(error);
1834 }
1835 }
1836
1837 c = *cptr;
1838
1839 if (c == '-') {
1840 tokeniser->context.pending = len;
1841 tokeniser->state = STATE_MATCH_COMMENT;
1842 } else if ((c & ~0x20) == 'D') {
1843 tokeniser->context.pending = len;
1844 tokeniser->context.match_doctype.count = len;
1845 tokeniser->state = STATE_MATCH_DOCTYPE;
1846 } else if (tokeniser->process_cdata_section == true && c == '[') {
1847 tokeniser->context.pending = len;
1848 tokeniser->context.match_cdata.count = len;
1849 tokeniser->state = STATE_MATCH_CDATA;
1850 } else {
1851 tokeniser->state = STATE_BOGUS_COMMENT;
1852 }
1853
1854 return HUBBUB_OK;
1855 }
1856
1857
hubbub_tokeniser_handle_match_comment(hubbub_tokeniser * tokeniser)1858 hubbub_error hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser)
1859 {
1860 size_t len;
1861 const uint8_t *cptr;
1862 parserutils_error error;
1863
1864 error = parserutils_inputstream_peek(tokeniser->input,
1865 tokeniser->context.pending, &cptr, &len);
1866
1867 if (error != PARSERUTILS_OK) {
1868 if (error == PARSERUTILS_EOF) {
1869 tokeniser->context.pending =
1870 tokeniser->context.current_comment.len = 0;
1871 tokeniser->state = STATE_BOGUS_COMMENT;
1872 return HUBBUB_OK;
1873 } else {
1874 return hubbub_error_from_parserutils_error(error);
1875 }
1876 }
1877
1878 tokeniser->context.pending = tokeniser->context.current_comment.len = 0;
1879
1880 if (*cptr == '-') {
1881 parserutils_inputstream_advance(tokeniser->input, SLEN("--"));
1882 tokeniser->state = STATE_COMMENT_START;
1883 } else {
1884 tokeniser->state = STATE_BOGUS_COMMENT;
1885 }
1886
1887 return HUBBUB_OK;
1888 }
1889
1890
hubbub_tokeniser_handle_comment(hubbub_tokeniser * tokeniser)1891 hubbub_error hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
1892 {
1893 size_t len;
1894 const uint8_t *cptr;
1895 parserutils_error error;
1896 uint8_t c;
1897
1898 error = parserutils_inputstream_peek(tokeniser->input,
1899 tokeniser->context.pending, &cptr, &len);
1900
1901 if (error != PARSERUTILS_OK) {
1902 if (error == PARSERUTILS_EOF) {
1903 tokeniser->state = STATE_DATA;
1904 return emit_current_comment(tokeniser);
1905 } else {
1906 return hubbub_error_from_parserutils_error(error);
1907 }
1908 }
1909
1910 c = *cptr;
1911
1912 if (c == '>' && (tokeniser->state == STATE_COMMENT_START_DASH ||
1913 tokeniser->state == STATE_COMMENT_START ||
1914 tokeniser->state == STATE_COMMENT_END)) {
1915 tokeniser->context.pending += len;
1916
1917 /** \todo parse error if state != COMMENT_END */
1918 tokeniser->state = STATE_DATA;
1919 return emit_current_comment(tokeniser);
1920 } else if (c == '-') {
1921 if (tokeniser->state == STATE_COMMENT_START) {
1922 tokeniser->state = STATE_COMMENT_START_DASH;
1923 } else if (tokeniser->state == STATE_COMMENT_START_DASH) {
1924 tokeniser->state = STATE_COMMENT_END;
1925 } else if (tokeniser->state == STATE_COMMENT) {
1926 tokeniser->state = STATE_COMMENT_END_DASH;
1927 } else if (tokeniser->state == STATE_COMMENT_END_DASH) {
1928 tokeniser->state = STATE_COMMENT_END;
1929 } else if (tokeniser->state == STATE_COMMENT_END) {
1930 error = parserutils_buffer_append(tokeniser->buffer,
1931 (uint8_t *) "-", SLEN("-"));
1932 if (error != PARSERUTILS_OK) {
1933 return hubbub_error_from_parserutils_error(
1934 error);
1935 }
1936 }
1937
1938 tokeniser->context.pending += len;
1939 } else {
1940 if (tokeniser->state == STATE_COMMENT_START_DASH ||
1941 tokeniser->state == STATE_COMMENT_END_DASH) {
1942 error = parserutils_buffer_append(tokeniser->buffer,
1943 (uint8_t *) "-", SLEN("-"));
1944 if (error != PARSERUTILS_OK) {
1945 return hubbub_error_from_parserutils_error(
1946 error);
1947 }
1948 } else if (tokeniser->state == STATE_COMMENT_END) {
1949 error = parserutils_buffer_append(tokeniser->buffer,
1950 (uint8_t *) "--", SLEN("--"));
1951 if (error != PARSERUTILS_OK) {
1952 return hubbub_error_from_parserutils_error(
1953 error);
1954 }
1955 }
1956
1957 if (c == '\0') {
1958 error = parserutils_buffer_append(tokeniser->buffer,
1959 u_fffd, sizeof(u_fffd));
1960 if (error != PARSERUTILS_OK) {
1961 return hubbub_error_from_parserutils_error(
1962 error);
1963 }
1964 } else if (c == '\r') {
1965 size_t next_len;
1966 error = parserutils_inputstream_peek(
1967 tokeniser->input,
1968 tokeniser->context.pending + len,
1969 &cptr,
1970 &next_len);
1971 if (error != PARSERUTILS_OK &&
1972 error != PARSERUTILS_EOF) {
1973 return hubbub_error_from_parserutils_error(
1974 error);
1975 } else if (error != PARSERUTILS_EOF && *cptr != '\n') {
1976 error = parserutils_buffer_append(
1977 tokeniser->buffer,
1978 &lf, sizeof(lf));
1979 if (error != PARSERUTILS_OK) {
1980 return hubbub_error_from_parserutils_error(
1981 error);
1982 }
1983 }
1984 } else {
1985 error = parserutils_buffer_append(tokeniser->buffer,
1986 cptr, len);
1987 if (error != PARSERUTILS_OK) {
1988 return hubbub_error_from_parserutils_error(
1989 error);
1990 }
1991 }
1992
1993 tokeniser->context.pending += len;
1994 tokeniser->state = STATE_COMMENT;
1995 }
1996
1997 return HUBBUB_OK;
1998 }
1999
2000
2001
2002
2003 #define DOCTYPE "DOCTYPE"
2004 #define DOCTYPE_LEN (SLEN(DOCTYPE) - 1)
2005
hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser * tokeniser)2006 hubbub_error hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
2007 {
2008 size_t len;
2009 const uint8_t *cptr;
2010 parserutils_error error;
2011 uint8_t c;
2012
2013 error = parserutils_inputstream_peek(tokeniser->input,
2014 tokeniser->context.match_doctype.count, &cptr, &len);
2015
2016 if (error != PARSERUTILS_OK) {
2017 if (error == PARSERUTILS_EOF) {
2018 tokeniser->context.current_comment.len =
2019 tokeniser->context.pending = 0;
2020 tokeniser->state = STATE_BOGUS_COMMENT;
2021 return HUBBUB_OK;
2022 } else {
2023 return hubbub_error_from_parserutils_error(error);
2024 }
2025 }
2026
2027 c = *cptr;
2028
2029 assert(tokeniser->context.match_doctype.count <= DOCTYPE_LEN);
2030
2031 if (DOCTYPE[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
2032 tokeniser->context.current_comment.len =
2033 tokeniser->context.pending = 0;
2034 tokeniser->state = STATE_BOGUS_COMMENT;
2035 return HUBBUB_OK;
2036 }
2037
2038 tokeniser->context.pending += len;
2039
2040 if (tokeniser->context.match_doctype.count == DOCTYPE_LEN) {
2041 /* Skip over the DOCTYPE bit */
2042 parserutils_inputstream_advance(tokeniser->input,
2043 tokeniser->context.pending);
2044
2045 memset(&tokeniser->context.current_doctype, 0,
2046 sizeof tokeniser->context.current_doctype);
2047 tokeniser->context.current_doctype.public_missing = true;
2048 tokeniser->context.current_doctype.system_missing = true;
2049 tokeniser->context.pending = 0;
2050
2051 tokeniser->state = STATE_DOCTYPE;
2052 }
2053
2054 tokeniser->context.match_doctype.count++;
2055
2056 return HUBBUB_OK;
2057 }
2058
2059 #undef DOCTYPE
2060 #undef DOCTYPE_LEN
2061
hubbub_tokeniser_handle_doctype(hubbub_tokeniser * tokeniser)2062 hubbub_error hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser)
2063 {
2064 size_t len;
2065 const uint8_t *cptr;
2066 parserutils_error error;
2067 uint8_t c;
2068
2069 error = parserutils_inputstream_peek(tokeniser->input,
2070 tokeniser->context.pending, &cptr, &len);
2071
2072 if (error != PARSERUTILS_OK) {
2073 if (error == PARSERUTILS_EOF) {
2074 tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
2075 return HUBBUB_OK;
2076 } else {
2077 return hubbub_error_from_parserutils_error(error);
2078 }
2079 }
2080
2081 c = *cptr;
2082
2083 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2084 tokeniser->context.pending += len;
2085 }
2086
2087 tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
2088
2089 return HUBBUB_OK;
2090 }
2091
hubbub_tokeniser_handle_before_doctype_name(hubbub_tokeniser * tokeniser)2092 hubbub_error hubbub_tokeniser_handle_before_doctype_name(
2093 hubbub_tokeniser *tokeniser)
2094 {
2095 hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2096 size_t len;
2097 const uint8_t *cptr;
2098 parserutils_error error;
2099 uint8_t c;
2100
2101 error = parserutils_inputstream_peek(tokeniser->input,
2102 tokeniser->context.pending, &cptr, &len);
2103
2104 if (error != PARSERUTILS_OK) {
2105 if (error == PARSERUTILS_EOF) {
2106 /** \todo parse error */
2107 /* Emit current doctype, force-quirks on */
2108 tokeniser->state = STATE_DATA;
2109 return emit_current_doctype(tokeniser, true);
2110 } else {
2111 return hubbub_error_from_parserutils_error(error);
2112 }
2113 }
2114
2115 c = *cptr;
2116
2117 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2118 /* pass over in silence */
2119 tokeniser->context.pending += len;
2120 } else if (c == '>') {
2121 /** \todo parse error */
2122 tokeniser->context.pending += len;
2123 tokeniser->state = STATE_DATA;
2124 return emit_current_doctype(tokeniser, true);
2125 } else {
2126 if (c == '\0') {
2127 START_BUF(cdoc->name, u_fffd, sizeof(u_fffd));
2128 } else if ('A' <= c && c <= 'Z') {
2129 uint8_t lc = c + 0x20;
2130
2131 START_BUF(cdoc->name, &lc, len);
2132 } else {
2133 START_BUF(cdoc->name, cptr, len);
2134 }
2135
2136 tokeniser->context.pending += len;
2137 tokeniser->state = STATE_DOCTYPE_NAME;
2138 }
2139
2140 return HUBBUB_OK;
2141 }
2142
hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser * tokeniser)2143 hubbub_error hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
2144 {
2145 hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2146 size_t len;
2147 const uint8_t *cptr;
2148 parserutils_error error;
2149 uint8_t c;
2150
2151 error = parserutils_inputstream_peek(tokeniser->input,
2152 tokeniser->context.pending, &cptr, &len);
2153
2154 if (error != PARSERUTILS_OK) {
2155 if (error == PARSERUTILS_EOF) {
2156 tokeniser->state = STATE_DATA;
2157 return emit_current_doctype(tokeniser, true);
2158 } else {
2159 return hubbub_error_from_parserutils_error(error);
2160 }
2161 }
2162
2163 c = *cptr;
2164
2165 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2166 tokeniser->context.pending += len;
2167 tokeniser->state = STATE_AFTER_DOCTYPE_NAME;
2168 } else if (c == '>') {
2169 tokeniser->context.pending += len;
2170 tokeniser->state = STATE_DATA;
2171 return emit_current_doctype(tokeniser, false);
2172 } else if (c == '\0') {
2173 COLLECT(cdoc->name, u_fffd, sizeof(u_fffd));
2174 tokeniser->context.pending += len;
2175 } else if ('A' <= c && c <= 'Z') {
2176 uint8_t lc = c + 0x20;
2177 COLLECT(cdoc->name, &lc, len);
2178 tokeniser->context.pending += len;
2179 } else {
2180 COLLECT(cdoc->name, cptr, len);
2181 tokeniser->context.pending += len;
2182 }
2183
2184 return HUBBUB_OK;
2185 }
2186
hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser * tokeniser)2187 hubbub_error hubbub_tokeniser_handle_after_doctype_name(
2188 hubbub_tokeniser *tokeniser)
2189 {
2190 size_t len;
2191 const uint8_t *cptr;
2192 parserutils_error error;
2193 uint8_t c;
2194
2195 error = parserutils_inputstream_peek(tokeniser->input,
2196 tokeniser->context.pending, &cptr, &len);
2197
2198 if (error != PARSERUTILS_OK) {
2199 if (error == PARSERUTILS_EOF) {
2200 tokeniser->state = STATE_DATA;
2201 return emit_current_doctype(tokeniser, true);
2202 } else {
2203 return hubbub_error_from_parserutils_error(error);
2204 }
2205 }
2206
2207 c = *cptr;
2208 tokeniser->context.pending += len;
2209
2210 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2211 /* pass over in silence */
2212 } else if (c == '>') {
2213 tokeniser->state = STATE_DATA;
2214 return emit_current_doctype(tokeniser, false);
2215 } else if ((c & ~0x20) == 'P') {
2216 tokeniser->context.match_doctype.count = 1;
2217 tokeniser->state = STATE_MATCH_PUBLIC;
2218 } else if ((c & ~0x20) == 'S') {
2219 tokeniser->context.match_doctype.count = 1;
2220 tokeniser->state = STATE_MATCH_SYSTEM;
2221 } else {
2222 tokeniser->state = STATE_BOGUS_DOCTYPE;
2223 tokeniser->context.current_doctype.force_quirks = true;
2224 }
2225
2226 return HUBBUB_OK;
2227 }
2228
2229 #define PUBLIC "PUBLIC"
2230 #define PUBLIC_LEN (SLEN(PUBLIC) - 1)
2231
hubbub_tokeniser_handle_match_public(hubbub_tokeniser * tokeniser)2232 hubbub_error hubbub_tokeniser_handle_match_public(hubbub_tokeniser *tokeniser)
2233 {
2234 size_t len;
2235 const uint8_t *cptr;
2236 parserutils_error error;
2237 uint8_t c;
2238
2239 error = parserutils_inputstream_peek(tokeniser->input,
2240 tokeniser->context.pending, &cptr, &len);
2241
2242 if (error != PARSERUTILS_OK) {
2243 if (error == PARSERUTILS_EOF) {
2244 tokeniser->context.current_doctype.force_quirks = true;
2245 tokeniser->state = STATE_BOGUS_DOCTYPE;
2246 return HUBBUB_OK;
2247 } else {
2248 return hubbub_error_from_parserutils_error(error);
2249 }
2250 }
2251
2252 c = *cptr;
2253
2254 assert(tokeniser->context.match_doctype.count <= PUBLIC_LEN);
2255
2256 if (PUBLIC[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
2257 tokeniser->context.current_doctype.force_quirks = true;
2258 tokeniser->state = STATE_BOGUS_DOCTYPE;
2259 return HUBBUB_OK;
2260 }
2261
2262 tokeniser->context.pending += len;
2263
2264 if (tokeniser->context.match_doctype.count == PUBLIC_LEN) {
2265 tokeniser->state = STATE_BEFORE_DOCTYPE_PUBLIC;
2266 }
2267
2268 tokeniser->context.match_doctype.count++;
2269
2270 return HUBBUB_OK;
2271 }
2272
2273 #undef PUBLIC
2274 #undef PUBLIC_LEN
2275
hubbub_tokeniser_handle_before_doctype_public(hubbub_tokeniser * tokeniser)2276 hubbub_error hubbub_tokeniser_handle_before_doctype_public(
2277 hubbub_tokeniser *tokeniser)
2278 {
2279 hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2280 size_t len;
2281 const uint8_t *cptr;
2282 parserutils_error error;
2283 uint8_t c;
2284
2285 error = parserutils_inputstream_peek(tokeniser->input,
2286 tokeniser->context.pending, &cptr, &len);
2287
2288 if (error != PARSERUTILS_OK) {
2289 if (error == PARSERUTILS_EOF) {
2290 tokeniser->state = STATE_DATA;
2291 return emit_current_doctype(tokeniser, true);
2292 } else {
2293 return hubbub_error_from_parserutils_error(error);
2294 }
2295 }
2296
2297 c = *cptr;
2298 tokeniser->context.pending += len;
2299
2300 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2301 /* pass over in silence */
2302 } else if (c == '"') {
2303 cdoc->public_missing = false;
2304 cdoc->public_id.len = 0;
2305 tokeniser->state = STATE_DOCTYPE_PUBLIC_DQ;
2306 } else if (c == '\'') {
2307 cdoc->public_missing = false;
2308 cdoc->public_id.len = 0;
2309 tokeniser->state = STATE_DOCTYPE_PUBLIC_SQ;
2310 } else if (c == '>') {
2311 tokeniser->state = STATE_DATA;
2312 return emit_current_doctype(tokeniser, true);
2313 } else {
2314 cdoc->force_quirks = true;
2315 tokeniser->state = STATE_BOGUS_DOCTYPE;
2316 }
2317
2318 return HUBBUB_OK;
2319 }
2320
hubbub_tokeniser_handle_doctype_public_dq(hubbub_tokeniser * tokeniser)2321 hubbub_error hubbub_tokeniser_handle_doctype_public_dq(
2322 hubbub_tokeniser *tokeniser)
2323 {
2324 hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2325 size_t len;
2326 const uint8_t *cptr;
2327 parserutils_error error;
2328 uint8_t c;
2329
2330 error = parserutils_inputstream_peek(tokeniser->input,
2331 tokeniser->context.pending, &cptr, &len);
2332
2333 if (error != PARSERUTILS_OK) {
2334 if (error == PARSERUTILS_EOF) {
2335 tokeniser->state = STATE_DATA;
2336 return emit_current_doctype(tokeniser, true);
2337 } else {
2338 return hubbub_error_from_parserutils_error(error);
2339 }
2340 }
2341
2342 c = *cptr;
2343
2344 if (c == '"') {
2345 tokeniser->context.pending += len;
2346 tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
2347 } else if (c == '>') {
2348 tokeniser->context.pending += len;
2349 tokeniser->state = STATE_DATA;
2350 return emit_current_doctype(tokeniser, true);
2351 } else if (c == '\0') {
2352 COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd));
2353 tokeniser->context.pending += len;
2354 } else if (c == '\r') {
2355 error = parserutils_inputstream_peek(
2356 tokeniser->input,
2357 tokeniser->context.pending,
2358 &cptr,
2359 &len);
2360
2361 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2362 return hubbub_error_from_parserutils_error(error);
2363 } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2364 COLLECT_MS(cdoc->public_id, &lf, sizeof(lf));
2365 }
2366
2367 /* Collect '\r' */
2368 tokeniser->context.pending += 1;
2369 } else {
2370 COLLECT_MS(cdoc->public_id, cptr, len);
2371
2372 tokeniser->context.pending += len;
2373 }
2374
2375 return HUBBUB_OK;
2376 }
2377
hubbub_tokeniser_handle_doctype_public_sq(hubbub_tokeniser * tokeniser)2378 hubbub_error hubbub_tokeniser_handle_doctype_public_sq(
2379 hubbub_tokeniser *tokeniser)
2380 {
2381 hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2382 size_t len;
2383 const uint8_t *cptr;
2384 parserutils_error error;
2385 uint8_t c;
2386
2387 error = parserutils_inputstream_peek(tokeniser->input,
2388 tokeniser->context.pending, &cptr, &len);
2389
2390 if (error != PARSERUTILS_OK) {
2391 if (error == PARSERUTILS_EOF) {
2392 tokeniser->state = STATE_DATA;
2393 return emit_current_doctype(tokeniser, true);
2394 } else {
2395 return hubbub_error_from_parserutils_error(error);
2396 }
2397 }
2398
2399 c = *cptr;
2400
2401 if (c == '\'') {
2402 tokeniser->context.pending += len;
2403 tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
2404 } else if (c == '>') {
2405 tokeniser->context.pending += len;
2406 tokeniser->state = STATE_DATA;
2407 return emit_current_doctype(tokeniser, true);
2408 } else if (c == '\0') {
2409 COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd));
2410 tokeniser->context.pending += len;
2411 } else if (c == '\r') {
2412 error = parserutils_inputstream_peek(
2413 tokeniser->input,
2414 tokeniser->context.pending,
2415 &cptr,
2416 &len);
2417
2418 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2419 return hubbub_error_from_parserutils_error(error);
2420 } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2421 COLLECT_MS(cdoc->public_id, &lf, sizeof(lf));
2422 }
2423
2424 /* Collect '\r' */
2425 tokeniser->context.pending += 1;
2426 } else {
2427 COLLECT_MS(cdoc->public_id, cptr, len);
2428 tokeniser->context.pending += len;
2429 }
2430
2431 return HUBBUB_OK;
2432 }
2433
2434
hubbub_tokeniser_handle_after_doctype_public(hubbub_tokeniser * tokeniser)2435 hubbub_error hubbub_tokeniser_handle_after_doctype_public(
2436 hubbub_tokeniser *tokeniser)
2437 {
2438 hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2439 size_t len;
2440 const uint8_t *cptr;
2441 parserutils_error error;
2442 uint8_t c;
2443
2444 error = parserutils_inputstream_peek(tokeniser->input,
2445 tokeniser->context.pending, &cptr, &len);
2446
2447 if (error != PARSERUTILS_OK) {
2448 if (error == PARSERUTILS_EOF) {
2449 tokeniser->state = STATE_DATA;
2450 return emit_current_doctype(tokeniser, true);
2451 } else {
2452 return hubbub_error_from_parserutils_error(error);
2453 }
2454 }
2455
2456 c = *cptr;
2457 tokeniser->context.pending += len;
2458
2459 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2460 /* pass over in silence */
2461 } else if (c == '"') {
2462 cdoc->system_missing = false;
2463 cdoc->system_id.len = 0;
2464
2465 tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
2466 } else if (c == '\'') {
2467 cdoc->system_missing = false;
2468 cdoc->system_id.len = 0;
2469
2470 tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
2471 } else if (c == '>') {
2472 tokeniser->state = STATE_DATA;
2473 return emit_current_doctype(tokeniser, false);
2474 } else {
2475 cdoc->force_quirks = true;
2476 tokeniser->state = STATE_BOGUS_DOCTYPE;
2477 }
2478
2479 return HUBBUB_OK;
2480 }
2481
2482
2483
2484 #define SYSTEM "SYSTEM"
2485 #define SYSTEM_LEN (SLEN(SYSTEM) - 1)
2486
hubbub_tokeniser_handle_match_system(hubbub_tokeniser * tokeniser)2487 hubbub_error hubbub_tokeniser_handle_match_system(hubbub_tokeniser *tokeniser)
2488 {
2489 size_t len;
2490 const uint8_t *cptr;
2491 parserutils_error error;
2492 uint8_t c;
2493
2494 error = parserutils_inputstream_peek(tokeniser->input,
2495 tokeniser->context.pending, &cptr, &len);
2496
2497 if (error != PARSERUTILS_OK){
2498 if (error == PARSERUTILS_EOF) {
2499 tokeniser->context.current_doctype.force_quirks = true;
2500 tokeniser->state = STATE_BOGUS_DOCTYPE;
2501 return HUBBUB_OK;
2502 } else {
2503 return hubbub_error_from_parserutils_error(error);
2504 }
2505 }
2506
2507 c = *cptr;
2508
2509 assert(tokeniser->context.match_doctype.count <= SYSTEM_LEN);
2510
2511 if (SYSTEM[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
2512 tokeniser->context.current_doctype.force_quirks = true;
2513 tokeniser->state = STATE_BOGUS_DOCTYPE;
2514 return HUBBUB_OK;
2515 }
2516
2517 tokeniser->context.pending += len;
2518
2519 if (tokeniser->context.match_doctype.count == SYSTEM_LEN) {
2520 tokeniser->state = STATE_BEFORE_DOCTYPE_SYSTEM;
2521 }
2522
2523 tokeniser->context.match_doctype.count++;
2524
2525 return HUBBUB_OK;
2526 }
2527
2528 #undef SYSTEM
2529 #undef SYSTEM_LEN
2530
hubbub_tokeniser_handle_before_doctype_system(hubbub_tokeniser * tokeniser)2531 hubbub_error hubbub_tokeniser_handle_before_doctype_system(
2532 hubbub_tokeniser *tokeniser)
2533 {
2534 hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2535 size_t len;
2536 const uint8_t *cptr;
2537 parserutils_error error;
2538 uint8_t c;
2539
2540 error = parserutils_inputstream_peek(tokeniser->input,
2541 tokeniser->context.pending, &cptr, &len);
2542
2543 if (error != PARSERUTILS_OK) {
2544 if (error == PARSERUTILS_EOF) {
2545 tokeniser->state = STATE_DATA;
2546 return emit_current_doctype(tokeniser, true);
2547 } else {
2548 return hubbub_error_from_parserutils_error(error);
2549 }
2550 }
2551
2552 c = *cptr;
2553 tokeniser->context.pending += len;
2554
2555 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2556 /* pass over */
2557 } else if (c == '"') {
2558 cdoc->system_missing = false;
2559 cdoc->system_id.len = 0;
2560
2561 tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
2562 } else if (c == '\'') {
2563 cdoc->system_missing = false;
2564 cdoc->system_id.len = 0;
2565
2566 tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
2567 } else if (c == '>') {
2568 tokeniser->state = STATE_DATA;
2569 return emit_current_doctype(tokeniser, true);
2570 } else {
2571 cdoc->force_quirks = true;
2572 tokeniser->state = STATE_BOGUS_DOCTYPE;
2573 }
2574
2575 return HUBBUB_OK;
2576 }
2577
hubbub_tokeniser_handle_doctype_system_dq(hubbub_tokeniser * tokeniser)2578 hubbub_error hubbub_tokeniser_handle_doctype_system_dq(
2579 hubbub_tokeniser *tokeniser)
2580 {
2581 hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2582 size_t len;
2583 const uint8_t *cptr;
2584 parserutils_error error;
2585 uint8_t c;
2586
2587 error = parserutils_inputstream_peek(tokeniser->input,
2588 tokeniser->context.pending, &cptr, &len);
2589
2590 if (error != PARSERUTILS_OK) {
2591 if (error == PARSERUTILS_EOF) {
2592 tokeniser->state = STATE_DATA;
2593 return emit_current_doctype(tokeniser, true);
2594 } else {
2595 return hubbub_error_from_parserutils_error(error);
2596 }
2597 }
2598
2599 c = *cptr;
2600
2601 if (c == '"') {
2602 tokeniser->context.pending += len;
2603 tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
2604 } else if (c == '>') {
2605 tokeniser->context.pending += len;
2606 tokeniser->state = STATE_DATA;
2607 return emit_current_doctype(tokeniser, true);
2608 } else if (c == '\0') {
2609 COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd));
2610 tokeniser->context.pending += len;
2611 } else if (c == '\r') {
2612 error = parserutils_inputstream_peek(
2613 tokeniser->input,
2614 tokeniser->context.pending,
2615 &cptr,
2616 &len);
2617
2618 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2619 return hubbub_error_from_parserutils_error(error);
2620 } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2621 COLLECT_MS(cdoc->system_id, &lf, sizeof(lf));
2622 }
2623
2624 /* Collect '\r' */
2625 tokeniser->context.pending += 1;
2626 } else {
2627 COLLECT_MS(cdoc->system_id, cptr, len);
2628 tokeniser->context.pending += len;
2629 }
2630
2631 return HUBBUB_OK;
2632 }
2633
hubbub_tokeniser_handle_doctype_system_sq(hubbub_tokeniser * tokeniser)2634 hubbub_error hubbub_tokeniser_handle_doctype_system_sq(
2635 hubbub_tokeniser *tokeniser)
2636 {
2637 hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2638 size_t len;
2639 const uint8_t *cptr;
2640 parserutils_error error;
2641 uint8_t c;
2642
2643 error = parserutils_inputstream_peek(tokeniser->input,
2644 tokeniser->context.pending, &cptr, &len);
2645
2646 if (error != PARSERUTILS_OK) {
2647 if (error == PARSERUTILS_EOF) {
2648 tokeniser->state = STATE_DATA;
2649 return emit_current_doctype(tokeniser, true);
2650 } else {
2651 return hubbub_error_from_parserutils_error(error);
2652 }
2653 }
2654
2655 c = *cptr;
2656
2657 if (c == '\'') {
2658 tokeniser->context.pending += len;
2659 tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
2660 } else if (c == '>') {
2661 tokeniser->context.pending += len;
2662 tokeniser->state = STATE_DATA;
2663 return emit_current_doctype(tokeniser, true);
2664 } else if (c == '\0') {
2665 COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd));
2666 tokeniser->context.pending += len;
2667 } else if (c == '\r') {
2668 error = parserutils_inputstream_peek(
2669 tokeniser->input,
2670 tokeniser->context.pending,
2671 &cptr,
2672 &len);
2673
2674 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2675 return hubbub_error_from_parserutils_error(error);
2676 } else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2677 COLLECT_MS(cdoc->system_id, &lf, sizeof(lf));
2678 }
2679
2680 /* Collect '\r' */
2681 tokeniser->context.pending += 1;
2682 } else {
2683 COLLECT_MS(cdoc->system_id, cptr, len);
2684 tokeniser->context.pending += len;
2685 }
2686
2687 return HUBBUB_OK;
2688 }
2689
hubbub_tokeniser_handle_after_doctype_system(hubbub_tokeniser * tokeniser)2690 hubbub_error hubbub_tokeniser_handle_after_doctype_system(
2691 hubbub_tokeniser *tokeniser)
2692 {
2693 size_t len;
2694 const uint8_t *cptr;
2695 parserutils_error error;
2696 uint8_t c;
2697
2698 error = parserutils_inputstream_peek(tokeniser->input,
2699 tokeniser->context.pending, &cptr, &len);
2700
2701 if (error != PARSERUTILS_OK) {
2702 if (error == PARSERUTILS_EOF) {
2703 tokeniser->state = STATE_DATA;
2704 return emit_current_doctype(tokeniser, true);
2705 } else {
2706 return hubbub_error_from_parserutils_error(error);
2707 }
2708 }
2709
2710 c = *cptr;
2711 tokeniser->context.pending += len;
2712
2713 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2714 /* pass over in silence */
2715 } else if (c == '>') {
2716 tokeniser->state = STATE_DATA;
2717 return emit_current_doctype(tokeniser, false);
2718 } else {
2719 tokeniser->state = STATE_BOGUS_DOCTYPE;
2720 }
2721
2722 return HUBBUB_OK;
2723 }
2724
2725
hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser * tokeniser)2726 hubbub_error hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser)
2727 {
2728 size_t len;
2729 const uint8_t *cptr;
2730 parserutils_error error;
2731 uint8_t c;
2732
2733 error = parserutils_inputstream_peek(tokeniser->input,
2734 tokeniser->context.pending, &cptr, &len);
2735
2736 if (error != PARSERUTILS_OK) {
2737 if (error == PARSERUTILS_EOF) {
2738 tokeniser->state = STATE_DATA;
2739 return emit_current_doctype(tokeniser, false);
2740 } else {
2741 return hubbub_error_from_parserutils_error(error);
2742 }
2743 }
2744
2745 c = *cptr;
2746 tokeniser->context.pending += len;
2747
2748 if (c == '>') {
2749 tokeniser->state = STATE_DATA;
2750 return emit_current_doctype(tokeniser, false);
2751 }
2752
2753 return HUBBUB_OK;
2754 }
2755
2756
2757
2758 #define CDATA "[CDATA["
2759 #define CDATA_LEN (SLEN(CDATA) - 1)
2760
hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser * tokeniser)2761 hubbub_error hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser *tokeniser)
2762 {
2763 size_t len;
2764 const uint8_t *cptr;
2765 parserutils_error error;
2766 uint8_t c;
2767
2768 error = parserutils_inputstream_peek(tokeniser->input,
2769 tokeniser->context.pending, &cptr, &len);
2770
2771 if (error != PARSERUTILS_OK) {
2772 if (error == PARSERUTILS_EOF) {
2773 tokeniser->context.current_comment.len =
2774 tokeniser->context.pending = 0;
2775 tokeniser->state = STATE_BOGUS_COMMENT;
2776 return HUBBUB_OK;
2777 } else {
2778 return hubbub_error_from_parserutils_error(error);
2779 }
2780 }
2781
2782 c = *cptr;
2783
2784 assert(tokeniser->context.match_cdata.count <= CDATA_LEN);
2785
2786 if (CDATA[tokeniser->context.match_cdata.count] != (c & ~0x20)) {
2787 tokeniser->context.current_comment.len =
2788 tokeniser->context.pending =
2789 0;
2790 tokeniser->state = STATE_BOGUS_COMMENT;
2791 return HUBBUB_OK;
2792 }
2793
2794 tokeniser->context.pending += len;
2795
2796 if (tokeniser->context.match_cdata.count == CDATA_LEN) {
2797 parserutils_inputstream_advance(tokeniser->input,
2798 tokeniser->context.match_cdata.count + len);
2799 tokeniser->context.pending = 0;
2800 tokeniser->context.match_cdata.end = 0;
2801 tokeniser->state = STATE_CDATA_BLOCK;
2802 }
2803
2804 tokeniser->context.match_cdata.count += len;
2805
2806 return HUBBUB_OK;
2807 }
2808
2809 #undef CDATA
2810 #undef CDATA_LEN
2811
2812
hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser * tokeniser)2813 hubbub_error hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser)
2814 {
2815 size_t len;
2816 const uint8_t *cptr;
2817 parserutils_error error;
2818 uint8_t c;
2819
2820 error = parserutils_inputstream_peek(tokeniser->input,
2821 tokeniser->context.pending, &cptr, &len);
2822
2823 if (error != PARSERUTILS_OK) {
2824 if (error == PARSERUTILS_EOF) {
2825 tokeniser->state = STATE_DATA;
2826 return emit_current_chars(tokeniser);
2827 } else {
2828 return hubbub_error_from_parserutils_error(error);
2829 }
2830 }
2831
2832 c = *cptr;
2833
2834 if (c == ']' && (tokeniser->context.match_cdata.end == 0 ||
2835 tokeniser->context.match_cdata.end == 1)) {
2836 tokeniser->context.pending += len;
2837 tokeniser->context.match_cdata.end += len;
2838 } else if (c == '>' && tokeniser->context.match_cdata.end == 2) {
2839 /* Remove the previous two "]]" */
2840 tokeniser->context.pending -= 2;
2841
2842 /* Emit any pending characters */
2843 emit_current_chars(tokeniser);
2844
2845 /* Now move past the "]]>" bit */
2846 parserutils_inputstream_advance(tokeniser->input, SLEN("]]>"));
2847
2848 tokeniser->state = STATE_DATA;
2849 } else if (c == '\0') {
2850 if (tokeniser->context.pending > 0) {
2851 /* Emit any pending characters */
2852 emit_current_chars(tokeniser);
2853 }
2854
2855 /* Perform NUL-byte replacement */
2856 emit_character_token(tokeniser, &u_fffd_str);
2857
2858 parserutils_inputstream_advance(tokeniser->input, len);
2859 tokeniser->context.match_cdata.end = 0;
2860 } else if (c == '\r') {
2861 error = parserutils_inputstream_peek(
2862 tokeniser->input,
2863 tokeniser->context.pending + len,
2864 &cptr,
2865 &len);
2866
2867 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2868 return hubbub_error_from_parserutils_error(error);
2869 }
2870
2871 if (tokeniser->context.pending > 0) {
2872 /* Emit any pending characters */
2873 emit_current_chars(tokeniser);
2874 }
2875
2876 if (error == PARSERUTILS_EOF || *cptr != '\n') {
2877 /* Emit newline */
2878 emit_character_token(tokeniser, &lf_str);
2879 }
2880
2881 /* Advance over \r */
2882 parserutils_inputstream_advance(tokeniser->input, 1);
2883 tokeniser->context.match_cdata.end = 0;
2884 } else {
2885 tokeniser->context.pending += len;
2886 tokeniser->context.match_cdata.end = 0;
2887 }
2888
2889 return HUBBUB_OK;
2890 }
2891
2892
hubbub_tokeniser_consume_character_reference(hubbub_tokeniser * tokeniser,size_t pos)2893 hubbub_error hubbub_tokeniser_consume_character_reference(
2894 hubbub_tokeniser *tokeniser, size_t pos)
2895 {
2896 uint32_t allowed_char = tokeniser->context.allowed_char;
2897
2898 size_t len;
2899 const uint8_t *cptr;
2900 parserutils_error error;
2901 uint8_t c;
2902 size_t off;
2903
2904 error = parserutils_inputstream_peek(tokeniser->input, pos,
2905 &cptr, &len);
2906
2907 /* We should always start on an ampersand */
2908 assert(error == PARSERUTILS_OK);
2909 assert(len == 1 && *cptr == '&');
2910
2911 off = pos + len;
2912
2913 /* Look at the character after the ampersand */
2914 error = parserutils_inputstream_peek(tokeniser->input, off,
2915 &cptr, &len);
2916
2917 if (error != PARSERUTILS_OK) {
2918 if (error == PARSERUTILS_EOF) {
2919 tokeniser->context.match_entity.complete = true;
2920 tokeniser->context.match_entity.codepoint = 0;
2921 return HUBBUB_OK;
2922 } else {
2923 return hubbub_error_from_parserutils_error(error);
2924 }
2925 }
2926
2927 c = *cptr;
2928
2929 /* Set things up */
2930 tokeniser->context.match_entity.offset = off;
2931 tokeniser->context.match_entity.poss_length = 0;
2932 tokeniser->context.match_entity.length = 0;
2933 tokeniser->context.match_entity.base = 0;
2934 tokeniser->context.match_entity.codepoint = 0;
2935 tokeniser->context.match_entity.had_data = false;
2936 tokeniser->context.match_entity.return_state = tokeniser->state;
2937 tokeniser->context.match_entity.complete = false;
2938 tokeniser->context.match_entity.overflow = false;
2939 tokeniser->context.match_entity.context = -1;
2940 tokeniser->context.match_entity.prev_len = len;
2941
2942 /* Reset allowed character for future calls */
2943 tokeniser->context.allowed_char = '\0';
2944
2945 if (c == '\t' || c == '\n' || c == '\f' || c == ' ' ||
2946 c == '<' || c == '&' ||
2947 (allowed_char && c == allowed_char)) {
2948 tokeniser->context.match_entity.complete = true;
2949 tokeniser->context.match_entity.codepoint = 0;
2950 } else if (c == '#') {
2951 tokeniser->context.match_entity.length += len;
2952 tokeniser->state = STATE_NUMBERED_ENTITY;
2953 } else {
2954 tokeniser->state = STATE_NAMED_ENTITY;
2955 }
2956
2957 return HUBBUB_OK;
2958 }
2959
2960
hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser * tokeniser)2961 hubbub_error hubbub_tokeniser_handle_numbered_entity(
2962 hubbub_tokeniser *tokeniser)
2963 {
2964 hubbub_tokeniser_context *ctx = &tokeniser->context;
2965
2966 size_t len;
2967 const uint8_t *cptr;
2968 parserutils_error error;
2969
2970 error = parserutils_inputstream_peek(tokeniser->input,
2971 ctx->match_entity.offset + ctx->match_entity.length,
2972 &cptr, &len);
2973
2974 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2975 return hubbub_error_from_parserutils_error(error);
2976 }
2977
2978 if (error != PARSERUTILS_EOF && ctx->match_entity.base == 0) {
2979 uint8_t c = *cptr;
2980 if ((c & ~0x20) == 'X') {
2981 ctx->match_entity.base = 16;
2982 ctx->match_entity.length += len;
2983 } else {
2984 ctx->match_entity.base = 10;
2985 }
2986 }
2987
2988 while ((error = parserutils_inputstream_peek(tokeniser->input,
2989 ctx->match_entity.offset + ctx->match_entity.length,
2990 &cptr, &len)) == PARSERUTILS_OK) {
2991 uint8_t c = *cptr;
2992
2993 if (ctx->match_entity.base == 10 &&
2994 ('0' <= c && c <= '9')) {
2995 ctx->match_entity.had_data = true;
2996 ctx->match_entity.codepoint =
2997 ctx->match_entity.codepoint * 10 + (c - '0');
2998
2999 ctx->match_entity.length += len;
3000 } else if (ctx->match_entity.base == 16 &&
3001 (('0' <= c && c <= '9') ||
3002 ('A' <= (c & ~0x20) &&
3003 (c & ~0x20) <= 'F'))) {
3004 ctx->match_entity.had_data = true;
3005 ctx->match_entity.codepoint *= 16;
3006
3007 if ('0' <= c && c <= '9') {
3008 ctx->match_entity.codepoint += (c - '0');
3009 } else {
3010 ctx->match_entity.codepoint +=
3011 ((c & ~0x20) - 'A' + 10);
3012 }
3013
3014 ctx->match_entity.length += len;
3015 } else {
3016 break;
3017 }
3018
3019 if (ctx->match_entity.codepoint >= 0x10FFFF) {
3020 ctx->match_entity.overflow = true;
3021 }
3022 }
3023
3024 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
3025 return hubbub_error_from_parserutils_error(error);
3026 }
3027
3028 /* Eat trailing semicolon, if any */
3029 if (error != PARSERUTILS_EOF && *cptr == ';') {
3030 ctx->match_entity.length += len;
3031 }
3032
3033 /* Had data, so calculate final codepoint */
3034 if (ctx->match_entity.had_data) {
3035 uint32_t cp = ctx->match_entity.codepoint;
3036
3037 if (0x80 <= cp && cp <= 0x9F) {
3038 cp = cp1252Table[cp - 0x80];
3039 } else if (cp == 0x0D) {
3040 cp = 0x000A;
3041 } else if (ctx->match_entity.overflow ||
3042 cp <= 0x0008 || cp == 0x000B ||
3043 (0x000E <= cp && cp <= 0x001F) ||
3044 (0x007F <= cp && cp <= 0x009F) ||
3045 (0xD800 <= cp && cp <= 0xDFFF) ||
3046 (0xFDD0 <= cp && cp <= 0xFDEF) ||
3047 (cp & 0xFFFE) == 0xFFFE) {
3048 /* the check for cp > 0x10FFFF per spec is performed
3049 * in the loop above to avoid overflow */
3050 cp = 0xFFFD;
3051 }
3052
3053 ctx->match_entity.codepoint = cp;
3054 }
3055
3056 /* Flag completion */
3057 ctx->match_entity.complete = true;
3058
3059 /* And back to the state we were entered in */
3060 tokeniser->state = ctx->match_entity.return_state;
3061
3062 return HUBBUB_OK;
3063 }
3064
hubbub_tokeniser_handle_named_entity(hubbub_tokeniser * tokeniser)3065 hubbub_error hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
3066 {
3067 hubbub_tokeniser_context *ctx = &tokeniser->context;
3068
3069 size_t len;
3070 const uint8_t *cptr;
3071 parserutils_error error;
3072
3073 while ((error = parserutils_inputstream_peek(tokeniser->input,
3074 ctx->match_entity.offset +
3075 ctx->match_entity.poss_length,
3076 &cptr, &len)) == PARSERUTILS_OK) {
3077 uint32_t cp;
3078
3079 uint8_t c = *cptr;
3080 hubbub_error error;
3081
3082 if (c > 0x7F) {
3083 /* Entity names are ASCII only */
3084 break;
3085 }
3086
3087 error = hubbub_entities_search_step(c, &cp,
3088 &ctx->match_entity.context);
3089 if (error == HUBBUB_OK) {
3090 /* Had a match - store it for later */
3091 ctx->match_entity.codepoint = cp;
3092
3093 ctx->match_entity.length =
3094 ctx->match_entity.poss_length + len;
3095 ctx->match_entity.poss_length =
3096 ctx->match_entity.length;
3097 } else if (error == HUBBUB_INVALID) {
3098 /* No further matches - use last found */
3099 break;
3100 } else {
3101 /* Need more data */
3102 ctx->match_entity.poss_length += len;
3103 }
3104 }
3105
3106 if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
3107 return hubbub_error_from_parserutils_error(error);
3108 }
3109
3110 if (ctx->match_entity.length > 0) {
3111 uint8_t c;
3112 error = parserutils_inputstream_peek(tokeniser->input,
3113 ctx->match_entity.offset +
3114 ctx->match_entity.length - 1,
3115 &cptr, &len);
3116 /* We're re-reading a character we've already read after.
3117 * Therefore, there's no way that an error may occur as
3118 * a result. */
3119 assert(error == PARSERUTILS_OK);
3120
3121 c = *cptr;
3122
3123 if ((tokeniser->context.match_entity.return_state ==
3124 STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) &&
3125 c != ';') {
3126 error = parserutils_inputstream_peek(tokeniser->input,
3127 ctx->match_entity.offset +
3128 ctx->match_entity.length,
3129 &cptr, &len);
3130 /* We must have attempted to read one more character
3131 * than was present in the entity name, as that is the
3132 * only way to break out of the loop above. If that
3133 * failed, then any non-EOF case will have been handled
3134 * by the if statement after the loop thus it cannot
3135 * occur here. */
3136 assert(error == PARSERUTILS_OK ||
3137 error == PARSERUTILS_EOF);
3138
3139 if (error == PARSERUTILS_EOF) {
3140 ctx->match_entity.codepoint = 0;
3141 }
3142
3143 c = *cptr;
3144 if ((0x0030 <= c && c <= 0x0039) ||
3145 (0x0041 <= c && c <= 0x005A) ||
3146 (0x0061 <= c && c <= 0x007A)) {
3147 ctx->match_entity.codepoint = 0;
3148 }
3149 }
3150 }
3151
3152 /* Flag completion */
3153 ctx->match_entity.complete = true;
3154
3155 /* And back to the state from whence we came */
3156 tokeniser->state = ctx->match_entity.return_state;
3157
3158 return HUBBUB_OK;
3159 }
3160
3161
3162
3163 /*** Token emitting bits ***/
3164
3165 /**
3166 * Emit a character token.
3167 *
3168 * \param tokeniser Tokeniser instance
3169 * \param chars Pointer to hubbub_string to emit
3170 * \return true
3171 */
emit_character_token(hubbub_tokeniser * tokeniser,const hubbub_string * chars)3172 hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
3173 const hubbub_string *chars)
3174 {
3175 hubbub_token token;
3176
3177 token.type = HUBBUB_TOKEN_CHARACTER;
3178 token.data.character = *chars;
3179
3180 return hubbub_tokeniser_emit_token(tokeniser, &token);
3181 }
3182
3183 /**
3184 * Emit the current pending characters being stored in the tokeniser context.
3185 *
3186 * \param tokeniser Tokeniser instance
3187 * \return true
3188 */
emit_current_chars(hubbub_tokeniser * tokeniser)3189 hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser)
3190 {
3191 hubbub_token token;
3192 size_t len;
3193 const uint8_t *cptr = NULL;
3194 parserutils_error error;
3195
3196 /* Calling this with nothing to output is a probable bug */
3197 assert(tokeniser->context.pending > 0);
3198
3199 error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);
3200 if (error != PARSERUTILS_OK)
3201 return hubbub_error_from_parserutils_error(error);
3202
3203 token.type = HUBBUB_TOKEN_CHARACTER;
3204 token.data.character.ptr = cptr;
3205 token.data.character.len = tokeniser->context.pending;
3206
3207 return hubbub_tokeniser_emit_token(tokeniser, &token);
3208 }
3209
3210 /**
3211 * Emit the current tag token being stored in the tokeniser context.
3212 *
3213 * \param tokeniser Tokeniser instance
3214 * \return true
3215 */
emit_current_tag(hubbub_tokeniser * tokeniser)3216 hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser)
3217 {
3218 hubbub_error err;
3219 hubbub_token token;
3220 uint32_t n_attributes;
3221 hubbub_attribute *attrs;
3222 uint8_t *ptr;
3223 uint32_t i, j;
3224
3225 /* Emit current tag */
3226 token.type = tokeniser->context.current_tag_type;
3227 token.data.tag = tokeniser->context.current_tag;
3228 token.data.tag.ns = HUBBUB_NS_HTML;
3229
3230
3231 n_attributes = token.data.tag.n_attributes;
3232 attrs = token.data.tag.attributes;
3233
3234 /* Set pointers correctly... */
3235 ptr = tokeniser->buffer->data;
3236 token.data.tag.name.ptr = tokeniser->buffer->data;
3237 ptr += token.data.tag.name.len;
3238
3239 for (i = 0; i < n_attributes; i++) {
3240 attrs[i].name.ptr = ptr;
3241 ptr += attrs[i].name.len;
3242 attrs[i].value.ptr = ptr;
3243 ptr += attrs[i].value.len;
3244 }
3245
3246
3247 /* Discard duplicate attributes */
3248 for (i = 0; i < n_attributes; i++) {
3249 for (j = 0; j < n_attributes; j++) {
3250 uint32_t move;
3251
3252 if (j == i ||
3253 attrs[i].name.len !=
3254 attrs[j].name.len ||
3255 strncmp((char *) attrs[i].name.ptr,
3256 (char *) attrs[j].name.ptr,
3257 attrs[i].name.len) != 0) {
3258 /* Attributes don't match */
3259 continue;
3260 }
3261
3262 assert(i < j);
3263
3264 /* Calculate amount to move */
3265 move = (n_attributes - 1 - j) *
3266 sizeof(hubbub_attribute);
3267
3268 if (move > 0) {
3269 memmove(&attrs[j],&attrs[j+1], move);
3270 }
3271
3272 /* We've deleted an item, so we need to
3273 * reprocess this index */
3274 j--;
3275
3276 /* And reduce the number of attributes */
3277 n_attributes--;
3278 }
3279 }
3280
3281 token.data.tag.n_attributes = n_attributes;
3282
3283 err = hubbub_tokeniser_emit_token(tokeniser, &token);
3284
3285 if (token.type == HUBBUB_TOKEN_START_TAG) {
3286 /* Save start tag name for R?CDATA */
3287 if (token.data.tag.name.len <
3288 sizeof(tokeniser->context.last_start_tag_name)) {
3289 strncpy((char *) tokeniser->context.last_start_tag_name,
3290 (const char *) token.data.tag.name.ptr,
3291 token.data.tag.name.len);
3292 tokeniser->context.last_start_tag_len =
3293 token.data.tag.name.len;
3294 } else {
3295 tokeniser->context.last_start_tag_name[0] = '\0';
3296 tokeniser->context.last_start_tag_len = 0;
3297 }
3298 } else /* if (token->type == HUBBUB_TOKEN_END_TAG) */ {
3299 /* Reset content model after R?CDATA elements */
3300 tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
3301 }
3302
3303 /* Reset the self-closing flag */
3304 tokeniser->context.current_tag.self_closing = false;
3305
3306 return err;
3307 }
3308
3309 /**
3310 * Emit the current comment token being stored in the tokeniser context.
3311 *
3312 * \param tokeniser Tokeniser instance
3313 * \return true
3314 */
emit_current_comment(hubbub_tokeniser * tokeniser)3315 hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser)
3316 {
3317 hubbub_token token;
3318
3319 token.type = HUBBUB_TOKEN_COMMENT;
3320 token.data.comment.ptr = tokeniser->buffer->data;
3321 token.data.comment.len = tokeniser->buffer->length;
3322
3323 return hubbub_tokeniser_emit_token(tokeniser, &token);
3324 }
3325
3326 /**
3327 * Emit the current doctype token being stored in the tokeniser context.
3328 *
3329 * \param tokeniser Tokeniser instance
3330 * \param force_quirks Force quirks mode on this document
3331 * \return true
3332 */
emit_current_doctype(hubbub_tokeniser * tokeniser,bool force_quirks)3333 hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
3334 bool force_quirks)
3335 {
3336 hubbub_token token;
3337
3338 /* Emit doctype */
3339 token.type = HUBBUB_TOKEN_DOCTYPE;
3340 token.data.doctype = tokeniser->context.current_doctype;
3341 if (force_quirks == true)
3342 token.data.doctype.force_quirks = true;
3343
3344 /* Set pointers correctly */
3345 token.data.doctype.name.ptr = tokeniser->buffer->data;
3346
3347 if (token.data.doctype.public_missing == false) {
3348 token.data.doctype.public_id.ptr = tokeniser->buffer->data +
3349 token.data.doctype.name.len;
3350 }
3351
3352 if (token.data.doctype.system_missing == false) {
3353 token.data.doctype.system_id.ptr = tokeniser->buffer->data +
3354 token.data.doctype.name.len +
3355 token.data.doctype.public_id.len;
3356 }
3357
3358 return hubbub_tokeniser_emit_token(tokeniser, &token);
3359 }
3360
3361 /**
3362 * Emit a token, performing sanity checks if necessary
3363 *
3364 * \param tokeniser Tokeniser instance
3365 * \param token Token to emit
3366 */
hubbub_tokeniser_emit_token(hubbub_tokeniser * tokeniser,hubbub_token * token)3367 hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
3368 hubbub_token *token)
3369 {
3370 hubbub_error err = HUBBUB_OK;
3371
3372 assert(tokeniser != NULL);
3373 assert(token != NULL);
3374 assert(tokeniser->insert_buf->length == 0);
3375
3376 #ifndef NDEBUG
3377 /* Sanity checks */
3378 switch (token->type) {
3379 case HUBBUB_TOKEN_DOCTYPE:
3380 assert(memchr(token->data.doctype.name.ptr, 0xff,
3381 token->data.doctype.name.len) == NULL);
3382 if (token->data.doctype.public_missing == false)
3383 assert(memchr(token->data.doctype.public_id.ptr, 0xff,
3384 token->data.doctype.public_id.len) == NULL);
3385 if (token->data.doctype.system_missing == false)
3386 assert(memchr(token->data.doctype.system_id.ptr, 0xff,
3387 token->data.doctype.system_id.len) == NULL);
3388 break;
3389 case HUBBUB_TOKEN_START_TAG:
3390 case HUBBUB_TOKEN_END_TAG:
3391 {
3392 uint32_t i;
3393 assert(memchr(token->data.tag.name.ptr, 0xff,
3394 token->data.tag.name.len) == NULL);
3395 for (i = 0; i < token->data.tag.n_attributes; i++) {
3396 hubbub_attribute *attr = &token->data.tag.attributes[i];
3397
3398 assert(memchr(attr->name.ptr, 0xff, attr->name.len) ==
3399 NULL);
3400 assert(memchr(attr->value.ptr, 0xff, attr->value.len) ==
3401 NULL);
3402 }
3403 }
3404 break;
3405 case HUBBUB_TOKEN_COMMENT:
3406 assert(memchr(token->data.comment.ptr, 0xff,
3407 token->data.comment.len) == NULL);
3408 break;
3409 case HUBBUB_TOKEN_CHARACTER:
3410 assert(memchr(token->data.character.ptr, 0xff,
3411 token->data.character.len) == NULL);
3412 break;
3413 case HUBBUB_TOKEN_EOF:
3414 break;
3415 }
3416 #endif
3417
3418 /* Emit the token */
3419 if (tokeniser->token_handler) {
3420 err = tokeniser->token_handler(token, tokeniser->token_pw);
3421 }
3422
3423 /* Discard current buffer */
3424 if (tokeniser->buffer->length) {
3425 parserutils_buffer_discard(tokeniser->buffer, 0,
3426 tokeniser->buffer->length);
3427 }
3428
3429 /* Advance the pointer */
3430 if (tokeniser->context.pending) {
3431 parserutils_inputstream_advance(tokeniser->input,
3432 tokeniser->context.pending);
3433 tokeniser->context.pending = 0;
3434 }
3435
3436 if (tokeniser->insert_buf->length > 0) {
3437 parserutils_inputstream_insert(tokeniser->input,
3438 tokeniser->insert_buf->data,
3439 tokeniser->insert_buf->length);
3440 parserutils_buffer_discard(tokeniser->insert_buf, 0,
3441 tokeniser->insert_buf->length);
3442 }
3443
3444 /* Ensure callback can pause the tokenise */
3445 if (err == HUBBUB_PAUSED) {
3446 tokeniser->paused = true;
3447 }
3448
3449 return err;
3450 }
3451