1package HTML::HTML5::Parser::Tokenizer; # -*- Perl -*-
2## skip Test::Tabs
3use strict;
4use warnings;
5our $VERSION='0.992';
6
7## This module implements the tokenization phase of both HTML5 and
8## XML5.  Notes like this are usually based on the latest HTML
9## specification.  Since XML is different from HTML, and since XML5
10## specification has not been maintained any more, there is a few
11## differences from HTML's tokenization.  Such differences are marked
12## by prefix "XML5:".
13
14## Warnings that depend on the HTML/XML input stream, such as ones
15## related to surrogate code positions, are not useful.
16no warnings 'utf8';
17
18## ------ Token types ------
19
20BEGIN {
21  require Exporter;
22  push our @ISA, 'Exporter';
23
24  our @EXPORT_OK = qw(
25    DOCTYPE_TOKEN
26    COMMENT_TOKEN
27    START_TAG_TOKEN
28    END_TAG_TOKEN
29    END_OF_FILE_TOKEN
30    CHARACTER_TOKEN
31    PI_TOKEN
32    ABORT_TOKEN
33    END_OF_DOCTYPE_TOKEN
34    ATTLIST_TOKEN
35    ELEMENT_TOKEN
36    GENERAL_ENTITY_TOKEN
37    PARAMETER_ENTITY_TOKEN
38    NOTATION_TOKEN
39  );
40
41  our %EXPORT_TAGS = (
42    token => [qw(
43      DOCTYPE_TOKEN
44      COMMENT_TOKEN
45      START_TAG_TOKEN
46      END_TAG_TOKEN
47      END_OF_FILE_TOKEN
48      CHARACTER_TOKEN
49      PI_TOKEN
50      ABORT_TOKEN
51      END_OF_DOCTYPE_TOKEN
52      ATTLIST_TOKEN
53      ELEMENT_TOKEN
54      GENERAL_ENTITY_TOKEN
55      PARAMETER_ENTITY_TOKEN
56      NOTATION_TOKEN
57    )],
58  );
59}
60
61sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
62sub COMMENT_TOKEN () { 2 }
63sub START_TAG_TOKEN () { 3 }
64sub END_TAG_TOKEN () { 4 }
65sub END_OF_FILE_TOKEN () { 5 }
66sub CHARACTER_TOKEN () { 6 }
67sub PI_TOKEN () { 7 } ## NOTE: XML only.
68sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
69sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
70sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
71sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
72sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
73sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
74sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
75
76## XML5: XML5 has "empty tag token".  In this implementation, it is
77## represented as a start tag token with $self->{self_closing} flag
78## set to true.
79
80## XML5: XML5 has "short end tag token".  In this implementation, it
81## is represented as an end tag token with $token->{tag_name} flag set
82## to an empty string.
83
84package HTML::HTML5::Parser::TagSoupParser;
85
86BEGIN { HTML::HTML5::Parser::Tokenizer->import (':token') }
87
88use HTML::HTML5::Entities qw[%entity2char];
89
90## ------ Tokenizer states ------
91
92sub DATA_STATE () { 0 }
93sub RCDATA_STATE () { 107 }
94sub RAWTEXT_STATE () { 108 }
95sub SCRIPT_DATA_STATE () { 109 }
96sub PLAINTEXT_STATE () { 110 }
97sub TAG_OPEN_STATE () { 2 }
98sub RCDATA_LT_STATE () { 111 }
99sub RAWTEXT_LT_STATE () { 112 }
100sub SCRIPT_DATA_LT_STATE () { 113 }
101sub CLOSE_TAG_OPEN_STATE () { 3 }
102sub RCDATA_END_TAG_OPEN_STATE () { 114 }
103sub RAWTEXT_END_TAG_OPEN_STATE () { 115 }
104sub SCRIPT_DATA_END_TAG_OPEN_STATE () { 116 }
105sub SCRIPT_DATA_ESCAPE_START_STATE () { 1 }
106sub SCRIPT_DATA_ESCAPE_START_DASH_STATE () { 12 }
107sub SCRIPT_DATA_ESCAPED_STATE () { 117 }
108sub SCRIPT_DATA_ESCAPED_DASH_STATE () { 118 }
109sub SCRIPT_DATA_ESCAPED_DASH_DASH_STATE () { 119 }
110sub SCRIPT_DATA_ESCAPED_LT_STATE () { 120 }
111sub SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE () { 121 }
112sub SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE () { 122 }
113sub SCRIPT_DATA_DOUBLE_ESCAPED_STATE () { 123 }
114sub SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE () { 124 }
115sub SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE () { 125 }
116sub SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE () { 126 }
117sub SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE () { 127 }
118sub TAG_NAME_STATE () { 4 }
119sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
120sub ATTRIBUTE_NAME_STATE () { 6 }
121sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
122sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
123sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
124sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
125sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
126sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
127sub COMMENT_START_STATE () { 14 }
128sub COMMENT_START_DASH_STATE () { 15 }
129sub COMMENT_STATE () { 16 }
130sub COMMENT_END_STATE () { 17 }
131sub COMMENT_END_BANG_STATE () { 102 }
132#sub COMMENT_END_SPACE_STATE () { 103 } ## REMOVED
133sub COMMENT_END_DASH_STATE () { 18 }
134sub BOGUS_COMMENT_STATE () { 19 }
135sub DOCTYPE_STATE () { 20 }
136sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
137sub DOCTYPE_NAME_STATE () { 22 }
138sub AFTER_DOCTYPE_NAME_STATE () { 23 }
139sub AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE () { 104 }
140sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
141sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
142sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
143sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
144sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
145sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
146sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
147sub BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE () { 105 }
148sub AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE () { 106 }
149sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
150sub BOGUS_DOCTYPE_STATE () { 32 }
151sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
152sub SELF_CLOSING_START_TAG_STATE () { 34 }
153sub CDATA_SECTION_STATE () { 35 }
154sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
155sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
156sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
157#sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
158sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
159sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
160sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
161sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
162##
163## NOTE: "Entity data state", "entity in attribute value state", and
164## the "consume a character reference" algorithm, are jointly
165## implemented as the following six states:
166sub ENTITY_STATE () { 44 }
167sub ENTITY_HASH_STATE () { 45 }
168sub NCR_NUM_STATE () { 46 }
169sub HEXREF_X_STATE () { 47 }
170sub HEXREF_HEX_STATE () { 48 }
171sub ENTITY_NAME_STATE () { 49 }
172##
173## XML-only states
174sub DATA_MSE1_STATE () { 50 }
175sub DATA_MSE2_STATE () { 128 } # last
176sub PI_STATE () { 51 }
177sub PI_TARGET_STATE () { 52 }
178sub PI_TARGET_AFTER_STATE () { 53 }
179sub PI_DATA_STATE () { 54 }
180sub PI_AFTER_STATE () { 55 }
181sub PI_DATA_AFTER_STATE () { 56 }
182sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
183sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
184sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
185sub DOCTYPE_TAG_STATE () { 60 }
186sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
187sub MD_ATTLIST_STATE () { 62 }
188sub MD_E_STATE () { 63 }
189sub MD_ELEMENT_STATE () { 64 }
190sub MD_ENTITY_STATE () { 65 }
191sub MD_NOTATION_STATE () { 66 }
192sub DOCTYPE_MD_STATE () { 67 }
193sub BEFORE_MD_NAME_STATE () { 68 }
194sub MD_NAME_STATE () { 69 }
195sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
196sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
197sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
198sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
199sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
200sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
201sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
202sub ALLOWED_TOKEN_STATE () { 77 }
203sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
204sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
205sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
206sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
207sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
208sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
209sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
210sub BEFORE_NDATA_STATE () { 85 }
211sub NDATA_STATE () { 86 }
212sub AFTER_NDATA_STATE () { 87 }
213sub BEFORE_NOTATION_NAME_STATE () { 88 }
214sub NOTATION_NAME_STATE () { 89 }
215sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
216sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
217sub ENTITY_VALUE_ENTITY_STATE () { 92 }
218sub AFTER_ELEMENT_NAME_STATE () { 93 }
219sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
220sub CONTENT_KEYWORD_STATE () { 95 }
221sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
222sub CM_ELEMENT_NAME_STATE () { 97 }
223sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
224sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
225sub AFTER_MD_DEF_STATE () { 100 }
226sub BOGUS_MD_STATE () { 101 }
227
228## ------ Tree constructor state constants ------
229
230## Whether the parsed string is in the foreign island or not affect
231## how tokenization is done, unfortunately.  These are a copy of some
232## of tokenization state constants.  See Whatpm::HTML for the full
233## list and the descriptions for constants.
234
235sub FOREIGN_EL () { 0b1_00000000000 }
236
237## ------ Character reference mappings ------
238
239my $charref_map = {
240  0x00 => 0xFFFD, # REPLACEMENT CHARACTER
241  0x0D => 0x000D, # CARRIAGE RETURN
242  0x80 => 0x20AC,
243  0x81 => 0x0081,
244  0x82 => 0x201A,
245  0x83 => 0x0192,
246  0x84 => 0x201E,
247  0x85 => 0x2026,
248  0x86 => 0x2020,
249  0x87 => 0x2021,
250  0x88 => 0x02C6,
251  0x89 => 0x2030,
252  0x8A => 0x0160,
253  0x8B => 0x2039,
254  0x8C => 0x0152,
255  0x8D => 0x008D,
256  0x8E => 0x017D,
257  0x8F => 0x008F,
258  0x90 => 0x0090,
259  0x91 => 0x2018,
260  0x92 => 0x2019,
261  0x93 => 0x201C,
262  0x94 => 0x201D,
263  0x95 => 0x2022,
264  0x96 => 0x2013,
265  0x97 => 0x2014,
266  0x98 => 0x02DC,
267  0x99 => 0x2122,
268  0x9A => 0x0161,
269  0x9B => 0x203A,
270  0x9C => 0x0153,
271  0x9D => 0x009D,
272  0x9E => 0x017E,
273  0x9F => 0x0178,
274}; # $charref_map
275$charref_map->{$_} = 0xFFFD # REPLACEMENT CHARACTER
276    for 0xD800..0xDFFF;
277$charref_map->{$_} = $_
278    for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
279        0xFDD0..0xFDEF,
280        0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
281        0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
282        0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
283        0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
284        0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
285
286## ------ Special character-like constants ------
287
288## The "EOF" pseudo-character in the HTML parsing algorithm.
289sub EOF_CHAR () { -1 }
290
291## A pseudo-character code that can never appear in the input stream.
292sub NEVER_CHAR () { -2 }
293
294## ------ The tokenizer ------
295
296## Implementations MUST act as if state machine in the spec
297
298sub _initialize_tokenizer ($) {
299  my $self = shift;
300
301  ## NOTE: Fields set by |new| constructor:
302  #$self->{level}
303  #$self->{set_nc}
304  #$self->{parse_error}
305  #$self->{is_xml} (if XML)
306
307  $self->{state} = DATA_STATE; # MUST
308  #$self->{kwd} = ''; # State-dependent keyword; initialized when used
309  #$self->{entity__value}; # initialized when used
310  #$self->{entity__match}; # initialized when used
311  undef $self->{ct}; # current token
312  undef $self->{ca}; # current attribute
313  undef $self->{last_stag_name}; # last emitted start tag name
314  #$self->{prev_state}; # initialized when used
315  delete $self->{self_closing};
316  $self->{char_buffer} = '';
317  $self->{char_buffer_pos} = 0;
318  $self->{nc} = -1; # next input character
319  #$self->{next_nc}
320
321    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
322      $self->{line_prev} = $self->{line};
323      $self->{column_prev} = $self->{column};
324      $self->{column}++;
325      $self->{nc}
326          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
327    } else {
328      $self->{set_nc}->($self);
329    }
330
331  $self->{token} = [];
332  # $self->{escape}
333} # _initialize_tokenizer
334
335## A token has:
336##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
337##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
338##   ->{name} (DOCTYPE_TOKEN)
339##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
340##   ->{target} (PI_TOKEN)
341##   ->{pubid} (DOCTYPE_TOKEN)
342##   ->{sysid} (DOCTYPE_TOKEN)
343##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
344##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
345##        ->{name}
346##        ->{value}
347##        ->{has_reference} == 1 or 0
348##        ->{index}: Index of the attribute in a tag.
349##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
350##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
351##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
352##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
353
354## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
355##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
356##     while the token is pushed back to the stack.
357
358## Emitted token MUST immediately be handled by the tree construction state.
359
360## Before each step, UA MAY check to see if either one of the scripts in
361## "list of scripts that will execute as soon as possible" or the first
362## script in the "list of scripts that will execute asynchronously",
363## has completed loading.  If one has, then it MUST be executed
364## and removed from the list.
365
366## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
367## (This requirement was dropped from HTML5 spec, unfortunately.)
368
369my $is_space = {
370  0x0009 => 1, # CHARACTER TABULATION (HT)
371  0x000A => 1, # LINE FEED (LF)
372  #0x000B => 0, # LINE TABULATION (VT)
373  0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
374  0x000D => 1, # CARRIAGE RETURN (CR)
375  0x0020 => 1, # SPACE (SP)
376};
377
378sub KEY_ELSE_CHAR () { 255 }
379sub KEY_ULATIN_CHAR () { 254 }
380sub KEY_LLATIN_CHAR () { 253 }
381sub KEY_EOF_CHAR () { 252 }
382sub KEY_SPACE_CHAR () { 251 }
383
384my $Action;
385my $XMLAction;
386$Action->[DATA_STATE]->[0x0026] = {
387  name => 'data &',
388  state => ENTITY_STATE, # "entity data state" + "consume a character reference"
389  state_set => {entity_add => -1, prev_state => DATA_STATE},
390};
391$Action->[DATA_STATE]->[0x003C] = {
392  name => 'data <',
393  state => TAG_OPEN_STATE,
394};
395$Action->[DATA_STATE]->[KEY_EOF_CHAR] = {
396  name => 'data eof',
397  emit => END_OF_FILE_TOKEN,
398  reconsume => 1,
399};
400$Action->[DATA_STATE]->[0x0000] = {
401  name => 'data null',
402  emit => CHARACTER_TOKEN,
403  error => 'NULL',
404};
405$Action->[DATA_STATE]->[KEY_ELSE_CHAR] = {
406  name => 'data else',
407  emit => CHARACTER_TOKEN,
408  emit_data_read_until => qq{\x00<&},
409};
410  $XMLAction->[DATA_STATE]->[0x005D] = { # ]
411    name => 'data ]',
412    state => DATA_MSE1_STATE,
413    emit => CHARACTER_TOKEN,
414  };
415  $XMLAction->[DATA_STATE]->[KEY_ELSE_CHAR] = {
416    name => 'data else xml',
417    emit => CHARACTER_TOKEN,
418    emit_data_read_until => qq{\x00<&\]},
419  };
420$Action->[RCDATA_STATE]->[0x0026] = {
421  name => 'rcdata &',
422  state => ENTITY_STATE, # "entity data state" + "consume a character reference"
423  state_set => {entity_add => -1, prev_state => RCDATA_STATE},
424};
425$Action->[RCDATA_STATE]->[0x003C] = {
426  name => 'rcdata <',
427  state => RCDATA_LT_STATE,
428};
429$Action->[RCDATA_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
430$Action->[RCDATA_STATE]->[0x0000] = {
431  name => 'rcdata null',
432  emit => CHARACTER_TOKEN,
433  emit_data => "\x{FFFD}",
434  error => 'NULL',
435};
436$Action->[RCDATA_STATE]->[KEY_ELSE_CHAR] = {
437  name => 'rcdata else',
438  emit => CHARACTER_TOKEN,
439  emit_data_read_until => qq{\x00<&},
440};
441$Action->[RAWTEXT_STATE]->[0x003C] = {
442  name => 'rawtext <',
443  state => RAWTEXT_LT_STATE,
444};
445$Action->[RAWTEXT_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
446$Action->[RAWTEXT_STATE]->[0x0000] = $Action->[RCDATA_STATE]->[0x0000];
447$Action->[RAWTEXT_STATE]->[KEY_ELSE_CHAR] = {
448  name => 'rawtext else',
449  emit => CHARACTER_TOKEN,
450  emit_data_read_until => qq{\x00<},
451};
452$Action->[SCRIPT_DATA_STATE]->[0x003C] = {
453  name => 'script data <',
454  state => SCRIPT_DATA_LT_STATE,
455};
456$Action->[SCRIPT_DATA_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
457$Action->[SCRIPT_DATA_STATE]->[0x0000] = $Action->[RAWTEXT_STATE]->[0x0000];
458$Action->[SCRIPT_DATA_STATE]->[KEY_ELSE_CHAR] = $Action->[RAWTEXT_STATE]->[KEY_ELSE_CHAR];
459$Action->[PLAINTEXT_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR];
460$Action->[PLAINTEXT_STATE]->[0x0000] = $Action->[RAWTEXT_STATE]->[0x0000];
461$Action->[PLAINTEXT_STATE]->[KEY_ELSE_CHAR] = {
462  name => 'plaintext else',
463  emit => CHARACTER_TOKEN,
464  emit_data_read_until => qq{\x00},
465};
466# "Tag open state" is known as "tag state" in XML5.
467$Action->[TAG_OPEN_STATE]->[0x0021] = {
468  name => 'tag open !',
469  state => MARKUP_DECLARATION_OPEN_STATE,
470};
471$Action->[TAG_OPEN_STATE]->[0x002F] = {
472  name => 'tag open /',
473  state => CLOSE_TAG_OPEN_STATE,
474};
475$Action->[TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
476  name => 'tag open uc',
477  ct => {
478    type => START_TAG_TOKEN,
479    delta => 1,
480    append_tag_name => 0x0020, # UC -> lc
481  },
482  state => TAG_NAME_STATE,
483};
484  $XMLAction->[TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
485    name => 'tag open uc xml',
486    ct => {
487      type => START_TAG_TOKEN,
488      delta => 1,
489      append_tag_name => 0x0000,
490    },
491    state => TAG_NAME_STATE,
492  };
493$Action->[TAG_OPEN_STATE]->[KEY_LLATIN_CHAR] = {
494  name => 'tag open lc',
495  ct => {
496    type => START_TAG_TOKEN,
497    delta => 1,
498    append_tag_name => 0x0000,
499  },
500  state => TAG_NAME_STATE,
501};
502$Action->[TAG_OPEN_STATE]->[0x003F] = {
503  name => 'tag open ?',
504  state => BOGUS_COMMENT_STATE,
505  error => 'pio',
506  error_delta => 1,
507  ct => {
508    type => COMMENT_TOKEN,
509  },
510  reconsume => 1, ## $self->{nc} is intentionally left as is
511};
512  $XMLAction->[TAG_OPEN_STATE]->[0x003F] = { # ?
513    name => 'tag open ? xml',
514    state => PI_STATE,
515  };
516$Action->[TAG_OPEN_STATE]->[KEY_SPACE_CHAR] =
517$Action->[TAG_OPEN_STATE]->[0x003E] = { # >
518  name => 'tag open else',
519  error => 'bare stago',
520  error_delta => 1,
521  state => DATA_STATE,
522  reconsume => 1,
523  emit => CHARACTER_TOKEN,
524  emit_data => '<',
525  emit_delta => 1,
526};
527$Action->[TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = $Action->[TAG_OPEN_STATE]->[0x003E];
528  $XMLAction->[TAG_OPEN_STATE]->[0x0000] = {
529    name => 'tag open null xml',
530    ct => {
531      type => START_TAG_TOKEN,
532      delta => 1,
533      append_tag_name => 0xFFFD,
534    },
535    error => 'NULL',
536    state => TAG_NAME_STATE,
537  };
538  ## XML5: "<:" has a parse error.
539  $XMLAction->[TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = {
540    name => 'tag open else xml',
541    ct => {
542      type => START_TAG_TOKEN,
543      delta => 1,
544      append_tag_name => 0x0000,
545    },
546    state => TAG_NAME_STATE,
547  };
548$Action->[RCDATA_LT_STATE]->[0x002F] = {
549  name => 'rcdata lt /',
550  state => RCDATA_END_TAG_OPEN_STATE,
551  buffer => {clear => 1},
552};
553$Action->[RAWTEXT_LT_STATE]->[0x002F] = {
554  name => 'rawtext lt /',
555  state => RAWTEXT_END_TAG_OPEN_STATE,
556  buffer => {clear => 1},
557};
558$Action->[SCRIPT_DATA_LT_STATE]->[0x002F] = {
559  name => 'script data lt /',
560  state => SCRIPT_DATA_END_TAG_OPEN_STATE,
561  buffer => {clear => 1},
562};
563$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[0x002F] = {
564  name => 'script data escaped lt /',
565  state => SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE,
566  buffer => {clear => 1},
567};
568$Action->[SCRIPT_DATA_LT_STATE]->[0x0021] = {
569  name => 'script data lt !',
570  state => SCRIPT_DATA_ESCAPE_START_STATE,
571  emit => CHARACTER_TOKEN,
572  emit_data => '<!',
573};
574$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_ULATIN_CHAR] = {
575  name => 'script data escaped lt uc',
576  emit => CHARACTER_TOKEN,
577  emit_data => '<',
578  emit_data_append => 1,
579  buffer => {clear => 1, append => 0x0020}, # UC -> lc
580  state => SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE,
581};
582$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_LLATIN_CHAR] = {
583  name => 'script data escaped lt lc',
584  emit => CHARACTER_TOKEN,
585  emit_data => '<',
586  emit_data_append => 1,
587  buffer => {clear => 1, append => 0x0000},
588  state => SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE,
589};
590$Action->[RCDATA_LT_STATE]->[KEY_ELSE_CHAR] = {
591  name => 'rcdata lt else',
592  state => RCDATA_STATE,
593  reconsume => 1,
594  emit => CHARACTER_TOKEN,
595  emit_data => '<',
596};
597$Action->[RAWTEXT_LT_STATE]->[KEY_ELSE_CHAR] = {
598  name => 'rawtext lt else',
599  state => RAWTEXT_STATE,
600  reconsume => 1,
601  emit => CHARACTER_TOKEN,
602  emit_data => '<',
603};
604$Action->[SCRIPT_DATA_LT_STATE]->[KEY_ELSE_CHAR] = {
605  name => 'script data lt else',
606  state => SCRIPT_DATA_STATE,
607  reconsume => 1,
608  emit => CHARACTER_TOKEN,
609  emit_data => '<',
610};
611$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_ELSE_CHAR] = {
612  name => 'script data escaped lt else',
613  state => SCRIPT_DATA_ESCAPED_STATE,
614  reconsume => 1,
615  emit => CHARACTER_TOKEN,
616  emit_data => '<',
617};
618## XXX "End tag token" in latest HTML5 and in XML5.
619$Action->[CLOSE_TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
620  name => 'end tag open uc',
621  ct => {
622    type => END_TAG_TOKEN,
623    delta => 2,
624    append_tag_name => 0x0020, # UC -> lc
625  },
626  state => TAG_NAME_STATE,
627};
628  $XMLAction->[CLOSE_TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = {
629    name => 'end tag open uc xml',
630    ct => {
631      type => END_TAG_TOKEN,
632      delta => 2,
633      append_tag_name => 0x0000,
634    },
635    state => TAG_NAME_STATE,
636  };
637$Action->[CLOSE_TAG_OPEN_STATE]->[KEY_LLATIN_CHAR] = {
638  name => 'end tag open lc',
639  ct => {
640    type => END_TAG_TOKEN,
641    delta => 2,
642    append_tag_name => 0x0000,
643  },
644  state => TAG_NAME_STATE,
645};
646$Action->[CLOSE_TAG_OPEN_STATE]->[0x003E] = {
647  name => 'end tag open >',
648  error => 'empty end tag',
649  error_delta => 2, # "<" in "</>"
650  state => DATA_STATE,
651};
652  ## XML5: No parse error.
653
654  ## NOTE: This parser raises a parse error, since it supports XML1,
655  ## not XML5.
656
657  ## NOTE: A short end tag token.
658
659  $XMLAction->[CLOSE_TAG_OPEN_STATE]->[0x003E] = {
660    name => 'end tag open > xml',
661    error => 'empty end tag',
662    error_delta => 2, # "<" in "</>"
663    state => DATA_STATE,
664    ct => {
665      type => END_TAG_TOKEN,
666      delta => 2,
667    },
668    emit => '',
669  };
670$Action->[CLOSE_TAG_OPEN_STATE]->[KEY_EOF_CHAR] = {
671  name => 'end tag open eof',
672  error => 'bare etago',
673  state => DATA_STATE,
674  reconsume => 1,
675  emit => CHARACTER_TOKEN,
676  emit_data => '</',
677  emit_delta => 2,
678};
679$Action->[CLOSE_TAG_OPEN_STATE]->[KEY_SPACE_CHAR] =
680$Action->[CLOSE_TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = {
681  name => 'end tag open else',
682  error => 'bogus end tag',
683  error_delta => 2, # "<" of "</"
684  state => BOGUS_COMMENT_STATE,
685  ct => {
686    type => COMMENT_TOKEN,
687    delta => 2, # "<" of "</"
688  },
689  reconsume => 1,
690  ## NOTE: $self->{nc} is intentionally left as is.  Although the
691  ## "anything else" case of the spec not explicitly states that the
692  ## next input character is to be reconsumed, it will be included to
693  ## the |data| of the comment token generated from the bogus end tag,
694  ## as defined in the "bogus comment state" entry.
695};
696  $XMLAction->[CLOSE_TAG_OPEN_STATE]->[0x0000] = {
697    name => 'end tag open null xml',
698    ct => {
699      type => END_TAG_TOKEN,
700      delta => 2,
701      append_tag_name => 0xFFFD,
702    },
703    error => 'NULL',
704    state => TAG_NAME_STATE, ## XML5: "end tag name state".
705  };
706  ## XML5: "</:" is a parse error.
707  $XMLAction->[CLOSE_TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = {
708    name => 'end tag open else xml',
709    ct => {
710      type => END_TAG_TOKEN,
711      delta => 2,
712      append_tag_name => 0x0000,
713    },
714    state => TAG_NAME_STATE, ## XML5: "end tag name state".
715  };
716      ## This switch-case implements "tag name state", "RCDATA end tag
717      ## name state", "RAWTEXT end tag name state", and "script data
718      ## end tag name state" jointly with the implementation of
719      ## "RCDATA end tag open state" and so on.
720$Action->[TAG_NAME_STATE]->[KEY_SPACE_CHAR] = {
721  name => 'tag name sp',
722  state => BEFORE_ATTRIBUTE_NAME_STATE,
723};
724$Action->[TAG_NAME_STATE]->[0x003E] = {
725  name => 'tag name >',
726  state => DATA_STATE,
727  emit => '',
728};
729$Action->[TAG_NAME_STATE]->[KEY_ULATIN_CHAR] = {
730  name => 'tag name uc',
731  ct => {
732    append_tag_name => 0x0020, # UC -> lc
733  },
734};
735$XMLAction->[TAG_NAME_STATE]->[KEY_ULATIN_CHAR] = {
736  name => 'tag name uc xml',
737  ct => {
738    append_tag_name => 0x0000,
739  },
740};
741$Action->[TAG_NAME_STATE]->[KEY_EOF_CHAR] = {
742  name => 'tag name eof',
743  error => 'unclosed tag',
744  state => DATA_STATE,
745  reconsume => 1,
746};
747$Action->[TAG_NAME_STATE]->[0x002F] = {
748  name => 'tag name /',
749  state => SELF_CLOSING_START_TAG_STATE,
750};
751$Action->[TAG_NAME_STATE]->[0x0000] = {
752  name => 'tag name null',
753  ct => {
754    append_tag_name => 0xFFFD,
755  },
756  error => 'NULL',
757};
758$Action->[TAG_NAME_STATE]->[KEY_ELSE_CHAR] = {
759  name => 'tag name else',
760  ct => {
761    append_tag_name => 0x0000,
762  },
763};
764$Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[0x002D] = {
765  name => 'script data escape start -',
766  state => SCRIPT_DATA_ESCAPE_START_DASH_STATE,
767  emit => CHARACTER_TOKEN,
768  emit_data => '-',
769};
770$Action->[SCRIPT_DATA_ESCAPE_START_DASH_STATE]->[0x002D] = {
771  name => 'script data escape start dash -',
772  state => SCRIPT_DATA_ESCAPED_STATE,
773  emit => CHARACTER_TOKEN,
774  emit_data => '-',
775};
776$Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[KEY_ELSE_CHAR] = {
777  name => 'script data escape start else',
778  state => SCRIPT_DATA_STATE,
779  reconsume => 1,
780};
781$Action->[SCRIPT_DATA_ESCAPE_START_DASH_STATE]->[KEY_ELSE_CHAR] = $Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[KEY_ELSE_CHAR];
782$Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x002D] = {
783  name => 'script data escaped -',
784  state => SCRIPT_DATA_ESCAPED_DASH_STATE,
785  emit => CHARACTER_TOKEN,
786  emit_data => '-',
787};
788$Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x002D] = {
789  name => 'script data escaped dash -',
790  state => SCRIPT_DATA_ESCAPED_DASH_DASH_STATE,
791  emit => CHARACTER_TOKEN,
792  emit_data => '-',
793};
794$Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x002D] = {
795  name => 'script data escaped dash dash -',
796  emit => CHARACTER_TOKEN,
797  emit_data => '-',
798};
799$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x002D] = {
800  name => 'script data double escaped -',
801  state => SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE,
802  emit => CHARACTER_TOKEN,
803  emit_data => '-',
804};
805$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x002D] = {
806  name => 'script data double escaped -',
807  state => SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE,
808  emit => CHARACTER_TOKEN,
809  emit_data => '-',
810};
811$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x002D] = {
812  name => 'script data double escaped dash dash -',
813  emit => CHARACTER_TOKEN,
814  emit_data => '-',
815};
816$Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x003C] = {
817  name => 'script data escaped <',
818  state => SCRIPT_DATA_ESCAPED_LT_STATE,
819};
820$Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x003C] = {
821  name => 'script data escaped dash <',
822  state => SCRIPT_DATA_ESCAPED_LT_STATE,
823};
824$Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003C] = {
825  name => 'script data escaped dash dash <',
826  state => SCRIPT_DATA_ESCAPED_LT_STATE,
827};
828$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x003C] = {
829  name => 'script data double escaped <',
830  state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE,
831  emit => CHARACTER_TOKEN,
832  emit_data => '<',
833};
834$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x003C] = {
835  name => 'script data double escaped dash <',
836  state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE,
837  emit => CHARACTER_TOKEN,
838  emit_data => '<',
839};
840$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x003C] = {
841  name => 'script data double escaped dash dash <',
842  state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE,
843  emit => CHARACTER_TOKEN,
844  emit_data => '<',
845};
846$Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003E] = {
847  name => 'script data escaped dash dash >',
848  state => SCRIPT_DATA_STATE,
849  emit => CHARACTER_TOKEN,
850  emit_data => '>',
851};
852$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x003E] = $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003E];
853$Action->[SCRIPT_DATA_ESCAPED_STATE]->[KEY_EOF_CHAR] =
854$Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[KEY_EOF_CHAR] =
855$Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[KEY_EOF_CHAR] =
856$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[KEY_EOF_CHAR] =
857$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[KEY_EOF_CHAR] =
858$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[KEY_EOF_CHAR] = {
859  name => 'script data escaped eof',
860  error => 'eof in escaped script data', # XXXdocumentation
861  state => DATA_STATE,
862  reconsume => 1,
863};
864$Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x0000] =
865$Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x0000] =
866$Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x0000] =
867$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x0000] =
868$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x0000] =
869$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x0000] = {
870  name => 'script data escaped null',
871  emit => CHARACTER_TOKEN,
872  emit_data => "\x{FFFD}",
873  error => 'NULL',
874  state => SCRIPT_DATA_ESCAPED_STATE,
875};
876$Action->[SCRIPT_DATA_ESCAPED_STATE]->[KEY_ELSE_CHAR] = {
877  name => 'script data escaped else',
878  emit => CHARACTER_TOKEN,
879  state => SCRIPT_DATA_ESCAPED_STATE,
880};
881$Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[KEY_ELSE_CHAR] = {
882  name => 'script data escaped dash else',
883  emit => CHARACTER_TOKEN,
884  state => SCRIPT_DATA_ESCAPED_STATE,
885};
886$Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[KEY_ELSE_CHAR] = {
887  name => 'script data escaped dash dash else',
888  emit => CHARACTER_TOKEN,
889  state => SCRIPT_DATA_ESCAPED_STATE,
890};
891$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[KEY_ELSE_CHAR] = {
892  name => 'script data double escaped else',
893  emit => CHARACTER_TOKEN,
894  state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
895};
896$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[KEY_ELSE_CHAR] = {
897  name => 'script data double escaped dash else',
898  emit => CHARACTER_TOKEN,
899  state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
900};
901$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[KEY_ELSE_CHAR] = {
902  name => 'script data double escaped dash dash else',
903  emit => CHARACTER_TOKEN,
904  state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
905};
906$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_SPACE_CHAR] =
907$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_SPACE_CHAR] =
908$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[0x003E] =
909$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[0x003E] =
910$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[0x002F] =
911$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[0x002F] = {
912  name => 'script data double escape start sp>/',
913  skip => 1,
914};
915$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_ULATIN_CHAR] =
916$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_ULATIN_CHAR] = {
917  name => 'script data double escape start uc',
918  emit => CHARACTER_TOKEN,
919  buffer => {append => 0x0020}, # UC -> lc
920};
921$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_LLATIN_CHAR] =
922$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_LLATIN_CHAR] = {
923  name => 'script data double escape start lc',
924  emit => CHARACTER_TOKEN,
925  buffer => {append => 0x0000},
926};
927$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_ELSE_CHAR] = {
928  name => 'script data double escape start else',
929  state => SCRIPT_DATA_ESCAPED_STATE,
930  reconsume => 1,
931};
932$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_ELSE_CHAR] = {
933  name => 'script data double escape end else',
934  state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
935  reconsume => 1,
936};
937$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE]->[0x002F] = {
938  name => 'script data double escaped lt /',
939  buffer => {clear => 1},
940  state => SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE,
941  emit => CHARACTER_TOKEN,
942  emit_data => '/',
943};
944$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE]->[KEY_ELSE_CHAR] = {
945  name => 'script data double escaped lt else',
946  state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE,
947  reconsume => 1,
948};
949      ## XML5: Part of the "data state".
950$Action->[DATA_MSE1_STATE]->[0x005D] = {
951  name => 'data mse1 ]',
952  state => DATA_MSE2_STATE,
953  emit => CHARACTER_TOKEN,
954  emit_data => ']',
955};
956$Action->[DATA_MSE1_STATE]->[KEY_ELSE_CHAR] = {
957  name => 'data mse1 else',
958  state => DATA_STATE,
959  reconsume => 1,
960};
961$Action->[DATA_MSE2_STATE]->[0x003E] = {
962  name => 'data mse2 >',
963  error => 'unmatched mse', # XML5: Not a parse error. # XXXdocumentation
964  error_delta => 2,
965  state => DATA_STATE,
966  emit => CHARACTER_TOKEN,
967  emit_data => '>',
968};
969$Action->[DATA_MSE2_STATE]->[0x005D] = {
970  name => 'data mse2 ]',
971  emit => CHARACTER_TOKEN,
972  emit_data => ']',
973};
974$Action->[DATA_MSE2_STATE]->[KEY_ELSE_CHAR] = {
975  name => 'data mse2 else',
976  state => DATA_STATE,
977  reconsume => 1,
978};
979      ## XML5: "Tag attribute name before state".
980$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = {
981  name => 'before attr name sp',
982};
983$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003E] = {
984  name => 'before attr name >',
985  emit => '',
986  state => DATA_STATE,
987};
988$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
989  name => 'before attr name uc',
990  ca => {
991    set_name => 0x0020, # UC -> lc
992  },
993  state => ATTRIBUTE_NAME_STATE,
994};
995$XMLAction->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
996  name => 'before attr name uc xml',
997  ca => {
998    set_name => 0x0000,
999  },
1000  state => ATTRIBUTE_NAME_STATE,
1001};
1002$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x002F] = {
1003  name => 'before attr name /',
1004  state => SELF_CLOSING_START_TAG_STATE,
1005};
1006$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = {
1007  name => 'before attr name eof',
1008  error => 'unclosed tag',
1009  state => DATA_STATE,
1010};
1011$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0022] =
1012$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0027] =
1013$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003C] =
1014$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003D] = {
1015  name => q[before attr name "'<=],
1016  error => 'bad attribute name', ## XML5: Not a parse error.
1017  ca => {set_name => 0x0000},
1018  state => ATTRIBUTE_NAME_STATE,
1019};
1020$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0000] = {
1021  name => 'before attr name null',
1022  ca => {set_name => 0xFFFD},
1023  error => 'NULL',
1024  state => ATTRIBUTE_NAME_STATE,
1025};
1026          ## XML5: ":" raises a parse error and is ignored.
1027$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = {
1028  name => 'before attr name else',
1029  ca => {set_name => 0x0000},
1030  state => ATTRIBUTE_NAME_STATE,
1031};
1032
1033      ## XML5: "Tag attribute name state".
1034$Action->[ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = {
1035  name => 'attr name sp',
1036  ca => {leave => 1},
1037  state => AFTER_ATTRIBUTE_NAME_STATE,
1038};
1039$Action->[ATTRIBUTE_NAME_STATE]->[0x003D] = {
1040  name => 'attr name =',
1041  ca => {leave => 1},
1042  state => BEFORE_ATTRIBUTE_VALUE_STATE,
1043};
1044$Action->[ATTRIBUTE_NAME_STATE]->[0x003E] = {
1045  name => 'attr name >',
1046  ca => {leave => 1},
1047  emit => '',
1048  state => DATA_STATE,
1049};
1050$XMLAction->[ATTRIBUTE_NAME_STATE]->[0x003E] = {
1051  name => 'attr name > xml',
1052  error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation
1053  ca => {leave => 1},
1054  emit => '',
1055  state => DATA_STATE,
1056};
1057$Action->[ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
1058  name => 'attr name uc',
1059  ca => {name => 0x0020}, # UC -> lc
1060};
1061$XMLAction->[ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
1062  name => 'attr name uc',
1063  ca => {name => 0x0000},
1064};
1065$Action->[ATTRIBUTE_NAME_STATE]->[0x002F] = {
1066  name => 'attr name /',
1067  ca => {leave => 1},
1068  state => SELF_CLOSING_START_TAG_STATE,
1069};
1070$XMLAction->[ATTRIBUTE_NAME_STATE]->[0x002F] = {
1071  name => 'attr name / xml',
1072  error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation
1073  ca => {leave => 1},
1074  state => SELF_CLOSING_START_TAG_STATE,
1075};
1076$Action->[ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = {
1077  name => 'attr name eof',
1078  error => 'unclosed tag',
1079  ca => {leave => 1},
1080  state => DATA_STATE,
1081  reconsume => 1,
1082};
1083$Action->[ATTRIBUTE_NAME_STATE]->[0x0022] =
1084$Action->[ATTRIBUTE_NAME_STATE]->[0x0027] =
1085$Action->[ATTRIBUTE_NAME_STATE]->[0x003C] = {
1086  name => q[attr name "'<],
1087  error => 'bad attribute name', ## XML5: Not a parse error.
1088  ca => {name => 0x0000},
1089};
1090$Action->[ATTRIBUTE_NAME_STATE]->[0x0000] = {
1091  name => 'attr name null',
1092  ca => {name => 0xFFFD},
1093  error => 'NULL',
1094};
1095$Action->[ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = {
1096  name => 'attr name else',
1097  ca => {name => 0x0000},
1098};
1099      ## XML5: "Tag attribute name after state".
1100$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = {
1101  name => 'after attr name sp',
1102};
1103$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003D] = {
1104  name => 'after attr name =',
1105  state => BEFORE_ATTRIBUTE_VALUE_STATE,
1106};
1107$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003E] = {
1108  name => 'after attr name >',
1109  emit => '',
1110  state => DATA_STATE,
1111};
1112$XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003E] = {
1113  name => 'after attr name > xml',
1114  error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation
1115  emit => '',
1116  state => DATA_STATE,
1117};
1118$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
1119  name => 'after attr name uc',
1120  ca => {set_name => 0x0020}, # UC -> lc
1121  state => ATTRIBUTE_NAME_STATE,
1122};
1123$XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = {
1124  name => 'after attr name uc xml',
1125  ca => {set_name => 0x0000},
1126  state => ATTRIBUTE_NAME_STATE,
1127};
1128$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x002F] = {
1129  name => 'after attr name /',
1130  state => SELF_CLOSING_START_TAG_STATE,
1131};
1132$XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[0x002F] = {
1133  name => 'after attr name / xml',
1134  error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation
1135  state => SELF_CLOSING_START_TAG_STATE,
1136};
1137$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = {
1138  name => 'after attr name eof',
1139  error => 'unclosed tag',
1140  state => DATA_STATE,
1141  reconsume => 1,
1142};
1143$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0022] =
1144$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0027] =
1145$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003C] = {
1146  name => q[after attr name "'<],
1147  error => 'bad attribute name', ## XML5: Not a parse error.
1148  #error2(xml) => 'no attr value', ## XML5: Not a parse error.
1149  ca => {set_name => 0x0000},
1150  state => ATTRIBUTE_NAME_STATE,
1151};
1152$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0000] = {
1153  name => q[after attr name else],
1154  ca => {set_name => 0xFFFD},
1155  error => 'NULL',
1156  #error2(xml) => 'no attr value', ## XML5: Not a parse error.
1157  state => ATTRIBUTE_NAME_STATE,
1158};
1159$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = {
1160  name => q[after attr name else],
1161  ca => {set_name => 0x0000},
1162  state => ATTRIBUTE_NAME_STATE,
1163};
1164$XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = {
1165  name => q[after attr name else],
1166  error => 'no attr value', ## XML5: Not a parse error.
1167  ca => {set_name => 0x0000},
1168  state => ATTRIBUTE_NAME_STATE,
1169};
1170      ## XML5: "Tag attribute value before state".
1171$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_SPACE_CHAR] = {
1172  name => 'before attr value sp',
1173};
1174$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0022] = {
1175  name => 'before attr value "',
1176  state => ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE,
1177};
1178$XMLAction->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0026] = {
1179  name => 'before attr value &',
1180  error => 'unquoted attr value', ## XML5: Not a parse error.
1181  state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1182  reconsume => 1,
1183};
1184$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0026] = {
1185  name => 'before attr value &',
1186  state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1187  reconsume => 1,
1188};
1189$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0027] = {
1190  name => "before attr value '",
1191  state => ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE,
1192};
1193$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003E] = {
1194  name => 'before attr value >',
1195  error => 'empty unquoted attribute value',
1196  emit => '',
1197  state => DATA_STATE,
1198};
1199$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_EOF_CHAR] = {
1200  name => 'before attr value eof',
1201  error => 'unclosed tag',
1202  state => DATA_STATE,
1203};
1204$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003C] =
1205$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003D] =
1206$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0060] = {
1207  name => 'before attr value <=`',
1208  error => 'bad attribute value', ## XML5: Not a parse error.
1209  #error2(xml) => 'unquoted attr value', ## XML5: Not a parse error.
1210  ca => {value => 1},
1211  state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1212};
1213$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0000] = {
1214  name => 'before attr value null',
1215  ca => {value => "\x{FFFD}"},
1216  error => 'NULL',
1217  #error2(xml) => 'unquoted attr value', ## XML5: Not a parse error.
1218  state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1219};
1220$XMLAction->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_ELSE_CHAR] = {
1221  name => 'before attr value else xml',
1222  error => 'unquoted attr value', ## XML5: Not a parse error. # XXXdocumentation
1223  ca => {value => 1},
1224  state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1225};
1226$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_ELSE_CHAR] = {
1227  name => 'before attr value else',
1228  ca => {value => 1},
1229  state => ATTRIBUTE_VALUE_UNQUOTED_STATE,
1230};
1231
1232$Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_SPACE_CHAR] = {
1233  name => 'after attr value quoted sp',
1234  state => BEFORE_ATTRIBUTE_NAME_STATE,
1235};
1236$Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[0x003E] = {
1237  name => 'after attr value quoted >',
1238  emit => '',
1239  state => DATA_STATE,
1240};
1241$Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[0x002F] = {
1242  name => 'after attr value quoted /',
1243  state => SELF_CLOSING_START_TAG_STATE,
1244};
1245$Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_EOF_CHAR] = {
1246  name => 'after attr value quoted eof',
1247  error => 'unclosed tag',
1248  state => DATA_STATE,
1249  reconsume => 1,
1250};
1251$Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_ELSE_CHAR] = {
1252  name => 'after attr value quoted else',
1253  error => 'no space between attributes',
1254  state => BEFORE_ATTRIBUTE_NAME_STATE,
1255  reconsume => 1,
1256};
1257$Action->[SELF_CLOSING_START_TAG_STATE]->[0x003E] = {
1258  name => 'self closing start tag >',
1259  skip => 1,
1260};
1261$Action->[SELF_CLOSING_START_TAG_STATE]->[KEY_EOF_CHAR] = {
1262  name => 'self closing start tag eof',
1263  error => 'unclosed tag',
1264  state => DATA_STATE, ## XML5: "Tag attribute name before state".
1265  reconsume => 1,
1266};
1267$Action->[SELF_CLOSING_START_TAG_STATE]->[KEY_ELSE_CHAR] = {
1268  name => 'self closing start tag else',
1269  error => 'nestc', # XXX This error type is wrong.
1270  state => BEFORE_ATTRIBUTE_NAME_STATE,
1271  reconsume => 1,
1272};
1273$Action->[MD_HYPHEN_STATE]->[0x002D] = {
1274  name => 'md hyphen -',
1275  ct => {type => COMMENT_TOKEN, data => '', delta => 3},
1276  state => COMMENT_START_STATE, ## XML5: "comment state".
1277};
1278$Action->[MD_HYPHEN_STATE]->[KEY_ELSE_CHAR] = {
1279  name => 'md hyphen else',
1280  error => 'bogus comment',
1281  error_delta => 3,
1282  state => BOGUS_COMMENT_STATE,
1283  reconsume => 1,
1284  ct => {type => COMMENT_TOKEN, data => '-', delta => 3},
1285};
1286
1287my $c_to_key = [];
1288$c_to_key->[255] = KEY_EOF_CHAR; # EOF_CHAR
1289$c_to_key->[$_] = $_ for 0x0000..0x007F;
1290$c_to_key->[$_] = KEY_SPACE_CHAR for keys %$is_space;
1291$c_to_key->[$_] = KEY_ULATIN_CHAR for 0x0041..0x005A;
1292$c_to_key->[$_] = KEY_LLATIN_CHAR for 0x0061..0x007A;
1293
1294sub _get_next_token ($) {
1295  my $self = shift;
1296
1297  if ($self->{self_closing}) {
1298    ## NOTE: The |$self->{self_closing}| flag can never be set to
1299    ## tokens except for start tag tokens.  A start tag token is
1300    ## always set to |$self->{ct}| before it is emitted.
1301    $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1302    delete $self->{self_closing};
1303  }
1304
1305  if (@{$self->{token}}) {
1306    $self->{self_closing} = $self->{token}->[0]->{self_closing};
1307    return shift @{$self->{token}};
1308  }
1309
1310  A: {
1311    my $nc = $self->{nc};
1312    my $state = $self->{state};
1313
1314
1315
1316    my $c = $nc > 0x007F ? KEY_ELSE_CHAR : $c_to_key->[$nc];
1317    my $action = $Action->[$state]->[$c] || $Action->[$state]->[KEY_ELSE_CHAR];
1318    if ($self->{is_xml}) {
1319      $action = $XMLAction->[$state]->[$c]
1320          || $Action->[$state]->[$c]
1321          || $XMLAction->[$state]->[KEY_ELSE_CHAR]
1322          || $Action->[$state]->[KEY_ELSE_CHAR];
1323    }
1324
1325    if ($action and not $action->{skip}) {
1326
1327
1328      if (defined $action->{error}) {
1329        if ($action->{error_delta}) {
1330          $self->{parse_error}->(level => $self->{level}->{must}, type => $action->{error},
1331                          line => $self->{line_prev},
1332                          column => $self->{column_prev} - $action->{error_delta} + 1);
1333        } else {
1334          $self->{parse_error}->(level => $self->{level}->{must}, type => $action->{error});
1335        }
1336      }
1337
1338      if (defined $action->{state}) {
1339        $self->{state} = $action->{state};
1340
1341        if ($action->{state_set}) {
1342          for (keys %{$action->{state_set}}) {
1343            $self->{$_} = $action->{state_set}->{$_};
1344          }
1345        }
1346      }
1347
1348      if (my $act = $action->{ct}) {
1349        if (defined $act->{type}) {
1350          $self->{ct} = {type => $act->{type},
1351                         tag_name => '', data => $act->{data}};
1352          if ($act->{delta}) {
1353            $self->{ct}->{line} = $self->{line_prev};
1354            $self->{ct}->{column} = $self->{column_prev} - $act->{delta} + 1;
1355          } else {
1356            $self->{ct}->{line} = $self->{line};
1357            $self->{ct}->{column} = $self->{column};
1358          }
1359        }
1360
1361        if (defined $act->{append_tag_name}) {
1362          $self->{ct}->{tag_name} .= chr ($nc + $act->{append_tag_name});
1363        }
1364      }
1365
1366      if (my $aca = $action->{ca}) {
1367        if ($aca->{value}) {
1368          $self->{ca}->{value} .= $aca->{value} ne '1' ? $aca->{value} : chr $nc;
1369        } elsif (defined $aca->{name}) {
1370          $self->{ca}->{name} .= chr ($nc + $aca->{name});
1371        } elsif (defined $aca->{set_name}) {
1372          $self->{ca} = {
1373            name => chr ($nc + $aca->{set_name}),
1374            value => '',
1375            line => $self->{line}, column => $self->{column},
1376          };
1377        } elsif ($aca->{leave}) {
1378          if (exists $self->{ct}->{attributes}->{$self->{ca}->{name}}) {
1379
1380            $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1381            ## Discard $self->{ca}.
1382          } else {
1383
1384            $self->{ct}->{attributes}->{$self->{ca}->{name}} = $self->{ca};
1385            $self->{ca}->{index} = ++$self->{ct}->{last_index};
1386          }
1387        }
1388      }
1389
1390      if (defined $action->{buffer}) {
1391        $self->{kwd} = '' if $action->{buffer}->{clear};
1392        $self->{kwd} .= chr ($nc + $action->{buffer}->{append})
1393            if defined $action->{buffer}->{append};
1394
1395
1396      }
1397
1398      if (defined $action->{emit}) {
1399        if ($action->{emit} eq '') {
1400          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1401
1402            $self->{last_stag_name} = $self->{ct}->{tag_name};
1403          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1404            if ($self->{ct}->{attributes}) {
1405
1406              $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1407            } else {
1408
1409            }
1410          } else {
1411            die "$0: $self->{ct}->{type}: Unknown token type";
1412          }
1413
1414          if ($action->{reconsume}) {
1415            #
1416          } else {
1417
1418    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1419      $self->{line_prev} = $self->{line};
1420      $self->{column_prev} = $self->{column};
1421      $self->{column}++;
1422      $self->{nc}
1423          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1424    } else {
1425      $self->{set_nc}->($self);
1426    }
1427
1428          }
1429          return  ($self->{ct});
1430        } else {
1431          my $token = {type => $action->{emit}};
1432          if (defined $action->{emit_data}) {
1433            $token->{data} = $action->{emit_data};
1434            if ($action->{emit_data_append}) {
1435              $token->{data} .= chr $nc;
1436            }
1437          } elsif ($action->{emit} == CHARACTER_TOKEN) {
1438            $token->{data} .= chr $nc;
1439          }
1440          if ($action->{emit_delta}) {
1441            $token->{line} = $self->{line_prev};
1442            $token->{column} = $self->{column_prev} - $action->{emit_delta} + 1;
1443          } else {
1444            $token->{line} = $self->{line};
1445            $token->{column} = $self->{column};
1446          }
1447          if (defined $action->{emit_data_read_until}) {
1448            $self->{read_until}->($token->{data},
1449                                  $action->{emit_data_read_until},
1450                                  length $token->{data});
1451          }
1452
1453          if ($action->{reconsume}) {
1454            #
1455          } else {
1456
1457    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1458      $self->{line_prev} = $self->{line};
1459      $self->{column_prev} = $self->{column};
1460      $self->{column}++;
1461      $self->{nc}
1462          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1463    } else {
1464      $self->{set_nc}->($self);
1465    }
1466
1467          }
1468          return  ($token);
1469        }
1470      } else {
1471        if ($action->{reconsume}) {
1472          #
1473        } else {
1474
1475    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1476      $self->{line_prev} = $self->{line};
1477      $self->{column_prev} = $self->{column};
1478      $self->{column}++;
1479      $self->{nc}
1480          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1481    } else {
1482      $self->{set_nc}->($self);
1483    }
1484
1485        }
1486      }
1487
1488      redo A;
1489    }
1490
1491    if ({
1492      (RCDATA_END_TAG_OPEN_STATE) => 1,
1493      (RAWTEXT_END_TAG_OPEN_STATE) => 1,
1494      (SCRIPT_DATA_END_TAG_OPEN_STATE) => 1,
1495      (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) => 1,
1496    }->{$state}) {
1497      ## This switch-case implements "RCDATA end tag open state",
1498      ## "RAWTEXT end tag open state", "script data end tag open
1499      ## state", "RCDATA end tag name state", "RAWTEXT end tag name
1500      ## state", and "script end tag name state" jointly with the
1501      ## implementation of the "tag name" state.
1502
1503      my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1504
1505      if (defined $self->{last_stag_name}) {
1506        #
1507      } else {
1508        ## No start tag token has ever been emitted
1509        ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
1510
1511        $self->{state} = {
1512          (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE,
1513          (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE,
1514          (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE,
1515          (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE)
1516              => SCRIPT_DATA_ESCAPED_STATE,
1517        }->{$state} or die "${state}'s next state not found";
1518        ## Reconsume.
1519        return  ({type => CHARACTER_TOKEN, data => '</',
1520                  line => $l, column => $c});
1521        redo A;
1522      }
1523
1524      my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
1525      if (length $ch) {
1526        my $CH = $ch;
1527        $ch =~ tr/a-z/A-Z/;
1528        my $nch = chr $nc;
1529        if ($nch eq $ch or $nch eq $CH) {
1530
1531          ## Stay in the state.
1532          $self->{kwd} .= $nch;
1533
1534    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1535      $self->{line_prev} = $self->{line};
1536      $self->{column_prev} = $self->{column};
1537      $self->{column}++;
1538      $self->{nc}
1539          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1540    } else {
1541      $self->{set_nc}->($self);
1542    }
1543
1544          redo A;
1545        } else {
1546
1547          $self->{state} = {
1548            (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE,
1549            (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE,
1550            (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE,
1551            (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE)
1552                => SCRIPT_DATA_ESCAPED_STATE,
1553          }->{$state} or die "${state}'s next state not found";
1554          ## Reconsume.
1555          return  ({type => CHARACTER_TOKEN,
1556                    data => '</' . $self->{kwd},
1557                    line => $self->{line_prev},
1558                    column => $self->{column_prev} - 1 - length $self->{kwd},
1559                   });
1560          redo A;
1561        }
1562      } else { # after "</{tag-name}"
1563        unless ($is_space->{$nc} or
1564	        {
1565                 0x003E => 1, # >
1566                 0x002F => 1, # /
1567                }->{$nc}) {
1568
1569          ## Reconsume.
1570          $self->{state} = {
1571            (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE,
1572            (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE,
1573            (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE,
1574            (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE)
1575                => SCRIPT_DATA_ESCAPED_STATE,
1576          }->{$self->{state}} or die "${state}'s next state not found";
1577          return  ({type => CHARACTER_TOKEN,
1578                    data => '</' . $self->{kwd},
1579                    line => $self->{line_prev},
1580                    column => $self->{column_prev} - 1 - length $self->{kwd},
1581                   });
1582          redo A;
1583        } else {
1584
1585          $self->{ct}
1586              = {type => END_TAG_TOKEN,
1587                 tag_name => $self->{last_stag_name},
1588                 line => $self->{line_prev},
1589                 column => $self->{column_prev} - 1 - length $self->{kwd}};
1590          $self->{state} = TAG_NAME_STATE;
1591          ## Reconsume.
1592          redo A;
1593        }
1594      }
1595    } elsif ($state == SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE or
1596             $state == SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE) {
1597      if ($is_space->{$nc} or
1598          $nc == 0x002F or # /
1599          $nc == 0x003E) { # >
1600        my $token = {type => CHARACTER_TOKEN,
1601                     data => chr $nc,
1602                     line => $self->{line}, column => $self->{column}};
1603        if ($state == SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE) {
1604          $self->{state} = $self->{kwd} eq 'script' # "temporary buffer"
1605              ? SCRIPT_DATA_DOUBLE_ESCAPED_STATE
1606              : SCRIPT_DATA_ESCAPED_STATE;
1607        } else {
1608          $self->{state} = $self->{kwd} eq 'script' # "temporary buffer"
1609              ? SCRIPT_DATA_ESCAPED_STATE
1610              : SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
1611        }
1612
1613    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1614      $self->{line_prev} = $self->{line};
1615      $self->{column_prev} = $self->{column};
1616      $self->{column}++;
1617      $self->{nc}
1618          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1619    } else {
1620      $self->{set_nc}->($self);
1621    }
1622
1623        return  ($token);
1624        redo A;
1625      } else {
1626        die "$state/$nc is implemented";
1627      }
1628    } elsif ($state == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1629      ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1630      ## ATTLIST attribute value double quoted state".
1631
1632      if ($nc == 0x0022) { # "
1633        if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1634
1635          ## XML5: "DOCTYPE ATTLIST name after state".
1636          push @{$self->{ct}->{attrdefs}}, $self->{ca};
1637          $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1638        } else {
1639
1640          ## XML5: "Tag attribute name before state".
1641          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1642        }
1643
1644    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1645      $self->{line_prev} = $self->{line};
1646      $self->{column_prev} = $self->{column};
1647      $self->{column}++;
1648      $self->{nc}
1649          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1650    } else {
1651      $self->{set_nc}->($self);
1652    }
1653
1654        redo A;
1655      } elsif ($nc == 0x0026) { # &
1656
1657        ## XML5: Not defined yet.
1658
1659        ## NOTE: In the spec, the tokenizer is switched to the
1660        ## "entity in attribute value state".  In this implementation, the
1661        ## tokenizer is switched to the |ENTITY_STATE|, which is an
1662        ## implementation of the "consume a character reference" algorithm.
1663        $self->{prev_state} = $state;
1664        $self->{entity_add} = 0x0022; # "
1665        $self->{state} = ENTITY_STATE;
1666
1667    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1668      $self->{line_prev} = $self->{line};
1669      $self->{column_prev} = $self->{column};
1670      $self->{column}++;
1671      $self->{nc}
1672          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1673    } else {
1674      $self->{set_nc}->($self);
1675    }
1676
1677        redo A;
1678      } elsif ($self->{is_xml} and
1679               $is_space->{$nc}) {
1680
1681        $self->{ca}->{value} .= ' ';
1682        ## Stay in the state.
1683
1684    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1685      $self->{line_prev} = $self->{line};
1686      $self->{column_prev} = $self->{column};
1687      $self->{column}++;
1688      $self->{nc}
1689          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1690    } else {
1691      $self->{set_nc}->($self);
1692    }
1693
1694        redo A;
1695      } elsif ($nc == -1) {
1696        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1697        if ($self->{ct}->{type} == START_TAG_TOKEN) {
1698
1699          $self->{last_stag_name} = $self->{ct}->{tag_name};
1700
1701          $self->{state} = DATA_STATE;
1702          ## reconsume
1703          return  ($self->{ct}); # start tag
1704          redo A;
1705        } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1706          if ($self->{ct}->{attributes}) {
1707
1708            $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1709          } else {
1710            ## NOTE: This state should never be reached.
1711
1712          }
1713
1714          $self->{state} = DATA_STATE;
1715          ## reconsume
1716
1717          ## Discard the token.
1718          #return  ($self->{ct}); # end tag
1719
1720          redo A;
1721        } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1722          ## XML5: No parse error above; not defined yet.
1723          push @{$self->{ct}->{attrdefs}}, $self->{ca};
1724          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1725          ## Reconsume.
1726
1727          ## Discard the token.
1728          #return  ($self->{ct}); # ATTLIST
1729
1730          redo A;
1731        } else {
1732          die "$0: $self->{ct}->{type}: Unknown token type";
1733        }
1734      } elsif ($nc == 0x0000) {
1735        $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
1736        $self->{ca}->{value} .= "\x{FFFD}";
1737        ## Stay in the state
1738
1739    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1740      $self->{line_prev} = $self->{line};
1741      $self->{column_prev} = $self->{column};
1742      $self->{column}++;
1743      $self->{nc}
1744          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1745    } else {
1746      $self->{set_nc}->($self);
1747    }
1748
1749        redo A;
1750      } else {
1751        ## XML5 [ATTLIST]: Not defined yet.
1752        if ($self->{is_xml} and $nc == 0x003C) { # <
1753
1754          ## XML5: Not a parse error.
1755          $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1756        } else {
1757
1758        }
1759        $self->{ca}->{value} .= chr ($nc);
1760        $self->{read_until}->($self->{ca}->{value},
1761                              qq[\x00"&<\x09\x0C\x20],
1762                              length $self->{ca}->{value});
1763
1764        ## Stay in the state
1765
1766    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1767      $self->{line_prev} = $self->{line};
1768      $self->{column_prev} = $self->{column};
1769      $self->{column}++;
1770      $self->{nc}
1771          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1772    } else {
1773      $self->{set_nc}->($self);
1774    }
1775
1776        redo A;
1777      }
1778    } elsif ($state == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1779      ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1780      ## ATTLIST attribute value single quoted state".
1781
1782      if ($nc == 0x0027) { # '
1783        if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1784
1785          ## XML5: "DOCTYPE ATTLIST name after state".
1786          push @{$self->{ct}->{attrdefs}}, $self->{ca};
1787          $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1788        } else {
1789
1790          ## XML5: "Before attribute name state" (sic).
1791          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1792        }
1793
1794    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1795      $self->{line_prev} = $self->{line};
1796      $self->{column_prev} = $self->{column};
1797      $self->{column}++;
1798      $self->{nc}
1799          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1800    } else {
1801      $self->{set_nc}->($self);
1802    }
1803
1804        redo A;
1805      } elsif ($nc == 0x0026) { # &
1806
1807        ## XML5: Not defined yet.
1808
1809        ## NOTE: In the spec, the tokenizer is switched to the
1810        ## "entity in attribute value state".  In this implementation, the
1811        ## tokenizer is switched to the |ENTITY_STATE|, which is an
1812        ## implementation of the "consume a character reference" algorithm.
1813        $self->{entity_add} = 0x0027; # '
1814        $self->{prev_state} = $state;
1815        $self->{state} = ENTITY_STATE;
1816
1817    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1818      $self->{line_prev} = $self->{line};
1819      $self->{column_prev} = $self->{column};
1820      $self->{column}++;
1821      $self->{nc}
1822          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1823    } else {
1824      $self->{set_nc}->($self);
1825    }
1826
1827        redo A;
1828      } elsif ($self->{is_xml} and
1829               $is_space->{$nc}) {
1830
1831        $self->{ca}->{value} .= ' ';
1832        ## Stay in the state.
1833
1834    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1835      $self->{line_prev} = $self->{line};
1836      $self->{column_prev} = $self->{column};
1837      $self->{column}++;
1838      $self->{nc}
1839          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1840    } else {
1841      $self->{set_nc}->($self);
1842    }
1843
1844        redo A;
1845      } elsif ($nc == -1) {
1846        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1847        if ($self->{ct}->{type} == START_TAG_TOKEN) {
1848
1849          $self->{last_stag_name} = $self->{ct}->{tag_name};
1850
1851          $self->{state} = DATA_STATE;
1852          ## reconsume
1853
1854          ## Discard the token.
1855          #return  ($self->{ct}); # start tag
1856
1857          redo A;
1858        } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1859          if ($self->{ct}->{attributes}) {
1860
1861            $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1862          } else {
1863            ## NOTE: This state should never be reached.
1864
1865          }
1866
1867          $self->{state} = DATA_STATE;
1868          ## reconsume
1869
1870          ## Discard the token.
1871          #return  ($self->{ct}); # end tag
1872
1873          redo A;
1874        } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1875          ## XML5: No parse error above; not defined yet.
1876          push @{$self->{ct}->{attrdefs}}, $self->{ca};
1877          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1878          ## Reconsume.
1879
1880          ## Discard the token.
1881          #return  ($self->{ct}); # ATTLIST
1882
1883          redo A;
1884        } else {
1885          die "$0: $self->{ct}->{type}: Unknown token type";
1886        }
1887      } elsif ($nc == 0x0000) {
1888        $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
1889        $self->{ca}->{value} .= "\x{FFFD}";
1890        ## Stay in the state
1891
1892    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1893      $self->{line_prev} = $self->{line};
1894      $self->{column_prev} = $self->{column};
1895      $self->{column}++;
1896      $self->{nc}
1897          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1898    } else {
1899      $self->{set_nc}->($self);
1900    }
1901
1902        redo A;
1903      } else {
1904        ## XML5 [ATTLIST]: Not defined yet.
1905        if ($self->{is_xml} and $nc == 0x003C) { # <
1906
1907          ## XML5: Not a parse error.
1908          $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1909        } else {
1910
1911        }
1912        $self->{ca}->{value} .= chr ($nc);
1913        $self->{read_until}->($self->{ca}->{value},
1914                              qq[\x00'&<\x09\x0C\x20],
1915                              length $self->{ca}->{value});
1916
1917        ## Stay in the state
1918
1919    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1920      $self->{line_prev} = $self->{line};
1921      $self->{column_prev} = $self->{column};
1922      $self->{column}++;
1923      $self->{nc}
1924          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1925    } else {
1926      $self->{set_nc}->($self);
1927    }
1928
1929        redo A;
1930      }
1931    } elsif ($state == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1932      ## XML5: "Tag attribute value unquoted state".
1933
1934      if ($is_space->{$nc}) {
1935        if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1936
1937          push @{$self->{ct}->{attrdefs}}, $self->{ca};
1938          $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1939        } else {
1940
1941          ## XML5: "Tag attribute name before state".
1942          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1943        }
1944
1945    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1946      $self->{line_prev} = $self->{line};
1947      $self->{column_prev} = $self->{column};
1948      $self->{column}++;
1949      $self->{nc}
1950          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1951    } else {
1952      $self->{set_nc}->($self);
1953    }
1954
1955        redo A;
1956      } elsif ($nc == 0x0026) { # &
1957
1958
1959        ## XML5: Not defined yet.
1960
1961        ## NOTE: In the spec, the tokenizer is switched to the
1962        ## "character reference in attribute value state".  In this
1963        ## implementation, the tokenizer is switched to the
1964        ## |ENTITY_STATE|, which is an implementation of the "consume
1965        ## a character reference" algorithm.
1966        $self->{entity_add} = 0x003E; # >
1967        $self->{prev_state} = $state;
1968        $self->{state} = ENTITY_STATE;
1969
1970    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1971      $self->{line_prev} = $self->{line};
1972      $self->{column_prev} = $self->{column};
1973      $self->{column}++;
1974      $self->{nc}
1975          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1976    } else {
1977      $self->{set_nc}->($self);
1978    }
1979
1980        redo A;
1981      } elsif ($nc == 0x003E) { # >
1982        if ($self->{ct}->{type} == START_TAG_TOKEN) {
1983
1984          $self->{last_stag_name} = $self->{ct}->{tag_name};
1985
1986          $self->{state} = DATA_STATE;
1987
1988    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1989      $self->{line_prev} = $self->{line};
1990      $self->{column_prev} = $self->{column};
1991      $self->{column}++;
1992      $self->{nc}
1993          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1994    } else {
1995      $self->{set_nc}->($self);
1996    }
1997
1998          return  ($self->{ct}); # start tag
1999          redo A;
2000        } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2001          if ($self->{ct}->{attributes}) {
2002
2003            $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2004          } else {
2005            ## NOTE: This state should never be reached.
2006
2007          }
2008
2009          $self->{state} = DATA_STATE;
2010
2011    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2012      $self->{line_prev} = $self->{line};
2013      $self->{column_prev} = $self->{column};
2014      $self->{column}++;
2015      $self->{nc}
2016          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2017    } else {
2018      $self->{set_nc}->($self);
2019    }
2020
2021          return  ($self->{ct}); # end tag
2022          redo A;
2023        } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2024          push @{$self->{ct}->{attrdefs}}, $self->{ca};
2025          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2026
2027    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2028      $self->{line_prev} = $self->{line};
2029      $self->{column_prev} = $self->{column};
2030      $self->{column}++;
2031      $self->{nc}
2032          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2033    } else {
2034      $self->{set_nc}->($self);
2035    }
2036
2037          return  ($self->{ct}); # ATTLIST
2038          redo A;
2039        } else {
2040          die "$0: $self->{ct}->{type}: Unknown token type";
2041        }
2042      } elsif ($nc == -1) {
2043        if ($self->{ct}->{type} == START_TAG_TOKEN) {
2044
2045          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2046          $self->{last_stag_name} = $self->{ct}->{tag_name};
2047
2048          $self->{state} = DATA_STATE;
2049          ## reconsume
2050
2051          ## Discard the token.
2052          #return  ($self->{ct}); # start tag
2053
2054          redo A;
2055        } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2056          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2057          if ($self->{ct}->{attributes}) {
2058
2059            $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2060          } else {
2061            ## NOTE: This state should never be reached.
2062
2063          }
2064
2065          $self->{state} = DATA_STATE;
2066          ## reconsume
2067
2068          ## Discard the token.
2069          #return  ($self->{ct}); # end tag
2070
2071          redo A;
2072        } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2073          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2074          push @{$self->{ct}->{attrdefs}}, $self->{ca};
2075          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2076          ## Reconsume.
2077
2078          ## Discard the token.
2079          #return  ($self->{ct}); # ATTLIST
2080
2081          redo A;
2082        } else {
2083          die "$0: $self->{ct}->{type}: Unknown token type";
2084        }
2085      } elsif ($nc == 0x0000) {
2086        $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2087        $self->{ca}->{value} .= "\x{FFFD}";
2088        ## Stay in the state
2089
2090    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2091      $self->{line_prev} = $self->{line};
2092      $self->{column_prev} = $self->{column};
2093      $self->{column}++;
2094      $self->{nc}
2095          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2096    } else {
2097      $self->{set_nc}->($self);
2098    }
2099
2100        redo A;
2101      } else {
2102        if ({
2103             0x0022 => 1, # "
2104             0x0027 => 1, # '
2105             0x003D => 1, # =
2106             0x003C => 1, # <
2107             0x0060 => 1, # `
2108            }->{$nc}) {
2109
2110          ## XML5: Not a parse error.
2111          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2112        } else {
2113
2114        }
2115        $self->{ca}->{value} .= chr ($nc);
2116        $self->{read_until}->($self->{ca}->{value},
2117                              qq[\x00"'=&` \x09\x0C<>],
2118                              length $self->{ca}->{value});
2119
2120        ## Stay in the state
2121
2122    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2123      $self->{line_prev} = $self->{line};
2124      $self->{column_prev} = $self->{column};
2125      $self->{column}++;
2126      $self->{nc}
2127          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2128    } else {
2129      $self->{set_nc}->($self);
2130    }
2131
2132        redo A;
2133      }
2134    } elsif ($state == SELF_CLOSING_START_TAG_STATE) {
2135      ## XML5: "Empty tag state".
2136
2137      if ($nc == 0x003E) { # >
2138        if ($self->{ct}->{type} == END_TAG_TOKEN) {
2139
2140          $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2141          ## XXX: Different type than slash in start tag
2142          if ($self->{ct}->{attributes}) {
2143
2144            $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2145          } else {
2146
2147          }
2148          ## XXX: Test |<title></title/>|
2149        } else {
2150
2151          $self->{self_closing} = 1;
2152        }
2153
2154        $self->{state} = DATA_STATE;
2155
2156    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2157      $self->{line_prev} = $self->{line};
2158      $self->{column_prev} = $self->{column};
2159      $self->{column}++;
2160      $self->{nc}
2161          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2162    } else {
2163      $self->{set_nc}->($self);
2164    }
2165
2166
2167        return  ($self->{ct}); # start tag or end tag
2168
2169        redo A;
2170      } else {
2171        die "$state/$nc is implemented";
2172      }
2173    } elsif ($state == BOGUS_COMMENT_STATE) {
2174      ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2175
2176      ## NOTE: Unlike spec's "bogus comment state", this implementation
2177      ## consumes characters one-by-one basis.
2178
2179      if ($nc == 0x003E) { # >
2180        if ($self->{in_subset}) {
2181
2182          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2183        } else {
2184
2185          $self->{state} = DATA_STATE;
2186        }
2187
2188    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2189      $self->{line_prev} = $self->{line};
2190      $self->{column_prev} = $self->{column};
2191      $self->{column}++;
2192      $self->{nc}
2193          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2194    } else {
2195      $self->{set_nc}->($self);
2196    }
2197
2198
2199        return  ($self->{ct}); # comment
2200        redo A;
2201      } elsif ($nc == -1) {
2202        if ($self->{in_subset}) {
2203
2204          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2205        } else {
2206
2207          $self->{state} = DATA_STATE;
2208        }
2209        ## reconsume
2210
2211        return  ($self->{ct}); # comment
2212        redo A;
2213      } elsif ($nc == 0x0000) {
2214        $self->{ct}->{data} .= "\x{FFFD}"; # comment
2215        ## Stay in the state.
2216
2217    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2218      $self->{line_prev} = $self->{line};
2219      $self->{column_prev} = $self->{column};
2220      $self->{column}++;
2221      $self->{nc}
2222          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2223    } else {
2224      $self->{set_nc}->($self);
2225    }
2226
2227        redo A;
2228      } else {
2229
2230        $self->{ct}->{data} .= chr ($nc); # comment
2231        $self->{read_until}->($self->{ct}->{data},
2232                              qq[\x00>],
2233                              length $self->{ct}->{data});
2234
2235        ## Stay in the state.
2236
2237    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2238      $self->{line_prev} = $self->{line};
2239      $self->{column_prev} = $self->{column};
2240      $self->{column}++;
2241      $self->{nc}
2242          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2243    } else {
2244      $self->{set_nc}->($self);
2245    }
2246
2247        redo A;
2248      }
2249    } elsif ($state == MARKUP_DECLARATION_OPEN_STATE) {
2250      ## XML5: "Markup declaration state".
2251
2252      if ($nc == 0x002D) { # -
2253
2254        $self->{state} = MD_HYPHEN_STATE;
2255
2256    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2257      $self->{line_prev} = $self->{line};
2258      $self->{column_prev} = $self->{column};
2259      $self->{column}++;
2260      $self->{nc}
2261          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2262    } else {
2263      $self->{set_nc}->($self);
2264    }
2265
2266        redo A;
2267      } elsif ($nc == 0x0044 or # D
2268               $nc == 0x0064) { # d
2269        ## ASCII case-insensitive.
2270
2271        $self->{state} = MD_DOCTYPE_STATE;
2272        $self->{kwd} = chr $nc;
2273
2274    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2275      $self->{line_prev} = $self->{line};
2276      $self->{column_prev} = $self->{column};
2277      $self->{column}++;
2278      $self->{nc}
2279          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2280    } else {
2281      $self->{set_nc}->($self);
2282    }
2283
2284        redo A;
2285#               $nc == 0x005B) { # [
2286
2287        $self->{state} = MD_CDATA_STATE;
2288        $self->{kwd} = '[';
2289
2290    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2291      $self->{line_prev} = $self->{line};
2292      $self->{column_prev} = $self->{column};
2293      $self->{column}++;
2294      $self->{nc}
2295          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2296    } else {
2297      $self->{set_nc}->($self);
2298    }
2299
2300        redo A;
2301      } else {
2302
2303      }
2304
2305      $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2306                      line => $self->{line_prev},
2307                      column => $self->{column_prev} - 1);
2308      ## Reconsume.
2309      $self->{state} = BOGUS_COMMENT_STATE;
2310      $self->{ct} = {type => COMMENT_TOKEN, data => '',
2311                                line => $self->{line_prev},
2312                                column => $self->{column_prev} - 1,
2313                               };
2314      redo A;
2315    } elsif ($state == MD_DOCTYPE_STATE) {
2316      ## ASCII case-insensitive.
2317      if ($nc == [
2318            undef,
2319            0x004F, # O
2320            0x0043, # C
2321            0x0054, # T
2322            0x0059, # Y
2323            0x0050, # P
2324            NEVER_CHAR, # (E)
2325          ]->[length $self->{kwd}] or
2326          $nc == [
2327            undef,
2328            0x006F, # o
2329            0x0063, # c
2330            0x0074, # t
2331            0x0079, # y
2332            0x0070, # p
2333            NEVER_CHAR, # (e)
2334          ]->[length $self->{kwd}]) {
2335
2336        ## Stay in the state.
2337        $self->{kwd} .= chr $nc;
2338
2339    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2340      $self->{line_prev} = $self->{line};
2341      $self->{column_prev} = $self->{column};
2342      $self->{column}++;
2343      $self->{nc}
2344          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2345    } else {
2346      $self->{set_nc}->($self);
2347    }
2348
2349        redo A;
2350      } elsif ((length $self->{kwd}) == 6 and
2351               ($nc == 0x0045 or # E
2352                $nc == 0x0065)) { # e
2353        if ($self->{is_xml} and
2354            ($self->{kwd} ne 'DOCTYP' or $nc == 0x0065)) {
2355
2356          ## XML5: case-sensitive.
2357          $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2358                          text => 'DOCTYPE',
2359                          line => $self->{line_prev},
2360                          column => $self->{column_prev} - 5);
2361        } else {
2362
2363        }
2364        $self->{state} = DOCTYPE_STATE;
2365        $self->{ct} = {type => DOCTYPE_TOKEN,
2366                                  quirks => 1,
2367                                  line => $self->{line_prev},
2368                                  column => $self->{column_prev} - 7,
2369                                 };
2370
2371    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2372      $self->{line_prev} = $self->{line};
2373      $self->{column_prev} = $self->{column};
2374      $self->{column}++;
2375      $self->{nc}
2376          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2377    } else {
2378      $self->{set_nc}->($self);
2379    }
2380
2381        redo A;
2382      } else {
2383
2384        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2385                        line => $self->{line_prev},
2386                        column => $self->{column_prev} - 1 - length $self->{kwd});
2387        $self->{state} = BOGUS_COMMENT_STATE;
2388        ## Reconsume.
2389        $self->{ct} = {type => COMMENT_TOKEN,
2390                                  data => $self->{kwd},
2391                                  line => $self->{line_prev},
2392                                  column => $self->{column_prev} - 1 - length $self->{kwd},
2393                                 };
2394        redo A;
2395      }
2396    } elsif ($state == MD_CDATA_STATE) {
2397      if ($nc == {
2398            '[' => 0x0043, # C
2399            '[C' => 0x0044, # D
2400            '[CD' => 0x0041, # A
2401            '[CDA' => 0x0054, # T
2402            '[CDAT' => 0x0041, # A
2403            '[CDATA' => NEVER_CHAR, # ([)
2404          }->{$self->{kwd}}) {
2405
2406        ## Stay in the state.
2407        $self->{kwd} .= chr $nc;
2408
2409    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2410      $self->{line_prev} = $self->{line};
2411      $self->{column_prev} = $self->{column};
2412      $self->{column}++;
2413      $self->{nc}
2414          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2415    } else {
2416      $self->{set_nc}->($self);
2417    }
2418
2419        redo A;
2420      } elsif ($self->{kwd} eq '[CDATA' and
2421               $nc == 0x005B) { # [
2422        if ($self->{is_xml} and
2423            not $self->{tainted} and
2424            @{$self->{open_elements} or []} == 0) {
2425
2426          $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2427                          line => $self->{line_prev},
2428                          column => $self->{column_prev} - 7);
2429          $self->{tainted} = 1;
2430        } else {
2431
2432        }
2433
2434        $self->{ct} = {type => CHARACTER_TOKEN,
2435                                  data => '',
2436                                  line => $self->{line_prev},
2437                                  column => $self->{column_prev} - 7};
2438        $self->{state} = CDATA_SECTION_STATE;
2439
2440    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2441      $self->{line_prev} = $self->{line};
2442      $self->{column_prev} = $self->{column};
2443      $self->{column}++;
2444      $self->{nc}
2445          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2446    } else {
2447      $self->{set_nc}->($self);
2448    }
2449
2450        redo A;
2451      } else {
2452
2453        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2454                        line => $self->{line_prev},
2455                        column => $self->{column_prev} - 1 - length $self->{kwd});
2456        $self->{state} = BOGUS_COMMENT_STATE;
2457        ## Reconsume.
2458        $self->{ct} = {type => COMMENT_TOKEN,
2459                                  data => $self->{kwd},
2460                                  line => $self->{line_prev},
2461                                  column => $self->{column_prev} - 1 - length $self->{kwd},
2462                                 };
2463        redo A;
2464      }
2465    } elsif ($state == COMMENT_START_STATE) {
2466      if ($nc == 0x002D) { # -
2467
2468        $self->{state} = COMMENT_START_DASH_STATE;
2469
2470    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2471      $self->{line_prev} = $self->{line};
2472      $self->{column_prev} = $self->{column};
2473      $self->{column}++;
2474      $self->{nc}
2475          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2476    } else {
2477      $self->{set_nc}->($self);
2478    }
2479
2480        redo A;
2481      } elsif ($nc == 0x003E) { # >
2482        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2483        if ($self->{in_subset}) {
2484
2485          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2486        } else {
2487
2488          $self->{state} = DATA_STATE;
2489        }
2490
2491    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2492      $self->{line_prev} = $self->{line};
2493      $self->{column_prev} = $self->{column};
2494      $self->{column}++;
2495      $self->{nc}
2496          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2497    } else {
2498      $self->{set_nc}->($self);
2499    }
2500
2501
2502        return  ($self->{ct}); # comment
2503
2504        redo A;
2505      } elsif ($nc == -1) {
2506        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2507        if ($self->{in_subset}) {
2508
2509          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2510        } else {
2511
2512          $self->{state} = DATA_STATE;
2513        }
2514        ## reconsume
2515
2516        return  ($self->{ct}); # comment
2517
2518        redo A;
2519      } elsif ($nc == 0x0000) {
2520        $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2521        $self->{ct}->{data} .= "\x{FFFD}"; # comment
2522        $self->{state} = COMMENT_STATE;
2523
2524    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2525      $self->{line_prev} = $self->{line};
2526      $self->{column_prev} = $self->{column};
2527      $self->{column}++;
2528      $self->{nc}
2529          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2530    } else {
2531      $self->{set_nc}->($self);
2532    }
2533
2534        redo A;
2535      } else {
2536
2537        $self->{ct}->{data} # comment
2538            .= chr ($nc);
2539        $self->{state} = COMMENT_STATE;
2540
2541    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2542      $self->{line_prev} = $self->{line};
2543      $self->{column_prev} = $self->{column};
2544      $self->{column}++;
2545      $self->{nc}
2546          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2547    } else {
2548      $self->{set_nc}->($self);
2549    }
2550
2551        redo A;
2552      }
2553    } elsif ($state == COMMENT_START_DASH_STATE) {
2554      if ($nc == 0x002D) { # -
2555
2556        $self->{state} = COMMENT_END_STATE;
2557
2558    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559      $self->{line_prev} = $self->{line};
2560      $self->{column_prev} = $self->{column};
2561      $self->{column}++;
2562      $self->{nc}
2563          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2564    } else {
2565      $self->{set_nc}->($self);
2566    }
2567
2568        redo A;
2569      } elsif ($nc == 0x003E) { # >
2570        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2571        if ($self->{in_subset}) {
2572
2573          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2574        } else {
2575
2576          $self->{state} = DATA_STATE;
2577        }
2578
2579    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2580      $self->{line_prev} = $self->{line};
2581      $self->{column_prev} = $self->{column};
2582      $self->{column}++;
2583      $self->{nc}
2584          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2585    } else {
2586      $self->{set_nc}->($self);
2587    }
2588
2589
2590        return  ($self->{ct}); # comment
2591
2592        redo A;
2593      } elsif ($nc == -1) {
2594        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2595        if ($self->{in_subset}) {
2596
2597          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2598        } else {
2599
2600          $self->{state} = DATA_STATE;
2601        }
2602        ## reconsume
2603
2604        return  ($self->{ct}); # comment
2605
2606        redo A;
2607      } elsif ($nc == 0x0000) {
2608        $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2609        $self->{ct}->{data} .= "-\x{FFFD}"; # comment
2610        $self->{state} = COMMENT_STATE;
2611
2612    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2613      $self->{line_prev} = $self->{line};
2614      $self->{column_prev} = $self->{column};
2615      $self->{column}++;
2616      $self->{nc}
2617          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2618    } else {
2619      $self->{set_nc}->($self);
2620    }
2621
2622        redo A;
2623      } else {
2624
2625        $self->{ct}->{data} # comment
2626            .= '-' . chr ($nc);
2627        $self->{state} = COMMENT_STATE;
2628
2629    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2630      $self->{line_prev} = $self->{line};
2631      $self->{column_prev} = $self->{column};
2632      $self->{column}++;
2633      $self->{nc}
2634          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2635    } else {
2636      $self->{set_nc}->($self);
2637    }
2638
2639        redo A;
2640      }
2641    } elsif ($state == COMMENT_STATE) {
2642      ## XML5: "Comment state" and "DOCTYPE comment state".
2643
2644      if ($nc == 0x002D) { # -
2645
2646        $self->{state} = COMMENT_END_DASH_STATE;
2647
2648    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2649      $self->{line_prev} = $self->{line};
2650      $self->{column_prev} = $self->{column};
2651      $self->{column}++;
2652      $self->{nc}
2653          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2654    } else {
2655      $self->{set_nc}->($self);
2656    }
2657
2658        redo A;
2659      } elsif ($nc == -1) {
2660        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2661        if ($self->{in_subset}) {
2662
2663          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2664        } else {
2665
2666          $self->{state} = DATA_STATE;
2667        }
2668        ## reconsume
2669
2670        return  ($self->{ct}); # comment
2671
2672        redo A;
2673      } elsif ($nc == 0x0000) {
2674        $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2675        $self->{ct}->{data} .= "\x{FFFD}"; # comment
2676
2677    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2678      $self->{line_prev} = $self->{line};
2679      $self->{column_prev} = $self->{column};
2680      $self->{column}++;
2681      $self->{nc}
2682          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2683    } else {
2684      $self->{set_nc}->($self);
2685    }
2686
2687        redo A;
2688      } else {
2689
2690        $self->{ct}->{data} .= chr ($nc); # comment
2691        $self->{read_until}->($self->{ct}->{data},
2692                              qq[-\x00],
2693                              length $self->{ct}->{data});
2694
2695        ## Stay in the state
2696
2697    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2698      $self->{line_prev} = $self->{line};
2699      $self->{column_prev} = $self->{column};
2700      $self->{column}++;
2701      $self->{nc}
2702          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2703    } else {
2704      $self->{set_nc}->($self);
2705    }
2706
2707        redo A;
2708      }
2709    } elsif ($state == COMMENT_END_DASH_STATE) {
2710      ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2711
2712      if ($nc == 0x002D) { # -
2713
2714        $self->{state} = COMMENT_END_STATE;
2715
2716    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2717      $self->{line_prev} = $self->{line};
2718      $self->{column_prev} = $self->{column};
2719      $self->{column}++;
2720      $self->{nc}
2721          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2722    } else {
2723      $self->{set_nc}->($self);
2724    }
2725
2726        redo A;
2727      } elsif ($nc == -1) {
2728        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2729        if ($self->{in_subset}) {
2730
2731          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2732        } else {
2733
2734          $self->{state} = DATA_STATE;
2735        }
2736        ## reconsume
2737
2738        return  ($self->{ct}); # comment
2739
2740        redo A;
2741      } elsif ($nc == 0x0000) {
2742        $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2743        $self->{ct}->{data} .= "-\x{FFFD}"; # comment
2744        $self->{state} = COMMENT_STATE;
2745
2746    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2747      $self->{line_prev} = $self->{line};
2748      $self->{column_prev} = $self->{column};
2749      $self->{column}++;
2750      $self->{nc}
2751          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2752    } else {
2753      $self->{set_nc}->($self);
2754    }
2755
2756        redo A;
2757      } else {
2758
2759        $self->{ct}->{data} .= '-' . chr ($nc); # comment
2760        $self->{state} = COMMENT_STATE;
2761
2762    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2763      $self->{line_prev} = $self->{line};
2764      $self->{column_prev} = $self->{column};
2765      $self->{column}++;
2766      $self->{nc}
2767          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2768    } else {
2769      $self->{set_nc}->($self);
2770    }
2771
2772        redo A;
2773      }
2774    } elsif ($state == COMMENT_END_STATE or
2775             $state == COMMENT_END_BANG_STATE) {
2776      ## XML5: "Comment end state" and "DOCTYPE comment end state".
2777      ## (No comment end bang state.)
2778
2779      if ($nc == 0x003E) { # >
2780        if ($self->{in_subset}) {
2781
2782          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2783        } else {
2784
2785          $self->{state} = DATA_STATE;
2786        }
2787
2788    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2789      $self->{line_prev} = $self->{line};
2790      $self->{column_prev} = $self->{column};
2791      $self->{column}++;
2792      $self->{nc}
2793          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2794    } else {
2795      $self->{set_nc}->($self);
2796    }
2797
2798
2799        return  ($self->{ct}); # comment
2800
2801        redo A;
2802      } elsif ($nc == 0x002D) { # -
2803        if ($state == COMMENT_END_BANG_STATE) {
2804
2805          $self->{ct}->{data} .= '--!'; # comment
2806          $self->{state} = COMMENT_END_DASH_STATE;
2807        } else {
2808
2809          ## XML5: Not a parse error.
2810          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2811                          line => $self->{line_prev},
2812                          column => $self->{column_prev});
2813          $self->{ct}->{data} .= '-'; # comment
2814          ## Stay in the state
2815        }
2816
2817    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2818      $self->{line_prev} = $self->{line};
2819      $self->{column_prev} = $self->{column};
2820      $self->{column}++;
2821      $self->{nc}
2822          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2823    } else {
2824      $self->{set_nc}->($self);
2825    }
2826
2827        redo A;
2828      } elsif ($state != COMMENT_END_BANG_STATE and
2829               $nc == 0x0021) { # !
2830
2831        $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
2832        $self->{state} = COMMENT_END_BANG_STATE;
2833
2834    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2835      $self->{line_prev} = $self->{line};
2836      $self->{column_prev} = $self->{column};
2837      $self->{column}++;
2838      $self->{nc}
2839          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2840    } else {
2841      $self->{set_nc}->($self);
2842    }
2843
2844        redo A;
2845      } elsif ($nc == -1) {
2846        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2847        if ($self->{in_subset}) {
2848
2849          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2850        } else {
2851
2852          $self->{state} = DATA_STATE;
2853        }
2854        ## Reconsume.
2855
2856        return  ($self->{ct}); # comment
2857
2858        redo A;
2859      } elsif ($nc == 0x0000) {
2860        $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
2861        if ($state == COMMENT_END_BANG_STATE) {
2862          $self->{ct}->{data} .= "--!\x{FFFD}"; # comment
2863        } else {
2864          $self->{ct}->{data} .= "--\x{FFFD}"; # comment
2865        }
2866        $self->{state} = COMMENT_STATE;
2867
2868    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2869      $self->{line_prev} = $self->{line};
2870      $self->{column_prev} = $self->{column};
2871      $self->{column}++;
2872      $self->{nc}
2873          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2874    } else {
2875      $self->{set_nc}->($self);
2876    }
2877
2878        redo A;
2879      } else {
2880
2881        if ($state == COMMENT_END_BANG_STATE) {
2882          $self->{ct}->{data} .= '--!' . chr ($nc); # comment
2883        } else {
2884          $self->{ct}->{data} .= '--' . chr ($nc); # comment
2885        }
2886        $self->{state} = COMMENT_STATE;
2887
2888    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2889      $self->{line_prev} = $self->{line};
2890      $self->{column_prev} = $self->{column};
2891      $self->{column}++;
2892      $self->{nc}
2893          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2894    } else {
2895      $self->{set_nc}->($self);
2896    }
2897
2898        redo A;
2899      }
2900    } elsif ($state == DOCTYPE_STATE) {
2901      if ($is_space->{$nc}) {
2902
2903        $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2904
2905    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2906      $self->{line_prev} = $self->{line};
2907      $self->{column_prev} = $self->{column};
2908      $self->{column}++;
2909      $self->{nc}
2910          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2911    } else {
2912      $self->{set_nc}->($self);
2913    }
2914
2915        redo A;
2916      } elsif ($nc == -1) {
2917
2918        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2919        $self->{ct}->{quirks} = 1;
2920
2921        $self->{state} = DATA_STATE;
2922        ## Reconsume.
2923        return  ($self->{ct}); # DOCTYPE (quirks)
2924
2925        redo A;
2926      } else {
2927
2928        ## XML5: Swith to the bogus comment state.
2929        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2930        $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2931        ## reconsume
2932        redo A;
2933      }
2934    } elsif ($state == BEFORE_DOCTYPE_NAME_STATE) {
2935      ## XML5: "DOCTYPE root name before state".
2936
2937      if ($is_space->{$nc}) {
2938
2939        ## Stay in the state
2940
2941    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2942      $self->{line_prev} = $self->{line};
2943      $self->{column_prev} = $self->{column};
2944      $self->{column}++;
2945      $self->{nc}
2946          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2947    } else {
2948      $self->{set_nc}->($self);
2949    }
2950
2951        redo A;
2952      } elsif ($nc == 0x003E) { # >
2953
2954        ## XML5: No parse error.
2955        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2956        $self->{state} = DATA_STATE;
2957
2958    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2959      $self->{line_prev} = $self->{line};
2960      $self->{column_prev} = $self->{column};
2961      $self->{column}++;
2962      $self->{nc}
2963          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2964    } else {
2965      $self->{set_nc}->($self);
2966    }
2967
2968
2969        return  ($self->{ct}); # DOCTYPE (quirks)
2970
2971        redo A;
2972      } elsif (0x0041 <= $nc and $nc <= 0x005A) { # A..Z
2973
2974        $self->{ct}->{name} # DOCTYPE
2975            = chr ($nc + ($self->{is_xml} ? 0 : 0x0020));
2976        delete $self->{ct}->{quirks};
2977        $self->{state} = DOCTYPE_NAME_STATE;
2978
2979    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2980      $self->{line_prev} = $self->{line};
2981      $self->{column_prev} = $self->{column};
2982      $self->{column}++;
2983      $self->{nc}
2984          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2985    } else {
2986      $self->{set_nc}->($self);
2987    }
2988
2989        redo A;
2990      } elsif ($nc == -1) {
2991
2992        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2993        $self->{state} = DATA_STATE;
2994        ## reconsume
2995
2996        return  ($self->{ct}); # DOCTYPE (quirks)
2997
2998        redo A;
2999      } elsif ($self->{is_xml} and $nc == 0x005B) { # [
3000
3001        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3002        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3003        $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3004        $self->{in_subset} = 1;
3005
3006    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3007      $self->{line_prev} = $self->{line};
3008      $self->{column_prev} = $self->{column};
3009      $self->{column}++;
3010      $self->{nc}
3011          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3012    } else {
3013      $self->{set_nc}->($self);
3014    }
3015
3016        return  ($self->{ct}); # DOCTYPE
3017        redo A;
3018      } elsif ($nc == 0x0000) {
3019        $self->{ct}->{name} = "\x{FFFD}";
3020        delete $self->{ct}->{quirks};
3021        $self->{state} = DOCTYPE_NAME_STATE;
3022
3023    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3024      $self->{line_prev} = $self->{line};
3025      $self->{column_prev} = $self->{column};
3026      $self->{column}++;
3027      $self->{nc}
3028          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3029    } else {
3030      $self->{set_nc}->($self);
3031    }
3032
3033        redo A;
3034      } else {
3035
3036        $self->{ct}->{name} = chr $nc;
3037        delete $self->{ct}->{quirks};
3038        $self->{state} = DOCTYPE_NAME_STATE;
3039
3040    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3041      $self->{line_prev} = $self->{line};
3042      $self->{column_prev} = $self->{column};
3043      $self->{column}++;
3044      $self->{nc}
3045          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3046    } else {
3047      $self->{set_nc}->($self);
3048    }
3049
3050        redo A;
3051      }
3052    } elsif ($state == DOCTYPE_NAME_STATE) {
3053      ## XML5: "DOCTYPE root name state".
3054
3055      if ($is_space->{$nc}) {
3056
3057        $self->{state} = AFTER_DOCTYPE_NAME_STATE;
3058
3059    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3060      $self->{line_prev} = $self->{line};
3061      $self->{column_prev} = $self->{column};
3062      $self->{column}++;
3063      $self->{nc}
3064          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3065    } else {
3066      $self->{set_nc}->($self);
3067    }
3068
3069        redo A;
3070      } elsif ($nc == 0x003E) { # >
3071
3072        $self->{state} = DATA_STATE;
3073
3074    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3075      $self->{line_prev} = $self->{line};
3076      $self->{column_prev} = $self->{column};
3077      $self->{column}++;
3078      $self->{nc}
3079          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3080    } else {
3081      $self->{set_nc}->($self);
3082    }
3083
3084
3085        return  ($self->{ct}); # DOCTYPE
3086
3087        redo A;
3088      } elsif (0x0041 <= $nc and $nc <= 0x005A) { # A..Z
3089
3090        $self->{ct}->{name} # DOCTYPE
3091            .= chr ($nc + ($self->{is_xml} ? 0 : 0x0020));
3092        delete $self->{ct}->{quirks};
3093        ## Stay in the state.
3094
3095    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3096      $self->{line_prev} = $self->{line};
3097      $self->{column_prev} = $self->{column};
3098      $self->{column}++;
3099      $self->{nc}
3100          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3101    } else {
3102      $self->{set_nc}->($self);
3103    }
3104
3105        redo A;
3106      } elsif ($nc == -1) {
3107
3108        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3109        $self->{state} = DATA_STATE;
3110        ## reconsume
3111
3112        $self->{ct}->{quirks} = 1;
3113        return  ($self->{ct}); # DOCTYPE
3114
3115        redo A;
3116      } elsif ($self->{is_xml} and $nc == 0x005B) { # [
3117
3118        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3119        $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3120        $self->{in_subset} = 1;
3121
3122    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3123      $self->{line_prev} = $self->{line};
3124      $self->{column_prev} = $self->{column};
3125      $self->{column}++;
3126      $self->{nc}
3127          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3128    } else {
3129      $self->{set_nc}->($self);
3130    }
3131
3132        return  ($self->{ct}); # DOCTYPE
3133        redo A;
3134      } elsif ($nc == 0x0000) {
3135        $self->{ct}->{name} .= "\x{FFFD}"; # DOCTYPE
3136        ## Stay in the state.
3137
3138    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3139      $self->{line_prev} = $self->{line};
3140      $self->{column_prev} = $self->{column};
3141      $self->{column}++;
3142      $self->{nc}
3143          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3144    } else {
3145      $self->{set_nc}->($self);
3146    }
3147
3148        redo A;
3149      } else {
3150
3151        $self->{ct}->{name} .= chr ($nc); # DOCTYPE
3152        ## Stay in the state.
3153
3154    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3155      $self->{line_prev} = $self->{line};
3156      $self->{column_prev} = $self->{column};
3157      $self->{column}++;
3158      $self->{nc}
3159          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3160    } else {
3161      $self->{set_nc}->($self);
3162    }
3163
3164        redo A;
3165      }
3166    } elsif ($state == AFTER_DOCTYPE_NAME_STATE) {
3167      ## XML5: Corresponding to XML5's "DOCTYPE root name after
3168      ## state", but implemented differently.
3169
3170      if ($is_space->{$nc}) {
3171
3172        ## Stay in the state
3173
3174    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3175      $self->{line_prev} = $self->{line};
3176      $self->{column_prev} = $self->{column};
3177      $self->{column}++;
3178      $self->{nc}
3179          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3180    } else {
3181      $self->{set_nc}->($self);
3182    }
3183
3184        redo A;
3185      } elsif ($nc == 0x003E) { # >
3186        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3187
3188          $self->{state} = DATA_STATE;
3189        } else {
3190
3191          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3192          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3193        }
3194
3195
3196    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3197      $self->{line_prev} = $self->{line};
3198      $self->{column_prev} = $self->{column};
3199      $self->{column}++;
3200      $self->{nc}
3201          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3202    } else {
3203      $self->{set_nc}->($self);
3204    }
3205
3206        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3207        redo A;
3208      } elsif ($nc == -1) {
3209        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3210
3211          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3212          $self->{state} = DATA_STATE;
3213          $self->{ct}->{quirks} = 1;
3214        } else {
3215
3216          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3217          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3218        }
3219
3220        ## Reconsume.
3221        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3222        redo A;
3223      } elsif ($nc == 0x0050 or # P
3224               $nc == 0x0070) { # p
3225
3226        $self->{state} = PUBLIC_STATE;
3227        $self->{kwd} = chr $nc;
3228
3229    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3230      $self->{line_prev} = $self->{line};
3231      $self->{column_prev} = $self->{column};
3232      $self->{column}++;
3233      $self->{nc}
3234          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3235    } else {
3236      $self->{set_nc}->($self);
3237    }
3238
3239        redo A;
3240      } elsif ($nc == 0x0053 or # S
3241               $nc == 0x0073) { # s
3242
3243        $self->{state} = SYSTEM_STATE;
3244        $self->{kwd} = chr $nc;
3245
3246    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3247      $self->{line_prev} = $self->{line};
3248      $self->{column_prev} = $self->{column};
3249      $self->{column}++;
3250      $self->{nc}
3251          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3252    } else {
3253      $self->{set_nc}->($self);
3254    }
3255
3256        redo A;
3257      } elsif ($nc == 0x0022 and # "
3258               ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3259                $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3260
3261        $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3262        $self->{ct}->{value} = ''; # ENTITY
3263
3264    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265      $self->{line_prev} = $self->{line};
3266      $self->{column_prev} = $self->{column};
3267      $self->{column}++;
3268      $self->{nc}
3269          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270    } else {
3271      $self->{set_nc}->($self);
3272    }
3273
3274        redo A;
3275      } elsif ($nc == 0x0027 and # '
3276               ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3277                $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3278
3279        $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3280        $self->{ct}->{value} = ''; # ENTITY
3281
3282    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283      $self->{line_prev} = $self->{line};
3284      $self->{column_prev} = $self->{column};
3285      $self->{column}++;
3286      $self->{nc}
3287          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3288    } else {
3289      $self->{set_nc}->($self);
3290    }
3291
3292        redo A;
3293      } elsif ($self->{is_xml} and
3294               $self->{ct}->{type} == DOCTYPE_TOKEN and
3295               $nc == 0x005B) { # [
3296
3297        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3298        $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3299        $self->{in_subset} = 1;
3300
3301    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3302      $self->{line_prev} = $self->{line};
3303      $self->{column_prev} = $self->{column};
3304      $self->{column}++;
3305      $self->{nc}
3306          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3307    } else {
3308      $self->{set_nc}->($self);
3309    }
3310
3311        return  ($self->{ct}); # DOCTYPE
3312        redo A;
3313      } else {
3314        $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3315
3316        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3317
3318          $self->{ct}->{quirks} = 1;
3319          $self->{state} = BOGUS_DOCTYPE_STATE;
3320        } else {
3321
3322          $self->{state} = BOGUS_MD_STATE;
3323        }
3324
3325
3326    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3327      $self->{line_prev} = $self->{line};
3328      $self->{column_prev} = $self->{column};
3329      $self->{column}++;
3330      $self->{nc}
3331          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3332    } else {
3333      $self->{set_nc}->($self);
3334    }
3335
3336        redo A;
3337      }
3338    } elsif ($state == PUBLIC_STATE) {
3339      ## ASCII case-insensitive
3340      if ($nc == [
3341            undef,
3342            0x0055, # U
3343            0x0042, # B
3344            0x004C, # L
3345            0x0049, # I
3346            NEVER_CHAR, # (C)
3347          ]->[length $self->{kwd}] or
3348          $nc == [
3349            undef,
3350            0x0075, # u
3351            0x0062, # b
3352            0x006C, # l
3353            0x0069, # i
3354            NEVER_CHAR, # (c)
3355          ]->[length $self->{kwd}]) {
3356
3357        ## Stay in the state.
3358        $self->{kwd} .= chr $nc;
3359
3360    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3361      $self->{line_prev} = $self->{line};
3362      $self->{column_prev} = $self->{column};
3363      $self->{column}++;
3364      $self->{nc}
3365          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3366    } else {
3367      $self->{set_nc}->($self);
3368    }
3369
3370        redo A;
3371      } elsif ((length $self->{kwd}) == 5 and
3372               ($nc == 0x0043 or # C
3373                $nc == 0x0063)) { # c
3374        if ($self->{is_xml} and
3375            ($self->{kwd} ne 'PUBLI' or $nc == 0x0063)) { # c
3376
3377          $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3378                          text => 'PUBLIC',
3379                          line => $self->{line_prev},
3380                          column => $self->{column_prev} - 4);
3381        } else {
3382
3383        }
3384        $self->{state} = AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE;
3385
3386    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3387      $self->{line_prev} = $self->{line};
3388      $self->{column_prev} = $self->{column};
3389      $self->{column}++;
3390      $self->{nc}
3391          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3392    } else {
3393      $self->{set_nc}->($self);
3394    }
3395
3396        redo A;
3397      } else {
3398        $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3399                        line => $self->{line_prev},
3400                        column => $self->{column_prev} + 1 - length $self->{kwd});
3401        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3402
3403          $self->{ct}->{quirks} = 1;
3404          $self->{state} = BOGUS_DOCTYPE_STATE;
3405        } else {
3406
3407          $self->{state} = BOGUS_MD_STATE;
3408        }
3409        ## Reconsume.
3410        redo A;
3411      }
3412    } elsif ($state == SYSTEM_STATE) {
3413      ## ASCII case-insensitive
3414      if ($nc == [
3415            undef,
3416            0x0059, # Y
3417            0x0053, # S
3418            0x0054, # T
3419            0x0045, # E
3420            NEVER_CHAR, # (M)
3421          ]->[length $self->{kwd}] or
3422          $nc == [
3423            undef,
3424            0x0079, # y
3425            0x0073, # s
3426            0x0074, # t
3427            0x0065, # e
3428            NEVER_CHAR, # (m)
3429          ]->[length $self->{kwd}]) {
3430
3431        ## Stay in the state.
3432        $self->{kwd} .= chr $nc;
3433
3434    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3435      $self->{line_prev} = $self->{line};
3436      $self->{column_prev} = $self->{column};
3437      $self->{column}++;
3438      $self->{nc}
3439          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3440    } else {
3441      $self->{set_nc}->($self);
3442    }
3443
3444        redo A;
3445      } elsif ((length $self->{kwd}) == 5 and
3446               ($nc == 0x004D or # M
3447                $nc == 0x006D)) { # m
3448        if ($self->{is_xml} and
3449            ($self->{kwd} ne 'SYSTE' or $nc == 0x006D)) { # m
3450
3451          $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3452                          text => 'SYSTEM',
3453                          line => $self->{line_prev},
3454                          column => $self->{column_prev} - 4);
3455        } else {
3456
3457        }
3458        $self->{state} = AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE;
3459
3460    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3461      $self->{line_prev} = $self->{line};
3462      $self->{column_prev} = $self->{column};
3463      $self->{column}++;
3464      $self->{nc}
3465          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3466    } else {
3467      $self->{set_nc}->($self);
3468    }
3469
3470        redo A;
3471      } else {
3472        $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
3473                        line => $self->{line_prev},
3474                        column => $self->{column_prev} + 1 - length $self->{kwd});
3475        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3476
3477          $self->{ct}->{quirks} = 1;
3478          $self->{state} = BOGUS_DOCTYPE_STATE;
3479        } else {
3480
3481          $self->{state} = BOGUS_MD_STATE;
3482        }
3483        ## Reconsume.
3484        redo A;
3485      }
3486    } elsif ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE or
3487             $state == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3488      if ($is_space->{$nc}) {
3489
3490        ## Stay in or switch to the state.
3491        $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3492
3493    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3494      $self->{line_prev} = $self->{line};
3495      $self->{column_prev} = $self->{column};
3496      $self->{column}++;
3497      $self->{nc}
3498          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3499    } else {
3500      $self->{set_nc}->($self);
3501    }
3502
3503        redo A;
3504      } elsif ($nc == 0x0022) { # "
3505        if ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE) {
3506
3507          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before pubid literal'); # XXX documentation
3508        } else {
3509
3510        }
3511        $self->{ct}->{pubid} = ''; # DOCTYPE
3512        $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3513
3514    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3515      $self->{line_prev} = $self->{line};
3516      $self->{column_prev} = $self->{column};
3517      $self->{column}++;
3518      $self->{nc}
3519          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3520    } else {
3521      $self->{set_nc}->($self);
3522    }
3523
3524        redo A;
3525      } elsif ($nc == 0x0027) { # '
3526        if ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE) {
3527
3528          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before pubid literal'); # XXX documentation
3529        } else {
3530
3531        }
3532        $self->{ct}->{pubid} = ''; # DOCTYPE
3533        $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3534
3535    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3536      $self->{line_prev} = $self->{line};
3537      $self->{column_prev} = $self->{column};
3538      $self->{column}++;
3539      $self->{nc}
3540          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3541    } else {
3542      $self->{set_nc}->($self);
3543    }
3544
3545        redo A;
3546      } elsif ($nc == 0x003E) { # >
3547        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3548
3549        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3550
3551          $self->{state} = DATA_STATE;
3552          $self->{ct}->{quirks} = 1;
3553        } else {
3554
3555          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3556        }
3557
3558
3559    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3560      $self->{line_prev} = $self->{line};
3561      $self->{column_prev} = $self->{column};
3562      $self->{column}++;
3563      $self->{nc}
3564          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3565    } else {
3566      $self->{set_nc}->($self);
3567    }
3568
3569        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3570        redo A;
3571      } elsif ($nc == EOF_CHAR) {
3572        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3573
3574          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3575          $self->{state} = DATA_STATE;
3576          $self->{ct}->{quirks} = 1;
3577        } else {
3578
3579          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3580          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3581        }
3582
3583        ## Reconsume.
3584        return  ($self->{ct}); # DOCTYPE
3585        redo A;
3586      } elsif ($self->{is_xml} and
3587               $self->{ct}->{type} == DOCTYPE_TOKEN and
3588               $nc == 0x005B) { # [
3589
3590        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3591        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3592        $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3593        $self->{in_subset} = 1;
3594
3595    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3596      $self->{line_prev} = $self->{line};
3597      $self->{column_prev} = $self->{column};
3598      $self->{column}++;
3599      $self->{nc}
3600          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3601    } else {
3602      $self->{set_nc}->($self);
3603    }
3604
3605        return  ($self->{ct}); # DOCTYPE
3606        redo A;
3607      } else {
3608        $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3609
3610        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3611
3612          $self->{ct}->{quirks} = 1;
3613          $self->{state} = BOGUS_DOCTYPE_STATE;
3614        } else {
3615
3616          $self->{state} = BOGUS_MD_STATE;
3617        }
3618
3619
3620    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3621      $self->{line_prev} = $self->{line};
3622      $self->{column_prev} = $self->{column};
3623      $self->{column}++;
3624      $self->{nc}
3625          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3626    } else {
3627      $self->{set_nc}->($self);
3628    }
3629
3630        redo A;
3631      }
3632    } elsif ($state == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3633      if ($nc == 0x0022) { # "
3634
3635        $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3636
3637    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3638      $self->{line_prev} = $self->{line};
3639      $self->{column_prev} = $self->{column};
3640      $self->{column}++;
3641      $self->{nc}
3642          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3643    } else {
3644      $self->{set_nc}->($self);
3645    }
3646
3647        redo A;
3648      } elsif ($nc == 0x003E) { # >
3649        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3650
3651        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3652
3653          $self->{state} = DATA_STATE;
3654          $self->{ct}->{quirks} = 1;
3655        } else {
3656
3657          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3658        }
3659
3660
3661    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3662      $self->{line_prev} = $self->{line};
3663      $self->{column_prev} = $self->{column};
3664      $self->{column}++;
3665      $self->{nc}
3666          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3667    } else {
3668      $self->{set_nc}->($self);
3669    }
3670
3671        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3672        redo A;
3673      } elsif ($nc == -1) {
3674        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3675
3676        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3677
3678          $self->{state} = DATA_STATE;
3679          $self->{ct}->{quirks} = 1;
3680        } else {
3681
3682          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3683        }
3684
3685        ## Reconsume.
3686        return  ($self->{ct}); # DOCTYPE
3687        redo A;
3688      } elsif ($nc == 0x0000) {
3689        $self->{ct}->{pubid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION
3690        ## Stay in the state.
3691
3692    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3693      $self->{line_prev} = $self->{line};
3694      $self->{column_prev} = $self->{column};
3695      $self->{column}++;
3696      $self->{nc}
3697          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3698    } else {
3699      $self->{set_nc}->($self);
3700    }
3701
3702        redo A;
3703      } else {
3704
3705        $self->{ct}->{pubid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION
3706        $self->{read_until}->($self->{ct}->{pubid}, qq[\x00">],
3707                              length $self->{ct}->{pubid});
3708
3709        ## Stay in the state.
3710
3711    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3712      $self->{line_prev} = $self->{line};
3713      $self->{column_prev} = $self->{column};
3714      $self->{column}++;
3715      $self->{nc}
3716          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3717    } else {
3718      $self->{set_nc}->($self);
3719    }
3720
3721        redo A;
3722      }
3723    } elsif ($state == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3724      if ($nc == 0x0027) { # '
3725
3726        $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3727
3728    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3729      $self->{line_prev} = $self->{line};
3730      $self->{column_prev} = $self->{column};
3731      $self->{column}++;
3732      $self->{nc}
3733          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3734    } else {
3735      $self->{set_nc}->($self);
3736    }
3737
3738        redo A;
3739      } elsif ($nc == 0x003E) { # >
3740        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3741
3742        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3743
3744          $self->{state} = DATA_STATE;
3745          $self->{ct}->{quirks} = 1;
3746        } else {
3747
3748          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3749        }
3750
3751
3752    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3753      $self->{line_prev} = $self->{line};
3754      $self->{column_prev} = $self->{column};
3755      $self->{column}++;
3756      $self->{nc}
3757          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3758    } else {
3759      $self->{set_nc}->($self);
3760    }
3761
3762        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3763        redo A;
3764      } elsif ($nc == -1) {
3765        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3766
3767        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3768
3769          $self->{state} = DATA_STATE;
3770          $self->{ct}->{quirks} = 1;
3771        } else {
3772
3773          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3774        }
3775
3776        ## reconsume
3777        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3778        redo A;
3779      } elsif ($nc == 0x0000) {
3780        $self->{ct}->{pubid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION
3781        ## Stay in the state.
3782
3783    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3784      $self->{line_prev} = $self->{line};
3785      $self->{column_prev} = $self->{column};
3786      $self->{column}++;
3787      $self->{nc}
3788          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3789    } else {
3790      $self->{set_nc}->($self);
3791    }
3792
3793        redo A;
3794      } else {
3795
3796        $self->{ct}->{pubid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION
3797        $self->{read_until}->($self->{ct}->{pubid}, qq[\x00'>],
3798                              length $self->{ct}->{pubid});
3799
3800        ## Stay in the state
3801
3802    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3803      $self->{line_prev} = $self->{line};
3804      $self->{column_prev} = $self->{column};
3805      $self->{column}++;
3806      $self->{nc}
3807          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3808    } else {
3809      $self->{set_nc}->($self);
3810    }
3811
3812        redo A;
3813      }
3814    } elsif ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE or
3815             $state == BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE) {
3816      if ($is_space->{$nc}) {
3817
3818        ## Stay in or switch to the state.
3819        $self->{state} = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE;
3820
3821    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3822      $self->{line_prev} = $self->{line};
3823      $self->{column_prev} = $self->{column};
3824      $self->{column}++;
3825      $self->{nc}
3826          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3827    } else {
3828      $self->{set_nc}->($self);
3829    }
3830
3831        redo A;
3832      } elsif ($nc == 0x0022) { # "
3833        if ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3834
3835          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation
3836        } else {
3837
3838        }
3839        $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3840        $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3841
3842    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3843      $self->{line_prev} = $self->{line};
3844      $self->{column_prev} = $self->{column};
3845      $self->{column}++;
3846      $self->{nc}
3847          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3848    } else {
3849      $self->{set_nc}->($self);
3850    }
3851
3852        redo A;
3853      } elsif ($nc == 0x0027) { # '
3854        if ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3855
3856          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation
3857        } else {
3858
3859        }
3860        $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3861        $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3862
3863    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3864      $self->{line_prev} = $self->{line};
3865      $self->{column_prev} = $self->{column};
3866      $self->{column}++;
3867      $self->{nc}
3868          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3869    } else {
3870      $self->{set_nc}->($self);
3871    }
3872
3873        redo A;
3874      } elsif ($nc == 0x003E) { # >
3875        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3876          if ($self->{is_xml}) {
3877
3878            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3879          } else {
3880
3881          }
3882          $self->{state} = DATA_STATE;
3883        } else {
3884          if ($self->{ct}->{type} == NOTATION_TOKEN) {
3885
3886          } else {
3887
3888            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3889          }
3890          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3891        }
3892
3893
3894    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3895      $self->{line_prev} = $self->{line};
3896      $self->{column_prev} = $self->{column};
3897      $self->{column}++;
3898      $self->{nc}
3899          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3900    } else {
3901      $self->{set_nc}->($self);
3902    }
3903
3904        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3905        redo A;
3906      } elsif ($nc == EOF_CHAR) {
3907        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3908
3909          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3910
3911          $self->{state} = DATA_STATE;
3912          $self->{ct}->{quirks} = 1;
3913        } else {
3914          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3915          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3916        }
3917
3918        ## Reconsume.
3919        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3920        redo A;
3921      } elsif ($self->{is_xml} and
3922               $self->{ct}->{type} == DOCTYPE_TOKEN and
3923               $nc == 0x005B) { # [
3924
3925        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3926        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3927        $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3928        $self->{in_subset} = 1;
3929
3930    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3931      $self->{line_prev} = $self->{line};
3932      $self->{column_prev} = $self->{column};
3933      $self->{column}++;
3934      $self->{nc}
3935          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3936    } else {
3937      $self->{set_nc}->($self);
3938    }
3939
3940        return  ($self->{ct}); # DOCTYPE
3941        redo A;
3942      } else {
3943        $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3944
3945        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3946
3947          $self->{ct}->{quirks} = 1;
3948          $self->{state} = BOGUS_DOCTYPE_STATE;
3949        } else {
3950
3951          $self->{state} = BOGUS_MD_STATE;
3952        }
3953
3954
3955    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3956      $self->{line_prev} = $self->{line};
3957      $self->{column_prev} = $self->{column};
3958      $self->{column}++;
3959      $self->{nc}
3960          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3961    } else {
3962      $self->{set_nc}->($self);
3963    }
3964
3965        redo A;
3966      }
3967    } elsif ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE or
3968             $state == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3969      if ($is_space->{$nc}) {
3970
3971        ## Stay in or switch to the state.
3972        $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3973
3974    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3975      $self->{line_prev} = $self->{line};
3976      $self->{column_prev} = $self->{column};
3977      $self->{column}++;
3978      $self->{nc}
3979          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3980    } else {
3981      $self->{set_nc}->($self);
3982    }
3983
3984        redo A;
3985      } elsif ($nc == 0x0022) { # "
3986        if ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE) {
3987
3988          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation
3989        } else {
3990
3991        }
3992        $self->{ct}->{sysid} = ''; # DOCTYPE
3993        $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3994
3995    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3996      $self->{line_prev} = $self->{line};
3997      $self->{column_prev} = $self->{column};
3998      $self->{column}++;
3999      $self->{nc}
4000          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4001    } else {
4002      $self->{set_nc}->($self);
4003    }
4004
4005        redo A;
4006      } elsif ($nc == 0x0027) { # '
4007        if ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE) {
4008
4009          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation
4010        } else {
4011
4012        }
4013        $self->{ct}->{sysid} = ''; # DOCTYPE
4014        $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4015
4016    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4017      $self->{line_prev} = $self->{line};
4018      $self->{column_prev} = $self->{column};
4019      $self->{column}++;
4020      $self->{nc}
4021          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4022    } else {
4023      $self->{set_nc}->($self);
4024    }
4025
4026        redo A;
4027      } elsif ($nc == 0x003E) { # >
4028        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4029
4030    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4031      $self->{line_prev} = $self->{line};
4032      $self->{column_prev} = $self->{column};
4033      $self->{column}++;
4034      $self->{nc}
4035          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4036    } else {
4037      $self->{set_nc}->($self);
4038    }
4039
4040
4041        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4042
4043          $self->{state} = DATA_STATE;
4044          $self->{ct}->{quirks} = 1;
4045        } else {
4046
4047          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4048        }
4049
4050        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4051        redo A;
4052      } elsif ($nc == EOF_CHAR) {
4053        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4054
4055          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4056          $self->{state} = DATA_STATE;
4057          $self->{ct}->{quirks} = 1;
4058        } else {
4059
4060          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4061          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4062        }
4063
4064        ## Reconsume.
4065        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4066        redo A;
4067      } elsif ($self->{is_xml} and
4068               $self->{ct}->{type} == DOCTYPE_TOKEN and
4069               $nc == 0x005B) { # [
4070
4071        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4072
4073        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4074        $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4075        $self->{in_subset} = 1;
4076
4077    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4078      $self->{line_prev} = $self->{line};
4079      $self->{column_prev} = $self->{column};
4080      $self->{column}++;
4081      $self->{nc}
4082          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4083    } else {
4084      $self->{set_nc}->($self);
4085    }
4086
4087        return  ($self->{ct}); # DOCTYPE
4088        redo A;
4089      } else {
4090        $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
4091
4092        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4093
4094          $self->{ct}->{quirks} = 1;
4095          $self->{state} = BOGUS_DOCTYPE_STATE;
4096        } else {
4097
4098          $self->{state} = BOGUS_MD_STATE;
4099        }
4100
4101
4102    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4103      $self->{line_prev} = $self->{line};
4104      $self->{column_prev} = $self->{column};
4105      $self->{column}++;
4106      $self->{nc}
4107          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4108    } else {
4109      $self->{set_nc}->($self);
4110    }
4111
4112        redo A;
4113      }
4114    } elsif ($state == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
4115      if ($nc == 0x0022) { # "
4116
4117        $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4118
4119    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4120      $self->{line_prev} = $self->{line};
4121      $self->{column_prev} = $self->{column};
4122      $self->{column}++;
4123      $self->{nc}
4124          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4125    } else {
4126      $self->{set_nc}->($self);
4127    }
4128
4129        redo A;
4130      } elsif (not $self->{is_xml} and $nc == 0x003E) { # >
4131        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4132
4133        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4134
4135          $self->{state} = DATA_STATE;
4136          $self->{ct}->{quirks} = 1;
4137        } else {
4138
4139          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4140        }
4141
4142
4143    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4144      $self->{line_prev} = $self->{line};
4145      $self->{column_prev} = $self->{column};
4146      $self->{column}++;
4147      $self->{nc}
4148          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4149    } else {
4150      $self->{set_nc}->($self);
4151    }
4152
4153        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4154        redo A;
4155      } elsif ($nc == -1) {
4156        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4157
4158        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4159
4160          $self->{state} = DATA_STATE;
4161          $self->{ct}->{quirks} = 1;
4162        } else {
4163
4164          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4165        }
4166
4167        ## reconsume
4168        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4169        redo A;
4170      } elsif ($nc == 0x0000) {
4171        $self->{ct}->{sysid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION
4172        ## Stay in the state.
4173
4174    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4175      $self->{line_prev} = $self->{line};
4176      $self->{column_prev} = $self->{column};
4177      $self->{column}++;
4178      $self->{nc}
4179          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4180    } else {
4181      $self->{set_nc}->($self);
4182    }
4183
4184        redo A;
4185      } else {
4186
4187        $self->{ct}->{sysid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION
4188        $self->{read_until}->($self->{ct}->{sysid}, qq[\x00">],
4189                              length $self->{ct}->{sysid});
4190
4191        ## Stay in the state
4192
4193    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4194      $self->{line_prev} = $self->{line};
4195      $self->{column_prev} = $self->{column};
4196      $self->{column}++;
4197      $self->{nc}
4198          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4199    } else {
4200      $self->{set_nc}->($self);
4201    }
4202
4203        redo A;
4204      }
4205    } elsif ($state == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
4206      if ($nc == 0x0027) { # '
4207
4208        $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
4209
4210    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4211      $self->{line_prev} = $self->{line};
4212      $self->{column_prev} = $self->{column};
4213      $self->{column}++;
4214      $self->{nc}
4215          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4216    } else {
4217      $self->{set_nc}->($self);
4218    }
4219
4220        redo A;
4221      } elsif (not $self->{is_xml} and $nc == 0x003E) { # >
4222
4223        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4224
4225        $self->{state} = DATA_STATE;
4226
4227    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4228      $self->{line_prev} = $self->{line};
4229      $self->{column_prev} = $self->{column};
4230      $self->{column}++;
4231      $self->{nc}
4232          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4233    } else {
4234      $self->{set_nc}->($self);
4235    }
4236
4237
4238        $self->{ct}->{quirks} = 1;
4239        return  ($self->{ct}); # DOCTYPE
4240
4241        redo A;
4242      } elsif ($nc == -1) {
4243        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4244
4245        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4246
4247          $self->{state} = DATA_STATE;
4248          $self->{ct}->{quirks} = 1;
4249        } else {
4250
4251          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4252        }
4253
4254        ## reconsume
4255        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4256        redo A;
4257      } elsif ($nc == 0x0000) {
4258        $self->{ct}->{sysid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION
4259        ## Stay in the state.
4260
4261    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4262      $self->{line_prev} = $self->{line};
4263      $self->{column_prev} = $self->{column};
4264      $self->{column}++;
4265      $self->{nc}
4266          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4267    } else {
4268      $self->{set_nc}->($self);
4269    }
4270
4271        redo A;
4272      } else {
4273
4274        $self->{ct}->{sysid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION
4275        $self->{read_until}->($self->{ct}->{sysid}, qq[\x00'>],
4276                              length $self->{ct}->{sysid});
4277
4278        ## Stay in the state
4279
4280    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4281      $self->{line_prev} = $self->{line};
4282      $self->{column_prev} = $self->{column};
4283      $self->{column}++;
4284      $self->{nc}
4285          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4286    } else {
4287      $self->{set_nc}->($self);
4288    }
4289
4290        redo A;
4291      }
4292    } elsif ($state == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4293      if ($is_space->{$nc}) {
4294        if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4295
4296          $self->{state} = BEFORE_NDATA_STATE;
4297        } else {
4298
4299          ## Stay in the state
4300        }
4301
4302    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4303      $self->{line_prev} = $self->{line};
4304      $self->{column_prev} = $self->{column};
4305      $self->{column}++;
4306      $self->{nc}
4307          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4308    } else {
4309      $self->{set_nc}->($self);
4310    }
4311
4312        redo A;
4313      } elsif ($nc == 0x003E) { # >
4314        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4315
4316          $self->{state} = DATA_STATE;
4317        } else {
4318
4319          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4320        }
4321
4322
4323    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4324      $self->{line_prev} = $self->{line};
4325      $self->{column_prev} = $self->{column};
4326      $self->{column}++;
4327      $self->{nc}
4328          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4329    } else {
4330      $self->{set_nc}->($self);
4331    }
4332
4333        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4334        redo A;
4335      } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4336               ($nc == 0x004E or # N
4337                $nc == 0x006E)) { # n
4338
4339        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4340        $self->{state} = NDATA_STATE;
4341        $self->{kwd} = chr $nc;
4342
4343    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4344      $self->{line_prev} = $self->{line};
4345      $self->{column_prev} = $self->{column};
4346      $self->{column}++;
4347      $self->{nc}
4348          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4349    } else {
4350      $self->{set_nc}->($self);
4351    }
4352
4353        redo A;
4354      } elsif ($nc == -1) {
4355        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4356
4357          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4358          $self->{state} = DATA_STATE;
4359          $self->{ct}->{quirks} = 1;
4360        } else {
4361
4362          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4363          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4364        }
4365
4366        ## reconsume
4367        return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4368        redo A;
4369      } elsif ($self->{is_xml} and
4370               $self->{ct}->{type} == DOCTYPE_TOKEN and
4371               $nc == 0x005B) { # [
4372
4373        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4374        $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4375        $self->{in_subset} = 1;
4376
4377    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4378      $self->{line_prev} = $self->{line};
4379      $self->{column_prev} = $self->{column};
4380      $self->{column}++;
4381      $self->{nc}
4382          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4383    } else {
4384      $self->{set_nc}->($self);
4385    }
4386
4387        return  ($self->{ct}); # DOCTYPE
4388        redo A;
4389      } else {
4390        $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4391
4392        if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4393
4394          #$self->{ct}->{quirks} = 1;
4395          $self->{state} = BOGUS_DOCTYPE_STATE;
4396        } else {
4397
4398          $self->{state} = BOGUS_MD_STATE;
4399        }
4400
4401
4402    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4403      $self->{line_prev} = $self->{line};
4404      $self->{column_prev} = $self->{column};
4405      $self->{column}++;
4406      $self->{nc}
4407          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4408    } else {
4409      $self->{set_nc}->($self);
4410    }
4411
4412        redo A;
4413      }
4414    } elsif ($state == BEFORE_NDATA_STATE) {
4415      if ($is_space->{$nc}) {
4416
4417        ## Stay in the state.
4418
4419    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4420      $self->{line_prev} = $self->{line};
4421      $self->{column_prev} = $self->{column};
4422      $self->{column}++;
4423      $self->{nc}
4424          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4425    } else {
4426      $self->{set_nc}->($self);
4427    }
4428
4429        redo A;
4430      } elsif ($nc == 0x003E) { # >
4431
4432        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4433
4434    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4435      $self->{line_prev} = $self->{line};
4436      $self->{column_prev} = $self->{column};
4437      $self->{column}++;
4438      $self->{nc}
4439          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4440    } else {
4441      $self->{set_nc}->($self);
4442    }
4443
4444        return  ($self->{ct}); # ENTITY
4445        redo A;
4446      } elsif ($nc == 0x004E or # N
4447               $nc == 0x006E) { # n
4448
4449        $self->{state} = NDATA_STATE;
4450        $self->{kwd} = chr $nc;
4451
4452    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4453      $self->{line_prev} = $self->{line};
4454      $self->{column_prev} = $self->{column};
4455      $self->{column}++;
4456      $self->{nc}
4457          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4458    } else {
4459      $self->{set_nc}->($self);
4460    }
4461
4462        redo A;
4463      } elsif ($nc == -1) {
4464
4465        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4466        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4467        ## reconsume
4468        return  ($self->{ct}); # ENTITY
4469        redo A;
4470      } else {
4471
4472        $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4473        $self->{state} = BOGUS_MD_STATE;
4474
4475    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4476      $self->{line_prev} = $self->{line};
4477      $self->{column_prev} = $self->{column};
4478      $self->{column}++;
4479      $self->{nc}
4480          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4481    } else {
4482      $self->{set_nc}->($self);
4483    }
4484
4485        redo A;
4486      }
4487    } elsif ($state == BOGUS_DOCTYPE_STATE) {
4488      if ($nc == 0x003E) { # >
4489
4490        $self->{state} = DATA_STATE;
4491
4492    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4493      $self->{line_prev} = $self->{line};
4494      $self->{column_prev} = $self->{column};
4495      $self->{column}++;
4496      $self->{nc}
4497          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4498    } else {
4499      $self->{set_nc}->($self);
4500    }
4501
4502
4503        return  ($self->{ct}); # DOCTYPE
4504
4505        redo A;
4506      } elsif ($self->{is_xml} and $nc == 0x005B) { # [
4507
4508        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4509        $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4510        $self->{in_subset} = 1;
4511
4512    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4513      $self->{line_prev} = $self->{line};
4514      $self->{column_prev} = $self->{column};
4515      $self->{column}++;
4516      $self->{nc}
4517          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4518    } else {
4519      $self->{set_nc}->($self);
4520    }
4521
4522        return  ($self->{ct}); # DOCTYPE
4523        redo A;
4524      } elsif ($nc == -1) {
4525
4526        $self->{state} = DATA_STATE;
4527        ## reconsume
4528
4529        return  ($self->{ct}); # DOCTYPE
4530
4531        redo A;
4532      } else {
4533
4534        my $s = '';
4535        $self->{read_until}->($s, q{>[}, 0);
4536
4537        ## Stay in the state
4538
4539    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4540      $self->{line_prev} = $self->{line};
4541      $self->{column_prev} = $self->{column};
4542      $self->{column}++;
4543      $self->{nc}
4544          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4545    } else {
4546      $self->{set_nc}->($self);
4547    }
4548
4549        redo A;
4550      }
4551    } elsif ($state == CDATA_SECTION_STATE) {
4552      ## NOTE: "CDATA section state" in the state is jointly implemented
4553      ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4554      ## and |CDATA_SECTION_MSE2_STATE|.
4555
4556      ## XML5: "CDATA state".
4557
4558      if ($nc == 0x005D) { # ]
4559
4560        $self->{state} = CDATA_SECTION_MSE1_STATE;
4561
4562    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4563      $self->{line_prev} = $self->{line};
4564      $self->{column_prev} = $self->{column};
4565      $self->{column}++;
4566      $self->{nc}
4567          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4568    } else {
4569      $self->{set_nc}->($self);
4570    }
4571
4572        redo A;
4573      } elsif ($nc == -1) {
4574        if ($self->{is_xml}) {
4575
4576          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4577        } else {
4578
4579        }
4580
4581        $self->{state} = DATA_STATE;
4582        ## Reconsume.
4583        if (length $self->{ct}->{data}) { # character
4584
4585          return  ($self->{ct}); # character
4586        } else {
4587
4588          ## No token to emit. $self->{ct} is discarded.
4589        }
4590        redo A;
4591      } else {
4592
4593        $self->{ct}->{data} .= chr $nc;
4594        $self->{read_until}->($self->{ct}->{data},
4595                              qq<\x00]>,
4596                              length $self->{ct}->{data});
4597        ## NOTE: NULLs are left as is (see spec's comment).  However,
4598        ## a token cannot contain more than one U+0000 NULL character
4599        ## for the ease of processing in the tree constructor.
4600
4601        ## Stay in the state.
4602
4603    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4604      $self->{line_prev} = $self->{line};
4605      $self->{column_prev} = $self->{column};
4606      $self->{column}++;
4607      $self->{nc}
4608          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4609    } else {
4610      $self->{set_nc}->($self);
4611    }
4612
4613        redo A;
4614      }
4615
4616      ## ISSUE: "text tokens" in spec.
4617    } elsif ($state == CDATA_SECTION_MSE1_STATE) {
4618      ## XML5: "CDATA bracket state".
4619
4620      if ($nc == 0x005D) { # ]
4621
4622        $self->{state} = CDATA_SECTION_MSE2_STATE;
4623
4624    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4625      $self->{line_prev} = $self->{line};
4626      $self->{column_prev} = $self->{column};
4627      $self->{column}++;
4628      $self->{nc}
4629          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4630    } else {
4631      $self->{set_nc}->($self);
4632    }
4633
4634        redo A;
4635      } else {
4636
4637        ## XML5: If EOF, "]" is not appended and changed to the data state.
4638        $self->{ct}->{data} .= ']';
4639        $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4640        ## Reconsume.
4641        redo A;
4642      }
4643    } elsif ($state == CDATA_SECTION_MSE2_STATE) {
4644      ## XML5: "CDATA end state".
4645
4646      if ($nc == 0x003E) { # >
4647        $self->{state} = DATA_STATE;
4648
4649    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4650      $self->{line_prev} = $self->{line};
4651      $self->{column_prev} = $self->{column};
4652      $self->{column}++;
4653      $self->{nc}
4654          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4655    } else {
4656      $self->{set_nc}->($self);
4657    }
4658
4659        if (length $self->{ct}->{data}) { # character
4660
4661          return  ($self->{ct}); # character
4662        } else {
4663
4664          ## No token to emit. $self->{ct} is discarded.
4665        }
4666        redo A;
4667      } elsif ($nc == 0x005D) { # ]
4668         # character
4669        $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
4670        ## Stay in the state.
4671
4672    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4673      $self->{line_prev} = $self->{line};
4674      $self->{column_prev} = $self->{column};
4675      $self->{column}++;
4676      $self->{nc}
4677          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4678    } else {
4679      $self->{set_nc}->($self);
4680    }
4681
4682        redo A;
4683      } else {
4684
4685        $self->{ct}->{data} .= ']]'; # character
4686        $self->{state} = CDATA_SECTION_STATE;
4687        ## Reconsume. ## XML5: Emit.
4688        redo A;
4689      }
4690    } elsif ($state == ENTITY_STATE) {
4691      if ($is_space->{$nc} or
4692          {
4693            0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4694
4695            ## Following characters are added here to detect parse
4696            ## error for "=" of "&=" in an unquoted attribute value.
4697            ## Though this disagree with the Web Applications 1.0
4698            ## spec, the result token sequences of both algorithms
4699            ## should be same, as these characters cannot form a part
4700            ## of character references.
4701            0x0022 => 1, 0x0027 => 1, 0x0060 => 1, # ", ', `
4702            0x003D => 1, # =
4703
4704            ## As a result of the addition above, the following clause
4705            ## has no effect in fact.
4706            $self->{entity_add} => 1,
4707          }->{$nc}) {
4708        if ($self->{is_xml}) {
4709
4710          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4711                          line => $self->{line_prev},
4712                          column => $self->{column_prev}
4713                              + ($nc == -1 ? 1 : 0));
4714        } else {
4715
4716          ## No error
4717        }
4718        ## Don't consume
4719        ## Return nothing.
4720        #
4721      } elsif ($nc == 0x0023) { # #
4722
4723        $self->{state} = ENTITY_HASH_STATE;
4724        $self->{kwd} = '#';
4725
4726    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4727      $self->{line_prev} = $self->{line};
4728      $self->{column_prev} = $self->{column};
4729      $self->{column}++;
4730      $self->{nc}
4731          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4732    } else {
4733      $self->{set_nc}->($self);
4734    }
4735
4736        redo A;
4737      } elsif ($self->{is_xml} or
4738               (0x0041 <= $nc and
4739                $nc <= 0x005A) or # A..Z
4740               (0x0061 <= $nc and
4741                $nc <= 0x007A)) { # a..z
4742
4743        #require HTML::HTML5::Parser::NamedEntityList;
4744        $self->{state} = ENTITY_NAME_STATE;
4745        $self->{kwd} = chr $nc;
4746        $self->{entity__value} = $self->{kwd};
4747        $self->{entity__match} = 0;
4748
4749    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4750      $self->{line_prev} = $self->{line};
4751      $self->{column_prev} = $self->{column};
4752      $self->{column}++;
4753      $self->{nc}
4754          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4755    } else {
4756      $self->{set_nc}->($self);
4757    }
4758
4759        redo A;
4760      } else {
4761
4762        ## Return nothing.
4763        #
4764      }
4765
4766      ## We implement the "consume a character reference" in a
4767      ## slightly different way from the spec's algorithm, though the
4768      ## end result should be exactly same.
4769
4770      ## NOTE: No character is consumed by the "consume a character
4771      ## reference" algorithm.  In other word, there is an "&" character
4772      ## that does not introduce a character reference, which would be
4773      ## appended to the parent element or the attribute value in later
4774      ## process of the tokenizer.
4775
4776      if ($self->{prev_state} == DATA_STATE or
4777          $self->{prev_state} == RCDATA_STATE) {
4778
4779        $self->{state} = $self->{prev_state};
4780        ## Reconsume.
4781        return  ({type => CHARACTER_TOKEN, data => '&',
4782                  line => $self->{line_prev},
4783                  column => $self->{column_prev},
4784                 });
4785        redo A;
4786      } else {
4787
4788        $self->{ca}->{value} .= '&';
4789        $self->{state} = $self->{prev_state};
4790        ## Reconsume.
4791        redo A;
4792      }
4793    } elsif ($state == ENTITY_HASH_STATE) {
4794      if ($nc == 0x0078) { # x
4795
4796        $self->{state} = HEXREF_X_STATE;
4797        $self->{kwd} .= chr $nc;
4798
4799    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4800      $self->{line_prev} = $self->{line};
4801      $self->{column_prev} = $self->{column};
4802      $self->{column}++;
4803      $self->{nc}
4804          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4805    } else {
4806      $self->{set_nc}->($self);
4807    }
4808
4809        redo A;
4810      } elsif ($nc == 0x0058) { # X
4811
4812        if ($self->{is_xml}) {
4813          $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4814        }
4815        $self->{state} = HEXREF_X_STATE;
4816        $self->{kwd} .= chr $nc;
4817
4818    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4819      $self->{line_prev} = $self->{line};
4820      $self->{column_prev} = $self->{column};
4821      $self->{column}++;
4822      $self->{nc}
4823          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4824    } else {
4825      $self->{set_nc}->($self);
4826    }
4827
4828        redo A;
4829      } elsif (0x0030 <= $nc and
4830               $nc <= 0x0039) { # 0..9
4831
4832        $self->{state} = NCR_NUM_STATE;
4833        $self->{kwd} = $nc - 0x0030;
4834
4835    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4836      $self->{line_prev} = $self->{line};
4837      $self->{column_prev} = $self->{column};
4838      $self->{column}++;
4839      $self->{nc}
4840          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4841    } else {
4842      $self->{set_nc}->($self);
4843    }
4844
4845        redo A;
4846      } else {
4847        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
4848                        line => $self->{line_prev},
4849                        column => $self->{column_prev} - 1);
4850
4851        ## NOTE: According to the spec algorithm, nothing is returned,
4852        ## and then "&#" is appended to the parent element or the attribute
4853        ## value in the later processing.
4854
4855        if ($self->{prev_state} == DATA_STATE or
4856            $self->{prev_state} == RCDATA_STATE) {
4857
4858          $self->{state} = $self->{prev_state};
4859          ## Reconsume.
4860          return  ({type => CHARACTER_TOKEN,
4861                    data => '&#',
4862                    line => $self->{line_prev},
4863                    column => $self->{column_prev} - 1,
4864                   });
4865          redo A;
4866        } else {
4867
4868          $self->{ca}->{value} .= '&#';
4869          $self->{state} = $self->{prev_state};
4870          ## Reconsume.
4871          redo A;
4872        }
4873      }
4874    } elsif ($state == NCR_NUM_STATE) {
4875      if (0x0030 <= $nc and
4876          $nc <= 0x0039) { # 0..9
4877
4878        $self->{kwd} *= 10;
4879        $self->{kwd} += $nc - 0x0030;
4880
4881        ## Stay in the state.
4882
4883    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4884      $self->{line_prev} = $self->{line};
4885      $self->{column_prev} = $self->{column};
4886      $self->{column}++;
4887      $self->{nc}
4888          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4889    } else {
4890      $self->{set_nc}->($self);
4891    }
4892
4893        redo A;
4894      } elsif ($nc == 0x003B) { # ;
4895
4896
4897    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4898      $self->{line_prev} = $self->{line};
4899      $self->{column_prev} = $self->{column};
4900      $self->{column}++;
4901      $self->{nc}
4902          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4903    } else {
4904      $self->{set_nc}->($self);
4905    }
4906
4907        #
4908      } else {
4909
4910        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4911        ## Reconsume.
4912        #
4913      }
4914
4915      my $code = $self->{kwd};
4916      my $l = $self->{line_prev};
4917      my $c = $self->{column_prev};
4918      if ((not $self->{is_xml} and $charref_map->{$code}) or
4919          ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4920          ($self->{is_xml} and $code == 0x0000)) {
4921
4922        $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4923                        text => (sprintf 'U+%04X', $code),
4924                        line => $l, column => $c);
4925        $code = $charref_map->{$code};
4926      } elsif ($code > 0x10FFFF) {
4927
4928        $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4929                        text => (sprintf 'U-%08X', $code),
4930                        line => $l, column => $c);
4931        $code = 0xFFFD;
4932      }
4933
4934      if ($self->{prev_state} == DATA_STATE or
4935          $self->{prev_state} == RCDATA_STATE) {
4936
4937        $self->{state} = $self->{prev_state};
4938        ## Reconsume.
4939        return  ({type => CHARACTER_TOKEN, data => chr $code,
4940                  has_reference => 1,
4941                  line => $l, column => $c,
4942                 });
4943        redo A;
4944      } else {
4945
4946        $self->{ca}->{value} .= chr $code;
4947        $self->{ca}->{has_reference} = 1;
4948        $self->{state} = $self->{prev_state};
4949        ## Reconsume.
4950        redo A;
4951      }
4952    } elsif ($state == HEXREF_X_STATE) {
4953      if ((0x0030 <= $nc and $nc <= 0x0039) or
4954          (0x0041 <= $nc and $nc <= 0x0046) or
4955          (0x0061 <= $nc and $nc <= 0x0066)) {
4956        # 0..9, A..F, a..f
4957
4958        $self->{state} = HEXREF_HEX_STATE;
4959        $self->{kwd} = 0;
4960        ## Reconsume.
4961        redo A;
4962      } else {
4963        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4964                        line => $self->{line_prev},
4965                        column => $self->{column_prev} - 2);
4966
4967        ## NOTE: According to the spec algorithm, nothing is returned,
4968        ## and then "&#" followed by "X" or "x" is appended to the parent
4969        ## element or the attribute value in the later processing.
4970
4971        if ($self->{prev_state} == DATA_STATE or
4972            $self->{prev_state} == RCDATA_STATE) {
4973
4974          $self->{state} = $self->{prev_state};
4975          ## Reconsume.
4976          return  ({type => CHARACTER_TOKEN,
4977                    data => '&' . $self->{kwd},
4978                    line => $self->{line_prev},
4979                    column => $self->{column_prev} - length $self->{kwd},
4980                   });
4981          redo A;
4982        } else {
4983
4984          $self->{ca}->{value} .= '&' . $self->{kwd};
4985          $self->{state} = $self->{prev_state};
4986          ## Reconsume.
4987          redo A;
4988        }
4989      }
4990    } elsif ($state == HEXREF_HEX_STATE) {
4991      if (0x0030 <= $nc and $nc <= 0x0039) {
4992        # 0..9
4993
4994        $self->{kwd} *= 0x10;
4995        $self->{kwd} += $nc - 0x0030;
4996        ## Stay in the state.
4997
4998    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4999      $self->{line_prev} = $self->{line};
5000      $self->{column_prev} = $self->{column};
5001      $self->{column}++;
5002      $self->{nc}
5003          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5004    } else {
5005      $self->{set_nc}->($self);
5006    }
5007
5008        redo A;
5009      } elsif (0x0061 <= $nc and
5010               $nc <= 0x0066) { # a..f
5011
5012        $self->{kwd} *= 0x10;
5013        $self->{kwd} += $nc - 0x0060 + 9;
5014        ## Stay in the state.
5015
5016    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5017      $self->{line_prev} = $self->{line};
5018      $self->{column_prev} = $self->{column};
5019      $self->{column}++;
5020      $self->{nc}
5021          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5022    } else {
5023      $self->{set_nc}->($self);
5024    }
5025
5026        redo A;
5027      } elsif (0x0041 <= $nc and
5028               $nc <= 0x0046) { # A..F
5029
5030        $self->{kwd} *= 0x10;
5031        $self->{kwd} += $nc - 0x0040 + 9;
5032        ## Stay in the state.
5033
5034    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5035      $self->{line_prev} = $self->{line};
5036      $self->{column_prev} = $self->{column};
5037      $self->{column}++;
5038      $self->{nc}
5039          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5040    } else {
5041      $self->{set_nc}->($self);
5042    }
5043
5044        redo A;
5045      } elsif ($nc == 0x003B) { # ;
5046
5047
5048    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5049      $self->{line_prev} = $self->{line};
5050      $self->{column_prev} = $self->{column};
5051      $self->{column}++;
5052      $self->{nc}
5053          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5054    } else {
5055      $self->{set_nc}->($self);
5056    }
5057
5058        #
5059      } else {
5060
5061        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
5062                        line => $self->{line},
5063                        column => $self->{column});
5064        ## Reconsume.
5065        #
5066      }
5067
5068      my $code = $self->{kwd};
5069      my $l = $self->{line_prev};
5070      my $c = $self->{column_prev};
5071      if ((not $self->{is_xml} and $charref_map->{$code}) or
5072          ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5073          ($self->{is_xml} and $code == 0x0000)) {
5074
5075        $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5076                        text => (sprintf 'U+%04X', $code),
5077                        line => $l, column => $c);
5078        $code = $charref_map->{$code};
5079      } elsif ($code > 0x10FFFF) {
5080
5081        $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5082                        text => (sprintf 'U-%08X', $code),
5083                        line => $l, column => $c);
5084        $code = 0xFFFD;
5085      }
5086
5087      if ($self->{prev_state} == DATA_STATE or
5088          $self->{prev_state} == RCDATA_STATE) {
5089
5090        $self->{state} = $self->{prev_state};
5091        ## Reconsume.
5092        return  ({type => CHARACTER_TOKEN, data => chr $code,
5093                  has_reference => 1,
5094                  line => $l, column => $c,
5095                 });
5096        redo A;
5097      } else {
5098
5099        $self->{ca}->{value} .= chr $code;
5100        $self->{ca}->{has_reference} = 1;
5101        $self->{state} = $self->{prev_state};
5102        ## Reconsume.
5103        redo A;
5104      }
5105    } elsif ($state == ENTITY_NAME_STATE) {
5106      if ((0x0041 <= $nc and # a
5107           $nc <= 0x005A) or # x
5108          (0x0061 <= $nc and # a
5109           $nc <= 0x007A) or # z
5110          (0x0030 <= $nc and # 0
5111           $nc <= 0x0039) or # 9
5112          $nc == 0x003B or # ;
5113          ($self->{is_xml} and
5114           not ($is_space->{$nc} or
5115                {
5116                  0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5117
5118                  ## See comment in the |ENTITY_STATE|'s |if|
5119                  ## statement for the rationale of addition of these
5120                  ## characters.
5121                  0x0022 => 1, 0x0027 => 1, 0x0060 => 1, # ", ', `
5122                  0x003D => 1, # =
5123
5124                  ## This is redundant for the same reason.
5125                  $self->{entity_add} => 1,
5126                }->{$nc}))) {
5127        #local %entity2char;
5128        $self->{kwd} .= chr $nc; ## Bare entity name.
5129        if (defined $entity2char{$self->{kwd}} or ## HTML charrefs.
5130            $self->{ge}->{$self->{kwd}}) { ## XML general entities.
5131          if ($nc == 0x003B) { # ;
5132            if (defined $self->{ge}->{$self->{kwd}}) {
5133              ## A declared XML entity.
5134              if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5135
5136                $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5137              } else {
5138                if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5139
5140                  $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5141                                  value => $self->{kwd});
5142                } else {
5143
5144                }
5145                $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5146              }
5147            } else {
5148              ## An HTML character reference.
5149              if ($self->{is_xml}) {
5150                ## Not a declared XML entity.
5151
5152                $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5153                                value => $self->{kwd},
5154                                level => {
5155                                          'amp;' => $self->{level}->{warn},
5156                                          'quot;' => $self->{level}->{warn},
5157                                          'lt;' => $self->{level}->{warn},
5158                                          'gt;' => $self->{level}->{warn},
5159                                          'apos;' => $self->{level}->{warn},
5160                                         }->{$self->{kwd}} ||
5161                                         $self->{level}->{must},
5162                                line => $self->{line_prev},
5163                                column => $self->{column} - length $self->{kwd});
5164              } else {
5165
5166              }
5167              $self->{entity__value} = $entity2char{$self->{kwd}};
5168            }
5169            $self->{entity__match} = 1; ## Matched exactly with ";" entity.
5170
5171    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5172      $self->{line_prev} = $self->{line};
5173      $self->{column_prev} = $self->{column};
5174      $self->{column}++;
5175      $self->{nc}
5176          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5177    } else {
5178      $self->{set_nc}->($self);
5179    }
5180
5181            #
5182          } else {
5183
5184            $self->{entity__value} = $entity2char{$self->{kwd}};
5185            $self->{entity__match} = -1; ## Exactly matched to non-";" entity.
5186            ## Stay in the state.
5187
5188    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5189      $self->{line_prev} = $self->{line};
5190      $self->{column_prev} = $self->{column};
5191      $self->{column}++;
5192      $self->{nc}
5193          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5194    } else {
5195      $self->{set_nc}->($self);
5196    }
5197
5198            redo A;
5199          }
5200        } else {
5201          if ($nc == 0x003B) { # ;
5202            ## A reserved HTML character reference or an undeclared
5203            ## XML entity reference.
5204
5205            $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## XXXtype
5206                            value => $self->{kwd},
5207                            level => $self->{level}->{must},
5208                            line => $self->{line_prev},
5209                            column => $self->{column} - length $self->{kwd});
5210            $self->{entity__value} .= chr $nc;
5211            $self->{entity__match} *= 2; ## Matched (positive) or not (zero)
5212
5213    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5214      $self->{line_prev} = $self->{line};
5215      $self->{column_prev} = $self->{column};
5216      $self->{column}++;
5217      $self->{nc}
5218          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5219    } else {
5220      $self->{set_nc}->($self);
5221    }
5222
5223            #
5224          } else {
5225
5226            $self->{entity__value} .= chr $nc;
5227            $self->{entity__match} *= 2; ## Matched (positive) or not (zero)
5228            ## Stay in the state.
5229
5230    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5231      $self->{line_prev} = $self->{line};
5232      $self->{column_prev} = $self->{column};
5233      $self->{column}++;
5234      $self->{nc}
5235          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5236    } else {
5237      $self->{set_nc}->($self);
5238    }
5239
5240            redo A;
5241          }
5242        }
5243      } elsif ($nc == 0x003D) { # =
5244        if ($self->{entity__match} < 0 and
5245            $self->{prev_state} != DATA_STATE and # in attribute
5246            $self->{prev_state} != RCDATA_STATE) {
5247          $self->{entity__match} = 0;
5248        }
5249      }
5250
5251      my $data;
5252      my $has_ref;
5253      if ($self->{entity__match} > 0) { ## A ";" entity.
5254
5255        $data = $self->{entity__value};
5256        ## Strictly speaking the $has_ref flag should not be set if
5257        ## there is no matched entity.  However, this flag is used
5258        ## only in contexts where use of an
5259        ## unexpanded-entity-reference-like string is in no way
5260        ## allowed, so it should not make any difference in theory.
5261        $has_ref = 1;
5262        #
5263      } elsif ($self->{entity__match} < 0) { ## Matched to non-";" entity.
5264        if ($self->{prev_state} != DATA_STATE and # in attribute
5265            $self->{prev_state} != RCDATA_STATE and
5266            $self->{entity__match} < -1) {
5267          ## In attribute-value contexts, matched non-";" string is
5268          ## left as is if there is trailing alphabetical letters.
5269
5270          $data = '&' . $self->{kwd};
5271          #
5272        } else {
5273          ## In attribute-value contexts, exactly matched non-";"
5274          ## string is replaced as a character reference.  In any
5275          ## context, matched non-";" string with or without trailing
5276          ## alphabetical letters is replaced as a character reference
5277          ## (with trailing letters).  Note that use of a no-";"
5278          ## character reference is always non-conforming.
5279
5280          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
5281          $data = $self->{entity__value};
5282          $has_ref = 1;
5283          #
5284        }
5285      } else { ## Unmatched string.
5286        if ($self->{is_xml} and not $self->{kwd} =~ /;$/) {
5287
5288          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5289                          line => $self->{line_prev},
5290                          column => $self->{column_prev} - length $self->{kwd});
5291        } else {
5292
5293        }
5294        $data = '&' . $self->{kwd};
5295        #
5296      }
5297
5298      ## NOTE: In these cases, when a character reference is found,
5299      ## it is consumed and a character token is returned, or, otherwise,
5300      ## nothing is consumed and returned, according to the spec algorithm.
5301      ## In this implementation, anything that has been examined by the
5302      ## tokenizer is appended to the parent element or the attribute value
5303      ## as string, either literal string when no character reference or
5304      ## entity-replaced string otherwise, in this stage, since any characters
5305      ## that would not be consumed are appended in the data state or in an
5306      ## appropriate attribute value state anyway.
5307
5308      if ($self->{prev_state} == DATA_STATE or
5309          $self->{prev_state} == RCDATA_STATE) {
5310
5311        $self->{state} = $self->{prev_state};
5312        ## Reconsume.
5313        return  ({type => CHARACTER_TOKEN,
5314                  data => $data,
5315                  has_reference => $has_ref,
5316                  line => $self->{line_prev},
5317                  column => $self->{column_prev} + 1 - length $self->{kwd},
5318                 });
5319        redo A;
5320      } else {
5321
5322        $self->{ca}->{value} .= $data;
5323        $self->{ca}->{has_reference} = 1 if $has_ref;
5324        $self->{state} = $self->{prev_state};
5325        ## Reconsume.
5326        redo A;
5327      }
5328
5329    ## ========== XML-only states ==========
5330
5331    } elsif ($state == PI_STATE) {
5332      ## XML5: "Pi state" and "DOCTYPE pi state".
5333
5334      if ($is_space->{$nc} or
5335          $nc == 0x003F or # ?
5336          $nc == -1) {
5337        ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5338        ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
5339        ## "DOCTYPE pi state": Parse error, switch to the "data
5340        ## state".
5341        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5342                        line => $self->{line_prev},
5343                        column => $self->{column_prev}
5344                            - 1 * ($nc != -1));
5345        $self->{state} = BOGUS_COMMENT_STATE;
5346        ## Reconsume.
5347        $self->{ct} = {type => COMMENT_TOKEN,
5348                       data => '?',
5349                       line => $self->{line_prev},
5350                       column => $self->{column_prev}
5351                           - 1 * ($nc != -1),
5352                      };
5353        redo A;
5354      } else {
5355        ## XML5: "DOCTYPE pi state": Stay in the state.
5356        if ($nc == 0x0000) {
5357          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
5358        }
5359        $self->{ct} = {type => PI_TOKEN,
5360                       target => $nc == 0x0000 ? "\x{FFFD}" : chr $nc,
5361                       data => '',
5362                       line => $self->{line_prev},
5363                       column => $self->{column_prev} - 1,
5364                      };
5365        $self->{state} = PI_TARGET_STATE;
5366
5367    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5368      $self->{line_prev} = $self->{line};
5369      $self->{column_prev} = $self->{column};
5370      $self->{column}++;
5371      $self->{nc}
5372          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5373    } else {
5374      $self->{set_nc}->($self);
5375    }
5376
5377        redo A;
5378      }
5379    } elsif ($state == PI_TARGET_STATE) {
5380      if ($is_space->{$nc}) {
5381        $self->{state} = PI_TARGET_AFTER_STATE;
5382        $self->{kwd} = chr $nc; # "temporary buffer"
5383
5384    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5385      $self->{line_prev} = $self->{line};
5386      $self->{column_prev} = $self->{column};
5387      $self->{column}++;
5388      $self->{nc}
5389          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5390    } else {
5391      $self->{set_nc}->($self);
5392    }
5393
5394        redo A;
5395      } elsif ($nc == EOF_CHAR) {
5396        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5397        if ($self->{in_subset}) {
5398          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5399        } else {
5400          $self->{state} = DATA_STATE;
5401        }
5402        ## Reconsume.
5403        return  ({type => COMMENT_TOKEN,
5404                  data => '?' . $self->{ct}->{target},
5405                  line => $self->{ct}->{line},
5406                  column => $self->{ct}->{column}});
5407        redo A;
5408      } elsif ($nc == 0x003F) { # ?
5409        $self->{state} = PI_AFTER_STATE;
5410        $self->{kwd} = ''; # "temporary buffer"
5411
5412    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5413      $self->{line_prev} = $self->{line};
5414      $self->{column_prev} = $self->{column};
5415      $self->{column}++;
5416      $self->{nc}
5417          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5418    } else {
5419      $self->{set_nc}->($self);
5420    }
5421
5422        redo A;
5423      } else {
5424        ## XML5: typo ("tag name" -> "target")
5425        if ($nc == 0x0000) {
5426          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
5427        }
5428        $self->{ct}->{target} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # pi
5429
5430    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5431      $self->{line_prev} = $self->{line};
5432      $self->{column_prev} = $self->{column};
5433      $self->{column}++;
5434      $self->{nc}
5435          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5436    } else {
5437      $self->{set_nc}->($self);
5438    }
5439
5440        redo A;
5441      }
5442    } elsif ($state == PI_TARGET_AFTER_STATE) {
5443      if ($is_space->{$nc}) {
5444        $self->{kwd} .= chr $nc; # "temporary buffer"
5445        ## Stay in the state.
5446
5447    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5448      $self->{line_prev} = $self->{line};
5449      $self->{column_prev} = $self->{column};
5450      $self->{column}++;
5451      $self->{nc}
5452          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5453    } else {
5454      $self->{set_nc}->($self);
5455    }
5456
5457        redo A;
5458      } else {
5459        $self->{state} = PI_DATA_STATE;
5460        ## Reprocess.
5461        redo A;
5462      }
5463    } elsif ($state == PI_DATA_STATE) {
5464      if ($nc == 0x003F) { # ?
5465        $self->{state} = PI_DATA_AFTER_STATE;
5466
5467    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5468      $self->{line_prev} = $self->{line};
5469      $self->{column_prev} = $self->{column};
5470      $self->{column}++;
5471      $self->{nc}
5472          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5473    } else {
5474      $self->{set_nc}->($self);
5475    }
5476
5477        redo A;
5478      } elsif ($nc == EOF_CHAR) {
5479        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5480        if ($self->{in_subset}) {
5481          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5482        } else {
5483          $self->{state} = DATA_STATE;
5484        }
5485        ## Reprocess.
5486        return  ({type => COMMENT_TOKEN,
5487                  data => '?' . $self->{ct}->{target} .
5488                      $self->{kwd} . # "temporary buffer"
5489                      $self->{ct}->{data},
5490                  line => $self->{ct}->{line},
5491                  column => $self->{ct}->{column}});
5492        redo A;
5493      } else {
5494        if ($nc == 0x0000) {
5495          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
5496        }
5497        $self->{ct}->{data} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # pi
5498        $self->{read_until}->($self->{ct}->{data}, qq[\x00?],
5499                              length $self->{ct}->{data});
5500        ## Stay in the state.
5501
5502    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5503      $self->{line_prev} = $self->{line};
5504      $self->{column_prev} = $self->{column};
5505      $self->{column}++;
5506      $self->{nc}
5507          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5508    } else {
5509      $self->{set_nc}->($self);
5510    }
5511
5512        ## Reprocess.
5513        redo A;
5514      }
5515    } elsif ($state == PI_AFTER_STATE) {
5516      ## XML5: Part of "Pi after state".
5517
5518      if ($nc == 0x003E) { # >
5519        if ($self->{in_subset}) {
5520          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5521        } else {
5522          $self->{state} = DATA_STATE;
5523        }
5524
5525    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5526      $self->{line_prev} = $self->{line};
5527      $self->{column_prev} = $self->{column};
5528      $self->{column}++;
5529      $self->{nc}
5530          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5531    } else {
5532      $self->{set_nc}->($self);
5533    }
5534
5535        return  ($self->{ct}); # pi
5536        redo A;
5537      } elsif ($nc == 0x003F) { # ?
5538        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5539                        line => $self->{line_prev},
5540                        column => $self->{column_prev}); ## XML5: no error
5541        $self->{ct}->{data} .= '?';
5542        $self->{state} = PI_DATA_AFTER_STATE;
5543
5544    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5545      $self->{line_prev} = $self->{line};
5546      $self->{column_prev} = $self->{column};
5547      $self->{column}++;
5548      $self->{nc}
5549          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5550    } else {
5551      $self->{set_nc}->($self);
5552    }
5553
5554        redo A;
5555      } else {
5556        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5557                        line => $self->{line_prev},
5558                        column => $self->{column_prev}
5559                            + 1 * ($nc == -1)); ## XML5: no error
5560        $self->{ct}->{data} .= '?'; ## XML5: not appended
5561        $self->{state} = PI_DATA_STATE;
5562        ## Reprocess.
5563        redo A;
5564      }
5565    } elsif ($state == PI_DATA_AFTER_STATE) {
5566      ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5567
5568      if ($nc == 0x003E) { # >
5569        if ($self->{in_subset}) {
5570          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5571        } else {
5572          $self->{state} = DATA_STATE;
5573        }
5574
5575    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5576      $self->{line_prev} = $self->{line};
5577      $self->{column_prev} = $self->{column};
5578      $self->{column}++;
5579      $self->{nc}
5580          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5581    } else {
5582      $self->{set_nc}->($self);
5583    }
5584
5585        return  ($self->{ct}); # pi
5586        redo A;
5587      } elsif ($nc == 0x003F) { # ?
5588        $self->{ct}->{data} .= '?';
5589        ## Stay in the state.
5590
5591    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5592      $self->{line_prev} = $self->{line};
5593      $self->{column_prev} = $self->{column};
5594      $self->{column}++;
5595      $self->{nc}
5596          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5597    } else {
5598      $self->{set_nc}->($self);
5599    }
5600
5601        redo A;
5602      } else {
5603        $self->{ct}->{data} .= '?'; ## XML5: not appended
5604        $self->{state} = PI_DATA_STATE;
5605        ## Reprocess.
5606        redo A;
5607      }
5608
5609    } elsif ($state == DOCTYPE_INTERNAL_SUBSET_STATE) {
5610      if ($nc == 0x003C) { # <
5611        $self->{state} = DOCTYPE_TAG_STATE;
5612
5613    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5614      $self->{line_prev} = $self->{line};
5615      $self->{column_prev} = $self->{column};
5616      $self->{column}++;
5617      $self->{nc}
5618          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5619    } else {
5620      $self->{set_nc}->($self);
5621    }
5622
5623        redo A;
5624      } elsif ($nc == 0x0025) { # %
5625        ## XML5: Not defined yet.
5626
5627        ## TODO: parameter entity expansion
5628
5629        if (not $self->{stop_processing} and
5630            not $self->{document}->xml_standalone) {
5631          $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5632                          level => $self->{level}->{info});
5633          $self->{stop_processing} = 1;
5634        }
5635
5636
5637    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5638      $self->{line_prev} = $self->{line};
5639      $self->{column_prev} = $self->{column};
5640      $self->{column}++;
5641      $self->{nc}
5642          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5643    } else {
5644      $self->{set_nc}->($self);
5645    }
5646
5647        redo A;
5648      } elsif ($nc == 0x005D) { # ]
5649        delete $self->{in_subset};
5650        $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5651
5652    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5653      $self->{line_prev} = $self->{line};
5654      $self->{column_prev} = $self->{column};
5655      $self->{column}++;
5656      $self->{nc}
5657          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5658    } else {
5659      $self->{set_nc}->($self);
5660    }
5661
5662        redo A;
5663      } elsif ($is_space->{$nc}) {
5664        ## Stay in the state.
5665
5666    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5667      $self->{line_prev} = $self->{line};
5668      $self->{column_prev} = $self->{column};
5669      $self->{column}++;
5670      $self->{nc}
5671          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5672    } else {
5673      $self->{set_nc}->($self);
5674    }
5675
5676        redo A;
5677      } elsif ($nc == EOF_CHAR) {
5678        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5679        delete $self->{in_subset};
5680        $self->{state} = DATA_STATE;
5681        ## Reconsume.
5682        return  ({type => END_OF_DOCTYPE_TOKEN});
5683        redo A;
5684      } else {
5685        unless ($self->{internal_subset_tainted}) {
5686          ## XML5: No parse error.
5687          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5688          $self->{internal_subset_tainted} = 1;
5689        }
5690        ## Stay in the state.
5691
5692    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5693      $self->{line_prev} = $self->{line};
5694      $self->{column_prev} = $self->{column};
5695      $self->{column}++;
5696      $self->{nc}
5697          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5698    } else {
5699      $self->{set_nc}->($self);
5700    }
5701
5702        redo A;
5703      }
5704    } elsif ($state == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5705      if ($nc == 0x003E) { # >
5706        $self->{state} = DATA_STATE;
5707
5708    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5709      $self->{line_prev} = $self->{line};
5710      $self->{column_prev} = $self->{column};
5711      $self->{column}++;
5712      $self->{nc}
5713          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5714    } else {
5715      $self->{set_nc}->($self);
5716    }
5717
5718        return  ({type => END_OF_DOCTYPE_TOKEN});
5719        redo A;
5720      } elsif ($nc == EOF_CHAR) {
5721        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5722        $self->{state} = DATA_STATE;
5723        ## Reconsume.
5724        return  ({type => END_OF_DOCTYPE_TOKEN});
5725        redo A;
5726      } else {
5727        ## XML5: No parse error and stay in the state.
5728        $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5729
5730        $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5731
5732    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5733      $self->{line_prev} = $self->{line};
5734      $self->{column_prev} = $self->{column};
5735      $self->{column}++;
5736      $self->{nc}
5737          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5738    } else {
5739      $self->{set_nc}->($self);
5740    }
5741
5742        redo A;
5743      }
5744    } elsif ($state == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5745      if ($nc == 0x003E) { # >
5746        $self->{state} = DATA_STATE;
5747
5748    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5749      $self->{line_prev} = $self->{line};
5750      $self->{column_prev} = $self->{column};
5751      $self->{column}++;
5752      $self->{nc}
5753          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5754    } else {
5755      $self->{set_nc}->($self);
5756    }
5757
5758        return  ({type => END_OF_DOCTYPE_TOKEN});
5759        redo A;
5760      } elsif ($nc == EOF_CHAR) {
5761        $self->{state} = DATA_STATE;
5762        ## Reconsume.
5763        return  ({type => END_OF_DOCTYPE_TOKEN});
5764        redo A;
5765      } else {
5766        ## Stay in the state.
5767
5768    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5769      $self->{line_prev} = $self->{line};
5770      $self->{column_prev} = $self->{column};
5771      $self->{column}++;
5772      $self->{nc}
5773          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5774    } else {
5775      $self->{set_nc}->($self);
5776    }
5777
5778        redo A;
5779      }
5780    } elsif ($state == DOCTYPE_TAG_STATE) {
5781      if ($nc == 0x0021) { # !
5782        $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5783
5784    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5785      $self->{line_prev} = $self->{line};
5786      $self->{column_prev} = $self->{column};
5787      $self->{column}++;
5788      $self->{nc}
5789          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5790    } else {
5791      $self->{set_nc}->($self);
5792    }
5793
5794        redo A;
5795      } elsif ($nc == 0x003F) { # ?
5796        $self->{state} = PI_STATE;
5797
5798    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5799      $self->{line_prev} = $self->{line};
5800      $self->{column_prev} = $self->{column};
5801      $self->{column}++;
5802      $self->{nc}
5803          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5804    } else {
5805      $self->{set_nc}->($self);
5806    }
5807
5808        redo A;
5809      } elsif ($nc == EOF_CHAR) {
5810        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5811        $self->{state} = DATA_STATE;
5812        ## Reconsume.
5813        redo A;
5814      } else {
5815        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5816                        line => $self->{line_prev},
5817                        column => $self->{column_prev});
5818        $self->{state} = BOGUS_COMMENT_STATE;
5819        $self->{ct} = {type => COMMENT_TOKEN,
5820                       data => '',
5821                      }; ## NOTE: Will be discarded.
5822
5823    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5824      $self->{line_prev} = $self->{line};
5825      $self->{column_prev} = $self->{column};
5826      $self->{column}++;
5827      $self->{nc}
5828          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5829    } else {
5830      $self->{set_nc}->($self);
5831    }
5832
5833        redo A;
5834      }
5835    } elsif ($state == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5836      ## XML5: "DOCTYPE markup declaration state".
5837
5838      if ($nc == 0x002D) { # -
5839        $self->{state} = MD_HYPHEN_STATE;
5840
5841    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5842      $self->{line_prev} = $self->{line};
5843      $self->{column_prev} = $self->{column};
5844      $self->{column}++;
5845      $self->{nc}
5846          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5847    } else {
5848      $self->{set_nc}->($self);
5849    }
5850
5851        redo A;
5852      } elsif ($nc == 0x0045 or # E
5853               $nc == 0x0065) { # e
5854        $self->{state} = MD_E_STATE;
5855        $self->{kwd} = chr $nc;
5856
5857    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5858      $self->{line_prev} = $self->{line};
5859      $self->{column_prev} = $self->{column};
5860      $self->{column}++;
5861      $self->{nc}
5862          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5863    } else {
5864      $self->{set_nc}->($self);
5865    }
5866
5867        redo A;
5868      } elsif ($nc == 0x0041 or # A
5869               $nc == 0x0061) { # a
5870        $self->{state} = MD_ATTLIST_STATE;
5871        $self->{kwd} = chr $nc;
5872
5873    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5874      $self->{line_prev} = $self->{line};
5875      $self->{column_prev} = $self->{column};
5876      $self->{column}++;
5877      $self->{nc}
5878          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5879    } else {
5880      $self->{set_nc}->($self);
5881    }
5882
5883        redo A;
5884      } elsif ($nc == 0x004E or # N
5885               $nc == 0x006E) { # n
5886        $self->{state} = MD_NOTATION_STATE;
5887        $self->{kwd} = chr $nc;
5888
5889    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5890      $self->{line_prev} = $self->{line};
5891      $self->{column_prev} = $self->{column};
5892      $self->{column}++;
5893      $self->{nc}
5894          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5895    } else {
5896      $self->{set_nc}->($self);
5897    }
5898
5899        redo A;
5900      } else {
5901        #
5902      }
5903
5904      ## XML5: No parse error.
5905      $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5906                      line => $self->{line_prev},
5907                      column => $self->{column_prev} - 1);
5908      ## Reconsume.
5909      $self->{state} = BOGUS_COMMENT_STATE;
5910      $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5911      redo A;
5912    } elsif ($state == MD_E_STATE) {
5913      if ($nc == 0x004E or # N
5914          $nc == 0x006E) { # n
5915        $self->{state} = MD_ENTITY_STATE;
5916        $self->{kwd} .= chr $nc;
5917
5918    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5919      $self->{line_prev} = $self->{line};
5920      $self->{column_prev} = $self->{column};
5921      $self->{column}++;
5922      $self->{nc}
5923          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5924    } else {
5925      $self->{set_nc}->($self);
5926    }
5927
5928        redo A;
5929      } elsif ($nc == 0x004C or # L
5930               $nc == 0x006C) { # l
5931        ## XML5: <!ELEMENT> not supported.
5932        $self->{state} = MD_ELEMENT_STATE;
5933        $self->{kwd} .= chr $nc;
5934
5935    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5936      $self->{line_prev} = $self->{line};
5937      $self->{column_prev} = $self->{column};
5938      $self->{column}++;
5939      $self->{nc}
5940          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5941    } else {
5942      $self->{set_nc}->($self);
5943    }
5944
5945        redo A;
5946      } else {
5947        ## XML5: No parse error.
5948        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5949                        line => $self->{line_prev},
5950                        column => $self->{column_prev} - 2
5951                            + 1 * ($nc == EOF_CHAR));
5952        ## Reconsume.
5953        $self->{state} = BOGUS_COMMENT_STATE;
5954        $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5955        redo A;
5956      }
5957    } elsif ($state == MD_ENTITY_STATE) {
5958      if ($nc == [
5959            undef,
5960            undef,
5961            0x0054, # T
5962            0x0049, # I
5963            0x0054, # T
5964            NEVER_CHAR, # (Y)
5965          ]->[length $self->{kwd}] or
5966          $nc == [
5967            undef,
5968            undef,
5969            0x0074, # t
5970            0x0069, # i
5971            0x0074, # t
5972            NEVER_CHAR, # (y)
5973          ]->[length $self->{kwd}]) {
5974        ## Stay in the state.
5975        $self->{kwd} .= chr $nc;
5976
5977    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5978      $self->{line_prev} = $self->{line};
5979      $self->{column_prev} = $self->{column};
5980      $self->{column}++;
5981      $self->{nc}
5982          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5983    } else {
5984      $self->{set_nc}->($self);
5985    }
5986
5987        redo A;
5988      } elsif ((length $self->{kwd}) == 5 and
5989               ($nc == 0x0059 or # Y
5990                $nc == 0x0079)) { # y
5991        if ($self->{kwd} ne 'ENTIT' or $nc == 0x0079) {
5992          $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5993                          text => 'ENTITY',
5994                          line => $self->{line_prev},
5995                          column => $self->{column_prev} - 4);
5996        }
5997        $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5998                       line => $self->{line_prev},
5999                       column => $self->{column_prev} - 6};
6000        $self->{state} = DOCTYPE_MD_STATE;
6001
6002    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6003      $self->{line_prev} = $self->{line};
6004      $self->{column_prev} = $self->{column};
6005      $self->{column}++;
6006      $self->{nc}
6007          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6008    } else {
6009      $self->{set_nc}->($self);
6010    }
6011
6012        redo A;
6013      } else {
6014        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6015                        line => $self->{line_prev},
6016                        column => $self->{column_prev} - 1
6017                            - (length $self->{kwd})
6018                            + 1 * ($nc == EOF_CHAR));
6019        $self->{state} = BOGUS_COMMENT_STATE;
6020        ## Reconsume.
6021        $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6022        redo A;
6023      }
6024    } elsif ($state == MD_ELEMENT_STATE) {
6025      if ($nc == [
6026           undef,
6027           undef,
6028           0x0045, # E
6029           0x004D, # M
6030           0x0045, # E
6031           0x004E, # N
6032           NEVER_CHAR, # (T)
6033          ]->[length $self->{kwd}] or
6034          $nc == [
6035           undef,
6036           undef,
6037           0x0065, # e
6038           0x006D, # m
6039           0x0065, # e
6040           0x006E, # n
6041           NEVER_CHAR, # (t)
6042          ]->[length $self->{kwd}]) {
6043        ## Stay in the state.
6044        $self->{kwd} .= chr $nc;
6045
6046    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6047      $self->{line_prev} = $self->{line};
6048      $self->{column_prev} = $self->{column};
6049      $self->{column}++;
6050      $self->{nc}
6051          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6052    } else {
6053      $self->{set_nc}->($self);
6054    }
6055
6056        redo A;
6057      } elsif ((length $self->{kwd}) == 6 and
6058               ($nc == 0x0054 or # T
6059                $nc == 0x0074)) { # t
6060        if ($self->{kwd} ne 'ELEMEN' or $nc == 0x0074) {
6061          $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6062                          text => 'ELEMENT',
6063                          line => $self->{line_prev},
6064                          column => $self->{column_prev} - 5);
6065        }
6066        $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6067                       line => $self->{line_prev},
6068                       column => $self->{column_prev} - 7};
6069        $self->{state} = DOCTYPE_MD_STATE;
6070
6071    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6072      $self->{line_prev} = $self->{line};
6073      $self->{column_prev} = $self->{column};
6074      $self->{column}++;
6075      $self->{nc}
6076          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6077    } else {
6078      $self->{set_nc}->($self);
6079    }
6080
6081        redo A;
6082      } else {
6083        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6084                        line => $self->{line_prev},
6085                        column => $self->{column_prev} - 1
6086                            - (length $self->{kwd})
6087                            + 1 * ($nc == EOF_CHAR));
6088        $self->{state} = BOGUS_COMMENT_STATE;
6089        ## Reconsume.
6090        $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6091        redo A;
6092      }
6093    } elsif ($state == MD_ATTLIST_STATE) {
6094      if ($nc == [
6095           undef,
6096           0x0054, # T
6097           0x0054, # T
6098           0x004C, # L
6099           0x0049, # I
6100           0x0053, # S
6101           NEVER_CHAR, # (T)
6102          ]->[length $self->{kwd}] or
6103          $nc == [
6104           undef,
6105           0x0074, # t
6106           0x0074, # t
6107           0x006C, # l
6108           0x0069, # i
6109           0x0073, # s
6110           NEVER_CHAR, # (t)
6111          ]->[length $self->{kwd}]) {
6112        ## Stay in the state.
6113        $self->{kwd} .= chr $nc;
6114
6115    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6116      $self->{line_prev} = $self->{line};
6117      $self->{column_prev} = $self->{column};
6118      $self->{column}++;
6119      $self->{nc}
6120          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6121    } else {
6122      $self->{set_nc}->($self);
6123    }
6124
6125        redo A;
6126      } elsif ((length $self->{kwd}) == 6 and
6127               ($nc == 0x0054 or # T
6128                $nc == 0x0074)) { # t
6129        if ($self->{kwd} ne 'ATTLIS' or $nc == 0x0074) {
6130          $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6131                          text => 'ATTLIST',
6132                          line => $self->{line_prev},
6133                          column => $self->{column_prev} - 5);
6134        }
6135        $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6136                       attrdefs => [],
6137                       line => $self->{line_prev},
6138                       column => $self->{column_prev} - 7};
6139        $self->{state} = DOCTYPE_MD_STATE;
6140
6141    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6142      $self->{line_prev} = $self->{line};
6143      $self->{column_prev} = $self->{column};
6144      $self->{column}++;
6145      $self->{nc}
6146          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6147    } else {
6148      $self->{set_nc}->($self);
6149    }
6150
6151        redo A;
6152      } else {
6153        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6154                        line => $self->{line_prev},
6155                        column => $self->{column_prev} - 1
6156                             - (length $self->{kwd})
6157                             + 1 * ($nc == EOF_CHAR));
6158        $self->{state} = BOGUS_COMMENT_STATE;
6159        ## Reconsume.
6160        $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6161        redo A;
6162      }
6163    } elsif ($state == MD_NOTATION_STATE) {
6164      if ($nc == [
6165           undef,
6166           0x004F, # O
6167           0x0054, # T
6168           0x0041, # A
6169           0x0054, # T
6170           0x0049, # I
6171           0x004F, # O
6172           NEVER_CHAR, # (N)
6173          ]->[length $self->{kwd}] or
6174          $nc == [
6175           undef,
6176           0x006F, # o
6177           0x0074, # t
6178           0x0061, # a
6179           0x0074, # t
6180           0x0069, # i
6181           0x006F, # o
6182           NEVER_CHAR, # (n)
6183          ]->[length $self->{kwd}]) {
6184        ## Stay in the state.
6185        $self->{kwd} .= chr $nc;
6186
6187    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6188      $self->{line_prev} = $self->{line};
6189      $self->{column_prev} = $self->{column};
6190      $self->{column}++;
6191      $self->{nc}
6192          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6193    } else {
6194      $self->{set_nc}->($self);
6195    }
6196
6197        redo A;
6198      } elsif ((length $self->{kwd}) == 7 and
6199               ($nc == 0x004E or # N
6200                $nc == 0x006E)) { # n
6201        if ($self->{kwd} ne 'NOTATIO' or $nc == 0x006E) {
6202          $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6203                          text => 'NOTATION',
6204                          line => $self->{line_prev},
6205                          column => $self->{column_prev} - 6);
6206        }
6207        $self->{ct} = {type => NOTATION_TOKEN, name => '',
6208                       line => $self->{line_prev},
6209                       column => $self->{column_prev} - 8};
6210        $self->{state} = DOCTYPE_MD_STATE;
6211
6212    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6213      $self->{line_prev} = $self->{line};
6214      $self->{column_prev} = $self->{column};
6215      $self->{column}++;
6216      $self->{nc}
6217          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6218    } else {
6219      $self->{set_nc}->($self);
6220    }
6221
6222        redo A;
6223      } else {
6224        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6225                        line => $self->{line_prev},
6226                        column => $self->{column_prev} - 1
6227                            - (length $self->{kwd})
6228                            + 1 * ($nc == EOF_CHAR));
6229        $self->{state} = BOGUS_COMMENT_STATE;
6230        ## Reconsume.
6231        $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6232        redo A;
6233      }
6234    } elsif ($state == DOCTYPE_MD_STATE) {
6235      ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6236      ## "DOCTYPE NOTATION state".
6237
6238      if ($is_space->{$nc}) {
6239        ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6240        $self->{state} = BEFORE_MD_NAME_STATE;
6241
6242    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6243      $self->{line_prev} = $self->{line};
6244      $self->{column_prev} = $self->{column};
6245      $self->{column}++;
6246      $self->{nc}
6247          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6248    } else {
6249      $self->{set_nc}->($self);
6250    }
6251
6252        redo A;
6253      } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6254               $nc == 0x0025) { # %
6255        ## XML5: Switch to the "DOCTYPE bogus comment state".
6256        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6257        $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6258
6259    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6260      $self->{line_prev} = $self->{line};
6261      $self->{column_prev} = $self->{column};
6262      $self->{column}++;
6263      $self->{nc}
6264          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6265    } else {
6266      $self->{set_nc}->($self);
6267    }
6268
6269        redo A;
6270      } elsif ($nc == EOF_CHAR) {
6271        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6272        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6273        ## Reconsume.
6274        redo A;
6275      } elsif ($nc == 0x003E) { # >
6276        ## XML5: Switch to the "DOCTYPE bogus comment state".
6277        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6278        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6279
6280    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6281      $self->{line_prev} = $self->{line};
6282      $self->{column_prev} = $self->{column};
6283      $self->{column}++;
6284      $self->{nc}
6285          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6286    } else {
6287      $self->{set_nc}->($self);
6288    }
6289
6290        redo A;
6291      } else {
6292        ## XML5: Switch to the "DOCTYPE bogus comment state".
6293        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6294        $self->{state} = BEFORE_MD_NAME_STATE;
6295        redo A;
6296      }
6297    } elsif ($state == BEFORE_MD_NAME_STATE) {
6298      ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6299      ## before state", "DOCTYPE ATTLIST name before state".
6300
6301      if ($is_space->{$nc}) {
6302        ## Stay in the state.
6303
6304    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6305      $self->{line_prev} = $self->{line};
6306      $self->{column_prev} = $self->{column};
6307      $self->{column}++;
6308      $self->{nc}
6309          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6310    } else {
6311      $self->{set_nc}->($self);
6312    }
6313
6314        redo A;
6315      } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6316               $nc == 0x0025) { # %
6317        $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6318
6319    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6320      $self->{line_prev} = $self->{line};
6321      $self->{column_prev} = $self->{column};
6322      $self->{column}++;
6323      $self->{nc}
6324          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6325    } else {
6326      $self->{set_nc}->($self);
6327    }
6328
6329        redo A;
6330      } elsif ($nc == 0x003E) { # >
6331        ## XML5: Same as "Anything else".
6332        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6333        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6334
6335    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6336      $self->{line_prev} = $self->{line};
6337      $self->{column_prev} = $self->{column};
6338      $self->{column}++;
6339      $self->{nc}
6340          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6341    } else {
6342      $self->{set_nc}->($self);
6343    }
6344
6345        redo A;
6346      } elsif ($nc == EOF_CHAR) {
6347        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6348        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6349        ## Reconsume.
6350        redo A;
6351      } else {
6352        ## XML5: [ATTLIST] Not defined yet.
6353        if ($nc == 0x0000) {
6354          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
6355        }
6356        $self->{ct}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
6357        $self->{state} = MD_NAME_STATE;
6358
6359    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6360      $self->{line_prev} = $self->{line};
6361      $self->{column_prev} = $self->{column};
6362      $self->{column}++;
6363      $self->{nc}
6364          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6365    } else {
6366      $self->{set_nc}->($self);
6367    }
6368
6369        redo A;
6370      }
6371    } elsif ($state == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6372      if ($is_space->{$nc}) {
6373        ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6374        $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6375        $self->{state} = BEFORE_MD_NAME_STATE;
6376
6377    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6378      $self->{line_prev} = $self->{line};
6379      $self->{column_prev} = $self->{column};
6380      $self->{column}++;
6381      $self->{nc}
6382          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6383    } else {
6384      $self->{set_nc}->($self);
6385    }
6386
6387        redo A;
6388      } elsif ($nc == 0x003E) { # >
6389        ## XML5: Same as "Anything else".
6390        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6391        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6392
6393    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6394      $self->{line_prev} = $self->{line};
6395      $self->{column_prev} = $self->{column};
6396      $self->{column}++;
6397      $self->{nc}
6398          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6399    } else {
6400      $self->{set_nc}->($self);
6401    }
6402
6403        redo A;
6404      } elsif ($nc == EOF_CHAR) {
6405        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6406        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6407        ## Reconsume.
6408        redo A;
6409      } else {
6410        ## XML5: No parse error.
6411        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6412        $self->{state} = BOGUS_COMMENT_STATE;
6413        $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6414        ## Reconsume.
6415        redo A;
6416      }
6417    } elsif ($state == MD_NAME_STATE) {
6418      ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6419
6420      if ($is_space->{$nc}) {
6421        if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6422          $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6423        } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6424          $self->{state} = AFTER_ELEMENT_NAME_STATE;
6425        } else { # ENTITY/NOTATION
6426          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6427        }
6428
6429    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6430      $self->{line_prev} = $self->{line};
6431      $self->{column_prev} = $self->{column};
6432      $self->{column}++;
6433      $self->{nc}
6434          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6435    } else {
6436      $self->{set_nc}->($self);
6437    }
6438
6439        redo A;
6440      } elsif ($nc == 0x003E) { # >
6441        if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6442          #
6443        } else {
6444          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6445        }
6446        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6447
6448    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6449      $self->{line_prev} = $self->{line};
6450      $self->{column_prev} = $self->{column};
6451      $self->{column}++;
6452      $self->{nc}
6453          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6454    } else {
6455      $self->{set_nc}->($self);
6456    }
6457
6458        return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6459        redo A;
6460      } elsif ($nc == EOF_CHAR) {
6461        ## XML5: [ATTLIST] No parse error.
6462        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6463        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6464        ## Reconsume.
6465        redo A;
6466      } else {
6467        ## XML5: [ATTLIST] Not defined yet.
6468        if ($nc == 0x0000) {
6469          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
6470        }
6471        $self->{ct}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
6472        ## Stay in the state.
6473
6474    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6475      $self->{line_prev} = $self->{line};
6476      $self->{column_prev} = $self->{column};
6477      $self->{column}++;
6478      $self->{nc}
6479          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6480    } else {
6481      $self->{set_nc}->($self);
6482    }
6483
6484        redo A;
6485      }
6486    } elsif ($state == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6487      if ($is_space->{$nc}) {
6488        ## Stay in the state.
6489
6490    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6491      $self->{line_prev} = $self->{line};
6492      $self->{column_prev} = $self->{column};
6493      $self->{column}++;
6494      $self->{nc}
6495          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6496    } else {
6497      $self->{set_nc}->($self);
6498    }
6499
6500        redo A;
6501      } elsif ($nc == 0x003E) { # >
6502        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6503
6504    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6505      $self->{line_prev} = $self->{line};
6506      $self->{column_prev} = $self->{column};
6507      $self->{column}++;
6508      $self->{nc}
6509          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6510    } else {
6511      $self->{set_nc}->($self);
6512    }
6513
6514        return  ($self->{ct}); # ATTLIST
6515        redo A;
6516      } elsif ($nc == EOF_CHAR) {
6517        ## XML5: No parse error.
6518        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6519        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6520        ## Discard the current token.
6521        redo A;
6522      } else {
6523        ## XML5: Not defined yet.
6524        if ($nc == 0x0000) {
6525          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
6526        }
6527        $self->{ca} = {name => $nc == 0x0000 ? "\x{FFFD}" : chr $nc, # attrdef
6528                       tokens => [],
6529                       line => $self->{line}, column => $self->{column}};
6530        $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6531
6532    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6533      $self->{line_prev} = $self->{line};
6534      $self->{column_prev} = $self->{column};
6535      $self->{column}++;
6536      $self->{nc}
6537          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6538    } else {
6539      $self->{set_nc}->($self);
6540    }
6541
6542        redo A;
6543      }
6544    } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6545      if ($is_space->{$nc}) {
6546        $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6547
6548    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6549      $self->{line_prev} = $self->{line};
6550      $self->{column_prev} = $self->{column};
6551      $self->{column}++;
6552      $self->{nc}
6553          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6554    } else {
6555      $self->{set_nc}->($self);
6556    }
6557
6558        redo A;
6559      } elsif ($nc == 0x003E) { # >
6560        ## XML5: Same as "anything else".
6561        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6562        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6563
6564    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6565      $self->{line_prev} = $self->{line};
6566      $self->{column_prev} = $self->{column};
6567      $self->{column}++;
6568      $self->{nc}
6569          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6570    } else {
6571      $self->{set_nc}->($self);
6572    }
6573
6574        return  ($self->{ct}); # ATTLIST
6575        redo A;
6576      } elsif ($nc == 0x0028) { # (
6577        ## XML5: Same as "anything else".
6578        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6579        $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6580
6581    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6582      $self->{line_prev} = $self->{line};
6583      $self->{column_prev} = $self->{column};
6584      $self->{column}++;
6585      $self->{nc}
6586          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6587    } else {
6588      $self->{set_nc}->($self);
6589    }
6590
6591        redo A;
6592      } elsif ($nc == EOF_CHAR) {
6593        ## XML5: No parse error.
6594        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6595        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6596
6597    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6598      $self->{line_prev} = $self->{line};
6599      $self->{column_prev} = $self->{column};
6600      $self->{column}++;
6601      $self->{nc}
6602          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6603    } else {
6604      $self->{set_nc}->($self);
6605    }
6606
6607        ## Discard the current token.
6608        redo A;
6609      } else {
6610        ## XML5: Not defined yet.
6611        if ($nc == 0x0000) {
6612          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
6613        }
6614        $self->{ca}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
6615        ## Stay in the state.
6616
6617    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6618      $self->{line_prev} = $self->{line};
6619      $self->{column_prev} = $self->{column};
6620      $self->{column}++;
6621      $self->{nc}
6622          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6623    } else {
6624      $self->{set_nc}->($self);
6625    }
6626
6627        redo A;
6628      }
6629    } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6630      if ($is_space->{$nc}) {
6631        ## Stay in the state.
6632
6633    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6634      $self->{line_prev} = $self->{line};
6635      $self->{column_prev} = $self->{column};
6636      $self->{column}++;
6637      $self->{nc}
6638          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6639    } else {
6640      $self->{set_nc}->($self);
6641    }
6642
6643        redo A;
6644      } elsif ($nc == 0x003E) { # >
6645        ## XML5: Same as "anything else".
6646        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6647        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6648
6649    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6650      $self->{line_prev} = $self->{line};
6651      $self->{column_prev} = $self->{column};
6652      $self->{column}++;
6653      $self->{nc}
6654          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6655    } else {
6656      $self->{set_nc}->($self);
6657    }
6658
6659        return  ($self->{ct}); # ATTLIST
6660        redo A;
6661      } elsif ($nc == 0x0028) { # (
6662        ## XML5: Same as "anything else".
6663        $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6664
6665    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6666      $self->{line_prev} = $self->{line};
6667      $self->{column_prev} = $self->{column};
6668      $self->{column}++;
6669      $self->{nc}
6670          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6671    } else {
6672      $self->{set_nc}->($self);
6673    }
6674
6675        redo A;
6676      } elsif ($nc == EOF_CHAR) {
6677        ## XML5: No parse error.
6678        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6679        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6680
6681    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6682      $self->{line_prev} = $self->{line};
6683      $self->{column_prev} = $self->{column};
6684      $self->{column}++;
6685      $self->{nc}
6686          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6687    } else {
6688      $self->{set_nc}->($self);
6689    }
6690
6691        ## Discard the token.
6692        redo A;
6693      } else {
6694        ## XML5: Not defined yet.
6695        $self->{ca}->{type} = chr $nc;
6696        $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6697
6698    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6699      $self->{line_prev} = $self->{line};
6700      $self->{column_prev} = $self->{column};
6701      $self->{column}++;
6702      $self->{nc}
6703          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6704    } else {
6705      $self->{set_nc}->($self);
6706    }
6707
6708        redo A;
6709      }
6710    } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6711      if ($is_space->{$nc}) {
6712        $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6713
6714    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6715      $self->{line_prev} = $self->{line};
6716      $self->{column_prev} = $self->{column};
6717      $self->{column}++;
6718      $self->{nc}
6719          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6720    } else {
6721      $self->{set_nc}->($self);
6722    }
6723
6724        redo A;
6725      } elsif ($nc == 0x0023) { # #
6726        ## XML5: Same as "anything else".
6727        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6728        $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6729
6730    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6731      $self->{line_prev} = $self->{line};
6732      $self->{column_prev} = $self->{column};
6733      $self->{column}++;
6734      $self->{nc}
6735          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6736    } else {
6737      $self->{set_nc}->($self);
6738    }
6739
6740        redo A;
6741      } elsif ($nc == 0x0022) { # "
6742        ## XML5: Same as "anything else".
6743        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6744        $self->{ca}->{value} = '';
6745        $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6746
6747    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6748      $self->{line_prev} = $self->{line};
6749      $self->{column_prev} = $self->{column};
6750      $self->{column}++;
6751      $self->{nc}
6752          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6753    } else {
6754      $self->{set_nc}->($self);
6755    }
6756
6757        redo A;
6758      } elsif ($nc == 0x0027) { # '
6759        ## XML5: Same as "anything else".
6760        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6761        $self->{ca}->{value} = '';
6762        $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6763
6764    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6765      $self->{line_prev} = $self->{line};
6766      $self->{column_prev} = $self->{column};
6767      $self->{column}++;
6768      $self->{nc}
6769          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6770    } else {
6771      $self->{set_nc}->($self);
6772    }
6773
6774        redo A;
6775      } elsif ($nc == 0x003E) { # >
6776        ## XML5: Same as "anything else".
6777        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6778        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6779
6780    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6781      $self->{line_prev} = $self->{line};
6782      $self->{column_prev} = $self->{column};
6783      $self->{column}++;
6784      $self->{nc}
6785          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6786    } else {
6787      $self->{set_nc}->($self);
6788    }
6789
6790        return  ($self->{ct}); # ATTLIST
6791        redo A;
6792      } elsif ($nc == 0x0028) { # (
6793        ## XML5: Same as "anything else".
6794        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6795        $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6796
6797    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6798      $self->{line_prev} = $self->{line};
6799      $self->{column_prev} = $self->{column};
6800      $self->{column}++;
6801      $self->{nc}
6802          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6803    } else {
6804      $self->{set_nc}->($self);
6805    }
6806
6807        redo A;
6808      } elsif ($nc == EOF_CHAR) {
6809        ## XML5: No parse error.
6810        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6811        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6812
6813    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6814      $self->{line_prev} = $self->{line};
6815      $self->{column_prev} = $self->{column};
6816      $self->{column}++;
6817      $self->{nc}
6818          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6819    } else {
6820      $self->{set_nc}->($self);
6821    }
6822
6823        ## Discard the token.
6824        redo A;
6825      } else {
6826        ## XML5: Not defined yet.
6827        $self->{ca}->{type} .= chr $nc;
6828        ## Stay in the state.
6829
6830    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6831      $self->{line_prev} = $self->{line};
6832      $self->{column_prev} = $self->{column};
6833      $self->{column}++;
6834      $self->{nc}
6835          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6836    } else {
6837      $self->{set_nc}->($self);
6838    }
6839
6840        redo A;
6841      }
6842    } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6843      if ($is_space->{$nc}) {
6844        ## Stay in the state.
6845
6846    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6847      $self->{line_prev} = $self->{line};
6848      $self->{column_prev} = $self->{column};
6849      $self->{column}++;
6850      $self->{nc}
6851          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6852    } else {
6853      $self->{set_nc}->($self);
6854    }
6855
6856        redo A;
6857      } elsif ($nc == 0x0028) { # (
6858        ## XML5: Same as "anything else".
6859        $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6860
6861    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6862      $self->{line_prev} = $self->{line};
6863      $self->{column_prev} = $self->{column};
6864      $self->{column}++;
6865      $self->{nc}
6866          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6867    } else {
6868      $self->{set_nc}->($self);
6869    }
6870
6871        redo A;
6872      } elsif ($nc == 0x0023) { # #
6873        $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6874
6875    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6876      $self->{line_prev} = $self->{line};
6877      $self->{column_prev} = $self->{column};
6878      $self->{column}++;
6879      $self->{nc}
6880          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6881    } else {
6882      $self->{set_nc}->($self);
6883    }
6884
6885        redo A;
6886      } elsif ($nc == 0x0022) { # "
6887        ## XML5: Same as "anything else".
6888        $self->{ca}->{value} = '';
6889        $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6890
6891    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6892      $self->{line_prev} = $self->{line};
6893      $self->{column_prev} = $self->{column};
6894      $self->{column}++;
6895      $self->{nc}
6896          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6897    } else {
6898      $self->{set_nc}->($self);
6899    }
6900
6901        redo A;
6902      } elsif ($nc == 0x0027) { # '
6903        ## XML5: Same as "anything else".
6904        $self->{ca}->{value} = '';
6905        $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6906
6907    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6908      $self->{line_prev} = $self->{line};
6909      $self->{column_prev} = $self->{column};
6910      $self->{column}++;
6911      $self->{nc}
6912          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6913    } else {
6914      $self->{set_nc}->($self);
6915    }
6916
6917        redo A;
6918      } elsif ($nc == 0x003E) { # >
6919        ## XML5: Same as "anything else".
6920        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6921        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6922
6923    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6924      $self->{line_prev} = $self->{line};
6925      $self->{column_prev} = $self->{column};
6926      $self->{column}++;
6927      $self->{nc}
6928          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6929    } else {
6930      $self->{set_nc}->($self);
6931    }
6932
6933        return  ($self->{ct}); # ATTLIST
6934        redo A;
6935      } elsif ($nc == EOF_CHAR) {
6936        ## XML5: No parse error.
6937        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6938        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6939
6940    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6941      $self->{line_prev} = $self->{line};
6942      $self->{column_prev} = $self->{column};
6943      $self->{column}++;
6944      $self->{nc}
6945          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6946    } else {
6947      $self->{set_nc}->($self);
6948    }
6949
6950        ## Discard the current token.
6951        redo A;
6952      } else {
6953        ## XML5: Switch to the "DOCTYPE bogus comment state".
6954        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6955        $self->{ca}->{value} = '';
6956        $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6957        ## Reconsume.
6958        redo A;
6959      }
6960    } elsif ($state == BEFORE_ALLOWED_TOKEN_STATE) {
6961      if ($is_space->{$nc}) {
6962        ## Stay in the state.
6963
6964    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6965      $self->{line_prev} = $self->{line};
6966      $self->{column_prev} = $self->{column};
6967      $self->{column}++;
6968      $self->{nc}
6969          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6970    } else {
6971      $self->{set_nc}->($self);
6972    }
6973
6974        redo A;
6975      } elsif ($nc == 0x007C) { # |
6976        $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6977        ## Stay in the state.
6978
6979    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6980      $self->{line_prev} = $self->{line};
6981      $self->{column_prev} = $self->{column};
6982      $self->{column}++;
6983      $self->{nc}
6984          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6985    } else {
6986      $self->{set_nc}->($self);
6987    }
6988
6989        redo A;
6990      } elsif ($nc == 0x0029) { # )
6991        $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6992        $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6993
6994    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6995      $self->{line_prev} = $self->{line};
6996      $self->{column_prev} = $self->{column};
6997      $self->{column}++;
6998      $self->{nc}
6999          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7000    } else {
7001      $self->{set_nc}->($self);
7002    }
7003
7004        redo A;
7005      } elsif ($nc == 0x003E) { # >
7006        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7007        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7008
7009    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7010      $self->{line_prev} = $self->{line};
7011      $self->{column_prev} = $self->{column};
7012      $self->{column}++;
7013      $self->{nc}
7014          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7015    } else {
7016      $self->{set_nc}->($self);
7017    }
7018
7019        return  ($self->{ct}); # ATTLIST
7020        redo A;
7021      } elsif ($nc == EOF_CHAR) {
7022        ## XML5: No parse error.
7023        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7024        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7025
7026    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7027      $self->{line_prev} = $self->{line};
7028      $self->{column_prev} = $self->{column};
7029      $self->{column}++;
7030      $self->{nc}
7031          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7032    } else {
7033      $self->{set_nc}->($self);
7034    }
7035
7036        ## Discard the current token.
7037        redo A;
7038      } else {
7039        if ($nc == 0x000) {
7040          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7041        }
7042        push @{$self->{ca}->{tokens}}, $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
7043        $self->{state} = ALLOWED_TOKEN_STATE;
7044
7045    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7046      $self->{line_prev} = $self->{line};
7047      $self->{column_prev} = $self->{column};
7048      $self->{column}++;
7049      $self->{nc}
7050          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7051    } else {
7052      $self->{set_nc}->($self);
7053    }
7054
7055        redo A;
7056      }
7057    } elsif ($state == ALLOWED_TOKEN_STATE) {
7058      if ($is_space->{$nc}) {
7059        $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
7060
7061    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7062      $self->{line_prev} = $self->{line};
7063      $self->{column_prev} = $self->{column};
7064      $self->{column}++;
7065      $self->{nc}
7066          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7067    } else {
7068      $self->{set_nc}->($self);
7069    }
7070
7071        redo A;
7072      } elsif ($nc == 0x007C) { # |
7073        $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7074
7075    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7076      $self->{line_prev} = $self->{line};
7077      $self->{column_prev} = $self->{column};
7078      $self->{column}++;
7079      $self->{nc}
7080          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7081    } else {
7082      $self->{set_nc}->($self);
7083    }
7084
7085        redo A;
7086      } elsif ($nc == 0x0029) { # )
7087        $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7088
7089    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7090      $self->{line_prev} = $self->{line};
7091      $self->{column_prev} = $self->{column};
7092      $self->{column}++;
7093      $self->{nc}
7094          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7095    } else {
7096      $self->{set_nc}->($self);
7097    }
7098
7099        redo A;
7100      } elsif ($nc == 0x003E) { # >
7101        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7102        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7103
7104    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7105      $self->{line_prev} = $self->{line};
7106      $self->{column_prev} = $self->{column};
7107      $self->{column}++;
7108      $self->{nc}
7109          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7110    } else {
7111      $self->{set_nc}->($self);
7112    }
7113
7114        return  ($self->{ct}); # ATTLIST
7115        redo A;
7116      } elsif ($nc == EOF_CHAR) {
7117        ## XML5: No parse error.
7118        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7119        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7120
7121    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7122      $self->{line_prev} = $self->{line};
7123      $self->{column_prev} = $self->{column};
7124      $self->{column}++;
7125      $self->{nc}
7126          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7127    } else {
7128      $self->{set_nc}->($self);
7129    }
7130
7131        ## Discard the current token.
7132        redo A;
7133      } else {
7134        if ($nc == 0x0000) {
7135          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7136        }
7137        $self->{ca}->{tokens}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
7138        ## Stay in the state.
7139
7140    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7141      $self->{line_prev} = $self->{line};
7142      $self->{column_prev} = $self->{column};
7143      $self->{column}++;
7144      $self->{nc}
7145          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7146    } else {
7147      $self->{set_nc}->($self);
7148    }
7149
7150        redo A;
7151      }
7152    } elsif ($state == AFTER_ALLOWED_TOKEN_STATE) {
7153      if ($is_space->{$nc}) {
7154        ## Stay in the state.
7155
7156    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7157      $self->{line_prev} = $self->{line};
7158      $self->{column_prev} = $self->{column};
7159      $self->{column}++;
7160      $self->{nc}
7161          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7162    } else {
7163      $self->{set_nc}->($self);
7164    }
7165
7166        redo A;
7167      } elsif ($nc == 0x007C) { # |
7168        $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7169
7170    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7171      $self->{line_prev} = $self->{line};
7172      $self->{column_prev} = $self->{column};
7173      $self->{column}++;
7174      $self->{nc}
7175          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7176    } else {
7177      $self->{set_nc}->($self);
7178    }
7179
7180        redo A;
7181      } elsif ($nc == 0x0029) { # )
7182        $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7183
7184    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7185      $self->{line_prev} = $self->{line};
7186      $self->{column_prev} = $self->{column};
7187      $self->{column}++;
7188      $self->{nc}
7189          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7190    } else {
7191      $self->{set_nc}->($self);
7192    }
7193
7194        redo A;
7195      } elsif ($nc == 0x003E) { # >
7196        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7197        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7198
7199    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7200      $self->{line_prev} = $self->{line};
7201      $self->{column_prev} = $self->{column};
7202      $self->{column}++;
7203      $self->{nc}
7204          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7205    } else {
7206      $self->{set_nc}->($self);
7207    }
7208
7209        return  ($self->{ct}); # ATTLIST
7210        redo A;
7211      } elsif ($nc == EOF_CHAR) {
7212        ## XML5: No parse error.
7213        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7214        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7215
7216    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7217      $self->{line_prev} = $self->{line};
7218      $self->{column_prev} = $self->{column};
7219      $self->{column}++;
7220      $self->{nc}
7221          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7222    } else {
7223      $self->{set_nc}->($self);
7224    }
7225
7226        ## Discard the current token.
7227        redo A;
7228      } else {
7229        $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7230                        line => $self->{line_prev},
7231                        column => $self->{column_prev});
7232        if ($nc == 0x0000) {
7233          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7234        }
7235        $self->{ca}->{tokens}->[-1] .= ' ' . ($nc == 0x0000 ? "\x{FFFD}" : chr $nc);
7236        $self->{state} = ALLOWED_TOKEN_STATE;
7237
7238    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7239      $self->{line_prev} = $self->{line};
7240      $self->{column_prev} = $self->{column};
7241      $self->{column}++;
7242      $self->{nc}
7243          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7244    } else {
7245      $self->{set_nc}->($self);
7246    }
7247
7248        redo A;
7249      }
7250    } elsif ($state == AFTER_ALLOWED_TOKENS_STATE) {
7251      if ($is_space->{$nc}) {
7252        $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7253
7254    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7255      $self->{line_prev} = $self->{line};
7256      $self->{column_prev} = $self->{column};
7257      $self->{column}++;
7258      $self->{nc}
7259          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7260    } else {
7261      $self->{set_nc}->($self);
7262    }
7263
7264        redo A;
7265      } elsif ($nc == 0x0023) { # #
7266        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7267        $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7268
7269    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7270      $self->{line_prev} = $self->{line};
7271      $self->{column_prev} = $self->{column};
7272      $self->{column}++;
7273      $self->{nc}
7274          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7275    } else {
7276      $self->{set_nc}->($self);
7277    }
7278
7279        redo A;
7280      } elsif ($nc == 0x0022) { # "
7281        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7282        $self->{ca}->{value} = '';
7283        $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7284
7285    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7286      $self->{line_prev} = $self->{line};
7287      $self->{column_prev} = $self->{column};
7288      $self->{column}++;
7289      $self->{nc}
7290          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7291    } else {
7292      $self->{set_nc}->($self);
7293    }
7294
7295        redo A;
7296      } elsif ($nc == 0x0027) { # '
7297        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7298        $self->{ca}->{value} = '';
7299        $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7300
7301    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7302      $self->{line_prev} = $self->{line};
7303      $self->{column_prev} = $self->{column};
7304      $self->{column}++;
7305      $self->{nc}
7306          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7307    } else {
7308      $self->{set_nc}->($self);
7309    }
7310
7311        redo A;
7312      } elsif ($nc == 0x003E) { # >
7313        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7314        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7315
7316    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7317      $self->{line_prev} = $self->{line};
7318      $self->{column_prev} = $self->{column};
7319      $self->{column}++;
7320      $self->{nc}
7321          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7322    } else {
7323      $self->{set_nc}->($self);
7324    }
7325
7326        return  ($self->{ct}); # ATTLIST
7327        redo A;
7328      } elsif ($nc == EOF_CHAR) {
7329        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7330        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7331
7332    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7333      $self->{line_prev} = $self->{line};
7334      $self->{column_prev} = $self->{column};
7335      $self->{column}++;
7336      $self->{nc}
7337          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7338    } else {
7339      $self->{set_nc}->($self);
7340    }
7341
7342        ## Discard the current token.
7343        redo A;
7344      } else {
7345        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7346        $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7347        ## Reconsume.
7348        redo A;
7349      }
7350    } elsif ($state == BEFORE_ATTR_DEFAULT_STATE) {
7351      if ($is_space->{$nc}) {
7352        ## Stay in the state.
7353
7354    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7355      $self->{line_prev} = $self->{line};
7356      $self->{column_prev} = $self->{column};
7357      $self->{column}++;
7358      $self->{nc}
7359          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7360    } else {
7361      $self->{set_nc}->($self);
7362    }
7363
7364        redo A;
7365      } elsif ($nc == 0x0023) { # #
7366        $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7367
7368    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7369      $self->{line_prev} = $self->{line};
7370      $self->{column_prev} = $self->{column};
7371      $self->{column}++;
7372      $self->{nc}
7373          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7374    } else {
7375      $self->{set_nc}->($self);
7376    }
7377
7378        redo A;
7379      } elsif ($nc == 0x0022) { # "
7380        $self->{ca}->{value} = '';
7381        $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7382
7383    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7384      $self->{line_prev} = $self->{line};
7385      $self->{column_prev} = $self->{column};
7386      $self->{column}++;
7387      $self->{nc}
7388          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7389    } else {
7390      $self->{set_nc}->($self);
7391    }
7392
7393        redo A;
7394      } elsif ($nc == 0x0027) { # '
7395        $self->{ca}->{value} = '';
7396        $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7397
7398    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7399      $self->{line_prev} = $self->{line};
7400      $self->{column_prev} = $self->{column};
7401      $self->{column}++;
7402      $self->{nc}
7403          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7404    } else {
7405      $self->{set_nc}->($self);
7406    }
7407
7408        redo A;
7409      } elsif ($nc == 0x003E) { # >
7410        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7411        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7412
7413    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7414      $self->{line_prev} = $self->{line};
7415      $self->{column_prev} = $self->{column};
7416      $self->{column}++;
7417      $self->{nc}
7418          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7419    } else {
7420      $self->{set_nc}->($self);
7421    }
7422
7423        return  ($self->{ct}); # ATTLIST
7424        redo A;
7425      } elsif ($nc == EOF_CHAR) {
7426        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7427        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7428
7429    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7430      $self->{line_prev} = $self->{line};
7431      $self->{column_prev} = $self->{column};
7432      $self->{column}++;
7433      $self->{nc}
7434          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7435    } else {
7436      $self->{set_nc}->($self);
7437    }
7438
7439        ## Discard the current token.
7440        redo A;
7441      } else {
7442        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7443        $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7444        ## Reconsume.
7445        redo A;
7446      }
7447    } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7448      if ($is_space->{$nc}) {
7449        ## XML5: No parse error.
7450        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7451        $self->{state} = BOGUS_MD_STATE;
7452        ## Reconsume.
7453        redo A;
7454      } elsif ($nc == 0x0022) { # "
7455        # XXX parse error?
7456        ## XML5: Same as "anything else".
7457        $self->{ca}->{value} = '';
7458        $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7459
7460    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7461      $self->{line_prev} = $self->{line};
7462      $self->{column_prev} = $self->{column};
7463      $self->{column}++;
7464      $self->{nc}
7465          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7466    } else {
7467      $self->{set_nc}->($self);
7468    }
7469
7470        redo A;
7471      } elsif ($nc == 0x0027) { # '
7472        # XXX parse error?
7473        ## XML5: Same as "anything else".
7474        $self->{ca}->{value} = '';
7475        $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7476
7477    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7478      $self->{line_prev} = $self->{line};
7479      $self->{column_prev} = $self->{column};
7480      $self->{column}++;
7481      $self->{nc}
7482          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7483    } else {
7484      $self->{set_nc}->($self);
7485    }
7486
7487        redo A;
7488      } elsif ($nc == 0x003E) { # >
7489        ## XML5: Same as "anything else".
7490        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7491        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7492
7493    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7494      $self->{line_prev} = $self->{line};
7495      $self->{column_prev} = $self->{column};
7496      $self->{column}++;
7497      $self->{nc}
7498          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7499    } else {
7500      $self->{set_nc}->($self);
7501    }
7502
7503        return  ($self->{ct}); # ATTLIST
7504        redo A;
7505      } elsif ($nc == EOF_CHAR) {
7506        ## XML5: No parse error.
7507        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7508        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7509
7510    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7511      $self->{line_prev} = $self->{line};
7512      $self->{column_prev} = $self->{column};
7513      $self->{column}++;
7514      $self->{nc}
7515          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7516    } else {
7517      $self->{set_nc}->($self);
7518    }
7519
7520        ## Discard the current token.
7521        redo A;
7522      } else {
7523        $self->{ca}->{default} = chr $nc;
7524        $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7525
7526    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7527      $self->{line_prev} = $self->{line};
7528      $self->{column_prev} = $self->{column};
7529      $self->{column}++;
7530      $self->{nc}
7531          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7532    } else {
7533      $self->{set_nc}->($self);
7534    }
7535
7536        redo A;
7537      }
7538    } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7539      if ($is_space->{$nc}) {
7540        $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7541
7542    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7543      $self->{line_prev} = $self->{line};
7544      $self->{column_prev} = $self->{column};
7545      $self->{column}++;
7546      $self->{nc}
7547          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7548    } else {
7549      $self->{set_nc}->($self);
7550    }
7551
7552        redo A;
7553      } elsif ($nc == 0x0022) { # "
7554        ## XML5: Same as "anything else".
7555        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7556        $self->{ca}->{value} = '';
7557        $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7558
7559    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7560      $self->{line_prev} = $self->{line};
7561      $self->{column_prev} = $self->{column};
7562      $self->{column}++;
7563      $self->{nc}
7564          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7565    } else {
7566      $self->{set_nc}->($self);
7567    }
7568
7569        redo A;
7570      } elsif ($nc == 0x0027) { # '
7571        ## XML5: Same as "anything else".
7572        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7573        $self->{ca}->{value} = '';
7574        $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7575
7576    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7577      $self->{line_prev} = $self->{line};
7578      $self->{column_prev} = $self->{column};
7579      $self->{column}++;
7580      $self->{nc}
7581          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7582    } else {
7583      $self->{set_nc}->($self);
7584    }
7585
7586        redo A;
7587      } elsif ($nc == 0x003E) { # >
7588        ## XML5: Same as "anything else".
7589        push @{$self->{ct}->{attrdefs}}, $self->{ca};
7590        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7591
7592    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7593      $self->{line_prev} = $self->{line};
7594      $self->{column_prev} = $self->{column};
7595      $self->{column}++;
7596      $self->{nc}
7597          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7598    } else {
7599      $self->{set_nc}->($self);
7600    }
7601
7602        return  ($self->{ct}); # ATTLIST
7603        redo A;
7604      } elsif ($nc == EOF_CHAR) {
7605        ## XML5: No parse error.
7606        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7607        push @{$self->{ct}->{attrdefs}}, $self->{ca};
7608        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7609
7610    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7611      $self->{line_prev} = $self->{line};
7612      $self->{column_prev} = $self->{column};
7613      $self->{column}++;
7614      $self->{nc}
7615          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7616    } else {
7617      $self->{set_nc}->($self);
7618    }
7619
7620        ## Discard the current token.
7621        redo A;
7622      } else {
7623        $self->{ca}->{default} .= chr $nc;
7624        ## Stay in the state.
7625
7626    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7627      $self->{line_prev} = $self->{line};
7628      $self->{column_prev} = $self->{column};
7629      $self->{column}++;
7630      $self->{nc}
7631          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7632    } else {
7633      $self->{set_nc}->($self);
7634    }
7635
7636        redo A;
7637      }
7638    } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7639      if ($is_space->{$nc}) {
7640        ## Stay in the state.
7641
7642    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7643      $self->{line_prev} = $self->{line};
7644      $self->{column_prev} = $self->{column};
7645      $self->{column}++;
7646      $self->{nc}
7647          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7648    } else {
7649      $self->{set_nc}->($self);
7650    }
7651
7652        redo A;
7653      } elsif ($nc == 0x0022) { # "
7654        $self->{ca}->{value} = '';
7655        $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7656
7657    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7658      $self->{line_prev} = $self->{line};
7659      $self->{column_prev} = $self->{column};
7660      $self->{column}++;
7661      $self->{nc}
7662          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7663    } else {
7664      $self->{set_nc}->($self);
7665    }
7666
7667        redo A;
7668      } elsif ($nc == 0x0027) { # '
7669        $self->{ca}->{value} = '';
7670        $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7671
7672    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7673      $self->{line_prev} = $self->{line};
7674      $self->{column_prev} = $self->{column};
7675      $self->{column}++;
7676      $self->{nc}
7677          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7678    } else {
7679      $self->{set_nc}->($self);
7680    }
7681
7682        redo A;
7683      } elsif ($nc == 0x003E) { # >
7684        push @{$self->{ct}->{attrdefs}}, $self->{ca};
7685        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7686
7687    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7688      $self->{line_prev} = $self->{line};
7689      $self->{column_prev} = $self->{column};
7690      $self->{column}++;
7691      $self->{nc}
7692          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7693    } else {
7694      $self->{set_nc}->($self);
7695    }
7696
7697        return  ($self->{ct}); # ATTLIST
7698        redo A;
7699      } elsif ($nc == EOF_CHAR) {
7700        ## XML5: No parse error.
7701        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7702        push @{$self->{ct}->{attrdefs}}, $self->{ca};
7703        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7704
7705    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7706      $self->{line_prev} = $self->{line};
7707      $self->{column_prev} = $self->{column};
7708      $self->{column}++;
7709      $self->{nc}
7710          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7711    } else {
7712      $self->{set_nc}->($self);
7713    }
7714
7715        ## Discard the current token.
7716        redo A;
7717      } else {
7718        ## XML5: Not defined yet.
7719        if ($self->{ca}->{default} eq 'FIXED') {
7720          $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7721        } else {
7722          push @{$self->{ct}->{attrdefs}}, $self->{ca};
7723          $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7724        }
7725        ## Reconsume.
7726        redo A;
7727      }
7728    } elsif ($state == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7729      if ($is_space->{$nc} or
7730          $nc == EOF_CHAR or
7731          $nc == 0x003E) { # >
7732        $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7733        ## Reconsume.
7734        redo A;
7735      } else {
7736        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7737        $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7738        ## Reconsume.
7739        redo A;
7740      }
7741    } elsif ($state == NDATA_STATE) {
7742      ## ASCII case-insensitive
7743      if ($nc == [
7744            undef,
7745            0x0044, # D
7746            0x0041, # A
7747            0x0054, # T
7748            NEVER_CHAR, # (A)
7749          ]->[length $self->{kwd}] or
7750          $nc == [
7751            undef,
7752            0x0064, # d
7753            0x0061, # a
7754            0x0074, # t
7755            NEVER_CHAR, # (a)
7756          ]->[length $self->{kwd}]) {
7757
7758        ## Stay in the state.
7759        $self->{kwd} .= chr $nc;
7760
7761    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7762      $self->{line_prev} = $self->{line};
7763      $self->{column_prev} = $self->{column};
7764      $self->{column}++;
7765      $self->{nc}
7766          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7767    } else {
7768      $self->{set_nc}->($self);
7769    }
7770
7771        redo A;
7772      } elsif ((length $self->{kwd}) == 4 and
7773               ($nc == 0x0041 or # A
7774                $nc == 0x0061)) { # a
7775        if ($self->{kwd} ne 'NDAT' or $nc == 0x0061) { # a
7776
7777          $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7778                          text => 'NDATA',
7779                          line => $self->{line_prev},
7780                          column => $self->{column_prev} - 4);
7781        } else {
7782
7783        }
7784        $self->{state} = AFTER_NDATA_STATE;
7785
7786    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7787      $self->{line_prev} = $self->{line};
7788      $self->{column_prev} = $self->{column};
7789      $self->{column}++;
7790      $self->{nc}
7791          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7792    } else {
7793      $self->{set_nc}->($self);
7794    }
7795
7796        redo A;
7797      } else {
7798        $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7799                        line => $self->{line_prev},
7800                        column => $self->{column_prev} + 1
7801                            - length $self->{kwd});
7802
7803        $self->{state} = BOGUS_MD_STATE;
7804        ## Reconsume.
7805        redo A;
7806      }
7807    } elsif ($state == AFTER_NDATA_STATE) {
7808      if ($is_space->{$nc}) {
7809        $self->{state} = BEFORE_NOTATION_NAME_STATE;
7810
7811    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7812      $self->{line_prev} = $self->{line};
7813      $self->{column_prev} = $self->{column};
7814      $self->{column}++;
7815      $self->{nc}
7816          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7817    } else {
7818      $self->{set_nc}->($self);
7819    }
7820
7821        redo A;
7822      } elsif ($nc == 0x003E) { # >
7823        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7824        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7825
7826    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7827      $self->{line_prev} = $self->{line};
7828      $self->{column_prev} = $self->{column};
7829      $self->{column}++;
7830      $self->{nc}
7831          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7832    } else {
7833      $self->{set_nc}->($self);
7834    }
7835
7836        return  ($self->{ct}); # ENTITY
7837        redo A;
7838      } elsif ($nc == EOF_CHAR) {
7839        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7840        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7841
7842    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7843      $self->{line_prev} = $self->{line};
7844      $self->{column_prev} = $self->{column};
7845      $self->{column}++;
7846      $self->{nc}
7847          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7848    } else {
7849      $self->{set_nc}->($self);
7850    }
7851
7852        ## Discard the current token.
7853        redo A;
7854      } else {
7855        $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7856                        line => $self->{line_prev},
7857                        column => $self->{column_prev} + 1
7858                            - length $self->{kwd});
7859        $self->{state} = BOGUS_MD_STATE;
7860        ## Reconsume.
7861        redo A;
7862      }
7863    } elsif ($state == BEFORE_NOTATION_NAME_STATE) {
7864      if ($is_space->{$nc}) {
7865        ## Stay in the state.
7866
7867    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7868      $self->{line_prev} = $self->{line};
7869      $self->{column_prev} = $self->{column};
7870      $self->{column}++;
7871      $self->{nc}
7872          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7873    } else {
7874      $self->{set_nc}->($self);
7875    }
7876
7877        redo A;
7878      } elsif ($nc == 0x003E) { # >
7879        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7880        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7881
7882    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7883      $self->{line_prev} = $self->{line};
7884      $self->{column_prev} = $self->{column};
7885      $self->{column}++;
7886      $self->{nc}
7887          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7888    } else {
7889      $self->{set_nc}->($self);
7890    }
7891
7892        return  ($self->{ct}); # ENTITY
7893        redo A;
7894      } elsif ($nc == EOF_CHAR) {
7895        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7896        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7897
7898    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7899      $self->{line_prev} = $self->{line};
7900      $self->{column_prev} = $self->{column};
7901      $self->{column}++;
7902      $self->{nc}
7903          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7904    } else {
7905      $self->{set_nc}->($self);
7906    }
7907
7908        ## Discard the current token.
7909        redo A;
7910      } else {
7911        if ($nc == 0x0000) {
7912          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7913        }
7914        $self->{ct}->{notation} = $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY
7915        $self->{state} = NOTATION_NAME_STATE;
7916
7917    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7918      $self->{line_prev} = $self->{line};
7919      $self->{column_prev} = $self->{column};
7920      $self->{column}++;
7921      $self->{nc}
7922          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7923    } else {
7924      $self->{set_nc}->($self);
7925    }
7926
7927        redo A;
7928      }
7929    } elsif ($state == NOTATION_NAME_STATE) {
7930      if ($is_space->{$nc}) {
7931        $self->{state} = AFTER_MD_DEF_STATE;
7932
7933    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7934      $self->{line_prev} = $self->{line};
7935      $self->{column_prev} = $self->{column};
7936      $self->{column}++;
7937      $self->{nc}
7938          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7939    } else {
7940      $self->{set_nc}->($self);
7941    }
7942
7943        redo A;
7944      } elsif ($nc == 0x003E) { # >
7945        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7946
7947    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7948      $self->{line_prev} = $self->{line};
7949      $self->{column_prev} = $self->{column};
7950      $self->{column}++;
7951      $self->{nc}
7952          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7953    } else {
7954      $self->{set_nc}->($self);
7955    }
7956
7957        return  ($self->{ct}); # ENTITY
7958        redo A;
7959      } elsif ($nc == EOF_CHAR) {
7960        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7961        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7962
7963    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7964      $self->{line_prev} = $self->{line};
7965      $self->{column_prev} = $self->{column};
7966      $self->{column}++;
7967      $self->{nc}
7968          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7969    } else {
7970      $self->{set_nc}->($self);
7971    }
7972
7973        ## The current token.
7974        redo A;
7975      } else {
7976        if ($nc == 0x0000) {
7977          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
7978        }
7979        $self->{ct}->{notation} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY
7980        ## Stay in the state.
7981
7982    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7983      $self->{line_prev} = $self->{line};
7984      $self->{column_prev} = $self->{column};
7985      $self->{column}++;
7986      $self->{nc}
7987          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7988    } else {
7989      $self->{set_nc}->($self);
7990    }
7991
7992        redo A;
7993      }
7994    } elsif ($state == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7995      if ($nc == 0x0022) { # "
7996        $self->{state} = AFTER_MD_DEF_STATE;
7997
7998    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7999      $self->{line_prev} = $self->{line};
8000      $self->{column_prev} = $self->{column};
8001      $self->{column}++;
8002      $self->{nc}
8003          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8004    } else {
8005      $self->{set_nc}->($self);
8006    }
8007
8008        redo A;
8009      } elsif ($nc == 0x0026) { # &
8010        $self->{prev_state} = $state;
8011        $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8012        $self->{entity_add} = 0x0022; # "
8013
8014    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8015      $self->{line_prev} = $self->{line};
8016      $self->{column_prev} = $self->{column};
8017      $self->{column}++;
8018      $self->{nc}
8019          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8020    } else {
8021      $self->{set_nc}->($self);
8022    }
8023
8024        redo A;
8025## TODO: %
8026      } elsif ($nc == EOF_CHAR) {
8027        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8028        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8029        ## Reconsume.
8030        ## Discard the current token.
8031        redo A;
8032      } else {
8033        if ($nc == 0x0000) {
8034          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8035        }
8036        $self->{ct}->{value} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY
8037
8038    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8039      $self->{line_prev} = $self->{line};
8040      $self->{column_prev} = $self->{column};
8041      $self->{column}++;
8042      $self->{nc}
8043          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8044    } else {
8045      $self->{set_nc}->($self);
8046    }
8047
8048        redo A;
8049      }
8050    } elsif ($state == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
8051      if ($nc == 0x0027) { # '
8052        $self->{state} = AFTER_MD_DEF_STATE;
8053
8054    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8055      $self->{line_prev} = $self->{line};
8056      $self->{column_prev} = $self->{column};
8057      $self->{column}++;
8058      $self->{nc}
8059          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8060    } else {
8061      $self->{set_nc}->($self);
8062    }
8063
8064        redo A;
8065      } elsif ($nc == 0x0026) { # &
8066        $self->{prev_state} = $state;
8067        $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8068        $self->{entity_add} = 0x0027; # '
8069
8070    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8071      $self->{line_prev} = $self->{line};
8072      $self->{column_prev} = $self->{column};
8073      $self->{column}++;
8074      $self->{nc}
8075          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8076    } else {
8077      $self->{set_nc}->($self);
8078    }
8079
8080        redo A;
8081## TODO: %
8082      } elsif ($nc == EOF_CHAR) {
8083        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8084        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8085        ## Reconsume.
8086        ## Discard the current token.
8087        redo A;
8088      } else {
8089        if ($nc == 0x0000) {
8090          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8091        }
8092        $self->{ct}->{value} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY
8093
8094    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8095      $self->{line_prev} = $self->{line};
8096      $self->{column_prev} = $self->{column};
8097      $self->{column}++;
8098      $self->{nc}
8099          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8100    } else {
8101      $self->{set_nc}->($self);
8102    }
8103
8104        redo A;
8105      }
8106    } elsif ($state == ENTITY_VALUE_ENTITY_STATE) {
8107      if ($is_space->{$nc} or
8108          {
8109            0x003C => 1, 0x0026 => 1, (EOF_CHAR) => 1, # <, &
8110            $self->{entity_add} => 1,
8111          }->{$nc}) {
8112        $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8113                        line => $self->{line_prev},
8114                        column => $self->{column_prev}
8115                            + ($nc == EOF_CHAR ? 1 : 0));
8116        ## Don't consume
8117        ## Return nothing.
8118        #
8119      } elsif ($nc == 0x0023) { # #
8120        $self->{ca} = $self->{ct};
8121        $self->{state} = ENTITY_HASH_STATE;
8122        $self->{kwd} = '#';
8123
8124    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8125      $self->{line_prev} = $self->{line};
8126      $self->{column_prev} = $self->{column};
8127      $self->{column}++;
8128      $self->{nc}
8129          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8130    } else {
8131      $self->{set_nc}->($self);
8132    }
8133
8134        redo A;
8135      } else {
8136        #
8137      }
8138
8139      $self->{ct}->{value} .= '&';
8140      $self->{state} = $self->{prev_state};
8141      ## Reconsume.
8142      redo A;
8143    } elsif ($state == AFTER_ELEMENT_NAME_STATE) {
8144      if ($is_space->{$nc}) {
8145        $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8146
8147    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8148      $self->{line_prev} = $self->{line};
8149      $self->{column_prev} = $self->{column};
8150      $self->{column}++;
8151      $self->{nc}
8152          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8153    } else {
8154      $self->{set_nc}->($self);
8155    }
8156
8157        redo A;
8158      } elsif ($nc == 0x0028) { # (
8159        $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8160        $self->{ct}->{content} = ['('];
8161        $self->{group_depth} = 1;
8162
8163    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8164      $self->{line_prev} = $self->{line};
8165      $self->{column_prev} = $self->{column};
8166      $self->{column}++;
8167      $self->{nc}
8168          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8169    } else {
8170      $self->{set_nc}->($self);
8171    }
8172
8173        redo A;
8174      } elsif ($nc == 0x003E) { # >
8175        $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8176        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8177
8178    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8179      $self->{line_prev} = $self->{line};
8180      $self->{column_prev} = $self->{column};
8181      $self->{column}++;
8182      $self->{nc}
8183          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8184    } else {
8185      $self->{set_nc}->($self);
8186    }
8187
8188        return  ($self->{ct}); # ELEMENT
8189        redo A;
8190      } elsif ($nc == EOF_CHAR) {
8191        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8192        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8193
8194    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8195      $self->{line_prev} = $self->{line};
8196      $self->{column_prev} = $self->{column};
8197      $self->{column}++;
8198      $self->{nc}
8199          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8200    } else {
8201      $self->{set_nc}->($self);
8202    }
8203
8204        ## Discard the current token.
8205        redo A;
8206      } else {
8207        if ($nc == 0x0000) {
8208          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8209        }
8210        $self->{ct}->{content} = [$nc == 0x0000 ? "\x{FFFD}" : chr $nc];
8211        $self->{state} = CONTENT_KEYWORD_STATE;
8212
8213    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8214      $self->{line_prev} = $self->{line};
8215      $self->{column_prev} = $self->{column};
8216      $self->{column}++;
8217      $self->{nc}
8218          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8219    } else {
8220      $self->{set_nc}->($self);
8221    }
8222
8223        redo A;
8224      }
8225    } elsif ($state == CONTENT_KEYWORD_STATE) {
8226      if ($is_space->{$nc}) {
8227        $self->{state} = AFTER_MD_DEF_STATE;
8228
8229    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8230      $self->{line_prev} = $self->{line};
8231      $self->{column_prev} = $self->{column};
8232      $self->{column}++;
8233      $self->{nc}
8234          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8235    } else {
8236      $self->{set_nc}->($self);
8237    }
8238
8239        redo A;
8240      } elsif ($nc == 0x003E) { # >
8241        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8242
8243    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8244      $self->{line_prev} = $self->{line};
8245      $self->{column_prev} = $self->{column};
8246      $self->{column}++;
8247      $self->{nc}
8248          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8249    } else {
8250      $self->{set_nc}->($self);
8251    }
8252
8253        return  ($self->{ct}); # ELEMENT
8254        redo A;
8255      } elsif ($nc == EOF_CHAR) {
8256        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8257        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8258
8259    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8260      $self->{line_prev} = $self->{line};
8261      $self->{column_prev} = $self->{column};
8262      $self->{column}++;
8263      $self->{nc}
8264          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8265    } else {
8266      $self->{set_nc}->($self);
8267    }
8268
8269        ## Discard the current token.
8270        redo A;
8271      } else {
8272        if ($nc == 0x0000) {
8273          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8274        }
8275        $self->{ct}->{content}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ELEMENT
8276        ## Stay in the state.
8277
8278    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8279      $self->{line_prev} = $self->{line};
8280      $self->{column_prev} = $self->{column};
8281      $self->{column}++;
8282      $self->{nc}
8283          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8284    } else {
8285      $self->{set_nc}->($self);
8286    }
8287
8288        redo A;
8289      }
8290    } elsif ($state == AFTER_CM_GROUP_OPEN_STATE) {
8291      if ($is_space->{$nc}) {
8292        ## Stay in the state.
8293
8294    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8295      $self->{line_prev} = $self->{line};
8296      $self->{column_prev} = $self->{column};
8297      $self->{column}++;
8298      $self->{nc}
8299          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8300    } else {
8301      $self->{set_nc}->($self);
8302    }
8303
8304        redo A;
8305      } elsif ($nc == 0x0028) { # (
8306        $self->{group_depth}++;
8307        push @{$self->{ct}->{content}}, chr $nc;
8308        ## Stay in the state.
8309
8310    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8311      $self->{line_prev} = $self->{line};
8312      $self->{column_prev} = $self->{column};
8313      $self->{column}++;
8314      $self->{nc}
8315          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8316    } else {
8317      $self->{set_nc}->($self);
8318    }
8319
8320        redo A;
8321      } elsif ($nc == 0x007C or # |
8322               $nc == 0x002C) { # ,
8323        $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8324        ## Stay in the state.
8325
8326    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8327      $self->{line_prev} = $self->{line};
8328      $self->{column_prev} = $self->{column};
8329      $self->{column}++;
8330      $self->{nc}
8331          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8332    } else {
8333      $self->{set_nc}->($self);
8334    }
8335
8336        redo A;
8337      } elsif ($nc == 0x0029) { # )
8338        $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8339        push @{$self->{ct}->{content}}, chr $nc;
8340        $self->{group_depth}--;
8341        $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8342
8343    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8344      $self->{line_prev} = $self->{line};
8345      $self->{column_prev} = $self->{column};
8346      $self->{column}++;
8347      $self->{nc}
8348          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8349    } else {
8350      $self->{set_nc}->($self);
8351    }
8352
8353        redo A;
8354      } elsif ($nc == 0x003E) { # >
8355        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8356        push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8357        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8358
8359    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8360      $self->{line_prev} = $self->{line};
8361      $self->{column_prev} = $self->{column};
8362      $self->{column}++;
8363      $self->{nc}
8364          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8365    } else {
8366      $self->{set_nc}->($self);
8367    }
8368
8369        return  ($self->{ct}); # ELEMENT
8370        redo A;
8371      } elsif ($nc == EOF_CHAR) {
8372        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8373        #push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8374        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8375
8376    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8377      $self->{line_prev} = $self->{line};
8378      $self->{column_prev} = $self->{column};
8379      $self->{column}++;
8380      $self->{nc}
8381          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8382    } else {
8383      $self->{set_nc}->($self);
8384    }
8385
8386        ## Discard the current token.
8387        redo A;
8388      } else {
8389        if ($nc == 0x0000) {
8390          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8391        }
8392        push @{$self->{ct}->{content}}, $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
8393        $self->{state} = CM_ELEMENT_NAME_STATE;
8394
8395    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8396      $self->{line_prev} = $self->{line};
8397      $self->{column_prev} = $self->{column};
8398      $self->{column}++;
8399      $self->{nc}
8400          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8401    } else {
8402      $self->{set_nc}->($self);
8403    }
8404
8405        redo A;
8406      }
8407    } elsif ($state == CM_ELEMENT_NAME_STATE) {
8408      if ($is_space->{$nc}) {
8409        $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8410
8411    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8412      $self->{line_prev} = $self->{line};
8413      $self->{column_prev} = $self->{column};
8414      $self->{column}++;
8415      $self->{nc}
8416          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8417    } else {
8418      $self->{set_nc}->($self);
8419    }
8420
8421        redo A;
8422      } elsif ($nc == 0x002A or # *
8423               $nc == 0x002B or # +
8424               $nc == 0x003F) { # ?
8425        push @{$self->{ct}->{content}}, chr $nc;
8426        $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8427
8428    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8429      $self->{line_prev} = $self->{line};
8430      $self->{column_prev} = $self->{column};
8431      $self->{column}++;
8432      $self->{nc}
8433          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8434    } else {
8435      $self->{set_nc}->($self);
8436    }
8437
8438        redo A;
8439      } elsif ($nc == 0x007C or # |
8440               $nc == 0x002C) { # ,
8441        push @{$self->{ct}->{content}}, $nc == 0x007C ? ' | ' : ', ';
8442        $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8443
8444    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8445      $self->{line_prev} = $self->{line};
8446      $self->{column_prev} = $self->{column};
8447      $self->{column}++;
8448      $self->{nc}
8449          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8450    } else {
8451      $self->{set_nc}->($self);
8452    }
8453
8454        redo A;
8455      } elsif ($nc == 0x0029) { # )
8456        $self->{group_depth}--;
8457        push @{$self->{ct}->{content}}, chr $nc;
8458        $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8459
8460    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8461      $self->{line_prev} = $self->{line};
8462      $self->{column_prev} = $self->{column};
8463      $self->{column}++;
8464      $self->{nc}
8465          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8466    } else {
8467      $self->{set_nc}->($self);
8468    }
8469
8470        redo A;
8471      } elsif ($nc == 0x003E) { # >
8472        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8473        push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8474        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8475
8476    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8477      $self->{line_prev} = $self->{line};
8478      $self->{column_prev} = $self->{column};
8479      $self->{column}++;
8480      $self->{nc}
8481          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8482    } else {
8483      $self->{set_nc}->($self);
8484    }
8485
8486        return  ($self->{ct}); # ELEMENT
8487        redo A;
8488      } elsif ($nc == EOF_CHAR) {
8489        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8490        #push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8491        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8492
8493    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8494      $self->{line_prev} = $self->{line};
8495      $self->{column_prev} = $self->{column};
8496      $self->{column}++;
8497      $self->{nc}
8498          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8499    } else {
8500      $self->{set_nc}->($self);
8501    }
8502
8503        ## Discard the token.
8504        redo A;
8505      } else {
8506        if ($nc == 0x0000) {
8507          $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
8508        }
8509        $self->{ct}->{content}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc;
8510        ## Stay in the state.
8511
8512    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8513      $self->{line_prev} = $self->{line};
8514      $self->{column_prev} = $self->{column};
8515      $self->{column}++;
8516      $self->{nc}
8517          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8518    } else {
8519      $self->{set_nc}->($self);
8520    }
8521
8522        redo A;
8523      }
8524    } elsif ($state == AFTER_CM_ELEMENT_NAME_STATE) {
8525      if ($is_space->{$nc}) {
8526        ## Stay in the state.
8527
8528    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8529      $self->{line_prev} = $self->{line};
8530      $self->{column_prev} = $self->{column};
8531      $self->{column}++;
8532      $self->{nc}
8533          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8534    } else {
8535      $self->{set_nc}->($self);
8536    }
8537
8538        redo A;
8539      } elsif ($nc == 0x007C or # |
8540               $nc == 0x002C) { # ,
8541        push @{$self->{ct}->{content}}, $nc == 0x007C ? ' | ' : ', ';
8542        $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8543
8544    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8545      $self->{line_prev} = $self->{line};
8546      $self->{column_prev} = $self->{column};
8547      $self->{column}++;
8548      $self->{nc}
8549          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8550    } else {
8551      $self->{set_nc}->($self);
8552    }
8553
8554        redo A;
8555      } elsif ($nc == 0x0029) { # )
8556        $self->{group_depth}--;
8557        push @{$self->{ct}->{content}}, chr $nc;
8558        $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8559
8560    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8561      $self->{line_prev} = $self->{line};
8562      $self->{column_prev} = $self->{column};
8563      $self->{column}++;
8564      $self->{nc}
8565          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8566    } else {
8567      $self->{set_nc}->($self);
8568    }
8569
8570        redo A;
8571      } elsif ($nc == 0x003E) { # >
8572        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8573        push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8574        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8575
8576    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8577      $self->{line_prev} = $self->{line};
8578      $self->{column_prev} = $self->{column};
8579      $self->{column}++;
8580      $self->{nc}
8581          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8582    } else {
8583      $self->{set_nc}->($self);
8584    }
8585
8586        return  ($self->{ct}); # ELEMENT
8587        redo A;
8588      } elsif ($nc == EOF_CHAR) {
8589        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8590        #push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8591        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8592
8593    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8594      $self->{line_prev} = $self->{line};
8595      $self->{column_prev} = $self->{column};
8596      $self->{column}++;
8597      $self->{nc}
8598          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8599    } else {
8600      $self->{set_nc}->($self);
8601    }
8602
8603        ## Discard the current token.
8604        redo A;
8605      } else {
8606        $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8607        push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8608        $self->{state} = BOGUS_MD_STATE;
8609
8610    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8611      $self->{line_prev} = $self->{line};
8612      $self->{column_prev} = $self->{column};
8613      $self->{column}++;
8614      $self->{nc}
8615          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8616    } else {
8617      $self->{set_nc}->($self);
8618    }
8619
8620        redo A;
8621      }
8622    } elsif ($state == AFTER_CM_GROUP_CLOSE_STATE) {
8623      if ($is_space->{$nc}) {
8624        if ($self->{group_depth}) {
8625          $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8626        } else {
8627          $self->{state} = AFTER_MD_DEF_STATE;
8628        }
8629
8630    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8631      $self->{line_prev} = $self->{line};
8632      $self->{column_prev} = $self->{column};
8633      $self->{column}++;
8634      $self->{nc}
8635          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8636    } else {
8637      $self->{set_nc}->($self);
8638    }
8639
8640        redo A;
8641      } elsif ($nc == 0x002A or # *
8642               $nc == 0x002B or # +
8643               $nc == 0x003F) { # ?
8644        push @{$self->{ct}->{content}}, chr $nc;
8645        if ($self->{group_depth}) {
8646          $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8647        } else {
8648          $self->{state} = AFTER_MD_DEF_STATE;
8649        }
8650
8651    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8652      $self->{line_prev} = $self->{line};
8653      $self->{column_prev} = $self->{column};
8654      $self->{column}++;
8655      $self->{nc}
8656          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8657    } else {
8658      $self->{set_nc}->($self);
8659    }
8660
8661        redo A;
8662      } elsif ($nc == 0x0029) { # )
8663        if ($self->{group_depth}) {
8664          $self->{group_depth}--;
8665          push @{$self->{ct}->{content}}, chr $nc;
8666          ## Stay in the state.
8667
8668    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8669      $self->{line_prev} = $self->{line};
8670      $self->{column_prev} = $self->{column};
8671      $self->{column}++;
8672      $self->{nc}
8673          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8674    } else {
8675      $self->{set_nc}->($self);
8676    }
8677
8678          redo A;
8679        } else {
8680          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8681          $self->{state} = BOGUS_MD_STATE;
8682          ## Reconsume.
8683          redo A;
8684        }
8685      } elsif ($nc == 0x003E) { # >
8686        if ($self->{group_depth}) {
8687          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8688          push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8689        }
8690        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8691
8692    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8693      $self->{line_prev} = $self->{line};
8694      $self->{column_prev} = $self->{column};
8695      $self->{column}++;
8696      $self->{nc}
8697          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8698    } else {
8699      $self->{set_nc}->($self);
8700    }
8701
8702        return  ($self->{ct}); # ELEMENT
8703        redo A;
8704      } elsif ($nc == EOF_CHAR) {
8705        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8706        #push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8707        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8708
8709    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8710      $self->{line_prev} = $self->{line};
8711      $self->{column_prev} = $self->{column};
8712      $self->{column}++;
8713      $self->{nc}
8714          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8715    } else {
8716      $self->{set_nc}->($self);
8717    }
8718
8719        ## Discard the current token.
8720        redo A;
8721      } else {
8722        if ($self->{group_depth}) {
8723          $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8724        } else {
8725          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8726          $self->{state} = BOGUS_MD_STATE;
8727        }
8728        ## Reconsume.
8729        redo A;
8730      }
8731    } elsif ($state == AFTER_MD_DEF_STATE) {
8732      if ($is_space->{$nc}) {
8733        ## Stay in the state.
8734
8735    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8736      $self->{line_prev} = $self->{line};
8737      $self->{column_prev} = $self->{column};
8738      $self->{column}++;
8739      $self->{nc}
8740          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8741    } else {
8742      $self->{set_nc}->($self);
8743    }
8744
8745        redo A;
8746      } elsif ($nc == 0x003E) { # >
8747        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8748
8749    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8750      $self->{line_prev} = $self->{line};
8751      $self->{column_prev} = $self->{column};
8752      $self->{column}++;
8753      $self->{nc}
8754          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8755    } else {
8756      $self->{set_nc}->($self);
8757    }
8758
8759        return  ($self->{ct}); # ENTITY/ELEMENT
8760        redo A;
8761      } elsif ($nc == EOF_CHAR) {
8762        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8763        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8764
8765    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8766      $self->{line_prev} = $self->{line};
8767      $self->{column_prev} = $self->{column};
8768      $self->{column}++;
8769      $self->{nc}
8770          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8771    } else {
8772      $self->{set_nc}->($self);
8773    }
8774
8775        ## Discard the current token.
8776        redo A;
8777      } else {
8778        $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8779        $self->{state} = BOGUS_MD_STATE;
8780        ## Reconsume.
8781        redo A;
8782      }
8783    } elsif ($state == BOGUS_MD_STATE) {
8784      if ($nc == 0x003E) { # >
8785        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8786
8787    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8788      $self->{line_prev} = $self->{line};
8789      $self->{column_prev} = $self->{column};
8790      $self->{column}++;
8791      $self->{nc}
8792          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8793    } else {
8794      $self->{set_nc}->($self);
8795    }
8796
8797        return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8798        redo A;
8799      } elsif ($nc == EOF_CHAR) {
8800        $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8801        ## Reconsume.
8802        ## Discard the current token.
8803        redo A;
8804      } else {
8805        ## Stay in the state.
8806
8807    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8808      $self->{line_prev} = $self->{line};
8809      $self->{column_prev} = $self->{column};
8810      $self->{column}++;
8811      $self->{nc}
8812          = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8813    } else {
8814      $self->{set_nc}->($self);
8815    }
8816
8817        redo A;
8818      }
8819    } else {
8820      die "$0: $state: Unknown state";
8821    }
8822  } # A
8823
8824  die "$0: _get_next_token: unexpected case";
8825} # _get_next_token
8826
88271;
8828
8829# Copyright 2007-2011 Wakaba <w@suika.fam.cx>.
8830#
8831# This library is free software; you can redistribute it and/or modify
8832# it under the same terms as Perl itself.
8833
8834