1
2# The following YAML grammar is LL(1) and is parsed by a recursive descent
3# parser.
4#
5# stream            ::= STREAM-START implicit_document? explicit_document* STREAM-END
6# implicit_document ::= block_node DOCUMENT-END*
7# explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END*
8# block_node_or_indentless_sequence ::=
9#                       ALIAS
10#                       | properties (block_content | indentless_block_sequence)?
11#                       | block_content
12#                       | indentless_block_sequence
13# block_node        ::= ALIAS
14#                       | properties block_content?
15#                       | block_content
16# flow_node         ::= ALIAS
17#                       | properties flow_content?
18#                       | flow_content
19# properties        ::= TAG ANCHOR? | ANCHOR TAG?
20# block_content     ::= block_collection | flow_collection | SCALAR
21# flow_content      ::= flow_collection | SCALAR
22# block_collection  ::= block_sequence | block_mapping
23# flow_collection   ::= flow_sequence | flow_mapping
24# block_sequence    ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
25# indentless_sequence   ::= (BLOCK-ENTRY block_node?)+
26# block_mapping     ::= BLOCK-MAPPING_START
27#                       ((KEY block_node_or_indentless_sequence?)?
28#                       (VALUE block_node_or_indentless_sequence?)?)*
29#                       BLOCK-END
30# flow_sequence     ::= FLOW-SEQUENCE-START
31#                       (flow_sequence_entry FLOW-ENTRY)*
32#                       flow_sequence_entry?
33#                       FLOW-SEQUENCE-END
34# flow_sequence_entry   ::= flow_node | KEY flow_node? (VALUE flow_node?)?
35# flow_mapping      ::= FLOW-MAPPING-START
36#                       (flow_mapping_entry FLOW-ENTRY)*
37#                       flow_mapping_entry?
38#                       FLOW-MAPPING-END
39# flow_mapping_entry    ::= flow_node | KEY flow_node? (VALUE flow_node?)?
40#
41# FIRST sets:
42#
43# stream: { STREAM-START }
44# explicit_document: { DIRECTIVE DOCUMENT-START }
45# implicit_document: FIRST(block_node)
46# block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
47# flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
48# block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
49# flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
50# block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
51# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
52# block_sequence: { BLOCK-SEQUENCE-START }
53# block_mapping: { BLOCK-MAPPING-START }
54# block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
55# indentless_sequence: { ENTRY }
56# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
57# flow_sequence: { FLOW-SEQUENCE-START }
58# flow_mapping: { FLOW-MAPPING-START }
59# flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
60# flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
61
62__all__ = ['Parser', 'ParserError']
63
64from .error import MarkedYAMLError
65from .tokens import *
66from .events import *
67from .scanner import *
68
69class ParserError(MarkedYAMLError):
70    pass
71
72class Parser:
73    # Since writing a recursive-descendant parser is a straightforward task, we
74    # do not give many comments here.
75
76    DEFAULT_TAGS = {
77        '!':   '!',
78        '!!':  'tag:yaml.org,2002:',
79    }
80
81    def __init__(self):
82        self.current_event = None
83        self.yaml_version = None
84        self.tag_handles = {}
85        self.states = []
86        self.marks = []
87        self.state = self.parse_stream_start
88
89    def dispose(self):
90        # Reset the state attributes (to clear self-references)
91        self.states = []
92        self.state = None
93
94    def check_event(self, *choices):
95        # Check the type of the next event.
96        if self.current_event is None:
97            if self.state:
98                self.current_event = self.state()
99        if self.current_event is not None:
100            if not choices:
101                return True
102            for choice in choices:
103                if isinstance(self.current_event, choice):
104                    return True
105        return False
106
107    def peek_event(self):
108        # Get the next event.
109        if self.current_event is None:
110            if self.state:
111                self.current_event = self.state()
112        return self.current_event
113
114    def get_event(self):
115        # Get the next event and proceed further.
116        if self.current_event is None:
117            if self.state:
118                self.current_event = self.state()
119        value = self.current_event
120        self.current_event = None
121        return value
122
123    # stream    ::= STREAM-START implicit_document? explicit_document* STREAM-END
124    # implicit_document ::= block_node DOCUMENT-END*
125    # explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END*
126
127    def parse_stream_start(self):
128
129        # Parse the stream start.
130        token = self.get_token()
131        event = StreamStartEvent(token.start_mark, token.end_mark,
132                encoding=token.encoding)
133
134        # Prepare the next state.
135        self.state = self.parse_implicit_document_start
136
137        return event
138
139    def parse_implicit_document_start(self):
140
141        # Parse an implicit document.
142        if not self.check_token(DirectiveToken, DocumentStartToken,
143                StreamEndToken):
144            self.tag_handles = self.DEFAULT_TAGS
145            token = self.peek_token()
146            start_mark = end_mark = token.start_mark
147            event = DocumentStartEvent(start_mark, end_mark,
148                    explicit=False)
149
150            # Prepare the next state.
151            self.states.append(self.parse_document_end)
152            self.state = self.parse_block_node
153
154            return event
155
156        else:
157            return self.parse_document_start()
158
159    def parse_document_start(self):
160
161        # Parse any extra document end indicators.
162        while self.check_token(DocumentEndToken):
163            self.get_token()
164
165        # Parse an explicit document.
166        if not self.check_token(StreamEndToken):
167            token = self.peek_token()
168            start_mark = token.start_mark
169            version, tags = self.process_directives()
170            if not self.check_token(DocumentStartToken):
171                raise ParserError(None, None,
172                        "expected '<document start>', but found %r"
173                        % self.peek_token().id,
174                        self.peek_token().start_mark)
175            token = self.get_token()
176            end_mark = token.end_mark
177            event = DocumentStartEvent(start_mark, end_mark,
178                    explicit=True, version=version, tags=tags)
179            self.states.append(self.parse_document_end)
180            self.state = self.parse_document_content
181        else:
182            # Parse the end of the stream.
183            token = self.get_token()
184            event = StreamEndEvent(token.start_mark, token.end_mark)
185            assert not self.states
186            assert not self.marks
187            self.state = None
188        return event
189
190    def parse_document_end(self):
191
192        # Parse the document end.
193        token = self.peek_token()
194        start_mark = end_mark = token.start_mark
195        explicit = False
196        if self.check_token(DocumentEndToken):
197            token = self.get_token()
198            end_mark = token.end_mark
199            explicit = True
200        event = DocumentEndEvent(start_mark, end_mark,
201                explicit=explicit)
202
203        # Prepare the next state.
204        self.state = self.parse_document_start
205
206        return event
207
208    def parse_document_content(self):
209        if self.check_token(DirectiveToken,
210                DocumentStartToken, DocumentEndToken, StreamEndToken):
211            event = self.process_empty_scalar(self.peek_token().start_mark)
212            self.state = self.states.pop()
213            return event
214        else:
215            return self.parse_block_node()
216
217    def process_directives(self):
218        self.yaml_version = None
219        self.tag_handles = {}
220        while self.check_token(DirectiveToken):
221            token = self.get_token()
222            if token.name == 'YAML':
223                if self.yaml_version is not None:
224                    raise ParserError(None, None,
225                            "found duplicate YAML directive", token.start_mark)
226                major, minor = token.value
227                if major != 1:
228                    raise ParserError(None, None,
229                            "found incompatible YAML document (version 1.* is required)",
230                            token.start_mark)
231                self.yaml_version = token.value
232            elif token.name == 'TAG':
233                handle, prefix = token.value
234                if handle in self.tag_handles:
235                    raise ParserError(None, None,
236                            "duplicate tag handle %r" % handle,
237                            token.start_mark)
238                self.tag_handles[handle] = prefix
239        if self.tag_handles:
240            value = self.yaml_version, self.tag_handles.copy()
241        else:
242            value = self.yaml_version, None
243        for key in self.DEFAULT_TAGS:
244            if key not in self.tag_handles:
245                self.tag_handles[key] = self.DEFAULT_TAGS[key]
246        return value
247
248    # block_node_or_indentless_sequence ::= ALIAS
249    #               | properties (block_content | indentless_block_sequence)?
250    #               | block_content
251    #               | indentless_block_sequence
252    # block_node    ::= ALIAS
253    #                   | properties block_content?
254    #                   | block_content
255    # flow_node     ::= ALIAS
256    #                   | properties flow_content?
257    #                   | flow_content
258    # properties    ::= TAG ANCHOR? | ANCHOR TAG?
259    # block_content     ::= block_collection | flow_collection | SCALAR
260    # flow_content      ::= flow_collection | SCALAR
261    # block_collection  ::= block_sequence | block_mapping
262    # flow_collection   ::= flow_sequence | flow_mapping
263
264    def parse_block_node(self):
265        return self.parse_node(block=True)
266
267    def parse_flow_node(self):
268        return self.parse_node()
269
270    def parse_block_node_or_indentless_sequence(self):
271        return self.parse_node(block=True, indentless_sequence=True)
272
273    def parse_node(self, block=False, indentless_sequence=False):
274        if self.check_token(AliasToken):
275            token = self.get_token()
276            event = AliasEvent(token.value, token.start_mark, token.end_mark)
277            self.state = self.states.pop()
278        else:
279            anchor = None
280            tag = None
281            start_mark = end_mark = tag_mark = None
282            if self.check_token(AnchorToken):
283                token = self.get_token()
284                start_mark = token.start_mark
285                end_mark = token.end_mark
286                anchor = token.value
287                if self.check_token(TagToken):
288                    token = self.get_token()
289                    tag_mark = token.start_mark
290                    end_mark = token.end_mark
291                    tag = token.value
292            elif self.check_token(TagToken):
293                token = self.get_token()
294                start_mark = tag_mark = token.start_mark
295                end_mark = token.end_mark
296                tag = token.value
297                if self.check_token(AnchorToken):
298                    token = self.get_token()
299                    end_mark = token.end_mark
300                    anchor = token.value
301            if tag is not None:
302                handle, suffix = tag
303                if handle is not None:
304                    if handle not in self.tag_handles:
305                        raise ParserError("while parsing a node", start_mark,
306                                "found undefined tag handle %r" % handle,
307                                tag_mark)
308                    tag = self.tag_handles[handle]+suffix
309                else:
310                    tag = suffix
311            #if tag == '!':
312            #    raise ParserError("while parsing a node", start_mark,
313            #            "found non-specific tag '!'", tag_mark,
314            #            "Please check 'http://pyyaml.org/wiki/YAMLNonSpecificTag' and share your opinion.")
315            if start_mark is None:
316                start_mark = end_mark = self.peek_token().start_mark
317            event = None
318            implicit = (tag is None or tag == '!')
319            if indentless_sequence and self.check_token(BlockEntryToken):
320                end_mark = self.peek_token().end_mark
321                event = SequenceStartEvent(anchor, tag, implicit,
322                        start_mark, end_mark)
323                self.state = self.parse_indentless_sequence_entry
324            else:
325                if self.check_token(ScalarToken):
326                    token = self.get_token()
327                    end_mark = token.end_mark
328                    if (token.plain and tag is None) or tag == '!':
329                        implicit = (True, False)
330                    elif tag is None:
331                        implicit = (False, True)
332                    else:
333                        implicit = (False, False)
334                    event = ScalarEvent(anchor, tag, implicit, token.value,
335                            start_mark, end_mark, style=token.style)
336                    self.state = self.states.pop()
337                elif self.check_token(FlowSequenceStartToken):
338                    end_mark = self.peek_token().end_mark
339                    event = SequenceStartEvent(anchor, tag, implicit,
340                            start_mark, end_mark, flow_style=True)
341                    self.state = self.parse_flow_sequence_first_entry
342                elif self.check_token(FlowMappingStartToken):
343                    end_mark = self.peek_token().end_mark
344                    event = MappingStartEvent(anchor, tag, implicit,
345                            start_mark, end_mark, flow_style=True)
346                    self.state = self.parse_flow_mapping_first_key
347                elif block and self.check_token(BlockSequenceStartToken):
348                    end_mark = self.peek_token().start_mark
349                    event = SequenceStartEvent(anchor, tag, implicit,
350                            start_mark, end_mark, flow_style=False)
351                    self.state = self.parse_block_sequence_first_entry
352                elif block and self.check_token(BlockMappingStartToken):
353                    end_mark = self.peek_token().start_mark
354                    event = MappingStartEvent(anchor, tag, implicit,
355                            start_mark, end_mark, flow_style=False)
356                    self.state = self.parse_block_mapping_first_key
357                elif anchor is not None or tag is not None:
358                    # Empty scalars are allowed even if a tag or an anchor is
359                    # specified.
360                    event = ScalarEvent(anchor, tag, (implicit, False), '',
361                            start_mark, end_mark)
362                    self.state = self.states.pop()
363                else:
364                    if block:
365                        node = 'block'
366                    else:
367                        node = 'flow'
368                    token = self.peek_token()
369                    raise ParserError("while parsing a %s node" % node, start_mark,
370                            "expected the node content, but found %r" % token.id,
371                            token.start_mark)
372        return event
373
374    # block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
375
376    def parse_block_sequence_first_entry(self):
377        token = self.get_token()
378        self.marks.append(token.start_mark)
379        return self.parse_block_sequence_entry()
380
381    def parse_block_sequence_entry(self):
382        if self.check_token(BlockEntryToken):
383            token = self.get_token()
384            if not self.check_token(BlockEntryToken, BlockEndToken):
385                self.states.append(self.parse_block_sequence_entry)
386                return self.parse_block_node()
387            else:
388                self.state = self.parse_block_sequence_entry
389                return self.process_empty_scalar(token.end_mark)
390        if not self.check_token(BlockEndToken):
391            token = self.peek_token()
392            raise ParserError("while parsing a block collection", self.marks[-1],
393                    "expected <block end>, but found %r" % token.id, token.start_mark)
394        token = self.get_token()
395        event = SequenceEndEvent(token.start_mark, token.end_mark)
396        self.state = self.states.pop()
397        self.marks.pop()
398        return event
399
400    # indentless_sequence ::= (BLOCK-ENTRY block_node?)+
401
402    def parse_indentless_sequence_entry(self):
403        if self.check_token(BlockEntryToken):
404            token = self.get_token()
405            if not self.check_token(BlockEntryToken,
406                    KeyToken, ValueToken, BlockEndToken):
407                self.states.append(self.parse_indentless_sequence_entry)
408                return self.parse_block_node()
409            else:
410                self.state = self.parse_indentless_sequence_entry
411                return self.process_empty_scalar(token.end_mark)
412        token = self.peek_token()
413        event = SequenceEndEvent(token.start_mark, token.start_mark)
414        self.state = self.states.pop()
415        return event
416
417    # block_mapping     ::= BLOCK-MAPPING_START
418    #                       ((KEY block_node_or_indentless_sequence?)?
419    #                       (VALUE block_node_or_indentless_sequence?)?)*
420    #                       BLOCK-END
421
422    def parse_block_mapping_first_key(self):
423        token = self.get_token()
424        self.marks.append(token.start_mark)
425        return self.parse_block_mapping_key()
426
427    def parse_block_mapping_key(self):
428        if self.check_token(KeyToken):
429            token = self.get_token()
430            if not self.check_token(KeyToken, ValueToken, BlockEndToken):
431                self.states.append(self.parse_block_mapping_value)
432                return self.parse_block_node_or_indentless_sequence()
433            else:
434                self.state = self.parse_block_mapping_value
435                return self.process_empty_scalar(token.end_mark)
436        if not self.check_token(BlockEndToken):
437            token = self.peek_token()
438            raise ParserError("while parsing a block mapping", self.marks[-1],
439                    "expected <block end>, but found %r" % token.id, token.start_mark)
440        token = self.get_token()
441        event = MappingEndEvent(token.start_mark, token.end_mark)
442        self.state = self.states.pop()
443        self.marks.pop()
444        return event
445
446    def parse_block_mapping_value(self):
447        if self.check_token(ValueToken):
448            token = self.get_token()
449            if not self.check_token(KeyToken, ValueToken, BlockEndToken):
450                self.states.append(self.parse_block_mapping_key)
451                return self.parse_block_node_or_indentless_sequence()
452            else:
453                self.state = self.parse_block_mapping_key
454                return self.process_empty_scalar(token.end_mark)
455        else:
456            self.state = self.parse_block_mapping_key
457            token = self.peek_token()
458            return self.process_empty_scalar(token.start_mark)
459
460    # flow_sequence     ::= FLOW-SEQUENCE-START
461    #                       (flow_sequence_entry FLOW-ENTRY)*
462    #                       flow_sequence_entry?
463    #                       FLOW-SEQUENCE-END
464    # flow_sequence_entry   ::= flow_node | KEY flow_node? (VALUE flow_node?)?
465    #
466    # Note that while production rules for both flow_sequence_entry and
467    # flow_mapping_entry are equal, their interpretations are different.
468    # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
469    # generate an inline mapping (set syntax).
470
471    def parse_flow_sequence_first_entry(self):
472        token = self.get_token()
473        self.marks.append(token.start_mark)
474        return self.parse_flow_sequence_entry(first=True)
475
476    def parse_flow_sequence_entry(self, first=False):
477        if not self.check_token(FlowSequenceEndToken):
478            if not first:
479                if self.check_token(FlowEntryToken):
480                    self.get_token()
481                else:
482                    token = self.peek_token()
483                    raise ParserError("while parsing a flow sequence", self.marks[-1],
484                            "expected ',' or ']', but got %r" % token.id, token.start_mark)
485
486            if self.check_token(KeyToken):
487                token = self.peek_token()
488                event = MappingStartEvent(None, None, True,
489                        token.start_mark, token.end_mark,
490                        flow_style=True)
491                self.state = self.parse_flow_sequence_entry_mapping_key
492                return event
493            elif not self.check_token(FlowSequenceEndToken):
494                self.states.append(self.parse_flow_sequence_entry)
495                return self.parse_flow_node()
496        token = self.get_token()
497        event = SequenceEndEvent(token.start_mark, token.end_mark)
498        self.state = self.states.pop()
499        self.marks.pop()
500        return event
501
502    def parse_flow_sequence_entry_mapping_key(self):
503        token = self.get_token()
504        if not self.check_token(ValueToken,
505                FlowEntryToken, FlowSequenceEndToken):
506            self.states.append(self.parse_flow_sequence_entry_mapping_value)
507            return self.parse_flow_node()
508        else:
509            self.state = self.parse_flow_sequence_entry_mapping_value
510            return self.process_empty_scalar(token.end_mark)
511
512    def parse_flow_sequence_entry_mapping_value(self):
513        if self.check_token(ValueToken):
514            token = self.get_token()
515            if not self.check_token(FlowEntryToken, FlowSequenceEndToken):
516                self.states.append(self.parse_flow_sequence_entry_mapping_end)
517                return self.parse_flow_node()
518            else:
519                self.state = self.parse_flow_sequence_entry_mapping_end
520                return self.process_empty_scalar(token.end_mark)
521        else:
522            self.state = self.parse_flow_sequence_entry_mapping_end
523            token = self.peek_token()
524            return self.process_empty_scalar(token.start_mark)
525
526    def parse_flow_sequence_entry_mapping_end(self):
527        self.state = self.parse_flow_sequence_entry
528        token = self.peek_token()
529        return MappingEndEvent(token.start_mark, token.start_mark)
530
531    # flow_mapping  ::= FLOW-MAPPING-START
532    #                   (flow_mapping_entry FLOW-ENTRY)*
533    #                   flow_mapping_entry?
534    #                   FLOW-MAPPING-END
535    # flow_mapping_entry    ::= flow_node | KEY flow_node? (VALUE flow_node?)?
536
537    def parse_flow_mapping_first_key(self):
538        token = self.get_token()
539        self.marks.append(token.start_mark)
540        return self.parse_flow_mapping_key(first=True)
541
542    def parse_flow_mapping_key(self, first=False):
543        if not self.check_token(FlowMappingEndToken):
544            if not first:
545                if self.check_token(FlowEntryToken):
546                    self.get_token()
547                else:
548                    token = self.peek_token()
549                    raise ParserError("while parsing a flow mapping", self.marks[-1],
550                            "expected ',' or '}', but got %r" % token.id, token.start_mark)
551            if self.check_token(KeyToken):
552                token = self.get_token()
553                if not self.check_token(ValueToken,
554                        FlowEntryToken, FlowMappingEndToken):
555                    self.states.append(self.parse_flow_mapping_value)
556                    return self.parse_flow_node()
557                else:
558                    self.state = self.parse_flow_mapping_value
559                    return self.process_empty_scalar(token.end_mark)
560            elif not self.check_token(FlowMappingEndToken):
561                self.states.append(self.parse_flow_mapping_empty_value)
562                return self.parse_flow_node()
563        token = self.get_token()
564        event = MappingEndEvent(token.start_mark, token.end_mark)
565        self.state = self.states.pop()
566        self.marks.pop()
567        return event
568
569    def parse_flow_mapping_value(self):
570        if self.check_token(ValueToken):
571            token = self.get_token()
572            if not self.check_token(FlowEntryToken, FlowMappingEndToken):
573                self.states.append(self.parse_flow_mapping_key)
574                return self.parse_flow_node()
575            else:
576                self.state = self.parse_flow_mapping_key
577                return self.process_empty_scalar(token.end_mark)
578        else:
579            self.state = self.parse_flow_mapping_key
580            token = self.peek_token()
581            return self.process_empty_scalar(token.start_mark)
582
583    def parse_flow_mapping_empty_value(self):
584        self.state = self.parse_flow_mapping_key
585        return self.process_empty_scalar(self.peek_token().start_mark)
586
587    def process_empty_scalar(self, mark):
588        return ScalarEvent(None, None, (True, False), '', mark, mark)
589
590