1#default_value:foo
2#include: other.manifest
3#
4#[test_name.js]
5#  expected: ERROR
6#
7#  [subtest 1]
8#    expected:
9#      os == win: FAIL #This is a comment
10#      PASS
11#
12
13# TODO: keep comments in the tree
14
15from io import BytesIO
16
17from .node import (Node, AtomNode, BinaryExpressionNode, BinaryOperatorNode,
18                   ConditionalNode, DataNode, IndexNode, KeyValueNode, ListNode,
19                   NumberNode, StringNode, UnaryExpressionNode,
20                   UnaryOperatorNode, ValueNode, VariableNode)
21
22
23class ParseError(Exception):
24    def __init__(self, filename, line, detail):
25        self.line = line
26        self.filename = filename
27        self.detail = detail
28        self.message = "%s: %s line %s" % (self.detail, self.filename, self.line)
29        Exception.__init__(self, self.message)
30
31eol = object
32group_start = object
33group_end = object
34digits = "0123456789"
35open_parens = "[("
36close_parens = "])"
37parens = open_parens + close_parens
38operator_chars = "=!"
39
40unary_operators = ["not"]
41binary_operators = ["==", "!=", "and", "or"]
42
43operators = ["==", "!=", "not", "and", "or"]
44
45atoms = {"True": True,
46         "False": False,
47         "Reset": object()}
48
49def decode(s):
50    assert isinstance(s, str)
51    return s
52
53
54def precedence(operator_node):
55    return len(operators) - operators.index(operator_node.data)
56
57
58class TokenTypes(object):
59    def __init__(self) -> None:
60        for type in ["group_start", "group_end", "paren", "list_start", "list_end", "separator", "ident", "string", "number", "atom", "eof"]:
61            setattr(self, type, type)
62
63token_types = TokenTypes()
64
65
66class Tokenizer(object):
67    def __init__(self):
68        self.reset()
69
70    def reset(self):
71        self.indent_levels = [0]
72        self.state = self.line_start_state
73        self.next_state = self.data_line_state
74        self.line_number = 0
75        self.filename = ""
76
77    def tokenize(self, stream):
78        self.reset()
79        assert not isinstance(stream, str)
80        if isinstance(stream, bytes):
81            stream = BytesIO(stream)
82        if not hasattr(stream, "name"):
83            self.filename = ""
84        else:
85            self.filename = stream.name
86
87        self.next_line_state = self.line_start_state
88        for i, line in enumerate(stream):
89            assert isinstance(line, bytes)
90            self.state = self.next_line_state
91            assert self.state is not None
92            states = []
93            self.next_line_state = None
94            self.line_number = i + 1
95            self.index = 0
96            self.line = line.decode('utf-8').rstrip()
97            assert isinstance(self.line, str)
98            while self.state != self.eol_state:
99                states.append(self.state)
100                tokens = self.state()
101                if tokens:
102                    for token in tokens:
103                        yield token
104            self.state()
105        while True:
106            yield (token_types.eof, None)
107
108    def char(self):
109        if self.index == len(self.line):
110            return eol
111        return self.line[self.index]
112
113    def consume(self):
114        if self.index < len(self.line):
115            self.index += 1
116
117    def peek(self, length):
118        return self.line[self.index:self.index + length]
119
120    def skip_whitespace(self):
121        while self.char() == " ":
122            self.consume()
123
124    def eol_state(self):
125        if self.next_line_state is None:
126            self.next_line_state = self.line_start_state
127
128    def line_start_state(self):
129        self.skip_whitespace()
130        if self.char() == eol:
131            self.state = self.eol_state
132            return
133        if self.index > self.indent_levels[-1]:
134            self.indent_levels.append(self.index)
135            yield (token_types.group_start, None)
136        else:
137            if self.index < self.indent_levels[-1]:
138                while self.index < self.indent_levels[-1]:
139                    self.indent_levels.pop()
140                    yield (token_types.group_end, None)
141                # This is terrible; if we were parsing an expression
142                # then the next_state will be expr_or_value but when we deindent
143                # it must always be a heading or key next so we go back to data_line_state
144                self.next_state = self.data_line_state
145            if self.index != self.indent_levels[-1]:
146                raise ParseError(self.filename, self.line_number, "Unexpected indent")
147
148        self.state = self.next_state
149
150    def data_line_state(self):
151        if self.char() == "[":
152            yield (token_types.paren, self.char())
153            self.consume()
154            self.state = self.heading_state
155        else:
156            self.state = self.key_state
157
158    def heading_state(self):
159        rv = ""
160        while True:
161            c = self.char()
162            if c == "\\":
163                rv += self.consume_escape()
164            elif c == "]":
165                break
166            elif c == eol:
167                raise ParseError(self.filename, self.line_number, "EOL in heading")
168            else:
169                rv += c
170                self.consume()
171
172        yield (token_types.string, decode(rv))
173        yield (token_types.paren, "]")
174        self.consume()
175        self.state = self.line_end_state
176        self.next_state = self.data_line_state
177
178    def key_state(self):
179        rv = ""
180        while True:
181            c = self.char()
182            if c == " ":
183                self.skip_whitespace()
184                if self.char() != ":":
185                    raise ParseError(self.filename, self.line_number, "Space in key name")
186                break
187            elif c == ":":
188                break
189            elif c == eol:
190                raise ParseError(self.filename, self.line_number, "EOL in key name (missing ':'?)")
191            elif c == "\\":
192                rv += self.consume_escape()
193            else:
194                rv += c
195                self.consume()
196        yield (token_types.string, decode(rv))
197        yield (token_types.separator, ":")
198        self.consume()
199        self.state = self.after_key_state
200
201    def after_key_state(self):
202        self.skip_whitespace()
203        c = self.char()
204        if c == "#":
205            self.next_state = self.expr_or_value_state
206            self.state = self.comment_state
207        elif c == eol:
208            self.next_state = self.expr_or_value_state
209            self.state = self.eol_state
210        elif c == "[":
211            self.state = self.list_start_state
212        else:
213            self.state = self.value_state
214
215    def after_expr_state(self):
216        self.skip_whitespace()
217        c = self.char()
218        if c == "#":
219            self.next_state = self.after_expr_state
220            self.state = self.comment_state
221        elif c == eol:
222            self.next_state = self.after_expr_state
223            self.state = self.eol_state
224        elif c == "[":
225            self.state = self.list_start_state
226        else:
227            self.state = self.value_state
228
229    def list_start_state(self):
230        yield (token_types.list_start, "[")
231        self.consume()
232        self.state = self.list_value_start_state
233
234    def list_value_start_state(self):
235        self.skip_whitespace()
236        if self.char() == "]":
237            self.state = self.list_end_state
238        elif self.char() in ("'", '"'):
239            quote_char = self.char()
240            self.consume()
241            yield (token_types.string, self.consume_string(quote_char))
242            self.skip_whitespace()
243            if self.char() == "]":
244                self.state = self.list_end_state
245            elif self.char() != ",":
246                raise ParseError(self.filename, self.line_number, "Junk after quoted string")
247            self.consume()
248        elif self.char() == "#":
249            self.state = self.comment_state
250            self.next_line_state = self.list_value_start_state
251        elif self.char() == eol:
252            self.next_line_state = self.list_value_start_state
253            self.state = self.eol_state
254        elif self.char() == ",":
255            raise ParseError(self.filename, self.line_number, "List item started with separator")
256        elif self.char() == "@":
257            self.state = self.list_value_atom_state
258        else:
259            self.state = self.list_value_state
260
261    def list_value_state(self):
262        rv = ""
263        spaces = 0
264        while True:
265            c = self.char()
266            if c == "\\":
267                escape = self.consume_escape()
268                rv += escape
269            elif c == eol:
270                raise ParseError(self.filename, self.line_number, "EOL in list value")
271            elif c == "#":
272                raise ParseError(self.filename, self.line_number, "EOL in list value (comment)")
273            elif c == ",":
274                self.state = self.list_value_start_state
275                self.consume()
276                break
277            elif c == " ":
278                spaces += 1
279                self.consume()
280            elif c == "]":
281                self.state = self.list_end_state
282                self.consume()
283                break
284            else:
285                rv += " " * spaces
286                spaces = 0
287                rv += c
288                self.consume()
289
290        if rv:
291            yield (token_types.string, decode(rv))
292
293    def list_value_atom_state(self):
294        self.consume()
295        for _, value in self.list_value_state():
296            yield token_types.atom, value
297
298    def list_end_state(self):
299        self.consume()
300        yield (token_types.list_end, "]")
301        self.state = self.line_end_state
302
303    def value_state(self):
304        self.skip_whitespace()
305        c = self.char()
306        if c in ("'", '"'):
307            quote_char = self.char()
308            self.consume()
309            yield (token_types.string, self.consume_string(quote_char))
310            if self.char() == "#":
311                self.state = self.comment_state
312            else:
313                self.state = self.line_end_state
314        elif c == "@":
315            self.consume()
316            for _, value in self.value_inner_state():
317                yield token_types.atom, value
318        elif c == "[":
319            self.state = self.list_start_state
320        else:
321            self.state = self.value_inner_state
322
323    def value_inner_state(self):
324        rv = ""
325        spaces = 0
326        while True:
327            c = self.char()
328            if c == "\\":
329                rv += self.consume_escape()
330            elif c == "#":
331                self.state = self.comment_state
332                break
333            elif c == " ":
334                # prevent whitespace before comments from being included in the value
335                spaces += 1
336                self.consume()
337            elif c == eol:
338                self.state = self.line_end_state
339                break
340            else:
341                rv += " " * spaces
342                spaces = 0
343                rv += c
344                self.consume()
345        rv = decode(rv)
346        if rv.startswith("if "):
347            # Hack to avoid a problem where people write
348            # disabled: if foo
349            # and expect that to disable conditionally
350            raise ParseError(self.filename, self.line_number, "Strings starting 'if ' must be quoted "
351                             "(expressions must start on a newline and be indented)")
352        yield (token_types.string, rv)
353
354    def comment_state(self):
355        while self.char() is not eol:
356            self.consume()
357        self.state = self.eol_state
358
359    def line_end_state(self):
360        self.skip_whitespace()
361        c = self.char()
362        if c == "#":
363            self.state = self.comment_state
364        elif c == eol:
365            self.state = self.eol_state
366        else:
367            raise ParseError(self.filename, self.line_number, "Junk before EOL %s" % c)
368
369    def consume_string(self, quote_char):
370        rv = ""
371        while True:
372            c = self.char()
373            if c == "\\":
374                rv += self.consume_escape()
375            elif c == quote_char:
376                self.consume()
377                break
378            elif c == eol:
379                raise ParseError(self.filename, self.line_number, "EOL in quoted string")
380            else:
381                rv += c
382                self.consume()
383
384        return decode(rv)
385
386    def expr_or_value_state(self):
387        if self.peek(3) == "if ":
388            self.state = self.expr_state
389        else:
390            self.state = self.value_state
391
392    def expr_state(self):
393        self.skip_whitespace()
394        c = self.char()
395        if c == eol:
396            raise ParseError(self.filename, self.line_number, "EOL in expression")
397        elif c in "'\"":
398            self.consume()
399            yield (token_types.string, self.consume_string(c))
400        elif c == "#":
401            raise ParseError(self.filename, self.line_number, "Comment before end of expression")
402        elif c == ":":
403            yield (token_types.separator, c)
404            self.consume()
405            self.state = self.after_expr_state
406        elif c in parens:
407            self.consume()
408            yield (token_types.paren, c)
409        elif c in ("!", "="):
410            self.state = self.operator_state
411        elif c in digits:
412            self.state = self.digit_state
413        else:
414            self.state = self.ident_state
415
416    def operator_state(self):
417        # Only symbolic operators
418        index_0 = self.index
419        while True:
420            c = self.char()
421            if c == eol:
422                break
423            elif c in operator_chars:
424                self.consume()
425            else:
426                self.state = self.expr_state
427                break
428        yield (token_types.ident, self.line[index_0:self.index])
429
430    def digit_state(self):
431        index_0 = self.index
432        seen_dot = False
433        while True:
434            c = self.char()
435            if c == eol:
436                break
437            elif c in digits:
438                self.consume()
439            elif c == ".":
440                if seen_dot:
441                    raise ParseError(self.filename, self.line_number, "Invalid number")
442                self.consume()
443                seen_dot = True
444            elif c in parens:
445                break
446            elif c in operator_chars:
447                break
448            elif c == " ":
449                break
450            elif c == ":":
451                break
452            else:
453                raise ParseError(self.filename, self.line_number, "Invalid character in number")
454
455        self.state = self.expr_state
456        yield (token_types.number, self.line[index_0:self.index])
457
458    def ident_state(self):
459        index_0 = self.index
460        while True:
461            c = self.char()
462            if c == eol:
463                break
464            elif c == ".":
465                break
466            elif c in parens:
467                break
468            elif c in operator_chars:
469                break
470            elif c == " ":
471                break
472            elif c == ":":
473                break
474            else:
475                self.consume()
476        self.state = self.expr_state
477        yield (token_types.ident, self.line[index_0:self.index])
478
479    def consume_escape(self):
480        assert self.char() == "\\"
481        self.consume()
482        c = self.char()
483        self.consume()
484        if c == "x":
485            return self.decode_escape(2)
486        elif c == "u":
487            return self.decode_escape(4)
488        elif c == "U":
489            return self.decode_escape(6)
490        elif c in ["a", "b", "f", "n", "r", "t", "v"]:
491            return eval(r"'\%s'" % c)
492        elif c is eol:
493            raise ParseError(self.filename, self.line_number, "EOL in escape")
494        else:
495            return c
496
497    def decode_escape(self, length):
498        value = 0
499        for i in range(length):
500            c = self.char()
501            value *= 16
502            value += self.escape_value(c)
503            self.consume()
504
505        return chr(value)
506
507    def escape_value(self, c):
508        if '0' <= c <= '9':
509            return ord(c) - ord('0')
510        elif 'a' <= c <= 'f':
511            return ord(c) - ord('a') + 10
512        elif 'A' <= c <= 'F':
513            return ord(c) - ord('A') + 10
514        else:
515            raise ParseError(self.filename, self.line_number, "Invalid character escape")
516
517
518class Parser(object):
519    def __init__(self):
520        self.reset()
521
522    def reset(self):
523        self.token = None
524        self.unary_operators = "!"
525        self.binary_operators = frozenset(["&&", "||", "=="])
526        self.tokenizer = Tokenizer()
527        self.token_generator = None
528        self.tree = Treebuilder(DataNode(None))
529        self.expr_builder = None
530        self.expr_builders = []
531
532    def parse(self, input):
533        try:
534            self.reset()
535            self.token_generator = self.tokenizer.tokenize(input)
536            self.consume()
537            self.manifest()
538            return self.tree.node
539        except Exception as e:
540            if not isinstance(e, ParseError):
541                raise ParseError(self.tokenizer.filename,
542                                 self.tokenizer.line_number,
543                                 str(e))
544            raise
545
546    def consume(self):
547        self.token = next(self.token_generator)
548
549    def expect(self, type, value=None):
550        if self.token[0] != type:
551            raise ParseError(self.tokenizer.filename, self.tokenizer.line_number,
552                             "Token '{}' doesn't equal expected type '{}'".format(self.token[0], type))
553        if value is not None:
554            if self.token[1] != value:
555                raise ParseError(self.tokenizer.filename, self.tokenizer.line_number,
556                                 "Token '{}' doesn't equal expected value '{}'".format(self.token[1], value))
557
558        self.consume()
559
560    def manifest(self):
561        self.data_block()
562        self.expect(token_types.eof)
563
564    def data_block(self):
565        while self.token[0] == token_types.string:
566            self.tree.append(KeyValueNode(self.token[1]))
567            self.consume()
568            self.expect(token_types.separator)
569            self.value_block()
570            self.tree.pop()
571
572        while self.token == (token_types.paren, "["):
573            self.consume()
574            if self.token[0] != token_types.string:
575                raise ParseError(self.tokenizer.filename, self.tokenizer.line_number,
576                                 "Token '{}' is not a string".format(self.token[0]))
577            self.tree.append(DataNode(self.token[1]))
578            self.consume()
579            self.expect(token_types.paren, "]")
580            if self.token[0] == token_types.group_start:
581                self.consume()
582                self.data_block()
583                self.eof_or_end_group()
584            self.tree.pop()
585
586    def eof_or_end_group(self):
587        if self.token[0] != token_types.eof:
588            self.expect(token_types.group_end)
589
590    def value_block(self):
591        if self.token[0] == token_types.list_start:
592            self.consume()
593            self.list_value()
594        elif self.token[0] == token_types.string:
595            self.value()
596        elif self.token[0] == token_types.group_start:
597            self.consume()
598            self.expression_values()
599            if self.token[0] == token_types.string:
600                self.value()
601            elif self.token[0] == token_types.list_start:
602                self.consume()
603                self.list_value()
604            self.eof_or_end_group()
605        elif self.token[0] == token_types.atom:
606            self.atom()
607        else:
608            raise ParseError(self.tokenizer.filename, self.tokenizer.line_number,
609                             "Token '{}' is not a known type".format(self.token[0]))
610
611    def list_value(self):
612        self.tree.append(ListNode())
613        while self.token[0] in (token_types.atom, token_types.string):
614            if self.token[0] == token_types.atom:
615                self.atom()
616            else:
617                self.value()
618        self.expect(token_types.list_end)
619        self.tree.pop()
620
621    def expression_values(self):
622        while self.token == (token_types.ident, "if"):
623            self.consume()
624            self.tree.append(ConditionalNode())
625            self.expr_start()
626            self.expect(token_types.separator)
627            self.value_block()
628            self.tree.pop()
629
630    def value(self):
631        self.tree.append(ValueNode(self.token[1]))
632        self.consume()
633        self.tree.pop()
634
635    def atom(self):
636        if self.token[1] not in atoms:
637            raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, "Unrecognised symbol @%s" % self.token[1])
638        self.tree.append(AtomNode(atoms[self.token[1]]))
639        self.consume()
640        self.tree.pop()
641
642    def expr_start(self):
643        self.expr_builder = ExpressionBuilder(self.tokenizer)
644        self.expr_builders.append(self.expr_builder)
645        self.expr()
646        expression = self.expr_builder.finish()
647        self.expr_builders.pop()
648        self.expr_builder = self.expr_builders[-1] if self.expr_builders else None
649        if self.expr_builder:
650            self.expr_builder.operands[-1].children[-1].append(expression)
651        else:
652            self.tree.append(expression)
653            self.tree.pop()
654
655    def expr(self):
656        self.expr_operand()
657        while (self.token[0] == token_types.ident and self.token[1] in binary_operators):
658            self.expr_bin_op()
659            self.expr_operand()
660
661    def expr_operand(self):
662        if self.token == (token_types.paren, "("):
663            self.consume()
664            self.expr_builder.left_paren()
665            self.expr()
666            self.expect(token_types.paren, ")")
667            self.expr_builder.right_paren()
668        elif self.token[0] == token_types.ident and self.token[1] in unary_operators:
669            self.expr_unary_op()
670            self.expr_operand()
671        elif self.token[0] in [token_types.string, token_types.ident]:
672            self.expr_value()
673        elif self.token[0] == token_types.number:
674            self.expr_number()
675        else:
676            raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, "Unrecognised operand")
677
678    def expr_unary_op(self):
679        if self.token[1] in unary_operators:
680            self.expr_builder.push_operator(UnaryOperatorNode(self.token[1]))
681            self.consume()
682        else:
683            raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, "Expected unary operator")
684
685    def expr_bin_op(self):
686        if self.token[1] in binary_operators:
687            self.expr_builder.push_operator(BinaryOperatorNode(self.token[1]))
688            self.consume()
689        else:
690            raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, "Expected binary operator")
691
692    def expr_value(self):
693        node_type = {token_types.string: StringNode,
694                     token_types.ident: VariableNode}[self.token[0]]
695        self.expr_builder.push_operand(node_type(self.token[1]))
696        self.consume()
697        if self.token == (token_types.paren, "["):
698            self.consume()
699            self.expr_builder.operands[-1].append(IndexNode())
700            self.expr_start()
701            self.expect(token_types.paren, "]")
702
703    def expr_number(self):
704        self.expr_builder.push_operand(NumberNode(self.token[1]))
705        self.consume()
706
707
708class Treebuilder(object):
709    def __init__(self, root):
710        self.root = root
711        self.node = root
712
713    def append(self, node):
714        assert isinstance(node, Node)
715        self.node.append(node)
716        self.node = node
717        assert self.node is not None
718        return node
719
720    def pop(self):
721        node = self.node
722        self.node = self.node.parent
723        assert self.node is not None
724        return node
725
726
727class ExpressionBuilder(object):
728    def __init__(self, tokenizer):
729        self.operands = []
730        self.operators = [None]
731        self.tokenizer = tokenizer
732
733    def finish(self):
734        while self.operators[-1] is not None:
735            self.pop_operator()
736        rv = self.pop_operand()
737        assert self.is_empty()
738        return rv
739
740    def left_paren(self):
741        self.operators.append(None)
742
743    def right_paren(self):
744        while self.operators[-1] is not None:
745            self.pop_operator()
746            if not self.operators:
747                raise ParseError(self.tokenizer.filename, self.tokenizer.line,
748                                 "Unbalanced parens")
749
750        assert self.operators.pop() is None
751
752    def push_operator(self, operator):
753        assert operator is not None
754        while self.precedence(self.operators[-1]) > self.precedence(operator):
755            self.pop_operator()
756
757        self.operators.append(operator)
758
759    def pop_operator(self):
760        operator = self.operators.pop()
761        if isinstance(operator, BinaryOperatorNode):
762            operand_1 = self.operands.pop()
763            operand_0 = self.operands.pop()
764            self.operands.append(BinaryExpressionNode(operator, operand_0, operand_1))
765        else:
766            operand_0 = self.operands.pop()
767            self.operands.append(UnaryExpressionNode(operator, operand_0))
768
769    def push_operand(self, node):
770        self.operands.append(node)
771
772    def pop_operand(self):
773        return self.operands.pop()
774
775    def is_empty(self):
776        return len(self.operands) == 0 and all(item is None for item in self.operators)
777
778    def precedence(self, operator):
779        if operator is None:
780            return 0
781        return precedence(operator)
782
783
784def parse(stream):
785    p = Parser()
786    return p.parse(stream)
787