1#default_value:foo 2#include: other.manifest 3# 4#[test_name.js] 5# expected: ERROR 6# 7# [subtest 1] 8# expected: 9# os == win: FAIL #This is a comment 10# PASS 11# 12 13# TODO: keep comments in the tree 14 15from io import BytesIO 16 17from .node import (Node, AtomNode, BinaryExpressionNode, BinaryOperatorNode, 18 ConditionalNode, DataNode, IndexNode, KeyValueNode, ListNode, 19 NumberNode, StringNode, UnaryExpressionNode, 20 UnaryOperatorNode, ValueNode, VariableNode) 21 22 23class ParseError(Exception): 24 def __init__(self, filename, line, detail): 25 self.line = line 26 self.filename = filename 27 self.detail = detail 28 self.message = "%s: %s line %s" % (self.detail, self.filename, self.line) 29 Exception.__init__(self, self.message) 30 31eol = object 32group_start = object 33group_end = object 34digits = "0123456789" 35open_parens = "[(" 36close_parens = "])" 37parens = open_parens + close_parens 38operator_chars = "=!" 39 40unary_operators = ["not"] 41binary_operators = ["==", "!=", "and", "or"] 42 43operators = ["==", "!=", "not", "and", "or"] 44 45atoms = {"True": True, 46 "False": False, 47 "Reset": object()} 48 49def decode(s): 50 assert isinstance(s, str) 51 return s 52 53 54def precedence(operator_node): 55 return len(operators) - operators.index(operator_node.data) 56 57 58class TokenTypes(object): 59 def __init__(self) -> None: 60 for type in ["group_start", "group_end", "paren", "list_start", "list_end", "separator", "ident", "string", "number", "atom", "eof"]: 61 setattr(self, type, type) 62 63token_types = TokenTypes() 64 65 66class Tokenizer(object): 67 def __init__(self): 68 self.reset() 69 70 def reset(self): 71 self.indent_levels = [0] 72 self.state = self.line_start_state 73 self.next_state = self.data_line_state 74 self.line_number = 0 75 self.filename = "" 76 77 def tokenize(self, stream): 78 self.reset() 79 assert not isinstance(stream, str) 80 if isinstance(stream, bytes): 81 stream = BytesIO(stream) 82 if not hasattr(stream, "name"): 83 self.filename = "" 84 else: 85 self.filename = stream.name 86 87 self.next_line_state = self.line_start_state 88 for i, line in enumerate(stream): 89 assert isinstance(line, bytes) 90 self.state = self.next_line_state 91 assert self.state is not None 92 states = [] 93 self.next_line_state = None 94 self.line_number = i + 1 95 self.index = 0 96 self.line = line.decode('utf-8').rstrip() 97 assert isinstance(self.line, str) 98 while self.state != self.eol_state: 99 states.append(self.state) 100 tokens = self.state() 101 if tokens: 102 for token in tokens: 103 yield token 104 self.state() 105 while True: 106 yield (token_types.eof, None) 107 108 def char(self): 109 if self.index == len(self.line): 110 return eol 111 return self.line[self.index] 112 113 def consume(self): 114 if self.index < len(self.line): 115 self.index += 1 116 117 def peek(self, length): 118 return self.line[self.index:self.index + length] 119 120 def skip_whitespace(self): 121 while self.char() == " ": 122 self.consume() 123 124 def eol_state(self): 125 if self.next_line_state is None: 126 self.next_line_state = self.line_start_state 127 128 def line_start_state(self): 129 self.skip_whitespace() 130 if self.char() == eol: 131 self.state = self.eol_state 132 return 133 if self.index > self.indent_levels[-1]: 134 self.indent_levels.append(self.index) 135 yield (token_types.group_start, None) 136 else: 137 if self.index < self.indent_levels[-1]: 138 while self.index < self.indent_levels[-1]: 139 self.indent_levels.pop() 140 yield (token_types.group_end, None) 141 # This is terrible; if we were parsing an expression 142 # then the next_state will be expr_or_value but when we deindent 143 # it must always be a heading or key next so we go back to data_line_state 144 self.next_state = self.data_line_state 145 if self.index != self.indent_levels[-1]: 146 raise ParseError(self.filename, self.line_number, "Unexpected indent") 147 148 self.state = self.next_state 149 150 def data_line_state(self): 151 if self.char() == "[": 152 yield (token_types.paren, self.char()) 153 self.consume() 154 self.state = self.heading_state 155 else: 156 self.state = self.key_state 157 158 def heading_state(self): 159 rv = "" 160 while True: 161 c = self.char() 162 if c == "\\": 163 rv += self.consume_escape() 164 elif c == "]": 165 break 166 elif c == eol: 167 raise ParseError(self.filename, self.line_number, "EOL in heading") 168 else: 169 rv += c 170 self.consume() 171 172 yield (token_types.string, decode(rv)) 173 yield (token_types.paren, "]") 174 self.consume() 175 self.state = self.line_end_state 176 self.next_state = self.data_line_state 177 178 def key_state(self): 179 rv = "" 180 while True: 181 c = self.char() 182 if c == " ": 183 self.skip_whitespace() 184 if self.char() != ":": 185 raise ParseError(self.filename, self.line_number, "Space in key name") 186 break 187 elif c == ":": 188 break 189 elif c == eol: 190 raise ParseError(self.filename, self.line_number, "EOL in key name (missing ':'?)") 191 elif c == "\\": 192 rv += self.consume_escape() 193 else: 194 rv += c 195 self.consume() 196 yield (token_types.string, decode(rv)) 197 yield (token_types.separator, ":") 198 self.consume() 199 self.state = self.after_key_state 200 201 def after_key_state(self): 202 self.skip_whitespace() 203 c = self.char() 204 if c == "#": 205 self.next_state = self.expr_or_value_state 206 self.state = self.comment_state 207 elif c == eol: 208 self.next_state = self.expr_or_value_state 209 self.state = self.eol_state 210 elif c == "[": 211 self.state = self.list_start_state 212 else: 213 self.state = self.value_state 214 215 def after_expr_state(self): 216 self.skip_whitespace() 217 c = self.char() 218 if c == "#": 219 self.next_state = self.after_expr_state 220 self.state = self.comment_state 221 elif c == eol: 222 self.next_state = self.after_expr_state 223 self.state = self.eol_state 224 elif c == "[": 225 self.state = self.list_start_state 226 else: 227 self.state = self.value_state 228 229 def list_start_state(self): 230 yield (token_types.list_start, "[") 231 self.consume() 232 self.state = self.list_value_start_state 233 234 def list_value_start_state(self): 235 self.skip_whitespace() 236 if self.char() == "]": 237 self.state = self.list_end_state 238 elif self.char() in ("'", '"'): 239 quote_char = self.char() 240 self.consume() 241 yield (token_types.string, self.consume_string(quote_char)) 242 self.skip_whitespace() 243 if self.char() == "]": 244 self.state = self.list_end_state 245 elif self.char() != ",": 246 raise ParseError(self.filename, self.line_number, "Junk after quoted string") 247 self.consume() 248 elif self.char() == "#": 249 self.state = self.comment_state 250 self.next_line_state = self.list_value_start_state 251 elif self.char() == eol: 252 self.next_line_state = self.list_value_start_state 253 self.state = self.eol_state 254 elif self.char() == ",": 255 raise ParseError(self.filename, self.line_number, "List item started with separator") 256 elif self.char() == "@": 257 self.state = self.list_value_atom_state 258 else: 259 self.state = self.list_value_state 260 261 def list_value_state(self): 262 rv = "" 263 spaces = 0 264 while True: 265 c = self.char() 266 if c == "\\": 267 escape = self.consume_escape() 268 rv += escape 269 elif c == eol: 270 raise ParseError(self.filename, self.line_number, "EOL in list value") 271 elif c == "#": 272 raise ParseError(self.filename, self.line_number, "EOL in list value (comment)") 273 elif c == ",": 274 self.state = self.list_value_start_state 275 self.consume() 276 break 277 elif c == " ": 278 spaces += 1 279 self.consume() 280 elif c == "]": 281 self.state = self.list_end_state 282 self.consume() 283 break 284 else: 285 rv += " " * spaces 286 spaces = 0 287 rv += c 288 self.consume() 289 290 if rv: 291 yield (token_types.string, decode(rv)) 292 293 def list_value_atom_state(self): 294 self.consume() 295 for _, value in self.list_value_state(): 296 yield token_types.atom, value 297 298 def list_end_state(self): 299 self.consume() 300 yield (token_types.list_end, "]") 301 self.state = self.line_end_state 302 303 def value_state(self): 304 self.skip_whitespace() 305 c = self.char() 306 if c in ("'", '"'): 307 quote_char = self.char() 308 self.consume() 309 yield (token_types.string, self.consume_string(quote_char)) 310 if self.char() == "#": 311 self.state = self.comment_state 312 else: 313 self.state = self.line_end_state 314 elif c == "@": 315 self.consume() 316 for _, value in self.value_inner_state(): 317 yield token_types.atom, value 318 elif c == "[": 319 self.state = self.list_start_state 320 else: 321 self.state = self.value_inner_state 322 323 def value_inner_state(self): 324 rv = "" 325 spaces = 0 326 while True: 327 c = self.char() 328 if c == "\\": 329 rv += self.consume_escape() 330 elif c == "#": 331 self.state = self.comment_state 332 break 333 elif c == " ": 334 # prevent whitespace before comments from being included in the value 335 spaces += 1 336 self.consume() 337 elif c == eol: 338 self.state = self.line_end_state 339 break 340 else: 341 rv += " " * spaces 342 spaces = 0 343 rv += c 344 self.consume() 345 rv = decode(rv) 346 if rv.startswith("if "): 347 # Hack to avoid a problem where people write 348 # disabled: if foo 349 # and expect that to disable conditionally 350 raise ParseError(self.filename, self.line_number, "Strings starting 'if ' must be quoted " 351 "(expressions must start on a newline and be indented)") 352 yield (token_types.string, rv) 353 354 def comment_state(self): 355 while self.char() is not eol: 356 self.consume() 357 self.state = self.eol_state 358 359 def line_end_state(self): 360 self.skip_whitespace() 361 c = self.char() 362 if c == "#": 363 self.state = self.comment_state 364 elif c == eol: 365 self.state = self.eol_state 366 else: 367 raise ParseError(self.filename, self.line_number, "Junk before EOL %s" % c) 368 369 def consume_string(self, quote_char): 370 rv = "" 371 while True: 372 c = self.char() 373 if c == "\\": 374 rv += self.consume_escape() 375 elif c == quote_char: 376 self.consume() 377 break 378 elif c == eol: 379 raise ParseError(self.filename, self.line_number, "EOL in quoted string") 380 else: 381 rv += c 382 self.consume() 383 384 return decode(rv) 385 386 def expr_or_value_state(self): 387 if self.peek(3) == "if ": 388 self.state = self.expr_state 389 else: 390 self.state = self.value_state 391 392 def expr_state(self): 393 self.skip_whitespace() 394 c = self.char() 395 if c == eol: 396 raise ParseError(self.filename, self.line_number, "EOL in expression") 397 elif c in "'\"": 398 self.consume() 399 yield (token_types.string, self.consume_string(c)) 400 elif c == "#": 401 raise ParseError(self.filename, self.line_number, "Comment before end of expression") 402 elif c == ":": 403 yield (token_types.separator, c) 404 self.consume() 405 self.state = self.after_expr_state 406 elif c in parens: 407 self.consume() 408 yield (token_types.paren, c) 409 elif c in ("!", "="): 410 self.state = self.operator_state 411 elif c in digits: 412 self.state = self.digit_state 413 else: 414 self.state = self.ident_state 415 416 def operator_state(self): 417 # Only symbolic operators 418 index_0 = self.index 419 while True: 420 c = self.char() 421 if c == eol: 422 break 423 elif c in operator_chars: 424 self.consume() 425 else: 426 self.state = self.expr_state 427 break 428 yield (token_types.ident, self.line[index_0:self.index]) 429 430 def digit_state(self): 431 index_0 = self.index 432 seen_dot = False 433 while True: 434 c = self.char() 435 if c == eol: 436 break 437 elif c in digits: 438 self.consume() 439 elif c == ".": 440 if seen_dot: 441 raise ParseError(self.filename, self.line_number, "Invalid number") 442 self.consume() 443 seen_dot = True 444 elif c in parens: 445 break 446 elif c in operator_chars: 447 break 448 elif c == " ": 449 break 450 elif c == ":": 451 break 452 else: 453 raise ParseError(self.filename, self.line_number, "Invalid character in number") 454 455 self.state = self.expr_state 456 yield (token_types.number, self.line[index_0:self.index]) 457 458 def ident_state(self): 459 index_0 = self.index 460 while True: 461 c = self.char() 462 if c == eol: 463 break 464 elif c == ".": 465 break 466 elif c in parens: 467 break 468 elif c in operator_chars: 469 break 470 elif c == " ": 471 break 472 elif c == ":": 473 break 474 else: 475 self.consume() 476 self.state = self.expr_state 477 yield (token_types.ident, self.line[index_0:self.index]) 478 479 def consume_escape(self): 480 assert self.char() == "\\" 481 self.consume() 482 c = self.char() 483 self.consume() 484 if c == "x": 485 return self.decode_escape(2) 486 elif c == "u": 487 return self.decode_escape(4) 488 elif c == "U": 489 return self.decode_escape(6) 490 elif c in ["a", "b", "f", "n", "r", "t", "v"]: 491 return eval(r"'\%s'" % c) 492 elif c is eol: 493 raise ParseError(self.filename, self.line_number, "EOL in escape") 494 else: 495 return c 496 497 def decode_escape(self, length): 498 value = 0 499 for i in range(length): 500 c = self.char() 501 value *= 16 502 value += self.escape_value(c) 503 self.consume() 504 505 return chr(value) 506 507 def escape_value(self, c): 508 if '0' <= c <= '9': 509 return ord(c) - ord('0') 510 elif 'a' <= c <= 'f': 511 return ord(c) - ord('a') + 10 512 elif 'A' <= c <= 'F': 513 return ord(c) - ord('A') + 10 514 else: 515 raise ParseError(self.filename, self.line_number, "Invalid character escape") 516 517 518class Parser(object): 519 def __init__(self): 520 self.reset() 521 522 def reset(self): 523 self.token = None 524 self.unary_operators = "!" 525 self.binary_operators = frozenset(["&&", "||", "=="]) 526 self.tokenizer = Tokenizer() 527 self.token_generator = None 528 self.tree = Treebuilder(DataNode(None)) 529 self.expr_builder = None 530 self.expr_builders = [] 531 532 def parse(self, input): 533 try: 534 self.reset() 535 self.token_generator = self.tokenizer.tokenize(input) 536 self.consume() 537 self.manifest() 538 return self.tree.node 539 except Exception as e: 540 if not isinstance(e, ParseError): 541 raise ParseError(self.tokenizer.filename, 542 self.tokenizer.line_number, 543 str(e)) 544 raise 545 546 def consume(self): 547 self.token = next(self.token_generator) 548 549 def expect(self, type, value=None): 550 if self.token[0] != type: 551 raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, 552 "Token '{}' doesn't equal expected type '{}'".format(self.token[0], type)) 553 if value is not None: 554 if self.token[1] != value: 555 raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, 556 "Token '{}' doesn't equal expected value '{}'".format(self.token[1], value)) 557 558 self.consume() 559 560 def manifest(self): 561 self.data_block() 562 self.expect(token_types.eof) 563 564 def data_block(self): 565 while self.token[0] == token_types.string: 566 self.tree.append(KeyValueNode(self.token[1])) 567 self.consume() 568 self.expect(token_types.separator) 569 self.value_block() 570 self.tree.pop() 571 572 while self.token == (token_types.paren, "["): 573 self.consume() 574 if self.token[0] != token_types.string: 575 raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, 576 "Token '{}' is not a string".format(self.token[0])) 577 self.tree.append(DataNode(self.token[1])) 578 self.consume() 579 self.expect(token_types.paren, "]") 580 if self.token[0] == token_types.group_start: 581 self.consume() 582 self.data_block() 583 self.eof_or_end_group() 584 self.tree.pop() 585 586 def eof_or_end_group(self): 587 if self.token[0] != token_types.eof: 588 self.expect(token_types.group_end) 589 590 def value_block(self): 591 if self.token[0] == token_types.list_start: 592 self.consume() 593 self.list_value() 594 elif self.token[0] == token_types.string: 595 self.value() 596 elif self.token[0] == token_types.group_start: 597 self.consume() 598 self.expression_values() 599 if self.token[0] == token_types.string: 600 self.value() 601 elif self.token[0] == token_types.list_start: 602 self.consume() 603 self.list_value() 604 self.eof_or_end_group() 605 elif self.token[0] == token_types.atom: 606 self.atom() 607 else: 608 raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, 609 "Token '{}' is not a known type".format(self.token[0])) 610 611 def list_value(self): 612 self.tree.append(ListNode()) 613 while self.token[0] in (token_types.atom, token_types.string): 614 if self.token[0] == token_types.atom: 615 self.atom() 616 else: 617 self.value() 618 self.expect(token_types.list_end) 619 self.tree.pop() 620 621 def expression_values(self): 622 while self.token == (token_types.ident, "if"): 623 self.consume() 624 self.tree.append(ConditionalNode()) 625 self.expr_start() 626 self.expect(token_types.separator) 627 self.value_block() 628 self.tree.pop() 629 630 def value(self): 631 self.tree.append(ValueNode(self.token[1])) 632 self.consume() 633 self.tree.pop() 634 635 def atom(self): 636 if self.token[1] not in atoms: 637 raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, "Unrecognised symbol @%s" % self.token[1]) 638 self.tree.append(AtomNode(atoms[self.token[1]])) 639 self.consume() 640 self.tree.pop() 641 642 def expr_start(self): 643 self.expr_builder = ExpressionBuilder(self.tokenizer) 644 self.expr_builders.append(self.expr_builder) 645 self.expr() 646 expression = self.expr_builder.finish() 647 self.expr_builders.pop() 648 self.expr_builder = self.expr_builders[-1] if self.expr_builders else None 649 if self.expr_builder: 650 self.expr_builder.operands[-1].children[-1].append(expression) 651 else: 652 self.tree.append(expression) 653 self.tree.pop() 654 655 def expr(self): 656 self.expr_operand() 657 while (self.token[0] == token_types.ident and self.token[1] in binary_operators): 658 self.expr_bin_op() 659 self.expr_operand() 660 661 def expr_operand(self): 662 if self.token == (token_types.paren, "("): 663 self.consume() 664 self.expr_builder.left_paren() 665 self.expr() 666 self.expect(token_types.paren, ")") 667 self.expr_builder.right_paren() 668 elif self.token[0] == token_types.ident and self.token[1] in unary_operators: 669 self.expr_unary_op() 670 self.expr_operand() 671 elif self.token[0] in [token_types.string, token_types.ident]: 672 self.expr_value() 673 elif self.token[0] == token_types.number: 674 self.expr_number() 675 else: 676 raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, "Unrecognised operand") 677 678 def expr_unary_op(self): 679 if self.token[1] in unary_operators: 680 self.expr_builder.push_operator(UnaryOperatorNode(self.token[1])) 681 self.consume() 682 else: 683 raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, "Expected unary operator") 684 685 def expr_bin_op(self): 686 if self.token[1] in binary_operators: 687 self.expr_builder.push_operator(BinaryOperatorNode(self.token[1])) 688 self.consume() 689 else: 690 raise ParseError(self.tokenizer.filename, self.tokenizer.line_number, "Expected binary operator") 691 692 def expr_value(self): 693 node_type = {token_types.string: StringNode, 694 token_types.ident: VariableNode}[self.token[0]] 695 self.expr_builder.push_operand(node_type(self.token[1])) 696 self.consume() 697 if self.token == (token_types.paren, "["): 698 self.consume() 699 self.expr_builder.operands[-1].append(IndexNode()) 700 self.expr_start() 701 self.expect(token_types.paren, "]") 702 703 def expr_number(self): 704 self.expr_builder.push_operand(NumberNode(self.token[1])) 705 self.consume() 706 707 708class Treebuilder(object): 709 def __init__(self, root): 710 self.root = root 711 self.node = root 712 713 def append(self, node): 714 assert isinstance(node, Node) 715 self.node.append(node) 716 self.node = node 717 assert self.node is not None 718 return node 719 720 def pop(self): 721 node = self.node 722 self.node = self.node.parent 723 assert self.node is not None 724 return node 725 726 727class ExpressionBuilder(object): 728 def __init__(self, tokenizer): 729 self.operands = [] 730 self.operators = [None] 731 self.tokenizer = tokenizer 732 733 def finish(self): 734 while self.operators[-1] is not None: 735 self.pop_operator() 736 rv = self.pop_operand() 737 assert self.is_empty() 738 return rv 739 740 def left_paren(self): 741 self.operators.append(None) 742 743 def right_paren(self): 744 while self.operators[-1] is not None: 745 self.pop_operator() 746 if not self.operators: 747 raise ParseError(self.tokenizer.filename, self.tokenizer.line, 748 "Unbalanced parens") 749 750 assert self.operators.pop() is None 751 752 def push_operator(self, operator): 753 assert operator is not None 754 while self.precedence(self.operators[-1]) > self.precedence(operator): 755 self.pop_operator() 756 757 self.operators.append(operator) 758 759 def pop_operator(self): 760 operator = self.operators.pop() 761 if isinstance(operator, BinaryOperatorNode): 762 operand_1 = self.operands.pop() 763 operand_0 = self.operands.pop() 764 self.operands.append(BinaryExpressionNode(operator, operand_0, operand_1)) 765 else: 766 operand_0 = self.operands.pop() 767 self.operands.append(UnaryExpressionNode(operator, operand_0)) 768 769 def push_operand(self, node): 770 self.operands.append(node) 771 772 def pop_operand(self): 773 return self.operands.pop() 774 775 def is_empty(self): 776 return len(self.operands) == 0 and all(item is None for item in self.operators) 777 778 def precedence(self, operator): 779 if operator is None: 780 return 0 781 return precedence(operator) 782 783 784def parse(stream): 785 p = Parser() 786 return p.parse(stream) 787