1from __future__ import unicode_literals 2import re 3from . import ast 4from .stream import EOF, EOL, FluentParserStream 5from .errors import ParseError 6 7 8def with_span(fn): 9 def decorated(self, ps, *args, **kwargs): 10 if not self.with_spans: 11 return fn(self, ps, *args, **kwargs) 12 13 start = ps.index 14 node = fn(self, ps, *args, **kwargs) 15 16 # Don't re-add the span if the node already has it. This may happen 17 # when one decorated function calls another decorated function. 18 if node.span is not None: 19 return node 20 21 end = ps.index 22 node.add_span(start, end) 23 return node 24 25 return decorated 26 27 28class FluentParser(object): 29 def __init__(self, with_spans=True): 30 self.with_spans = with_spans 31 32 def parse(self, source): 33 ps = FluentParserStream(source) 34 ps.skip_blank_block() 35 36 entries = [] 37 last_comment = None 38 39 while ps.current_char: 40 entry = self.get_entry_or_junk(ps) 41 blank_lines = ps.skip_blank_block() 42 43 # Regular Comments require special logic. Comments may be attached 44 # to Messages or Terms if they are followed immediately by them. 45 # However they should parse as standalone when they're followed by 46 # Junk. Consequently, we only attach Comments once we know that the 47 # Message or the Term parsed successfully. 48 if isinstance(entry, ast.Comment) and len(blank_lines) == 0 \ 49 and ps.current_char: 50 # Stash the comment and decide what to do with it 51 # in the next pass. 52 last_comment = entry 53 continue 54 55 if last_comment is not None: 56 if isinstance(entry, (ast.Message, ast.Term)): 57 entry.comment = last_comment 58 if self.with_spans: 59 entry.span.start = entry.comment.span.start 60 else: 61 entries.append(last_comment) 62 # In either case, the stashed comment has been dealt with; 63 # clear it. 64 last_comment = None 65 66 if isinstance(entry, ast.Comment) \ 67 and ps.last_comment_zero_four_syntax \ 68 and len(entries) == 0: 69 comment = ast.ResourceComment(entry.content) 70 comment.span = entry.span 71 entries.append(comment) 72 else: 73 entries.append(entry) 74 75 ps.last_comment_zero_four_syntax = False 76 77 res = ast.Resource(entries) 78 79 if self.with_spans: 80 res.add_span(0, ps.index) 81 82 return res 83 84 def parse_entry(self, source): 85 """Parse the first Message or Term in source. 86 87 Skip all encountered comments and start parsing at the first Mesage 88 or Term start. Return Junk if the parsing is not successful. 89 90 Preceding comments are ignored unless they contain syntax errors 91 themselves, in which case Junk for the invalid comment is returned. 92 """ 93 ps = FluentParserStream(source) 94 ps.skip_blank_block() 95 96 while ps.current_char == '#': 97 skipped = self.get_entry_or_junk(ps) 98 if isinstance(skipped, ast.Junk): 99 # Don't skip Junk comments. 100 return skipped 101 ps.skip_blank_block() 102 103 return self.get_entry_or_junk(ps) 104 105 def get_entry_or_junk(self, ps): 106 entry_start_pos = ps.index 107 108 try: 109 entry = self.get_entry(ps) 110 ps.expect_line_end() 111 return entry 112 except ParseError as err: 113 error_index = ps.index 114 ps.skip_to_next_entry_start(entry_start_pos) 115 next_entry_start = ps.index 116 if next_entry_start < error_index: 117 # The position of the error must be inside of the Junk's span. 118 error_index = next_entry_start 119 120 # Create a Junk instance 121 slice = ps.string[entry_start_pos:next_entry_start] 122 junk = ast.Junk(slice) 123 if self.with_spans: 124 junk.add_span(entry_start_pos, next_entry_start) 125 annot = ast.Annotation(err.code, err.args, err.message) 126 annot.add_span(error_index, error_index) 127 junk.add_annotation(annot) 128 return junk 129 130 def get_entry(self, ps): 131 if ps.current_char == '#': 132 return self.get_comment(ps) 133 134 if ps.current_char == '/': 135 return self.get_zero_four_style_comment(ps) 136 137 if ps.current_char == '[': 138 return self.get_group_comment_from_section(ps) 139 140 if ps.current_char == '-': 141 return self.get_term(ps) 142 143 if ps.is_identifier_start(): 144 return self.get_message(ps) 145 146 raise ParseError('E0002') 147 148 @with_span 149 def get_zero_four_style_comment(self, ps): 150 ps.expect_char('/') 151 ps.expect_char('/') 152 ps.take_char(lambda x: x == ' ') 153 154 content = '' 155 156 while True: 157 ch = ps.take_char(lambda x: x != EOL) 158 while ch: 159 content += ch 160 ch = ps.take_char(lambda x: x != EOL) 161 162 if ps.is_next_line_zero_four_comment(): 163 content += ps.current_char 164 ps.next() 165 ps.expect_char('/') 166 ps.expect_char('/') 167 ps.take_char(lambda x: x == ' ') 168 else: 169 break 170 171 # Comments followed by Sections become GroupComments. 172 if ps.peek() == '[': 173 ps.skip_to_peek() 174 self.get_group_comment_from_section(ps) 175 return ast.GroupComment(content) 176 177 ps.reset_peek() 178 ps.last_comment_zero_four_syntax = True 179 return ast.Comment(content) 180 181 @with_span 182 def get_comment(self, ps): 183 # 0 - comment 184 # 1 - group comment 185 # 2 - resource comment 186 level = -1 187 content = '' 188 189 while True: 190 i = -1 191 while ps.current_char == '#' \ 192 and (i < (2 if level == -1 else level)): 193 ps.next() 194 i += 1 195 196 if level == -1: 197 level = i 198 199 if ps.current_char != EOL: 200 ps.expect_char(' ') 201 ch = ps.take_char(lambda x: x != EOL) 202 while ch: 203 content += ch 204 ch = ps.take_char(lambda x: x != EOL) 205 206 if ps.is_next_line_comment(level=level): 207 content += ps.current_char 208 ps.next() 209 else: 210 break 211 212 if level == 0: 213 return ast.Comment(content) 214 elif level == 1: 215 return ast.GroupComment(content) 216 elif level == 2: 217 return ast.ResourceComment(content) 218 219 @with_span 220 def get_group_comment_from_section(self, ps): 221 def until_closing_bracket_or_eol(ch): 222 return ch not in (']', EOL) 223 224 ps.expect_char('[') 225 ps.expect_char('[') 226 while ps.take_char(until_closing_bracket_or_eol): 227 pass 228 ps.expect_char(']') 229 ps.expect_char(']') 230 231 # A Section without a comment is like an empty Group Comment. 232 # Semantically it ends the previous group and starts a new one. 233 return ast.GroupComment('') 234 235 @with_span 236 def get_message(self, ps): 237 id = self.get_identifier(ps) 238 ps.skip_blank_inline() 239 240 # XXX Syntax 0.4 compat 241 if ps.current_char == '=': 242 ps.next() 243 value = self.maybe_get_pattern(ps) 244 else: 245 value = None 246 247 attrs = self.get_attributes(ps) 248 249 if value is None and len(attrs) == 0: 250 raise ParseError('E0005', id.name) 251 252 return ast.Message(id, value, attrs) 253 254 @with_span 255 def get_term(self, ps): 256 ps.expect_char('-') 257 id = self.get_identifier(ps) 258 259 ps.skip_blank_inline() 260 ps.expect_char('=') 261 262 # Syntax 0.8 compat: VariantLists are supported but deprecated. They 263 # can only be found as values of Terms. Nested VariantLists are not 264 # allowed. 265 value = self.maybe_get_variant_list(ps) or self.maybe_get_pattern(ps) 266 if value is None: 267 raise ParseError('E0006', id.name) 268 269 attrs = self.get_attributes(ps) 270 return ast.Term(id, value, attrs) 271 272 @with_span 273 def get_attribute(self, ps): 274 ps.expect_char('.') 275 276 key = self.get_identifier(ps) 277 278 ps.skip_blank_inline() 279 ps.expect_char('=') 280 281 value = self.maybe_get_pattern(ps) 282 if value is None: 283 raise ParseError('E0012') 284 285 return ast.Attribute(key, value) 286 287 288 def get_attributes(self, ps): 289 attrs = [] 290 ps.peek_blank() 291 292 while ps.is_attribute_start(): 293 ps.skip_to_peek() 294 attr = self.get_attribute(ps) 295 attrs.append(attr) 296 ps.peek_blank(); 297 298 return attrs 299 300 @with_span 301 def get_identifier(self, ps): 302 name = ps.take_id_start() 303 ch = ps.take_id_char() 304 while ch: 305 name += ch 306 ch = ps.take_id_char() 307 308 return ast.Identifier(name) 309 310 def get_variant_key(self, ps): 311 ch = ps.current_char 312 313 if ch is EOF: 314 raise ParseError('E0013') 315 316 cc = ord(ch) 317 if ((cc >= 48 and cc <= 57) or cc == 45): # 0-9, - 318 return self.get_number(ps) 319 320 return self.get_identifier(ps) 321 322 @with_span 323 def get_variant(self, ps, has_default): 324 default_index = False 325 326 if ps.current_char == '*': 327 if has_default: 328 raise ParseError('E0015') 329 ps.next() 330 default_index = True 331 332 ps.expect_char('[') 333 ps.skip_blank() 334 335 key = self.get_variant_key(ps) 336 337 ps.skip_blank() 338 ps.expect_char(']') 339 340 value = self.maybe_get_pattern(ps) 341 if value is None: 342 raise ParseError('E0012') 343 344 return ast.Variant(key, value, default_index) 345 346 347 def get_variants(self, ps): 348 variants = [] 349 has_default = False 350 351 ps.skip_blank() 352 while ps.is_variant_start(): 353 variant = self.get_variant(ps, has_default) 354 355 if variant.default: 356 has_default = True 357 358 variants.append(variant) 359 ps.expect_line_end() 360 ps.skip_blank() 361 362 if len(variants) == 0: 363 raise ParseError('E0011') 364 365 if not has_default: 366 raise ParseError('E0010') 367 368 return variants 369 370 def get_digits(self, ps): 371 num = '' 372 373 ch = ps.take_digit() 374 while ch: 375 num += ch 376 ch = ps.take_digit() 377 378 if len(num) == 0: 379 raise ParseError('E0004', '0-9') 380 381 return num 382 383 @with_span 384 def get_number(self, ps): 385 num = '' 386 387 if ps.current_char == '-': 388 num += '-' 389 ps.next() 390 391 num += self.get_digits(ps) 392 393 if ps.current_char == '.': 394 num += '.' 395 ps.next() 396 num += self.get_digits(ps) 397 398 return ast.NumberLiteral(num) 399 400 def maybe_get_pattern(self, ps): 401 '''Parse an inline or a block Pattern, or None 402 403 maybe_get_pattern distinguishes between patterns which start on the 404 same line as the indentifier (aka inline singleline patterns and inline 405 multiline patterns), and patterns which start on a new line (aka block 406 patterns). The distinction is important for the dedentation logic: the 407 indent of the first line of a block pattern must be taken into account 408 when calculating the maximum common indent. 409 ''' 410 ps.peek_blank_inline() 411 if ps.is_value_start(): 412 ps.skip_to_peek() 413 return self.get_pattern(ps, is_block=False) 414 415 ps.peek_blank_block() 416 if ps.is_value_continuation(): 417 ps.skip_to_peek() 418 return self.get_pattern(ps, is_block=True) 419 420 return None 421 422 def maybe_get_variant_list(self, ps): 423 '''Parse a VariantList, or None 424 425 Deprecated in Syntax 0.8. VariantLists are only allowed as values of 426 Terms. Values of Messages, Attributes and Variants must be Patterns. 427 This method is only used in get_term. 428 ''' 429 ps.peek_blank() 430 if ps.current_peek == '{': 431 start = ps.peek_offset 432 ps.peek() 433 ps.peek_blank_inline() 434 if ps.current_peek == EOL: 435 ps.peek_blank() 436 if ps.is_variant_start(): 437 ps.reset_peek(start) 438 ps.skip_to_peek() 439 return self.get_variant_list(ps) 440 441 ps.reset_peek() 442 return None 443 444 @with_span 445 def get_variant_list(self, ps): 446 ps.expect_char('{') 447 variants = self.get_variants(ps) 448 ps.expect_char('}') 449 return ast.VariantList(variants) 450 451 @with_span 452 def get_pattern(self, ps, is_block): 453 elements = [] 454 if is_block: 455 # A block pattern is a pattern which starts on a new line. Measure 456 # the indent of this first line for the dedentation logic. 457 blank_start = ps.index 458 first_indent = ps.skip_blank_inline() 459 elements.append(self.Indent(first_indent, blank_start, ps.index)) 460 common_indent_length = len(first_indent) 461 else: 462 common_indent_length = float('infinity') 463 464 465 while ps.current_char: 466 if ps.current_char == EOL: 467 blank_start = ps.index 468 blank_lines = ps.peek_blank_block() 469 if ps.is_value_continuation(): 470 ps.skip_to_peek() 471 indent = ps.skip_blank_inline() 472 common_indent_length = min(common_indent_length, len(indent)) 473 elements.append(self.Indent(blank_lines + indent, blank_start, ps.index)) 474 continue 475 476 # The end condition for get_pattern's while loop is a newline 477 # which is not followed by a valid pattern continuation. 478 ps.reset_peek() 479 break 480 481 if ps.current_char == '}': 482 raise ParseError('E0027') 483 484 if ps.current_char == '{': 485 element = self.get_placeable(ps) 486 else: 487 element = self.get_text_element(ps) 488 489 elements.append(element) 490 491 dedented = self.dedent(elements, common_indent_length) 492 return ast.Pattern(dedented) 493 494 class Indent(ast.SyntaxNode): 495 def __init__(self, value, start, end): 496 super(FluentParser.Indent, self).__init__() 497 self.value = value 498 self.add_span(start, end) 499 500 def dedent(self, elements, common_indent): 501 '''Dedent a list of elements by removing the maximum common indent from 502 the beginning of text lines. The common indent is calculated in 503 get_pattern. 504 ''' 505 trimmed = [] 506 507 for element in elements: 508 if isinstance(element, ast.Placeable): 509 trimmed.append(element) 510 continue 511 512 if isinstance(element, self.Indent): 513 # Strip the common indent. 514 element.value = element.value[:len(element.value) - common_indent] 515 if len(element.value) == 0: 516 continue 517 518 prev = trimmed[-1] if len(trimmed) > 0 else None 519 if isinstance(prev, ast.TextElement): 520 # Join adjacent TextElements by replacing them with their sum. 521 sum = ast.TextElement(prev.value + element.value) 522 if self.with_spans: 523 sum.add_span(prev.span.start, element.span.end) 524 trimmed[-1] = sum 525 continue 526 527 if isinstance(element, self.Indent): 528 # If the indent hasn't been merged into a preceding 529 # TextElements, convert it into a new TextElement. 530 text_element = ast.TextElement(element.value) 531 if self.with_spans: 532 text_element.add_span(element.span.start, element.span.end) 533 element = text_element 534 535 trimmed.append(element) 536 537 # Trim trailing whitespace from the Pattern. 538 last_element = trimmed[-1] if len(trimmed) > 0 else None 539 if isinstance(last_element, ast.TextElement): 540 last_element.value = last_element.value.rstrip(' \t\n\r') 541 if last_element.value == "": 542 trimmed.pop() 543 544 return trimmed 545 546 @with_span 547 def get_text_element(self, ps): 548 buf = '' 549 550 while ps.current_char: 551 ch = ps.current_char 552 553 if ch == '{' or ch == '}': 554 return ast.TextElement(buf) 555 556 if ch == EOL: 557 return ast.TextElement(buf) 558 559 buf += ch 560 ps.next() 561 562 return ast.TextElement(buf) 563 564 def get_escape_sequence(self, ps): 565 next = ps.current_char 566 567 if next == '\\' or next == '"': 568 ps.next() 569 return '\\{}'.format(next), next 570 571 if next == 'u': 572 return self.get_unicode_escape_sequence(ps, next, 4) 573 574 if next == 'U': 575 return self.get_unicode_escape_sequence(ps, next, 6) 576 577 raise ParseError('E0025', next) 578 579 def get_unicode_escape_sequence(self, ps, u, digits): 580 ps.expect_char(u) 581 sequence = '' 582 for _ in range(digits): 583 ch = ps.take_hex_digit() 584 if not ch: 585 raise ParseError('E0026', '\\{}{}{}'.format(u, sequence, ps.current_char)) 586 sequence += ch 587 588 codepoint = int(sequence, 16) 589 if codepoint <= 0xD7FF or 0xE000 <= codepoint: 590 # It's a Unicode scalar value. The escape sequence is 4 or 6 digits 591 # long. Convert it to a 8-digit-long \UHHHHHHHH sequence and encode 592 # it as bytes, because in Python 3 decode is not available on str. 593 byte_sequence = "\\U{:08x}".format(codepoint).encode('utf-8') 594 unescaped = byte_sequence.decode('unicode-escape') 595 else: 596 # Escape sequences reresenting surrogate code points are 597 # well-formed but invalid in Fluent. Replace them with U+FFFD 598 # REPLACEMENT CHARACTER. 599 unescaped = '\uFFFD' 600 601 return '\\{}{}'.format(u, sequence), unescaped 602 603 @with_span 604 def get_placeable(self, ps): 605 ps.expect_char('{') 606 ps.skip_blank() 607 expression = self.get_expression(ps) 608 ps.expect_char('}') 609 return ast.Placeable(expression) 610 611 @with_span 612 def get_expression(self, ps): 613 selector = self.get_inline_expression(ps) 614 615 ps.skip_blank() 616 617 if ps.current_char == '-': 618 if ps.peek() != '>': 619 ps.reset_peek() 620 return selector 621 622 if isinstance(selector, ast.MessageReference): 623 raise ParseError('E0016') 624 625 if isinstance(selector, ast.AttributeExpression) \ 626 and isinstance(selector.ref, ast.MessageReference): 627 raise ParseError('E0018') 628 629 if isinstance(selector, ast.TermReference) \ 630 or isinstance(selector, ast.VariantExpression): 631 raise ParseError('E0017') 632 633 if isinstance(selector, ast.CallExpression) \ 634 and isinstance(selector.callee, ast.TermReference): 635 raise ParseError('E0017') 636 637 ps.next() 638 ps.next() 639 640 ps.skip_blank_inline() 641 ps.expect_line_end() 642 643 variants = self.get_variants(ps) 644 return ast.SelectExpression(selector, variants) 645 646 if isinstance(selector, ast.AttributeExpression) \ 647 and isinstance(selector.ref, ast.TermReference): 648 raise ParseError('E0019') 649 650 if isinstance(selector, ast.CallExpression) \ 651 and isinstance(selector.callee, ast.AttributeExpression): 652 raise ParseError('E0019') 653 654 return selector 655 656 @with_span 657 def get_inline_expression(self, ps): 658 if ps.current_char == '{': 659 return self.get_placeable(ps) 660 661 expr = self.get_simple_expression(ps) 662 663 if isinstance(expr, (ast.NumberLiteral, ast.StringLiteral, 664 ast.VariableReference)): 665 return expr 666 667 if isinstance(expr, ast.MessageReference): 668 if ps.current_char == '.': 669 ps.next() 670 attr = self.get_identifier(ps) 671 return ast.AttributeExpression(expr, attr) 672 673 if ps.current_char == '(': 674 # It's a Function. Ensure it's all upper-case. 675 if not re.match('^[A-Z][A-Z_?-]*$', expr.id.name): 676 raise ParseError('E0008') 677 func = ast.FunctionReference(expr.id) 678 if self.with_spans: 679 func.add_span(expr.span.start, expr.span.end) 680 return ast.CallExpression(func, *self.get_call_arguments(ps)) 681 682 return expr 683 684 if isinstance(expr, ast.TermReference): 685 if (ps.current_char == '['): 686 ps.next() 687 key = self.get_variant_key(ps) 688 ps.expect_char(']') 689 return ast.VariantExpression(expr, key) 690 691 if (ps.current_char == '.'): 692 ps.next() 693 attr = self.get_identifier(ps) 694 expr = ast.AttributeExpression(expr, attr) 695 696 if (ps.current_char == '('): 697 return ast.CallExpression(expr, *self.get_call_arguments(ps)) 698 699 return expr 700 701 raise ParseError('E0028') 702 703 @with_span 704 def get_simple_expression(self, ps): 705 if ps.is_number_start(): 706 return self.get_number(ps) 707 if ps.current_char == '"': 708 return self.get_string(ps) 709 if ps.current_char == '$': 710 ps.next() 711 id = self.get_identifier(ps) 712 return ast.VariableReference(id) 713 if ps.current_char == '-': 714 ps.next() 715 id = self.get_identifier(ps) 716 return ast.TermReference(id) 717 if ps.is_identifier_start(): 718 id = self.get_identifier(ps) 719 return ast.MessageReference(id) 720 raise ParseError('E0028') 721 722 @with_span 723 def get_call_argument(self, ps): 724 exp = self.get_inline_expression(ps) 725 726 ps.skip_blank() 727 728 if ps.current_char != ':': 729 return exp 730 731 if not isinstance(exp, ast.MessageReference): 732 raise ParseError('E0009') 733 734 ps.next() 735 ps.skip_blank() 736 737 value = self.get_literal(ps) 738 return ast.NamedArgument(exp.id, value) 739 740 def get_call_arguments(self, ps): 741 positional = [] 742 named = [] 743 argument_names = set() 744 745 ps.expect_char('(') 746 ps.skip_blank() 747 748 while True: 749 if ps.current_char == ')': 750 break 751 752 arg = self.get_call_argument(ps) 753 if isinstance(arg, ast.NamedArgument): 754 if arg.name.name in argument_names: 755 raise ParseError('E0022') 756 named.append(arg) 757 argument_names.add(arg.name.name) 758 elif len(argument_names) > 0: 759 raise ParseError('E0021') 760 else: 761 positional.append(arg) 762 763 ps.skip_blank() 764 765 if ps.current_char == ',': 766 ps.next() 767 ps.skip_blank() 768 continue 769 770 break 771 772 ps.expect_char(')') 773 return positional, named 774 775 @with_span 776 def get_string(self, ps): 777 raw = '' 778 value = '' 779 780 ps.expect_char('"') 781 782 while True: 783 ch = ps.take_char(lambda x: x != '"' and x != EOL) 784 if not ch: 785 break 786 if ch == '\\': 787 sequence, unescaped = self.get_escape_sequence(ps) 788 raw += sequence 789 value += unescaped 790 else: 791 raw += ch 792 value += ch 793 794 if ps.current_char == EOL: 795 raise ParseError('E0020') 796 797 ps.expect_char('"') 798 799 return ast.StringLiteral(raw, value) 800 801 @with_span 802 def get_literal(self, ps): 803 if ps.is_number_start(): 804 return self.get_number(ps) 805 if ps.current_char == '"': 806 return self.get_string(ps) 807 raise ParseError('E0014') 808