1from __future__ import unicode_literals
2import re
3from . import ast
4from .stream import EOF, EOL, FluentParserStream
5from .errors import ParseError
6
7
8def with_span(fn):
9    def decorated(self, ps, *args, **kwargs):
10        if not self.with_spans:
11            return fn(self, ps, *args, **kwargs)
12
13        start = ps.index
14        node = fn(self, ps, *args, **kwargs)
15
16        # Don't re-add the span if the node already has it. This may happen
17        # when one decorated function calls another decorated function.
18        if node.span is not None:
19            return node
20
21        end = ps.index
22        node.add_span(start, end)
23        return node
24
25    return decorated
26
27
28class FluentParser(object):
29    def __init__(self, with_spans=True):
30        self.with_spans = with_spans
31
32    def parse(self, source):
33        ps = FluentParserStream(source)
34        ps.skip_blank_block()
35
36        entries = []
37        last_comment = None
38
39        while ps.current_char:
40            entry = self.get_entry_or_junk(ps)
41            blank_lines = ps.skip_blank_block()
42
43            # Regular Comments require special logic. Comments may be attached
44            # to Messages or Terms if they are followed immediately by them.
45            # However they should parse as standalone when they're followed by
46            # Junk. Consequently, we only attach Comments once we know that the
47            # Message or the Term parsed successfully.
48            if isinstance(entry, ast.Comment) and len(blank_lines) == 0 \
49                    and ps.current_char:
50                # Stash the comment and decide what to do with it
51                # in the next pass.
52                last_comment = entry
53                continue
54
55            if last_comment is not None:
56                if isinstance(entry, (ast.Message, ast.Term)):
57                    entry.comment = last_comment
58                    if self.with_spans:
59                        entry.span.start = entry.comment.span.start
60                else:
61                    entries.append(last_comment)
62                # In either case, the stashed comment has been dealt with;
63                # clear it.
64                last_comment = None
65
66            if isinstance(entry, ast.Comment) \
67               and ps.last_comment_zero_four_syntax \
68               and len(entries) == 0:
69                comment = ast.ResourceComment(entry.content)
70                comment.span = entry.span
71                entries.append(comment)
72            else:
73                entries.append(entry)
74
75            ps.last_comment_zero_four_syntax = False
76
77        res = ast.Resource(entries)
78
79        if self.with_spans:
80            res.add_span(0, ps.index)
81
82        return res
83
84    def parse_entry(self, source):
85        """Parse the first Message or Term in source.
86
87        Skip all encountered comments and start parsing at the first Mesage
88        or Term start. Return Junk if the parsing is not successful.
89
90        Preceding comments are ignored unless they contain syntax errors
91        themselves, in which case Junk for the invalid comment is returned.
92        """
93        ps = FluentParserStream(source)
94        ps.skip_blank_block()
95
96        while ps.current_char == '#':
97            skipped = self.get_entry_or_junk(ps)
98            if isinstance(skipped, ast.Junk):
99                # Don't skip Junk comments.
100                return skipped
101            ps.skip_blank_block()
102
103        return self.get_entry_or_junk(ps)
104
105    def get_entry_or_junk(self, ps):
106        entry_start_pos = ps.index
107
108        try:
109            entry = self.get_entry(ps)
110            ps.expect_line_end()
111            return entry
112        except ParseError as err:
113            error_index = ps.index
114            ps.skip_to_next_entry_start(entry_start_pos)
115            next_entry_start = ps.index
116            if next_entry_start < error_index:
117                # The position of the error must be inside of the Junk's span.
118                error_index = next_entry_start
119
120            # Create a Junk instance
121            slice = ps.string[entry_start_pos:next_entry_start]
122            junk = ast.Junk(slice)
123            if self.with_spans:
124                junk.add_span(entry_start_pos, next_entry_start)
125            annot = ast.Annotation(err.code, err.args, err.message)
126            annot.add_span(error_index, error_index)
127            junk.add_annotation(annot)
128            return junk
129
130    def get_entry(self, ps):
131        if ps.current_char == '#':
132            return self.get_comment(ps)
133
134        if ps.current_char == '/':
135            return self.get_zero_four_style_comment(ps)
136
137        if ps.current_char == '[':
138            return self.get_group_comment_from_section(ps)
139
140        if ps.current_char == '-':
141            return self.get_term(ps)
142
143        if ps.is_identifier_start():
144            return self.get_message(ps)
145
146        raise ParseError('E0002')
147
148    @with_span
149    def get_zero_four_style_comment(self, ps):
150        ps.expect_char('/')
151        ps.expect_char('/')
152        ps.take_char(lambda x: x == ' ')
153
154        content = ''
155
156        while True:
157            ch = ps.take_char(lambda x: x != EOL)
158            while ch:
159                content += ch
160                ch = ps.take_char(lambda x: x != EOL)
161
162            if ps.is_next_line_zero_four_comment():
163                content += ps.current_char
164                ps.next()
165                ps.expect_char('/')
166                ps.expect_char('/')
167                ps.take_char(lambda x: x == ' ')
168            else:
169                break
170
171        # Comments followed by Sections become GroupComments.
172        if ps.peek() == '[':
173            ps.skip_to_peek()
174            self.get_group_comment_from_section(ps)
175            return ast.GroupComment(content)
176
177        ps.reset_peek()
178        ps.last_comment_zero_four_syntax = True
179        return ast.Comment(content)
180
181    @with_span
182    def get_comment(self, ps):
183        # 0 - comment
184        # 1 - group comment
185        # 2 - resource comment
186        level = -1
187        content = ''
188
189        while True:
190            i = -1
191            while ps.current_char == '#' \
192                    and (i < (2 if level == -1 else level)):
193                ps.next()
194                i += 1
195
196            if level == -1:
197                level = i
198
199            if ps.current_char != EOL:
200                ps.expect_char(' ')
201                ch = ps.take_char(lambda x: x != EOL)
202                while ch:
203                    content += ch
204                    ch = ps.take_char(lambda x: x != EOL)
205
206            if ps.is_next_line_comment(level=level):
207                content += ps.current_char
208                ps.next()
209            else:
210                break
211
212        if level == 0:
213            return ast.Comment(content)
214        elif level == 1:
215            return ast.GroupComment(content)
216        elif level == 2:
217            return ast.ResourceComment(content)
218
219    @with_span
220    def get_group_comment_from_section(self, ps):
221        def until_closing_bracket_or_eol(ch):
222            return ch not in (']', EOL)
223
224        ps.expect_char('[')
225        ps.expect_char('[')
226        while ps.take_char(until_closing_bracket_or_eol):
227            pass
228        ps.expect_char(']')
229        ps.expect_char(']')
230
231        # A Section without a comment is like an empty Group Comment.
232        # Semantically it ends the previous group and starts a new one.
233        return ast.GroupComment('')
234
235    @with_span
236    def get_message(self, ps):
237        id = self.get_identifier(ps)
238        ps.skip_blank_inline()
239
240        # XXX Syntax 0.4 compat
241        if ps.current_char == '=':
242            ps.next()
243            value = self.maybe_get_pattern(ps)
244        else:
245            value = None
246
247        attrs = self.get_attributes(ps)
248
249        if value is None and len(attrs) == 0:
250            raise ParseError('E0005', id.name)
251
252        return ast.Message(id, value, attrs)
253
254    @with_span
255    def get_term(self, ps):
256        ps.expect_char('-')
257        id = self.get_identifier(ps)
258
259        ps.skip_blank_inline()
260        ps.expect_char('=')
261
262        # Syntax 0.8 compat: VariantLists are supported but deprecated. They
263        # can only be found as values of Terms. Nested VariantLists are not
264        # allowed.
265        value = self.maybe_get_variant_list(ps) or self.maybe_get_pattern(ps)
266        if value is None:
267            raise ParseError('E0006', id.name)
268
269        attrs = self.get_attributes(ps)
270        return ast.Term(id, value, attrs)
271
272    @with_span
273    def get_attribute(self, ps):
274        ps.expect_char('.')
275
276        key = self.get_identifier(ps)
277
278        ps.skip_blank_inline()
279        ps.expect_char('=')
280
281        value = self.maybe_get_pattern(ps)
282        if value is None:
283            raise ParseError('E0012')
284
285        return ast.Attribute(key, value)
286
287
288    def get_attributes(self, ps):
289        attrs = []
290        ps.peek_blank()
291
292        while ps.is_attribute_start():
293            ps.skip_to_peek()
294            attr = self.get_attribute(ps)
295            attrs.append(attr)
296            ps.peek_blank();
297
298        return attrs
299
300    @with_span
301    def get_identifier(self, ps):
302        name = ps.take_id_start()
303        ch = ps.take_id_char()
304        while ch:
305            name += ch
306            ch = ps.take_id_char()
307
308        return ast.Identifier(name)
309
310    def get_variant_key(self, ps):
311        ch = ps.current_char
312
313        if ch is EOF:
314            raise ParseError('E0013')
315
316        cc = ord(ch)
317        if ((cc >= 48 and cc <= 57) or cc == 45):  # 0-9, -
318            return self.get_number(ps)
319
320        return self.get_identifier(ps)
321
322    @with_span
323    def get_variant(self, ps, has_default):
324        default_index = False
325
326        if ps.current_char == '*':
327            if has_default:
328                raise ParseError('E0015')
329            ps.next()
330            default_index = True
331
332        ps.expect_char('[')
333        ps.skip_blank()
334
335        key = self.get_variant_key(ps)
336
337        ps.skip_blank()
338        ps.expect_char(']')
339
340        value = self.maybe_get_pattern(ps)
341        if value is None:
342            raise ParseError('E0012')
343
344        return ast.Variant(key, value, default_index)
345
346
347    def get_variants(self, ps):
348        variants = []
349        has_default = False
350
351        ps.skip_blank()
352        while ps.is_variant_start():
353            variant = self.get_variant(ps, has_default)
354
355            if variant.default:
356                has_default = True
357
358            variants.append(variant)
359            ps.expect_line_end()
360            ps.skip_blank()
361
362        if len(variants) == 0:
363            raise ParseError('E0011')
364
365        if not has_default:
366            raise ParseError('E0010')
367
368        return variants
369
370    def get_digits(self, ps):
371        num = ''
372
373        ch = ps.take_digit()
374        while ch:
375            num += ch
376            ch = ps.take_digit()
377
378        if len(num) == 0:
379            raise ParseError('E0004', '0-9')
380
381        return num
382
383    @with_span
384    def get_number(self, ps):
385        num = ''
386
387        if ps.current_char == '-':
388            num += '-'
389            ps.next()
390
391        num += self.get_digits(ps)
392
393        if ps.current_char == '.':
394            num += '.'
395            ps.next()
396            num += self.get_digits(ps)
397
398        return ast.NumberLiteral(num)
399
400    def maybe_get_pattern(self, ps):
401        '''Parse an inline or a block Pattern, or None
402
403        maybe_get_pattern distinguishes between patterns which start on the
404        same line as the indentifier (aka inline singleline patterns and inline
405        multiline patterns), and patterns which start on a new line (aka block
406        patterns). The distinction is important for the dedentation logic: the
407        indent of the first line of a block pattern must be taken into account
408        when calculating the maximum common indent.
409        '''
410        ps.peek_blank_inline()
411        if ps.is_value_start():
412            ps.skip_to_peek()
413            return self.get_pattern(ps, is_block=False)
414
415        ps.peek_blank_block()
416        if ps.is_value_continuation():
417            ps.skip_to_peek()
418            return self.get_pattern(ps, is_block=True)
419
420        return None
421
422    def maybe_get_variant_list(self, ps):
423        '''Parse a VariantList, or None
424
425        Deprecated in Syntax 0.8. VariantLists are only allowed as values of
426        Terms. Values of Messages, Attributes and Variants must be Patterns.
427        This method is only used in get_term.
428        '''
429        ps.peek_blank()
430        if ps.current_peek == '{':
431            start = ps.peek_offset
432            ps.peek()
433            ps.peek_blank_inline()
434            if ps.current_peek == EOL:
435                ps.peek_blank()
436                if ps.is_variant_start():
437                    ps.reset_peek(start)
438                    ps.skip_to_peek()
439                    return self.get_variant_list(ps)
440
441        ps.reset_peek()
442        return None
443
444    @with_span
445    def get_variant_list(self, ps):
446        ps.expect_char('{')
447        variants = self.get_variants(ps)
448        ps.expect_char('}')
449        return ast.VariantList(variants)
450
451    @with_span
452    def get_pattern(self, ps, is_block):
453        elements = []
454        if is_block:
455            # A block pattern is a pattern which starts on a new line. Measure
456            # the indent of this first line for the dedentation logic.
457            blank_start = ps.index
458            first_indent = ps.skip_blank_inline()
459            elements.append(self.Indent(first_indent, blank_start, ps.index))
460            common_indent_length = len(first_indent)
461        else:
462            common_indent_length = float('infinity')
463
464
465        while ps.current_char:
466            if ps.current_char == EOL:
467                blank_start = ps.index
468                blank_lines = ps.peek_blank_block()
469                if ps.is_value_continuation():
470                    ps.skip_to_peek()
471                    indent = ps.skip_blank_inline()
472                    common_indent_length = min(common_indent_length, len(indent))
473                    elements.append(self.Indent(blank_lines + indent, blank_start, ps.index))
474                    continue
475
476                # The end condition for get_pattern's while loop is a newline
477                # which is not followed by a valid pattern continuation.
478                ps.reset_peek()
479                break
480
481            if ps.current_char == '}':
482                raise ParseError('E0027')
483
484            if ps.current_char == '{':
485                element = self.get_placeable(ps)
486            else:
487                element = self.get_text_element(ps)
488
489            elements.append(element)
490
491        dedented = self.dedent(elements, common_indent_length)
492        return ast.Pattern(dedented)
493
494    class Indent(ast.SyntaxNode):
495        def __init__(self, value, start, end):
496            super(FluentParser.Indent, self).__init__()
497            self.value = value
498            self.add_span(start, end)
499
500    def dedent(self, elements, common_indent):
501        '''Dedent a list of elements by removing the maximum common indent from
502        the beginning of text lines. The common indent is calculated in
503        get_pattern.
504        '''
505        trimmed = []
506
507        for element in elements:
508            if isinstance(element, ast.Placeable):
509                trimmed.append(element)
510                continue
511
512            if isinstance(element, self.Indent):
513                # Strip the common indent.
514                element.value = element.value[:len(element.value) - common_indent]
515                if len(element.value) == 0:
516                    continue
517
518            prev = trimmed[-1] if len(trimmed) > 0 else None
519            if isinstance(prev, ast.TextElement):
520                # Join adjacent TextElements by replacing them with their sum.
521                sum = ast.TextElement(prev.value + element.value)
522                if self.with_spans:
523                    sum.add_span(prev.span.start, element.span.end)
524                trimmed[-1] = sum
525                continue
526
527            if isinstance(element, self.Indent):
528                # If the indent hasn't been merged into a preceding
529                # TextElements, convert it into a new TextElement.
530                text_element = ast.TextElement(element.value)
531                if self.with_spans:
532                    text_element.add_span(element.span.start, element.span.end)
533                element = text_element
534
535            trimmed.append(element)
536
537        # Trim trailing whitespace from the Pattern.
538        last_element = trimmed[-1] if len(trimmed) > 0 else None
539        if isinstance(last_element, ast.TextElement):
540            last_element.value = last_element.value.rstrip(' \t\n\r')
541            if last_element.value == "":
542                trimmed.pop()
543
544        return trimmed
545
546    @with_span
547    def get_text_element(self, ps):
548        buf = ''
549
550        while ps.current_char:
551            ch = ps.current_char
552
553            if ch == '{' or ch == '}':
554                return ast.TextElement(buf)
555
556            if ch == EOL:
557                return ast.TextElement(buf)
558
559            buf += ch
560            ps.next()
561
562        return ast.TextElement(buf)
563
564    def get_escape_sequence(self, ps):
565        next = ps.current_char
566
567        if next == '\\' or next == '"':
568            ps.next()
569            return '\\{}'.format(next), next
570
571        if next == 'u':
572            return self.get_unicode_escape_sequence(ps, next, 4)
573
574        if next == 'U':
575            return self.get_unicode_escape_sequence(ps, next, 6)
576
577        raise ParseError('E0025', next)
578
579    def get_unicode_escape_sequence(self, ps, u, digits):
580        ps.expect_char(u)
581        sequence = ''
582        for _ in range(digits):
583            ch = ps.take_hex_digit()
584            if not ch:
585                raise ParseError('E0026', '\\{}{}{}'.format(u, sequence, ps.current_char))
586            sequence += ch
587
588        codepoint = int(sequence, 16)
589        if codepoint <= 0xD7FF or 0xE000 <= codepoint:
590            # It's a Unicode scalar value. The escape sequence is 4 or 6 digits
591            # long. Convert it to a 8-digit-long \UHHHHHHHH sequence and encode
592            # it as bytes, because in Python 3 decode is not available on str.
593            byte_sequence = "\\U{:08x}".format(codepoint).encode('utf-8')
594            unescaped = byte_sequence.decode('unicode-escape')
595        else:
596            # Escape sequences reresenting surrogate code points are
597            # well-formed but invalid in Fluent. Replace them with U+FFFD
598            # REPLACEMENT CHARACTER.
599            unescaped = '\uFFFD'
600
601        return '\\{}{}'.format(u, sequence), unescaped
602
603    @with_span
604    def get_placeable(self, ps):
605        ps.expect_char('{')
606        ps.skip_blank()
607        expression = self.get_expression(ps)
608        ps.expect_char('}')
609        return ast.Placeable(expression)
610
611    @with_span
612    def get_expression(self, ps):
613        selector = self.get_inline_expression(ps)
614
615        ps.skip_blank()
616
617        if ps.current_char == '-':
618            if ps.peek() != '>':
619                ps.reset_peek()
620                return selector
621
622            if isinstance(selector, ast.MessageReference):
623                raise ParseError('E0016')
624
625            if isinstance(selector, ast.AttributeExpression) \
626                   and isinstance(selector.ref, ast.MessageReference):
627                raise ParseError('E0018')
628
629            if isinstance(selector, ast.TermReference) \
630                    or isinstance(selector, ast.VariantExpression):
631                raise ParseError('E0017')
632
633            if isinstance(selector, ast.CallExpression) \
634                   and isinstance(selector.callee, ast.TermReference):
635                raise ParseError('E0017')
636
637            ps.next()
638            ps.next()
639
640            ps.skip_blank_inline()
641            ps.expect_line_end()
642
643            variants = self.get_variants(ps)
644            return ast.SelectExpression(selector, variants)
645
646        if isinstance(selector, ast.AttributeExpression) \
647                and isinstance(selector.ref, ast.TermReference):
648            raise ParseError('E0019')
649
650        if isinstance(selector, ast.CallExpression) \
651                and isinstance(selector.callee, ast.AttributeExpression):
652            raise ParseError('E0019')
653
654        return selector
655
656    @with_span
657    def get_inline_expression(self, ps):
658        if ps.current_char == '{':
659            return self.get_placeable(ps)
660
661        expr = self.get_simple_expression(ps)
662
663        if isinstance(expr, (ast.NumberLiteral, ast.StringLiteral,
664                ast.VariableReference)):
665            return expr
666
667        if isinstance(expr, ast.MessageReference):
668            if ps.current_char == '.':
669                ps.next()
670                attr = self.get_identifier(ps)
671                return ast.AttributeExpression(expr, attr)
672
673            if ps.current_char == '(':
674                # It's a Function. Ensure it's all upper-case.
675                if not re.match('^[A-Z][A-Z_?-]*$', expr.id.name):
676                    raise ParseError('E0008')
677                func = ast.FunctionReference(expr.id)
678                if self.with_spans:
679                    func.add_span(expr.span.start, expr.span.end)
680                return ast.CallExpression(func, *self.get_call_arguments(ps))
681
682            return expr
683
684        if isinstance(expr, ast.TermReference):
685            if (ps.current_char == '['):
686                ps.next()
687                key = self.get_variant_key(ps)
688                ps.expect_char(']')
689                return ast.VariantExpression(expr, key)
690
691            if (ps.current_char == '.'):
692                ps.next()
693                attr = self.get_identifier(ps)
694                expr = ast.AttributeExpression(expr, attr)
695
696            if (ps.current_char == '('):
697                return ast.CallExpression(expr, *self.get_call_arguments(ps))
698
699            return expr
700
701        raise ParseError('E0028')
702
703    @with_span
704    def get_simple_expression(self, ps):
705        if ps.is_number_start():
706            return self.get_number(ps)
707        if ps.current_char == '"':
708            return self.get_string(ps)
709        if ps.current_char == '$':
710            ps.next()
711            id = self.get_identifier(ps)
712            return ast.VariableReference(id)
713        if ps.current_char == '-':
714            ps.next()
715            id = self.get_identifier(ps)
716            return ast.TermReference(id)
717        if ps.is_identifier_start():
718            id = self.get_identifier(ps)
719            return ast.MessageReference(id)
720        raise ParseError('E0028')
721
722    @with_span
723    def get_call_argument(self, ps):
724        exp = self.get_inline_expression(ps)
725
726        ps.skip_blank()
727
728        if ps.current_char != ':':
729            return exp
730
731        if not isinstance(exp, ast.MessageReference):
732            raise ParseError('E0009')
733
734        ps.next()
735        ps.skip_blank()
736
737        value = self.get_literal(ps)
738        return ast.NamedArgument(exp.id, value)
739
740    def get_call_arguments(self, ps):
741        positional = []
742        named = []
743        argument_names = set()
744
745        ps.expect_char('(')
746        ps.skip_blank()
747
748        while True:
749            if ps.current_char == ')':
750                break
751
752            arg = self.get_call_argument(ps)
753            if isinstance(arg, ast.NamedArgument):
754                if arg.name.name in argument_names:
755                    raise ParseError('E0022')
756                named.append(arg)
757                argument_names.add(arg.name.name)
758            elif len(argument_names) > 0:
759                raise ParseError('E0021')
760            else:
761                positional.append(arg)
762
763            ps.skip_blank()
764
765            if ps.current_char == ',':
766                ps.next()
767                ps.skip_blank()
768                continue
769
770            break
771
772        ps.expect_char(')')
773        return positional, named
774
775    @with_span
776    def get_string(self, ps):
777        raw = ''
778        value = ''
779
780        ps.expect_char('"')
781
782        while True:
783            ch = ps.take_char(lambda x: x != '"' and x != EOL)
784            if not ch:
785                break
786            if ch == '\\':
787                sequence, unescaped = self.get_escape_sequence(ps)
788                raw += sequence
789                value += unescaped
790            else:
791                raw += ch
792                value += ch
793
794        if ps.current_char == EOL:
795            raise ParseError('E0020')
796
797        ps.expect_char('"')
798
799        return ast.StringLiteral(raw, value)
800
801    @with_span
802    def get_literal(self, ps):
803        if ps.is_number_start():
804            return self.get_number(ps)
805        if ps.current_char == '"':
806            return self.get_string(ps)
807        raise ParseError('E0014')
808