1# coding=utf8
2"""Migration Transforms.
3
4Transforms are AST nodes which describe how legacy translations should be
5migrated.  They are created inert and only return the migrated AST nodes when
6they are evaluated by a MigrationContext.
7
8All Transforms evaluate to Fluent Patterns. This makes them suitable for
9defining migrations of values of message, attributes and variants.  The special
10CONCAT Transform is capable of joining multiple Patterns returned by evaluating
11other Transforms into a single Pattern.  It can also concatenate Pattern
12elements: TextElements and Placeables.
13
14The COPY, REPLACE and PLURALS Transforms inherit from Source which is a special
15AST Node defining the location (the file path and the id) of the legacy
16translation.  During the migration, the current MigrationContext scans the
17migration spec for Source nodes and extracts the information about all legacy
18translations being migrated. For instance,
19
20    COPY('file.dtd', 'hello')
21
22is equivalent to:
23
24    FTL.Pattern([
25        Source('file.dtd', 'hello')
26    ])
27
28Sometimes it's useful to work with text rather than (path, key) source
29definitions. This is the case when the migrated translation requires some
30hardcoded text, e.g. <a> and </a> when multiple translations become a single
31one with a DOM overlay. In such cases it's best to use FTL.TextElements:
32
33    FTL.Message(
34        id=FTL.Identifier('update-failed'),
35        value=CONCAT(
36            COPY('aboutDialog.dtd', 'update.failed.start'),
37            FTL.TextElement('<a>'),
38            COPY('aboutDialog.dtd', 'update.failed.linkText'),
39            FTL.TextElement('</a>'),
40            COPY('aboutDialog.dtd', 'update.failed.end'),
41        )
42    )
43
44The REPLACE_IN_TEXT Transform also takes TextElements as input, making it
45possible to pass it as the foreach function of the PLURALS Transform. In the
46example below, each slice of the plural string is converted into a
47TextElement by PLURALS and then run through the REPLACE_IN_TEXT transform.
48
49    FTL.Message(
50        FTL.Identifier('delete-all'),
51        value=PLURALS(
52            'aboutDownloads.dtd',
53            'deleteAll',
54            VARIABLE_REFERENCE('num'),
55            lambda text: REPLACE_IN_TEXT(
56                text,
57                {
58                    '#1': VARIABLE_REFERENCE('num')
59                }
60            )
61        )
62    )
63"""
64
65from __future__ import unicode_literals
66from __future__ import absolute_import
67import re
68
69from fluent.syntax import ast as FTL
70from fluent.syntax.visitor import Transformer
71from .errors import NotSupportedError
72
73
74def chain_elements(elements):
75    '''Flatten a list of FTL nodes into an iterator over PatternElements.'''
76    for element in elements:
77        if isinstance(element, FTL.Pattern):
78            # PY3 yield from element.elements
79            for child in element.elements:
80                yield child
81        elif isinstance(element, FTL.PatternElement):
82            yield element
83        elif isinstance(element, FTL.Expression):
84            yield FTL.Placeable(element)
85        else:
86            raise RuntimeError(
87                'Expected Pattern, PatternElement or Expression')
88
89
90re_leading_ws = re.compile(
91    r'\A(?:(?P<whitespace> +)(?P<text>.*?)|(?P<block_text>\n.*?))\Z',
92    re.S,
93)
94re_trailing_ws = re.compile(
95    r'\A(?:(?P<text>.*?)(?P<whitespace> +)|(?P<block_text>.*\n))\Z',
96    re.S
97)
98
99
100def extract_whitespace(regex, element):
101    '''Extract leading or trailing whitespace from a TextElement.
102
103    Return a tuple of (Placeable, TextElement) in which the Placeable
104    encodes the extracted whitespace as a StringLiteral and the
105    TextElement has the same amount of whitespace removed. The
106    Placeable with the extracted whitespace is always returned first.
107    If the element starts or ends with a newline, add an empty
108    StringLiteral.
109    '''
110    match = re.search(regex, element.value)
111    if match:
112        # If white-space is None, we're a newline. Add an
113        # empty { "" }
114        whitespace = match.group('whitespace') or ''
115        placeable = FTL.Placeable(FTL.StringLiteral(whitespace))
116        if whitespace == element.value:
117            return placeable, None
118        else:
119            # Either text or block_text matched the rest.
120            text = match.group('text') or match.group('block_text')
121            return placeable, FTL.TextElement(text)
122    else:
123        return None, element
124
125
126class Transform(FTL.BaseNode):
127    def __call__(self, ctx):
128        raise NotImplementedError
129
130    @staticmethod
131    def pattern_of(*elements):
132        normalized = []
133
134        # Normalize text content: convert text content to TextElements, join
135        # adjacent text and prune empty. Text content is either existing
136        # TextElements or whitespace-only StringLiterals. This may result in
137        # leading and trailing whitespace being put back into TextElements if
138        # the new Pattern is built from existing Patterns (CONCAT(COPY...)).
139        # The leading and trailing whitespace of the new Pattern will be
140        # extracted later into new StringLiterals.
141        for element in chain_elements(elements):
142            if isinstance(element, FTL.TextElement):
143                text_content = element.value
144            elif isinstance(element, FTL.Placeable) \
145                    and isinstance(element.expression, FTL.StringLiteral) \
146                    and re.match(r'^ *$', element.expression.value):
147                text_content = element.expression.value
148            else:
149                # The element does not contain text content which should be
150                # normalized. It may be a number, a reference, or
151                # a StringLiteral which should be preserved in the Pattern.
152                normalized.append(element)
153                continue
154
155            previous = normalized[-1] if len(normalized) else None
156            if isinstance(previous, FTL.TextElement):
157                # Join adjacent TextElements.
158                previous.value += text_content
159            elif len(text_content) > 0:
160                # Normalize non-empty text to a TextElement.
161                normalized.append(FTL.TextElement(text_content))
162            else:
163                # Prune empty text.
164                pass
165
166        # Store empty values explicitly as {""}.
167        if len(normalized) == 0:
168            empty = FTL.Placeable(FTL.StringLiteral(''))
169            return FTL.Pattern([empty])
170
171        # Extract explicit leading whitespace into a StringLiteral.
172        if isinstance(normalized[0], FTL.TextElement):
173            ws, text = extract_whitespace(re_leading_ws, normalized[0])
174            normalized[:1] = [ws, text]
175
176        # Extract explicit trailing whitespace into a StringLiteral.
177        if isinstance(normalized[-1], FTL.TextElement):
178            ws, text = extract_whitespace(re_trailing_ws, normalized[-1])
179            normalized[-1:] = [text, ws]
180
181        return FTL.Pattern([
182            element
183            for element in normalized
184            if element is not None
185        ])
186
187
188class Source(Transform):
189    """Base class for Transforms that get translations from source files.
190
191    The contract is that the first argument is the source path, and the
192    second is a key representing legacy string IDs, or Fluent id.attr.
193    """
194    def __init__(self, path, key):
195        self.path = path
196        self.key = key
197
198
199class FluentSource(Source):
200    """Declare a Fluent source translation to be copied over.
201
202    When evaluated, it clones the Pattern of the parsed source.
203    """
204    def __init__(self, path, key):
205        if not path.endswith('.ftl'):
206            raise NotSupportedError(
207                'Please use COPY to migrate from legacy files '
208                '({})'.format(path)
209            )
210        if key[0] == '-' and '.' in key:
211            raise NotSupportedError(
212                'Cannot migrate from Term Attributes, as they are'
213                'locale-dependent ({})'.format(path)
214            )
215        super(FluentSource, self).__init__(path, key)
216
217    def __call__(self, ctx):
218        pattern = ctx.get_fluent_source_pattern(self.path, self.key)
219        return pattern.clone()
220
221
222class COPY_PATTERN(FluentSource):
223    """Create a Pattern with the translation value from the given source.
224
225    The given key can be a Message ID, Message ID.attribute_name, or
226    Term ID. Accessing Term attributes is not supported, as they're internal
227    to the localization.
228    """
229    pass
230
231
232class TransformPattern(FluentSource, Transformer):
233    """Base class for modifying a Fluent pattern as part of a migration.
234
235    Implement visit_* methods of the Transformer pattern to do the
236    actual modifications.
237    """
238    def __call__(self, ctx):
239        pattern = super(TransformPattern, self).__call__(ctx)
240        return self.visit(pattern)
241
242    def visit_Pattern(self, node):
243        # Make sure we're creating valid Patterns after restructuring
244        # transforms.
245        node = self.generic_visit(node)
246        pattern = Transform.pattern_of(*node.elements)
247        return pattern
248
249    def visit_Placeable(self, node):
250        # Ensure we have a Placeable with an expression still.
251        # Transforms could have replaced the expression with
252        # a Pattern or PatternElement, in which case we
253        # just pass that through.
254        # Patterns then get flattened by visit_Pattern.
255        node = self.generic_visit(node)
256        if isinstance(node.expression, (FTL.Pattern, FTL.PatternElement)):
257            return node.expression
258        return node
259
260
261class LegacySource(Source):
262    """Declare the source translation to be migrated with other transforms.
263
264    When evaluated, `Source` returns a TextElement with the content from the
265    source translation. Escaped characters are unescaped by the
266    compare-locales parser according to the file format:
267
268      - in properties files: \\uXXXX,
269      - in DTD files: known named, decimal, and hexadecimal HTML entities.
270
271    Consult the following files for the list of known named HTML entities:
272
273    https://github.com/python/cpython/blob/2.7/Lib/htmlentitydefs.py
274    https://github.com/python/cpython/blob/3.6/Lib/html/entities.py
275
276    By default, leading and trailing whitespace on each line as well as
277    leading and trailing empty lines will be stripped from the source
278    translation's content. Set `trim=False` to disable this behavior.
279    """
280
281    def __init__(self, path, key, trim=None):
282        if path.endswith('.ftl'):
283            raise NotSupportedError(
284                'Please use COPY_PATTERN to migrate from Fluent files '
285                '({})'.format(path))
286
287        super(LegacySource, self).__init__(path, key)
288        self.trim = trim
289
290    def get_text(self, ctx):
291        return ctx.get_legacy_source(self.path, self.key)
292
293    @staticmethod
294    def trim_text(text):
295        # strip leading white-space from each line
296        text = re.sub('^[ \t]+', '', text, flags=re.M)
297        # strip trailing white-space from each line
298        text = re.sub('[ \t]+$', '', text, flags=re.M)
299        # strip leading and trailing empty lines
300        text = text.strip('\r\n')
301        return text
302
303    def __call__(self, ctx):
304        text = self.get_text(ctx)
305        if self.trim is not False:
306            text = self.trim_text(text)
307        return FTL.TextElement(text)
308
309
310class COPY(LegacySource):
311    """Create a Pattern with the translation value from the given source."""
312
313    def __call__(self, ctx):
314        element = super(COPY, self).__call__(ctx)
315        return Transform.pattern_of(element)
316
317
318PRINTF = re.compile(
319    r'%(?P<good>%|'
320    r'(?:(?P<number>[1-9][0-9]*)\$)?'
321    r'(?P<width>\*|[0-9]+)?'
322    r'(?P<prec>\.(?:\*|[0-9]+)?)?'
323    r'(?P<spec>[duxXosScpfg]))'
324)
325
326
327def number():
328    i = 1
329    while True:
330        yield i
331        i += 1
332
333
334def normalize_printf(text):
335    """Normalize printf arguments so that they're all numbered.
336    Gecko forbids mixing unnumbered and numbered ones, so
337    we just need to convert unnumbered to numbered ones.
338    Also remove ones that have zero width, as they're intended
339    to be removed from the output by the localizer.
340    """
341    next_number = number()
342
343    def normalized(match):
344        if match.group('good') == '%':
345            return '%'
346        hidden = match.group('width') == '0'
347        if match.group('number'):
348            return '' if hidden else match.group()
349        num = next(next_number)
350        return '' if hidden else '%{}${}'.format(num, match.group('spec'))
351
352    return PRINTF.sub(normalized, text)
353
354
355class REPLACE_IN_TEXT(Transform):
356    """Create a Pattern from a TextElement and replace legacy placeables.
357
358    The original placeables are defined as keys on the `replacements` dict.
359    For each key the value must be defined as a FTL Pattern, Placeable,
360    TextElement or Expression to be interpolated.
361    """
362
363    def __init__(self, element, replacements, normalize_printf=False):
364        self.element = element
365        self.replacements = replacements
366        self.normalize_printf = normalize_printf
367
368    def __call__(self, ctx):
369        # For each specified replacement, find all indices of the original
370        # placeable in the source translation. If missing, the list of indices
371        # will be empty.
372        value = self.element.value
373        if self.normalize_printf:
374            value = normalize_printf(value)
375        key_indices = {
376            key: [m.start() for m in re.finditer(re.escape(key), value)]
377            for key in self.replacements.keys()
378        }
379
380        # Build a dict of indices to replacement keys.
381        keys_indexed = {}
382        for key, indices in key_indices.items():
383            for index in indices:
384                keys_indexed[index] = key
385
386        # Order the replacements by the position of the original placeable in
387        # the translation.
388        replacements = (
389            (key, ctx.evaluate(self.replacements[key]))
390            for index, key
391            in sorted(keys_indexed.items(), key=lambda x: x[0])
392        )
393
394        # A list of PatternElements built from the legacy translation and the
395        # FTL replacements. It may contain empty or adjacent TextElements.
396        elements = []
397        tail = value
398
399        # Convert original placeables and text into FTL Nodes. For each
400        # original placeable the translation will be partitioned around it and
401        # the text before it will be converted into an `FTL.TextElement` and
402        # the placeable will be replaced with its replacement.
403        for key, node in replacements:
404            before, key, tail = tail.partition(key)
405            elements.append(FTL.TextElement(before))
406            elements.append(node)
407
408        # Don't forget about the tail after the loop ends.
409        elements.append(FTL.TextElement(tail))
410        return Transform.pattern_of(*elements)
411
412
413class REPLACE(LegacySource):
414    """Create a Pattern with interpolations from given source.
415
416    Interpolations in the translation value from the given source will be
417    replaced with FTL placeables using the `REPLACE_IN_TEXT` transform.
418    """
419
420    def __init__(
421        self, path, key, replacements, **kwargs
422    ):
423        # We default normalize_printf to False except for .properties files.
424        # We still allow the caller to override the default value.
425        normalize_printf = False
426        if 'normalize_printf' in kwargs:
427            normalize_printf = kwargs['normalize_printf']
428            del kwargs['normalize_printf']
429        elif path.endswith('.properties'):
430            normalize_printf = True
431
432        super(REPLACE, self).__init__(path, key, **kwargs)
433        self.replacements = replacements
434        self.normalize_printf = normalize_printf
435
436    def __call__(self, ctx):
437        element = super(REPLACE, self).__call__(ctx)
438        return REPLACE_IN_TEXT(
439            element, self.replacements,
440            normalize_printf=self.normalize_printf
441        )(ctx)
442
443
444class PLURALS(LegacySource):
445    """Create a Pattern with plurals from given source.
446
447    Build an `FTL.SelectExpression` with the supplied `selector` and variants
448    extracted from the source. The original translation should be a
449    semicolon-separated list of plural forms. Each form will be converted
450    into a TextElement and run through the `foreach` function, which should
451    return an `FTL.Node` or a `Transform`. By default, the `foreach` function
452    creates a valid Pattern from the TextElement passed into it.
453    """
454    DEFAULT_ORDER = ('zero', 'one', 'two', 'few', 'many', 'other')
455
456    def __init__(self, path, key, selector, foreach=Transform.pattern_of,
457                 **kwargs):
458        super(PLURALS, self).__init__(path, key, **kwargs)
459        self.selector = selector
460        self.foreach = foreach
461
462    def __call__(self, ctx):
463        element = super(PLURALS, self).__call__(ctx)
464        selector = ctx.evaluate(self.selector)
465        keys = ctx.plural_categories
466        forms = [
467            FTL.TextElement(part)
468            for part in element.value.split(';')
469        ]
470
471        # The default CLDR form should be the last we have in DEFAULT_ORDER,
472        # usually `other`, but in some cases `many`. If we don't have a variant
473        # for that, we'll append one, using the, in CLDR order, last existing
474        # variant in the legacy translation. That may or may not be the last
475        # variant.
476        default_key = [
477            key for key in reversed(self.DEFAULT_ORDER) if key in keys
478        ][0]
479
480        # Match keys to legacy forms in the order they are defined in Gecko's
481        # PluralForm.jsm. Filter out empty forms.
482        pairs = [
483            (key, var)
484            for key, var in zip(keys, forms)
485            if var.value
486        ]
487
488        # A special case for legacy translations which don't define any
489        # plural forms.
490        if len(pairs) == 0:
491            return Transform.pattern_of()
492
493        # A special case for languages with one plural category or one legacy
494        # variant. We don't need to insert a SelectExpression for them.
495        if len(pairs) == 1:
496            _, only_form = pairs[0]
497            only_variant = ctx.evaluate(self.foreach(only_form))
498            return Transform.pattern_of(only_variant)
499
500        # Make sure the default key is defined. If it's missing, use the last
501        # form (in CLDR order) found in the legacy translation.
502        pairs.sort(key=lambda pair: self.DEFAULT_ORDER.index(pair[0]))
503        last_key, last_form = pairs[-1]
504        if last_key != default_key:
505            pairs.append((default_key, last_form))
506
507        def createVariant(key, form):
508            # Run the legacy plural form through `foreach` which returns an
509            # `FTL.Node` describing the transformation required for each
510            # variant. Then evaluate it to a migrated FTL node.
511            value = ctx.evaluate(self.foreach(form))
512            return FTL.Variant(
513                key=FTL.Identifier(key),
514                value=value,
515                default=key == default_key
516            )
517
518        select = FTL.SelectExpression(
519            selector=selector,
520            variants=[
521                createVariant(key, form)
522                for key, form in pairs
523            ]
524        )
525
526        return Transform.pattern_of(select)
527
528
529class CONCAT(Transform):
530    """Create a new Pattern from Patterns, PatternElements and Expressions.
531
532    When called with at least two elements, `CONCAT` disables the trimming
533    behavior of the elements which are subclasses of `LegacySource` by
534    setting `trim=False`, unless `trim` has already been set explicitly. The
535    following two `CONCAT` calls are equivalent:
536
537       CONCAT(
538           FTL.TextElement("Hello"),
539           COPY("file.properties", "hello")
540       )
541
542       CONCAT(
543           FTL.TextElement("Hello"),
544           COPY("file.properties", "hello", trim=False)
545       )
546
547    Set `trim=True` explicitly to force trimming:
548
549       CONCAT(
550           FTL.TextElement("Hello "),
551           COPY("file.properties", "hello", trim=True)
552       )
553
554    When called with a single element and when the element is a subclass of
555    `LegacySource`, the trimming behavior is not changed. The following two
556    transforms are equivalent:
557
558       CONCAT(COPY("file.properties", "hello"))
559
560       COPY("file.properties", "hello")
561    """
562
563    def __init__(self, *elements, **kwargs):
564        # We want to support both passing elements as *elements in the
565        # migration specs and as elements=[]. The latter is used by
566        # FTL.BaseNode.traverse when it recreates the traversed node using its
567        # attributes as kwargs.
568        self.elements = list(kwargs.get('elements', elements))
569
570        # We want to make CONCAT(COPY()) equivalent to COPY() so that it's
571        # always safe (no-op) to wrap transforms in a CONCAT. This is used by
572        # the implementation of transforms_from.
573        if len(self.elements) > 1:
574            for elem in self.elements:
575                # Only change trim if it hasn't been set explicitly.
576                if isinstance(elem, LegacySource) and elem.trim is None:
577                    elem.trim = False
578
579    def __call__(self, ctx):
580        return Transform.pattern_of(*self.elements)
581