1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2011-2021 Edgewall Software
4# All rights reserved.
5#
6# This software is licensed as described in the file COPYING, which
7# you should have received as part of this distribution. The terms
8# are also available at https://trac.edgewall.org/wiki/TracLicense.
9#
10# This software consists of voluntary contributions made by many
11# individuals. For the exact contribution history, see the revision
12# history and logs, available at https://trac.edgewall.org/log/.
13
14"""Extra commands for setup.py.
15
16We provide a few extra command classes in `l10n_cmdclass` for
17localization tasks.  We also modify the standard commands
18`distutils.command.build` and `setuptools.command.install_lib` classes
19in order to call the l10n commands for compiling catalogs at the right
20time during install.
21
22"""
23
24from html.parser import HTMLParser
25import io
26import os
27import re
28from tokenize import generate_tokens, COMMENT, NAME, OP, STRING
29
30from jinja2.ext import babel_extract as jinja2_extractor
31
32from distutils import log as distlog
33from distutils.cmd import Command
34from distutils.command.build import build as _build
35from distutils.errors import DistutilsOptionError
36from setuptools.command.install_lib import install_lib as _install_lib
37
38
39def simplify_message(message):
40    """Transforms an extracted messsage (string or tuple) into one in
41    which the repeated white-space has been simplified to a single
42    space.
43
44    """
45    tuple_len = len(message) if isinstance(message, tuple) else 0
46    if tuple_len:
47        message = message[0]
48    message = ' '.join(message.split())
49    if tuple_len:
50        message = (message,) + (None,) * (tuple_len - 1)
51    return message
52
53
54class ScriptExtractor(HTMLParser):
55    def __init__(self, out):
56        HTMLParser.__init__(self)
57        self.out = out
58        self.in_javascript = False
59
60    def handle_starttag(self, tag, attrs):
61        if tag == 'script':
62            self.in_javascript = True
63
64    def handle_startendtag(self, tag, attrs):
65        self.in_javascript = False
66
67    def handle_charref(self, name):
68        if self.in_javascript:
69            self.out.write('&#%s;' % name)
70
71    def handle_entityref(self, name):
72        if self.in_javascript:
73            self.out.write('&%s;' % name)
74
75    def handle_data(self, data):
76        if self.in_javascript:
77            self.out.write(data)
78
79    def handle_endtag(self, tag):
80        self.in_javascript = False
81
82    def no_op(*args, **kwargs):
83        pass
84
85    handle_comment = handle_decl = handle_pi = no_op
86
87
88try:
89    from babel.messages.catalog import TranslationError
90    from babel.messages.extract import extract_javascript
91    from babel.messages.frontend import extract_messages, init_catalog, \
92                                        compile_catalog, update_catalog
93    from babel.messages.pofile import read_po
94    from babel.support import Translations
95    from babel.util import parse_encoding
96
97    _DEFAULT_KWARGS_MAPS = {
98        'Option': {'doc': 4},
99        'BoolOption': {'doc': 4},
100        'IntOption': {'doc': 4},
101        'FloatOption': {'doc': 4},
102        'ListOption': {'doc': 6},
103        'ChoiceOption': {'doc': 4},
104        'PathOption': {'doc': 4},
105        'ExtensionOption': {'doc': 5},
106        'OrderedExtensionsOption': {'doc': 6},
107    }
108
109    _DEFAULT_CLEANDOC_KEYWORDS = (
110        'ConfigSection', 'Option', 'BoolOption', 'IntOption', 'FloatOption',
111        'ListOption', 'ChoiceOption', 'PathOption', 'ExtensionOption',
112        'OrderedExtensionsOption', 'cleandoc_',
113    )
114
115    def extract_python(fileobj, keywords, comment_tags, options):
116        """Extract messages from Python source code, This is patched
117        extract_python from Babel to support keyword argument mapping.
118
119        `kwargs_maps` option: names of keyword arguments will be mapping to
120        index of messages array.
121
122        `cleandoc_keywords` option: a list of keywords to clean up the
123        extracted messages with `cleandoc`.
124        """
125        from trac.util.text import cleandoc
126
127        funcname = lineno = message_lineno = None
128        kwargs_maps = func_kwargs_map = None
129        call_stack = -1
130        buf = []
131        messages = []
132        messages_kwargs = {}
133        translator_comments = []
134        in_def = in_translator_comments = False
135        comment_tag = None
136
137        encoding = str(parse_encoding(fileobj) or
138                       options.get('encoding', 'iso-8859-1'))
139        kwargs_maps = _DEFAULT_KWARGS_MAPS.copy()
140        if 'kwargs_maps' in options:
141            kwargs_maps.update(options['kwargs_maps'])
142        cleandoc_keywords = set(_DEFAULT_CLEANDOC_KEYWORDS)
143        if 'cleandoc_keywords' in options:
144            cleandoc_keywords.update(options['cleandoc_keywords'])
145
146        tokens = generate_tokens(fileobj.readline)
147        tok = value = None
148        for _ in tokens:
149            prev_tok, prev_value = tok, value
150            tok, value, (lineno, _), _, _ = _
151            if call_stack == -1 and tok == NAME and value in ('def', 'class'):
152                in_def = True
153            elif tok == OP and value == '(':
154                if in_def:
155                    # Avoid false positives for declarations such as:
156                    # def gettext(arg='message'):
157                    in_def = False
158                    continue
159                if funcname:
160                    message_lineno = lineno
161                    call_stack += 1
162                kwarg_name = None
163            elif in_def and tok == OP and value == ':':
164                # End of a class definition without parens
165                in_def = False
166                continue
167            elif call_stack == -1 and tok == COMMENT:
168                # Strip the comment token from the line
169                value = value.decode(encoding)[1:].strip()
170                if in_translator_comments and \
171                        translator_comments[-1][0] == lineno - 1:
172                    # We're already inside a translator comment, continue
173                    # appending
174                    translator_comments.append((lineno, value))
175                    continue
176                # If execution reaches this point, let's see if comment line
177                # starts with one of the comment tags
178                for comment_tag in comment_tags:
179                    if value.startswith(comment_tag):
180                        in_translator_comments = True
181                        translator_comments.append((lineno, value))
182                        break
183            elif funcname and call_stack == 0:
184                if tok == OP and value == ')':
185                    if buf:
186                        message = ''.join(buf)
187                        if kwarg_name in func_kwargs_map:
188                            messages_kwargs[kwarg_name] = message
189                        else:
190                            messages.append(message)
191                        del buf[:]
192                    else:
193                        messages.append(None)
194
195                    for name, message in messages_kwargs.items():
196                        if name not in func_kwargs_map:
197                            continue
198                        index = func_kwargs_map[name]
199                        while index >= len(messages):
200                            messages.append(None)
201                        messages[index - 1] = message
202
203                    if funcname in cleandoc_keywords:
204                        messages = [m and cleandoc(m) for m in messages]
205                    if len(messages) > 1:
206                        messages = tuple(messages)
207                    else:
208                        messages = messages[0]
209                    # Comments don't apply unless they immediately preceed the
210                    # message
211                    if translator_comments and \
212                            translator_comments[-1][0] < message_lineno - 1:
213                        translator_comments = []
214
215                    yield (message_lineno, funcname, messages,
216                           [comment[1] for comment in translator_comments])
217
218                    funcname = lineno = message_lineno = None
219                    kwarg_name = func_kwargs_map = None
220                    call_stack = -1
221                    messages = []
222                    messages_kwargs = {}
223                    translator_comments = []
224                    in_translator_comments = False
225                elif tok == STRING:
226                    # Unwrap quotes in a safe manner, maintaining the string's
227                    # encoding
228                    # https://sourceforge.net/tracker/?func=detail&atid=355470&
229                    # aid=617979&group_id=5470
230                    value = eval('# coding=%s\n%s' % (encoding, value),
231                                 {'__builtins__':{}}, {})
232                    if isinstance(value, bytes):
233                        value = value.decode(encoding)
234                    buf.append(value)
235                elif tok == OP and value == '=' and prev_tok == NAME:
236                    kwarg_name = prev_value
237                elif tok == OP and value == ',':
238                    if buf:
239                        message = ''.join(buf)
240                        if kwarg_name in func_kwargs_map:
241                            messages_kwargs[kwarg_name] = message
242                        else:
243                            messages.append(message)
244                        del buf[:]
245                    else:
246                        messages.append(None)
247                    kwarg_name = None
248                    if translator_comments:
249                        # We have translator comments, and since we're on a
250                        # comma(,) user is allowed to break into a new line
251                        # Let's increase the last comment's lineno in order
252                        # for the comment to still be a valid one
253                        old_lineno, old_comment = translator_comments.pop()
254                        translator_comments.append((old_lineno+1, old_comment))
255            elif call_stack > 0 and tok == OP and value == ')':
256                call_stack -= 1
257            elif funcname and call_stack == -1:
258                funcname = func_kwargs_map = kwarg_name = None
259            elif tok == NAME and value in keywords:
260                funcname = value
261                func_kwargs_map = kwargs_maps.get(funcname, {})
262                kwarg_name = None
263
264
265    def extract_javascript_script(fileobj, keywords, comment_tags, options):
266        """Extract messages from Javascript embedded in <script> tags.
267
268        Select <script type="javascript/text"> tags and delegate to
269        `extract_javascript`.
270        """
271        if not fileobj.name:
272            return []
273        out = io.StringIO()
274        extractor = ScriptExtractor(out)
275        extractor.feed(str(fileobj.read(), 'utf-8'))
276        extractor.close()
277        out.seek(0)
278        return extract_javascript(out, keywords, comment_tags, options)
279
280
281    def extract_html(fileobj, keywords, comment_tags, options):
282        """Extracts translatable texts from templates.
283
284        We simplify white-space found in translatable texts collected
285        via the ``gettext`` function (which is what the ``trans``
286        directives use), otherwise we would have near duplicates
287        (e.g. admin.html, prefs.html).
288
289        We assume the template function ``gettext`` will do the same
290        before trying to fetch the translation from the catalog.
291
292        """
293        if fileobj:
294            extractor = jinja2_extractor
295            fileobj.seek(0)
296            for m in extractor(fileobj, keywords, comment_tags, options):
297                # lineno, func, message, comments = m
298                if m[1] in ('gettext', None):
299                    # Jinja2 trans
300                    yield m[0], m[1], simplify_message(m[2]), m[3]
301                else:
302                    yield m
303
304
305    extract_text = extract_html
306
307
308    class generate_messages_js(Command):
309        """Generating message javascripts command for use ``setup.py`` scripts.
310        """
311
312        description = 'generate message javascript files from binary MO files'
313        user_options = [
314            ('domain=', 'D',
315             "domain of PO file (default 'messages')"),
316            ('input-dir=', 'I',
317             'path to base directory containing the catalogs'),
318            ('input-file=', 'i',
319             'name of the input file'),
320            ('output-dir=', 'O',
321             "name of the output directory"),
322            ('output-file=', 'o',
323             "name of the output file (default "
324             "'<output_dir>/<locale>.js')"),
325            ('locale=', 'l',
326             'locale of the catalog to compile'),
327        ]
328
329        def initialize_options(self):
330            self.domain = 'messages'
331            self.input_dir = None
332            self.input_file = None
333            self.output_dir = None
334            self.output_file = None
335            self.locale = None
336
337        def finalize_options(self):
338            if not self.input_file and not self.input_dir:
339                raise DistutilsOptionError('you must specify either the input '
340                                           'file or directory')
341            if not self.output_file and not self.output_dir:
342                raise DistutilsOptionError('you must specify either the '
343                                           'output file or directory')
344
345        def run(self):
346            mo_files = []
347            js_files = []
348
349            def js_path(dir, locale):
350                return os.path.join(dir, locale + '.js')
351
352            if not self.input_file:
353                if self.locale:
354                    mo_files.append((self.locale,
355                                     os.path.join(self.input_dir, self.locale,
356                                                  'LC_MESSAGES',
357                                                  self.domain + '.mo')))
358                    js_files.append(js_path(self.output_dir, self.locale))
359                else:
360                    for locale in os.listdir(self.input_dir):
361                        mo_file = os.path.join(self.input_dir, locale,
362                                               'LC_MESSAGES',
363                                               self.domain + '.mo')
364                        if os.path.exists(mo_file):
365                            mo_files.append((locale, mo_file))
366                            js_files.append(js_path(self.output_dir, locale))
367            else:
368                mo_files.append((self.locale, self.input_file))
369                if self.output_file:
370                    js_files.append(self.output_file)
371                else:
372                    js_files.append(js_path(self.output_dir, self.locale))
373
374            if not mo_files:
375                raise DistutilsOptionError('no compiled catalogs found')
376
377            if not os.path.isdir(self.output_dir):
378                os.mkdir(self.output_dir)
379
380            for idx, (locale, mo_file) in enumerate(mo_files):
381                js_file = js_files[idx]
382                distlog.info('generating messages javascript %r to %r',
383                             mo_file, js_file)
384
385                with open(mo_file, 'rb') as infile:
386                    t = Translations(infile, self.domain)
387                    catalog = t._catalog
388
389                with open(js_file, 'w', encoding='utf-8') as outfile:
390                    write_js(outfile, catalog, self.domain, locale)
391
392
393    class check_catalog(Command):
394        """Check message catalog command for use ``setup.py`` scripts."""
395
396        description = 'check message catalog files, like `msgfmt --check`'
397        user_options = [
398            ('domain=', 'D',
399             "domain of PO file (default 'messages')"),
400            ('input-dir=', 'I',
401             'path to base directory containing the catalogs'),
402            ('input-file=', 'i',
403             'name of the input file'),
404            ('locale=', 'l',
405             'locale of the catalog to compile'),
406        ]
407
408        def initialize_options(self):
409            self.domain = 'messages'
410            self.input_dir = None
411            self.input_file = None
412            self.locale = None
413
414        def finalize_options(self):
415            if not self.input_file and not self.input_dir:
416                raise DistutilsOptionError('you must specify either the input '
417                                           'file or directory')
418
419        def run(self):
420            for filename in self._get_po_files():
421                distlog.info('checking catalog %s', filename)
422                with open(filename, 'rb') as f:
423                    catalog = read_po(f, domain=self.domain)
424                for message in catalog:
425                    for error in self._check_message(catalog, message):
426                        distlog.warn('%s:%d: %s', filename, message.lineno,
427                                     error)
428
429        def _get_po_files(self):
430            if self.input_file:
431                return [self.input_file]
432
433            if self.locale:
434                return [os.path.join(self.input_dir, self.locale,
435                                     'LC_MESSAGES', self.domain + '.po')]
436
437            files = []
438            for locale in os.listdir(self.input_dir):
439                filename = os.path.join(self.input_dir, locale, 'LC_MESSAGES',
440                                        self.domain + '.po')
441                if os.path.exists(filename):
442                    files.append(filename)
443            return sorted(files)
444
445        def _check_message(self, catalog, message):
446            for e in message.check(catalog):
447                yield e
448            for e in check_markup(catalog, message):
449                yield e
450
451    def check_markup(catalog, message):
452        """Verify markups in the translation."""
453        def to_array(value):
454            if not isinstance(value, (list, tuple)):
455                value = (value,)
456            return value
457        msgids = to_array(message.id)
458        msgstrs = to_array(message.string)
459        for msgid_idx, msgid in enumerate(msgids):
460            msgid_name = 'msgid' if msgid_idx == 0 else 'msgid_plural'
461            for msgstr_idx, msgstr in enumerate(msgstrs):
462                if msgid and msgstr and msgid != msgstr:
463                    msgstr_name = 'msgstr' if len(msgids) == 1 else \
464                                  'msgstr[%d]' % msgstr_idx
465                    for e in _check_markup_0(msgid, msgid_name, msgstr,
466                                             msgstr_name):
467                        yield e
468
469    def _check_markup_0(msgid, msgid_name, msgstr, msgstr_name):
470        from xml.etree import ElementTree
471
472        def count_tags(text):
473            text = '<html>\n%s\n</html>' % text.encode('utf-8')
474            counts = {}
475            for event in ElementTree.iterparse(io.BytesIO(text)):
476                tag = event[1].tag
477                counts.setdefault(tag, 0)
478                counts[tag] += 1
479            counts['html'] -= 1
480            return counts
481
482        try:
483            msgid_counts = count_tags(msgid)
484        except ElementTree.ParseError:
485            return
486        try:
487            msgstr_counts = count_tags(msgstr)
488        except ElementTree.ParseError as e:
489            yield TranslationError(e)
490            return
491
492        for tag in (set(msgid_counts) | set(msgstr_counts)):
493            msgid_count = msgid_counts.get(tag, 0)
494            msgstr_count = msgstr_counts.get(tag, 0)
495            if msgid_count != msgstr_count:
496                yield TranslationError(
497                    "mismatched '%s' tag between %s and %s (%d != %d)" %
498                    (tag, msgid_name, msgstr_name, msgid_count, msgstr_count))
499
500    def write_js(fileobj, catalog, domain, locale):
501        from trac.util.presentation import to_json
502        data = {'domain': domain, 'locale': locale}
503
504        messages = {}
505        for msgid, msgstr in catalog.items():
506            if isinstance(msgid, (list, tuple)):
507                messages.setdefault(msgid[0], {})
508                messages[msgid[0]][msgid[1]] = msgstr
509            elif msgid:
510                messages[msgid] = msgstr
511            else:
512                for line in msgstr.splitlines():
513                    line = line.strip()
514                    if not line:
515                        continue
516                    if ':' not in line:
517                        continue
518                    name, val = line.split(':', 1)
519                    name = name.strip().lower()
520                    if name == 'plural-forms':
521                        data['plural_expr'] = pluralexpr(val)
522                        break
523        data['messages'] = messages
524        data = to_json(data)
525        if isinstance(data, bytes):
526            data = str(data, 'utf-8')
527
528        fileobj.write('// Generated messages javascript file '
529                      'from compiled MO file\n')
530        fileobj.write('babel.Translations.load(')
531        fileobj.write(data)
532        fileobj.write(').install();\n')
533
534    def pluralexpr(forms):
535        match = re.search(r'\bplural\s*=\s*([^;]+)', forms)
536        if not match:
537            raise ValueError('Failed to parse plural_forms %r' % (forms,))
538        return match.group(1)
539
540
541    def get_command_overriders():
542        # 'bdist_wininst' runs a 'build', so make the latter
543        # run a 'compile_catalog' before 'build_py'
544        class build(_build):
545            sub_commands = [('compile_catalog', None)] + _build.sub_commands
546
547        # 'bdist_egg' isn't that nice, all it does is an 'install_lib'
548        class install_lib(_install_lib): # playing setuptools' own tricks ;-)
549            def l10n_run(self):
550                self.run_command('compile_catalog')
551            def run(self):
552                self.l10n_run()
553                # When bdist_egg is called on distribute 0.6.29 and later, the
554                # egg file includes no *.mo and *.js files which are generated
555                # in l10n_run() method.
556                # We remove build_py.data_files property to re-compute in order
557                # to avoid the issue (#11640).
558                build_py = self.get_finalized_command('build_py')
559                if 'data_files' in build_py.__dict__ and \
560                   not any(any(name.endswith('.mo') for name in filenames)
561                           for pkg, src_dir, build_dir, filenames
562                           in build_py.data_files):
563                    del build_py.__dict__['data_files']
564                _install_lib.run(self)
565        return build, install_lib
566
567    def get_l10n_cmdclass():
568        build, install_lib = get_command_overriders()
569        return {
570            'build': build, 'install_lib': install_lib,
571            'check_catalog': check_catalog,
572        }
573
574    def get_l10n_js_cmdclass():
575        build, _install_lib = get_command_overriders()
576        build.sub_commands.insert(0, ('generate_messages_js', None))
577        build.sub_commands.insert(0, ('compile_catalog_js', None))
578        class install_lib(_install_lib):
579            def l10n_run(self):
580                self.run_command('compile_catalog_js')
581                self.run_command('generate_messages_js')
582                self.run_command('compile_catalog')
583        return {
584            'build': build, 'install_lib': install_lib,
585            'check_catalog': check_catalog,
586            'extract_messages_js': extract_messages,
587            'init_catalog_js': init_catalog,
588            'compile_catalog_js': compile_catalog,
589            'update_catalog_js': update_catalog,
590            'generate_messages_js': generate_messages_js,
591            'check_catalog_js': check_catalog,
592        }
593
594    def get_l10n_trac_cmdclass():
595        build, _install_lib = get_command_overriders()
596        build.sub_commands.insert(0, ('generate_messages_js', None))
597        build.sub_commands.insert(0, ('compile_catalog_js', None))
598        build.sub_commands.insert(0, ('compile_catalog_tracini', None))
599        class install_lib(_install_lib):
600            def l10n_run(self):
601                self.run_command('compile_catalog_tracini')
602                self.run_command('compile_catalog_js')
603                self.run_command('generate_messages_js')
604                self.run_command('compile_catalog')
605        return {
606            'build': build, 'install_lib': install_lib,
607            'check_catalog': check_catalog,
608            'extract_messages_js': extract_messages,
609            'init_catalog_js': init_catalog,
610            'compile_catalog_js': compile_catalog,
611            'update_catalog_js': update_catalog,
612            'generate_messages_js': generate_messages_js,
613            'check_catalog_js': check_catalog,
614            'extract_messages_tracini': extract_messages,
615            'init_catalog_tracini': init_catalog,
616            'compile_catalog_tracini': compile_catalog,
617            'update_catalog_tracini': update_catalog,
618            'check_catalog_tracini': check_catalog,
619        }
620
621except ImportError:
622    def get_l10n_cmdclass():
623        return
624    def get_l10n_js_cmdclass():
625        return
626    def get_l10n_trac_cmdclass():
627        return
628