1# -*- coding: utf-8 -*-
2"""
3    babel.messages.extract
4    ~~~~~~~~~~~~~~~~~~~~~~
5
6    Basic infrastructure for extracting localizable messages from source files.
7
8    This module defines an extensible system for collecting localizable message
9    strings from a variety of sources. A native extractor for Python source
10    files is builtin, extractors for other sources can be added using very
11    simple plugins.
12
13    The main entry points into the extraction functionality are the functions
14    `extract_from_dir` and `extract_from_file`.
15
16    :copyright: (c) 2013-2021 by the Babel Team.
17    :license: BSD, see LICENSE for more details.
18"""
19
20import os
21from os.path import relpath
22import sys
23from tokenize import generate_tokens, COMMENT, NAME, OP, STRING
24
25from babel.util import parse_encoding, parse_future_flags, pathmatch
26from babel._compat import PY2, text_type
27from textwrap import dedent
28
29
30GROUP_NAME = 'babel.extractors'
31
32DEFAULT_KEYWORDS = {
33    '_': None,
34    'gettext': None,
35    'ngettext': (1, 2),
36    'ugettext': None,
37    'ungettext': (1, 2),
38    'dgettext': (2,),
39    'dngettext': (2, 3),
40    'N_': None,
41    'pgettext': ((1, 'c'), 2),
42    'npgettext': ((1, 'c'), 2, 3)
43}
44
45DEFAULT_MAPPING = [('**.py', 'python')]
46
47empty_msgid_warning = (
48    '%s: warning: Empty msgid.  It is reserved by GNU gettext: gettext("") '
49    'returns the header entry with meta information, not the empty string.')
50
51
52def _strip_comment_tags(comments, tags):
53    """Helper function for `extract` that strips comment tags from strings
54    in a list of comment lines.  This functions operates in-place.
55    """
56    def _strip(line):
57        for tag in tags:
58            if line.startswith(tag):
59                return line[len(tag):].strip()
60        return line
61    comments[:] = map(_strip, comments)
62
63
64def extract_from_dir(dirname=None, method_map=DEFAULT_MAPPING,
65                     options_map=None, keywords=DEFAULT_KEYWORDS,
66                     comment_tags=(), callback=None, strip_comment_tags=False):
67    """Extract messages from any source files found in the given directory.
68
69    This function generates tuples of the form ``(filename, lineno, message,
70    comments, context)``.
71
72    Which extraction method is used per file is determined by the `method_map`
73    parameter, which maps extended glob patterns to extraction method names.
74    For example, the following is the default mapping:
75
76    >>> method_map = [
77    ...     ('**.py', 'python')
78    ... ]
79
80    This basically says that files with the filename extension ".py" at any
81    level inside the directory should be processed by the "python" extraction
82    method. Files that don't match any of the mapping patterns are ignored. See
83    the documentation of the `pathmatch` function for details on the pattern
84    syntax.
85
86    The following extended mapping would also use the "genshi" extraction
87    method on any file in "templates" subdirectory:
88
89    >>> method_map = [
90    ...     ('**/templates/**.*', 'genshi'),
91    ...     ('**.py', 'python')
92    ... ]
93
94    The dictionary provided by the optional `options_map` parameter augments
95    these mappings. It uses extended glob patterns as keys, and the values are
96    dictionaries mapping options names to option values (both strings).
97
98    The glob patterns of the `options_map` do not necessarily need to be the
99    same as those used in the method mapping. For example, while all files in
100    the ``templates`` folders in an application may be Genshi applications, the
101    options for those files may differ based on extension:
102
103    >>> options_map = {
104    ...     '**/templates/**.txt': {
105    ...         'template_class': 'genshi.template:TextTemplate',
106    ...         'encoding': 'latin-1'
107    ...     },
108    ...     '**/templates/**.html': {
109    ...         'include_attrs': ''
110    ...     }
111    ... }
112
113    :param dirname: the path to the directory to extract messages from.  If
114                    not given the current working directory is used.
115    :param method_map: a list of ``(pattern, method)`` tuples that maps of
116                       extraction method names to extended glob patterns
117    :param options_map: a dictionary of additional options (optional)
118    :param keywords: a dictionary mapping keywords (i.e. names of functions
119                     that should be recognized as translation functions) to
120                     tuples that specify which of their arguments contain
121                     localizable strings
122    :param comment_tags: a list of tags of translator comments to search for
123                         and include in the results
124    :param callback: a function that is called for every file that message are
125                     extracted from, just before the extraction itself is
126                     performed; the function is passed the filename, the name
127                     of the extraction method and and the options dictionary as
128                     positional arguments, in that order
129    :param strip_comment_tags: a flag that if set to `True` causes all comment
130                               tags to be removed from the collected comments.
131    :see: `pathmatch`
132    """
133    if dirname is None:
134        dirname = os.getcwd()
135    if options_map is None:
136        options_map = {}
137
138    absname = os.path.abspath(dirname)
139    for root, dirnames, filenames in os.walk(absname):
140        dirnames[:] = [
141            subdir for subdir in dirnames
142            if not (subdir.startswith('.') or subdir.startswith('_'))
143        ]
144        dirnames.sort()
145        filenames.sort()
146        for filename in filenames:
147            filepath = os.path.join(root, filename).replace(os.sep, '/')
148
149            for message_tuple in check_and_call_extract_file(
150                filepath,
151                method_map,
152                options_map,
153                callback,
154                keywords,
155                comment_tags,
156                strip_comment_tags,
157                dirpath=absname,
158            ):
159                yield message_tuple
160
161
162def check_and_call_extract_file(filepath, method_map, options_map,
163                                callback, keywords, comment_tags,
164                                strip_comment_tags, dirpath=None):
165    """Checks if the given file matches an extraction method mapping, and if so, calls extract_from_file.
166
167    Note that the extraction method mappings are based relative to dirpath.
168    So, given an absolute path to a file `filepath`, we want to check using
169    just the relative path from `dirpath` to `filepath`.
170
171    Yields 5-tuples (filename, lineno, messages, comments, context).
172
173    :param filepath: An absolute path to a file that exists.
174    :param method_map: a list of ``(pattern, method)`` tuples that maps of
175                       extraction method names to extended glob patterns
176    :param options_map: a dictionary of additional options (optional)
177    :param callback: a function that is called for every file that message are
178                     extracted from, just before the extraction itself is
179                     performed; the function is passed the filename, the name
180                     of the extraction method and and the options dictionary as
181                     positional arguments, in that order
182    :param keywords: a dictionary mapping keywords (i.e. names of functions
183                     that should be recognized as translation functions) to
184                     tuples that specify which of their arguments contain
185                     localizable strings
186    :param comment_tags: a list of tags of translator comments to search for
187                         and include in the results
188    :param strip_comment_tags: a flag that if set to `True` causes all comment
189                               tags to be removed from the collected comments.
190    :param dirpath: the path to the directory to extract messages from.
191    :return: iterable of 5-tuples (filename, lineno, messages, comments, context)
192    :rtype: Iterable[tuple[str, int, str|tuple[str], list[str], str|None]
193    """
194    # filename is the relative path from dirpath to the actual file
195    filename = relpath(filepath, dirpath)
196
197    for pattern, method in method_map:
198        if not pathmatch(pattern, filename):
199            continue
200
201        options = {}
202        for opattern, odict in options_map.items():
203            if pathmatch(opattern, filename):
204                options = odict
205        if callback:
206            callback(filename, method, options)
207        for message_tuple in extract_from_file(
208            method, filepath,
209            keywords=keywords,
210            comment_tags=comment_tags,
211            options=options,
212            strip_comment_tags=strip_comment_tags
213        ):
214            yield (filename, ) + message_tuple
215
216        break
217
218
219def extract_from_file(method, filename, keywords=DEFAULT_KEYWORDS,
220                      comment_tags=(), options=None, strip_comment_tags=False):
221    """Extract messages from a specific file.
222
223    This function returns a list of tuples of the form ``(lineno, message, comments, context)``.
224
225    :param filename: the path to the file to extract messages from
226    :param method: a string specifying the extraction method (.e.g. "python")
227    :param keywords: a dictionary mapping keywords (i.e. names of functions
228                     that should be recognized as translation functions) to
229                     tuples that specify which of their arguments contain
230                     localizable strings
231    :param comment_tags: a list of translator tags to search for and include
232                         in the results
233    :param strip_comment_tags: a flag that if set to `True` causes all comment
234                               tags to be removed from the collected comments.
235    :param options: a dictionary of additional options (optional)
236    :returns: list of tuples of the form ``(lineno, message, comments, context)``
237    :rtype: list[tuple[int, str|tuple[str], list[str], str|None]
238    """
239    if method == 'ignore':
240        return []
241
242    with open(filename, 'rb') as fileobj:
243        return list(extract(method, fileobj, keywords, comment_tags,
244                            options, strip_comment_tags))
245
246
247def extract(method, fileobj, keywords=DEFAULT_KEYWORDS, comment_tags=(),
248            options=None, strip_comment_tags=False):
249    """Extract messages from the given file-like object using the specified
250    extraction method.
251
252    This function returns tuples of the form ``(lineno, message, comments, context)``.
253
254    The implementation dispatches the actual extraction to plugins, based on the
255    value of the ``method`` parameter.
256
257    >>> source = b'''# foo module
258    ... def run(argv):
259    ...    print(_('Hello, world!'))
260    ... '''
261
262    >>> from babel._compat import BytesIO
263    >>> for message in extract('python', BytesIO(source)):
264    ...     print(message)
265    (3, u'Hello, world!', [], None)
266
267    :param method: an extraction method (a callable), or
268                   a string specifying the extraction method (.e.g. "python");
269                   if this is a simple name, the extraction function will be
270                   looked up by entry point; if it is an explicit reference
271                   to a function (of the form ``package.module:funcname`` or
272                   ``package.module.funcname``), the corresponding function
273                   will be imported and used
274    :param fileobj: the file-like object the messages should be extracted from
275    :param keywords: a dictionary mapping keywords (i.e. names of functions
276                     that should be recognized as translation functions) to
277                     tuples that specify which of their arguments contain
278                     localizable strings
279    :param comment_tags: a list of translator tags to search for and include
280                         in the results
281    :param options: a dictionary of additional options (optional)
282    :param strip_comment_tags: a flag that if set to `True` causes all comment
283                               tags to be removed from the collected comments.
284    :raise ValueError: if the extraction method is not registered
285    :returns: iterable of tuples of the form ``(lineno, message, comments, context)``
286    :rtype: Iterable[tuple[int, str|tuple[str], list[str], str|None]
287    """
288    func = None
289    if callable(method):
290        func = method
291    elif ':' in method or '.' in method:
292        if ':' not in method:
293            lastdot = method.rfind('.')
294            module, attrname = method[:lastdot], method[lastdot + 1:]
295        else:
296            module, attrname = method.split(':', 1)
297        func = getattr(__import__(module, {}, {}, [attrname]), attrname)
298    else:
299        try:
300            from pkg_resources import working_set
301        except ImportError:
302            pass
303        else:
304            for entry_point in working_set.iter_entry_points(GROUP_NAME,
305                                                             method):
306                func = entry_point.load(require=True)
307                break
308        if func is None:
309            # if pkg_resources is not available or no usable egg-info was found
310            # (see #230), we resort to looking up the builtin extractors
311            # directly
312            builtin = {
313                'ignore': extract_nothing,
314                'python': extract_python,
315                'javascript': extract_javascript
316            }
317            func = builtin.get(method)
318
319    if func is None:
320        raise ValueError('Unknown extraction method %r' % method)
321
322    results = func(fileobj, keywords.keys(), comment_tags,
323                   options=options or {})
324
325    for lineno, funcname, messages, comments in results:
326        if funcname:
327            spec = keywords[funcname] or (1,)
328        else:
329            spec = (1,)
330        if not isinstance(messages, (list, tuple)):
331            messages = [messages]
332        if not messages:
333            continue
334
335        # Validate the messages against the keyword's specification
336        context = None
337        msgs = []
338        invalid = False
339        # last_index is 1 based like the keyword spec
340        last_index = len(messages)
341        for index in spec:
342            if isinstance(index, tuple):
343                context = messages[index[0] - 1]
344                continue
345            if last_index < index:
346                # Not enough arguments
347                invalid = True
348                break
349            message = messages[index - 1]
350            if message is None:
351                invalid = True
352                break
353            msgs.append(message)
354        if invalid:
355            continue
356
357        # keyword spec indexes are 1 based, therefore '-1'
358        if isinstance(spec[0], tuple):
359            # context-aware *gettext method
360            first_msg_index = spec[1] - 1
361        else:
362            first_msg_index = spec[0] - 1
363        if not messages[first_msg_index]:
364            # An empty string msgid isn't valid, emit a warning
365            where = '%s:%i' % (hasattr(fileobj, 'name') and
366                               fileobj.name or '(unknown)', lineno)
367            sys.stderr.write((empty_msgid_warning % where) + '\n')
368            continue
369
370        messages = tuple(msgs)
371        if len(messages) == 1:
372            messages = messages[0]
373
374        if strip_comment_tags:
375            _strip_comment_tags(comments, comment_tags)
376        yield lineno, messages, comments, context
377
378
379def extract_nothing(fileobj, keywords, comment_tags, options):
380    """Pseudo extractor that does not actually extract anything, but simply
381    returns an empty list.
382    """
383    return []
384
385
386def extract_python(fileobj, keywords, comment_tags, options):
387    """Extract messages from Python source code.
388
389    It returns an iterator yielding tuples in the following form ``(lineno,
390    funcname, message, comments)``.
391
392    :param fileobj: the seekable, file-like object the messages should be
393                    extracted from
394    :param keywords: a list of keywords (i.e. function names) that should be
395                     recognized as translation functions
396    :param comment_tags: a list of translator tags to search for and include
397                         in the results
398    :param options: a dictionary of additional options (optional)
399    :rtype: ``iterator``
400    """
401    funcname = lineno = message_lineno = None
402    call_stack = -1
403    buf = []
404    messages = []
405    translator_comments = []
406    in_def = in_translator_comments = False
407    comment_tag = None
408
409    encoding = parse_encoding(fileobj) or options.get('encoding', 'UTF-8')
410    future_flags = parse_future_flags(fileobj, encoding)
411
412    if PY2:
413        next_line = fileobj.readline
414    else:
415        next_line = lambda: fileobj.readline().decode(encoding)
416
417    tokens = generate_tokens(next_line)
418    for tok, value, (lineno, _), _, _ in tokens:
419        if call_stack == -1 and tok == NAME and value in ('def', 'class'):
420            in_def = True
421        elif tok == OP and value == '(':
422            if in_def:
423                # Avoid false positives for declarations such as:
424                # def gettext(arg='message'):
425                in_def = False
426                continue
427            if funcname:
428                message_lineno = lineno
429                call_stack += 1
430        elif in_def and tok == OP and value == ':':
431            # End of a class definition without parens
432            in_def = False
433            continue
434        elif call_stack == -1 and tok == COMMENT:
435            # Strip the comment token from the line
436            if PY2:
437                value = value.decode(encoding)
438            value = value[1:].strip()
439            if in_translator_comments and \
440                    translator_comments[-1][0] == lineno - 1:
441                # We're already inside a translator comment, continue appending
442                translator_comments.append((lineno, value))
443                continue
444            # If execution reaches this point, let's see if comment line
445            # starts with one of the comment tags
446            for comment_tag in comment_tags:
447                if value.startswith(comment_tag):
448                    in_translator_comments = True
449                    translator_comments.append((lineno, value))
450                    break
451        elif funcname and call_stack == 0:
452            nested = (tok == NAME and value in keywords)
453            if (tok == OP and value == ')') or nested:
454                if buf:
455                    messages.append(''.join(buf))
456                    del buf[:]
457                else:
458                    messages.append(None)
459
460                if len(messages) > 1:
461                    messages = tuple(messages)
462                else:
463                    messages = messages[0]
464                # Comments don't apply unless they immediately preceed the
465                # message
466                if translator_comments and \
467                        translator_comments[-1][0] < message_lineno - 1:
468                    translator_comments = []
469
470                yield (message_lineno, funcname, messages,
471                       [comment[1] for comment in translator_comments])
472
473                funcname = lineno = message_lineno = None
474                call_stack = -1
475                messages = []
476                translator_comments = []
477                in_translator_comments = False
478                if nested:
479                    funcname = value
480            elif tok == STRING:
481                # Unwrap quotes in a safe manner, maintaining the string's
482                # encoding
483                # https://sourceforge.net/tracker/?func=detail&atid=355470&
484                # aid=617979&group_id=5470
485                code = compile('# coding=%s\n%s' % (str(encoding), value),
486                               '<string>', 'eval', future_flags)
487                value = eval(code, {'__builtins__': {}}, {})
488                if PY2 and not isinstance(value, text_type):
489                    value = value.decode(encoding)
490                buf.append(value)
491            elif tok == OP and value == ',':
492                if buf:
493                    messages.append(''.join(buf))
494                    del buf[:]
495                else:
496                    messages.append(None)
497                if translator_comments:
498                    # We have translator comments, and since we're on a
499                    # comma(,) user is allowed to break into a new line
500                    # Let's increase the last comment's lineno in order
501                    # for the comment to still be a valid one
502                    old_lineno, old_comment = translator_comments.pop()
503                    translator_comments.append((old_lineno + 1, old_comment))
504        elif call_stack > 0 and tok == OP and value == ')':
505            call_stack -= 1
506        elif funcname and call_stack == -1:
507            funcname = None
508        elif tok == NAME and value in keywords:
509            funcname = value
510
511
512def extract_javascript(fileobj, keywords, comment_tags, options):
513    """Extract messages from JavaScript source code.
514
515    :param fileobj: the seekable, file-like object the messages should be
516                    extracted from
517    :param keywords: a list of keywords (i.e. function names) that should be
518                     recognized as translation functions
519    :param comment_tags: a list of translator tags to search for and include
520                         in the results
521    :param options: a dictionary of additional options (optional)
522                    Supported options are:
523                    * `jsx` -- set to false to disable JSX/E4X support.
524                    * `template_string` -- set to false to disable ES6
525                                           template string support.
526    """
527    from babel.messages.jslexer import Token, tokenize, unquote_string
528    funcname = message_lineno = None
529    messages = []
530    last_argument = None
531    translator_comments = []
532    concatenate_next = False
533    encoding = options.get('encoding', 'utf-8')
534    last_token = None
535    call_stack = -1
536    dotted = any('.' in kw for kw in keywords)
537
538    for token in tokenize(
539        fileobj.read().decode(encoding),
540        jsx=options.get("jsx", True),
541        template_string=options.get("template_string", True),
542        dotted=dotted
543    ):
544        if (  # Turn keyword`foo` expressions into keyword("foo") calls:
545            funcname and  # have a keyword...
546            (last_token and last_token.type == 'name') and  # we've seen nothing after the keyword...
547            token.type == 'template_string'  # this is a template string
548        ):
549            message_lineno = token.lineno
550            messages = [unquote_string(token.value)]
551            call_stack = 0
552            token = Token('operator', ')', token.lineno)
553
554        if token.type == 'operator' and token.value == '(':
555            if funcname:
556                message_lineno = token.lineno
557                call_stack += 1
558
559        elif call_stack == -1 and token.type == 'linecomment':
560            value = token.value[2:].strip()
561            if translator_comments and \
562               translator_comments[-1][0] == token.lineno - 1:
563                translator_comments.append((token.lineno, value))
564                continue
565
566            for comment_tag in comment_tags:
567                if value.startswith(comment_tag):
568                    translator_comments.append((token.lineno, value.strip()))
569                    break
570
571        elif token.type == 'multilinecomment':
572            # only one multi-line comment may preceed a translation
573            translator_comments = []
574            value = token.value[2:-2].strip()
575            for comment_tag in comment_tags:
576                if value.startswith(comment_tag):
577                    lines = value.splitlines()
578                    if lines:
579                        lines[0] = lines[0].strip()
580                        lines[1:] = dedent('\n'.join(lines[1:])).splitlines()
581                        for offset, line in enumerate(lines):
582                            translator_comments.append((token.lineno + offset,
583                                                        line))
584                    break
585
586        elif funcname and call_stack == 0:
587            if token.type == 'operator' and token.value == ')':
588                if last_argument is not None:
589                    messages.append(last_argument)
590                if len(messages) > 1:
591                    messages = tuple(messages)
592                elif messages:
593                    messages = messages[0]
594                else:
595                    messages = None
596
597                # Comments don't apply unless they immediately precede the
598                # message
599                if translator_comments and \
600                   translator_comments[-1][0] < message_lineno - 1:
601                    translator_comments = []
602
603                if messages is not None:
604                    yield (message_lineno, funcname, messages,
605                           [comment[1] for comment in translator_comments])
606
607                funcname = message_lineno = last_argument = None
608                concatenate_next = False
609                translator_comments = []
610                messages = []
611                call_stack = -1
612
613            elif token.type in ('string', 'template_string'):
614                new_value = unquote_string(token.value)
615                if concatenate_next:
616                    last_argument = (last_argument or '') + new_value
617                    concatenate_next = False
618                else:
619                    last_argument = new_value
620
621            elif token.type == 'operator':
622                if token.value == ',':
623                    if last_argument is not None:
624                        messages.append(last_argument)
625                        last_argument = None
626                    else:
627                        messages.append(None)
628                    concatenate_next = False
629                elif token.value == '+':
630                    concatenate_next = True
631
632        elif call_stack > 0 and token.type == 'operator' \
633                and token.value == ')':
634            call_stack -= 1
635
636        elif funcname and call_stack == -1:
637            funcname = None
638
639        elif call_stack == -1 and token.type == 'name' and \
640            token.value in keywords and \
641            (last_token is None or last_token.type != 'name' or
642             last_token.value != 'function'):
643            funcname = token.value
644
645        last_token = token
646