1# -*- coding: utf-8 -*- 2""" 3 babel.messages.extract 4 ~~~~~~~~~~~~~~~~~~~~~~ 5 6 Basic infrastructure for extracting localizable messages from source files. 7 8 This module defines an extensible system for collecting localizable message 9 strings from a variety of sources. A native extractor for Python source 10 files is builtin, extractors for other sources can be added using very 11 simple plugins. 12 13 The main entry points into the extraction functionality are the functions 14 `extract_from_dir` and `extract_from_file`. 15 16 :copyright: (c) 2013-2021 by the Babel Team. 17 :license: BSD, see LICENSE for more details. 18""" 19 20import os 21from os.path import relpath 22import sys 23from tokenize import generate_tokens, COMMENT, NAME, OP, STRING 24 25from babel.util import parse_encoding, parse_future_flags, pathmatch 26from babel._compat import PY2, text_type 27from textwrap import dedent 28 29 30GROUP_NAME = 'babel.extractors' 31 32DEFAULT_KEYWORDS = { 33 '_': None, 34 'gettext': None, 35 'ngettext': (1, 2), 36 'ugettext': None, 37 'ungettext': (1, 2), 38 'dgettext': (2,), 39 'dngettext': (2, 3), 40 'N_': None, 41 'pgettext': ((1, 'c'), 2), 42 'npgettext': ((1, 'c'), 2, 3) 43} 44 45DEFAULT_MAPPING = [('**.py', 'python')] 46 47empty_msgid_warning = ( 48 '%s: warning: Empty msgid. It is reserved by GNU gettext: gettext("") ' 49 'returns the header entry with meta information, not the empty string.') 50 51 52def _strip_comment_tags(comments, tags): 53 """Helper function for `extract` that strips comment tags from strings 54 in a list of comment lines. This functions operates in-place. 55 """ 56 def _strip(line): 57 for tag in tags: 58 if line.startswith(tag): 59 return line[len(tag):].strip() 60 return line 61 comments[:] = map(_strip, comments) 62 63 64def extract_from_dir(dirname=None, method_map=DEFAULT_MAPPING, 65 options_map=None, keywords=DEFAULT_KEYWORDS, 66 comment_tags=(), callback=None, strip_comment_tags=False): 67 """Extract messages from any source files found in the given directory. 68 69 This function generates tuples of the form ``(filename, lineno, message, 70 comments, context)``. 71 72 Which extraction method is used per file is determined by the `method_map` 73 parameter, which maps extended glob patterns to extraction method names. 74 For example, the following is the default mapping: 75 76 >>> method_map = [ 77 ... ('**.py', 'python') 78 ... ] 79 80 This basically says that files with the filename extension ".py" at any 81 level inside the directory should be processed by the "python" extraction 82 method. Files that don't match any of the mapping patterns are ignored. See 83 the documentation of the `pathmatch` function for details on the pattern 84 syntax. 85 86 The following extended mapping would also use the "genshi" extraction 87 method on any file in "templates" subdirectory: 88 89 >>> method_map = [ 90 ... ('**/templates/**.*', 'genshi'), 91 ... ('**.py', 'python') 92 ... ] 93 94 The dictionary provided by the optional `options_map` parameter augments 95 these mappings. It uses extended glob patterns as keys, and the values are 96 dictionaries mapping options names to option values (both strings). 97 98 The glob patterns of the `options_map` do not necessarily need to be the 99 same as those used in the method mapping. For example, while all files in 100 the ``templates`` folders in an application may be Genshi applications, the 101 options for those files may differ based on extension: 102 103 >>> options_map = { 104 ... '**/templates/**.txt': { 105 ... 'template_class': 'genshi.template:TextTemplate', 106 ... 'encoding': 'latin-1' 107 ... }, 108 ... '**/templates/**.html': { 109 ... 'include_attrs': '' 110 ... } 111 ... } 112 113 :param dirname: the path to the directory to extract messages from. If 114 not given the current working directory is used. 115 :param method_map: a list of ``(pattern, method)`` tuples that maps of 116 extraction method names to extended glob patterns 117 :param options_map: a dictionary of additional options (optional) 118 :param keywords: a dictionary mapping keywords (i.e. names of functions 119 that should be recognized as translation functions) to 120 tuples that specify which of their arguments contain 121 localizable strings 122 :param comment_tags: a list of tags of translator comments to search for 123 and include in the results 124 :param callback: a function that is called for every file that message are 125 extracted from, just before the extraction itself is 126 performed; the function is passed the filename, the name 127 of the extraction method and and the options dictionary as 128 positional arguments, in that order 129 :param strip_comment_tags: a flag that if set to `True` causes all comment 130 tags to be removed from the collected comments. 131 :see: `pathmatch` 132 """ 133 if dirname is None: 134 dirname = os.getcwd() 135 if options_map is None: 136 options_map = {} 137 138 absname = os.path.abspath(dirname) 139 for root, dirnames, filenames in os.walk(absname): 140 dirnames[:] = [ 141 subdir for subdir in dirnames 142 if not (subdir.startswith('.') or subdir.startswith('_')) 143 ] 144 dirnames.sort() 145 filenames.sort() 146 for filename in filenames: 147 filepath = os.path.join(root, filename).replace(os.sep, '/') 148 149 for message_tuple in check_and_call_extract_file( 150 filepath, 151 method_map, 152 options_map, 153 callback, 154 keywords, 155 comment_tags, 156 strip_comment_tags, 157 dirpath=absname, 158 ): 159 yield message_tuple 160 161 162def check_and_call_extract_file(filepath, method_map, options_map, 163 callback, keywords, comment_tags, 164 strip_comment_tags, dirpath=None): 165 """Checks if the given file matches an extraction method mapping, and if so, calls extract_from_file. 166 167 Note that the extraction method mappings are based relative to dirpath. 168 So, given an absolute path to a file `filepath`, we want to check using 169 just the relative path from `dirpath` to `filepath`. 170 171 Yields 5-tuples (filename, lineno, messages, comments, context). 172 173 :param filepath: An absolute path to a file that exists. 174 :param method_map: a list of ``(pattern, method)`` tuples that maps of 175 extraction method names to extended glob patterns 176 :param options_map: a dictionary of additional options (optional) 177 :param callback: a function that is called for every file that message are 178 extracted from, just before the extraction itself is 179 performed; the function is passed the filename, the name 180 of the extraction method and and the options dictionary as 181 positional arguments, in that order 182 :param keywords: a dictionary mapping keywords (i.e. names of functions 183 that should be recognized as translation functions) to 184 tuples that specify which of their arguments contain 185 localizable strings 186 :param comment_tags: a list of tags of translator comments to search for 187 and include in the results 188 :param strip_comment_tags: a flag that if set to `True` causes all comment 189 tags to be removed from the collected comments. 190 :param dirpath: the path to the directory to extract messages from. 191 :return: iterable of 5-tuples (filename, lineno, messages, comments, context) 192 :rtype: Iterable[tuple[str, int, str|tuple[str], list[str], str|None] 193 """ 194 # filename is the relative path from dirpath to the actual file 195 filename = relpath(filepath, dirpath) 196 197 for pattern, method in method_map: 198 if not pathmatch(pattern, filename): 199 continue 200 201 options = {} 202 for opattern, odict in options_map.items(): 203 if pathmatch(opattern, filename): 204 options = odict 205 if callback: 206 callback(filename, method, options) 207 for message_tuple in extract_from_file( 208 method, filepath, 209 keywords=keywords, 210 comment_tags=comment_tags, 211 options=options, 212 strip_comment_tags=strip_comment_tags 213 ): 214 yield (filename, ) + message_tuple 215 216 break 217 218 219def extract_from_file(method, filename, keywords=DEFAULT_KEYWORDS, 220 comment_tags=(), options=None, strip_comment_tags=False): 221 """Extract messages from a specific file. 222 223 This function returns a list of tuples of the form ``(lineno, message, comments, context)``. 224 225 :param filename: the path to the file to extract messages from 226 :param method: a string specifying the extraction method (.e.g. "python") 227 :param keywords: a dictionary mapping keywords (i.e. names of functions 228 that should be recognized as translation functions) to 229 tuples that specify which of their arguments contain 230 localizable strings 231 :param comment_tags: a list of translator tags to search for and include 232 in the results 233 :param strip_comment_tags: a flag that if set to `True` causes all comment 234 tags to be removed from the collected comments. 235 :param options: a dictionary of additional options (optional) 236 :returns: list of tuples of the form ``(lineno, message, comments, context)`` 237 :rtype: list[tuple[int, str|tuple[str], list[str], str|None] 238 """ 239 if method == 'ignore': 240 return [] 241 242 with open(filename, 'rb') as fileobj: 243 return list(extract(method, fileobj, keywords, comment_tags, 244 options, strip_comment_tags)) 245 246 247def extract(method, fileobj, keywords=DEFAULT_KEYWORDS, comment_tags=(), 248 options=None, strip_comment_tags=False): 249 """Extract messages from the given file-like object using the specified 250 extraction method. 251 252 This function returns tuples of the form ``(lineno, message, comments, context)``. 253 254 The implementation dispatches the actual extraction to plugins, based on the 255 value of the ``method`` parameter. 256 257 >>> source = b'''# foo module 258 ... def run(argv): 259 ... print(_('Hello, world!')) 260 ... ''' 261 262 >>> from babel._compat import BytesIO 263 >>> for message in extract('python', BytesIO(source)): 264 ... print(message) 265 (3, u'Hello, world!', [], None) 266 267 :param method: an extraction method (a callable), or 268 a string specifying the extraction method (.e.g. "python"); 269 if this is a simple name, the extraction function will be 270 looked up by entry point; if it is an explicit reference 271 to a function (of the form ``package.module:funcname`` or 272 ``package.module.funcname``), the corresponding function 273 will be imported and used 274 :param fileobj: the file-like object the messages should be extracted from 275 :param keywords: a dictionary mapping keywords (i.e. names of functions 276 that should be recognized as translation functions) to 277 tuples that specify which of their arguments contain 278 localizable strings 279 :param comment_tags: a list of translator tags to search for and include 280 in the results 281 :param options: a dictionary of additional options (optional) 282 :param strip_comment_tags: a flag that if set to `True` causes all comment 283 tags to be removed from the collected comments. 284 :raise ValueError: if the extraction method is not registered 285 :returns: iterable of tuples of the form ``(lineno, message, comments, context)`` 286 :rtype: Iterable[tuple[int, str|tuple[str], list[str], str|None] 287 """ 288 func = None 289 if callable(method): 290 func = method 291 elif ':' in method or '.' in method: 292 if ':' not in method: 293 lastdot = method.rfind('.') 294 module, attrname = method[:lastdot], method[lastdot + 1:] 295 else: 296 module, attrname = method.split(':', 1) 297 func = getattr(__import__(module, {}, {}, [attrname]), attrname) 298 else: 299 try: 300 from pkg_resources import working_set 301 except ImportError: 302 pass 303 else: 304 for entry_point in working_set.iter_entry_points(GROUP_NAME, 305 method): 306 func = entry_point.load(require=True) 307 break 308 if func is None: 309 # if pkg_resources is not available or no usable egg-info was found 310 # (see #230), we resort to looking up the builtin extractors 311 # directly 312 builtin = { 313 'ignore': extract_nothing, 314 'python': extract_python, 315 'javascript': extract_javascript 316 } 317 func = builtin.get(method) 318 319 if func is None: 320 raise ValueError('Unknown extraction method %r' % method) 321 322 results = func(fileobj, keywords.keys(), comment_tags, 323 options=options or {}) 324 325 for lineno, funcname, messages, comments in results: 326 if funcname: 327 spec = keywords[funcname] or (1,) 328 else: 329 spec = (1,) 330 if not isinstance(messages, (list, tuple)): 331 messages = [messages] 332 if not messages: 333 continue 334 335 # Validate the messages against the keyword's specification 336 context = None 337 msgs = [] 338 invalid = False 339 # last_index is 1 based like the keyword spec 340 last_index = len(messages) 341 for index in spec: 342 if isinstance(index, tuple): 343 context = messages[index[0] - 1] 344 continue 345 if last_index < index: 346 # Not enough arguments 347 invalid = True 348 break 349 message = messages[index - 1] 350 if message is None: 351 invalid = True 352 break 353 msgs.append(message) 354 if invalid: 355 continue 356 357 # keyword spec indexes are 1 based, therefore '-1' 358 if isinstance(spec[0], tuple): 359 # context-aware *gettext method 360 first_msg_index = spec[1] - 1 361 else: 362 first_msg_index = spec[0] - 1 363 if not messages[first_msg_index]: 364 # An empty string msgid isn't valid, emit a warning 365 where = '%s:%i' % (hasattr(fileobj, 'name') and 366 fileobj.name or '(unknown)', lineno) 367 sys.stderr.write((empty_msgid_warning % where) + '\n') 368 continue 369 370 messages = tuple(msgs) 371 if len(messages) == 1: 372 messages = messages[0] 373 374 if strip_comment_tags: 375 _strip_comment_tags(comments, comment_tags) 376 yield lineno, messages, comments, context 377 378 379def extract_nothing(fileobj, keywords, comment_tags, options): 380 """Pseudo extractor that does not actually extract anything, but simply 381 returns an empty list. 382 """ 383 return [] 384 385 386def extract_python(fileobj, keywords, comment_tags, options): 387 """Extract messages from Python source code. 388 389 It returns an iterator yielding tuples in the following form ``(lineno, 390 funcname, message, comments)``. 391 392 :param fileobj: the seekable, file-like object the messages should be 393 extracted from 394 :param keywords: a list of keywords (i.e. function names) that should be 395 recognized as translation functions 396 :param comment_tags: a list of translator tags to search for and include 397 in the results 398 :param options: a dictionary of additional options (optional) 399 :rtype: ``iterator`` 400 """ 401 funcname = lineno = message_lineno = None 402 call_stack = -1 403 buf = [] 404 messages = [] 405 translator_comments = [] 406 in_def = in_translator_comments = False 407 comment_tag = None 408 409 encoding = parse_encoding(fileobj) or options.get('encoding', 'UTF-8') 410 future_flags = parse_future_flags(fileobj, encoding) 411 412 if PY2: 413 next_line = fileobj.readline 414 else: 415 next_line = lambda: fileobj.readline().decode(encoding) 416 417 tokens = generate_tokens(next_line) 418 for tok, value, (lineno, _), _, _ in tokens: 419 if call_stack == -1 and tok == NAME and value in ('def', 'class'): 420 in_def = True 421 elif tok == OP and value == '(': 422 if in_def: 423 # Avoid false positives for declarations such as: 424 # def gettext(arg='message'): 425 in_def = False 426 continue 427 if funcname: 428 message_lineno = lineno 429 call_stack += 1 430 elif in_def and tok == OP and value == ':': 431 # End of a class definition without parens 432 in_def = False 433 continue 434 elif call_stack == -1 and tok == COMMENT: 435 # Strip the comment token from the line 436 if PY2: 437 value = value.decode(encoding) 438 value = value[1:].strip() 439 if in_translator_comments and \ 440 translator_comments[-1][0] == lineno - 1: 441 # We're already inside a translator comment, continue appending 442 translator_comments.append((lineno, value)) 443 continue 444 # If execution reaches this point, let's see if comment line 445 # starts with one of the comment tags 446 for comment_tag in comment_tags: 447 if value.startswith(comment_tag): 448 in_translator_comments = True 449 translator_comments.append((lineno, value)) 450 break 451 elif funcname and call_stack == 0: 452 nested = (tok == NAME and value in keywords) 453 if (tok == OP and value == ')') or nested: 454 if buf: 455 messages.append(''.join(buf)) 456 del buf[:] 457 else: 458 messages.append(None) 459 460 if len(messages) > 1: 461 messages = tuple(messages) 462 else: 463 messages = messages[0] 464 # Comments don't apply unless they immediately preceed the 465 # message 466 if translator_comments and \ 467 translator_comments[-1][0] < message_lineno - 1: 468 translator_comments = [] 469 470 yield (message_lineno, funcname, messages, 471 [comment[1] for comment in translator_comments]) 472 473 funcname = lineno = message_lineno = None 474 call_stack = -1 475 messages = [] 476 translator_comments = [] 477 in_translator_comments = False 478 if nested: 479 funcname = value 480 elif tok == STRING: 481 # Unwrap quotes in a safe manner, maintaining the string's 482 # encoding 483 # https://sourceforge.net/tracker/?func=detail&atid=355470& 484 # aid=617979&group_id=5470 485 code = compile('# coding=%s\n%s' % (str(encoding), value), 486 '<string>', 'eval', future_flags) 487 value = eval(code, {'__builtins__': {}}, {}) 488 if PY2 and not isinstance(value, text_type): 489 value = value.decode(encoding) 490 buf.append(value) 491 elif tok == OP and value == ',': 492 if buf: 493 messages.append(''.join(buf)) 494 del buf[:] 495 else: 496 messages.append(None) 497 if translator_comments: 498 # We have translator comments, and since we're on a 499 # comma(,) user is allowed to break into a new line 500 # Let's increase the last comment's lineno in order 501 # for the comment to still be a valid one 502 old_lineno, old_comment = translator_comments.pop() 503 translator_comments.append((old_lineno + 1, old_comment)) 504 elif call_stack > 0 and tok == OP and value == ')': 505 call_stack -= 1 506 elif funcname and call_stack == -1: 507 funcname = None 508 elif tok == NAME and value in keywords: 509 funcname = value 510 511 512def extract_javascript(fileobj, keywords, comment_tags, options): 513 """Extract messages from JavaScript source code. 514 515 :param fileobj: the seekable, file-like object the messages should be 516 extracted from 517 :param keywords: a list of keywords (i.e. function names) that should be 518 recognized as translation functions 519 :param comment_tags: a list of translator tags to search for and include 520 in the results 521 :param options: a dictionary of additional options (optional) 522 Supported options are: 523 * `jsx` -- set to false to disable JSX/E4X support. 524 * `template_string` -- set to false to disable ES6 525 template string support. 526 """ 527 from babel.messages.jslexer import Token, tokenize, unquote_string 528 funcname = message_lineno = None 529 messages = [] 530 last_argument = None 531 translator_comments = [] 532 concatenate_next = False 533 encoding = options.get('encoding', 'utf-8') 534 last_token = None 535 call_stack = -1 536 dotted = any('.' in kw for kw in keywords) 537 538 for token in tokenize( 539 fileobj.read().decode(encoding), 540 jsx=options.get("jsx", True), 541 template_string=options.get("template_string", True), 542 dotted=dotted 543 ): 544 if ( # Turn keyword`foo` expressions into keyword("foo") calls: 545 funcname and # have a keyword... 546 (last_token and last_token.type == 'name') and # we've seen nothing after the keyword... 547 token.type == 'template_string' # this is a template string 548 ): 549 message_lineno = token.lineno 550 messages = [unquote_string(token.value)] 551 call_stack = 0 552 token = Token('operator', ')', token.lineno) 553 554 if token.type == 'operator' and token.value == '(': 555 if funcname: 556 message_lineno = token.lineno 557 call_stack += 1 558 559 elif call_stack == -1 and token.type == 'linecomment': 560 value = token.value[2:].strip() 561 if translator_comments and \ 562 translator_comments[-1][0] == token.lineno - 1: 563 translator_comments.append((token.lineno, value)) 564 continue 565 566 for comment_tag in comment_tags: 567 if value.startswith(comment_tag): 568 translator_comments.append((token.lineno, value.strip())) 569 break 570 571 elif token.type == 'multilinecomment': 572 # only one multi-line comment may preceed a translation 573 translator_comments = [] 574 value = token.value[2:-2].strip() 575 for comment_tag in comment_tags: 576 if value.startswith(comment_tag): 577 lines = value.splitlines() 578 if lines: 579 lines[0] = lines[0].strip() 580 lines[1:] = dedent('\n'.join(lines[1:])).splitlines() 581 for offset, line in enumerate(lines): 582 translator_comments.append((token.lineno + offset, 583 line)) 584 break 585 586 elif funcname and call_stack == 0: 587 if token.type == 'operator' and token.value == ')': 588 if last_argument is not None: 589 messages.append(last_argument) 590 if len(messages) > 1: 591 messages = tuple(messages) 592 elif messages: 593 messages = messages[0] 594 else: 595 messages = None 596 597 # Comments don't apply unless they immediately precede the 598 # message 599 if translator_comments and \ 600 translator_comments[-1][0] < message_lineno - 1: 601 translator_comments = [] 602 603 if messages is not None: 604 yield (message_lineno, funcname, messages, 605 [comment[1] for comment in translator_comments]) 606 607 funcname = message_lineno = last_argument = None 608 concatenate_next = False 609 translator_comments = [] 610 messages = [] 611 call_stack = -1 612 613 elif token.type in ('string', 'template_string'): 614 new_value = unquote_string(token.value) 615 if concatenate_next: 616 last_argument = (last_argument or '') + new_value 617 concatenate_next = False 618 else: 619 last_argument = new_value 620 621 elif token.type == 'operator': 622 if token.value == ',': 623 if last_argument is not None: 624 messages.append(last_argument) 625 last_argument = None 626 else: 627 messages.append(None) 628 concatenate_next = False 629 elif token.value == '+': 630 concatenate_next = True 631 632 elif call_stack > 0 and token.type == 'operator' \ 633 and token.value == ')': 634 call_stack -= 1 635 636 elif funcname and call_stack == -1: 637 funcname = None 638 639 elif call_stack == -1 and token.type == 'name' and \ 640 token.value in keywords and \ 641 (last_token is None or last_token.type != 'name' or 642 last_token.value != 'function'): 643 funcname = token.value 644 645 last_token = token 646