1#! /usr/bin/env python3
2# -*- coding: iso-8859-1 -*-
3# Originally written by Barry Warsaw <barry@python.org>
4#
5# Minimally patched to make it even more xgettext compatible
6# by Peter Funk <pf@artcom-gmbh.de>
7#
8# 2002-11-22 J�rgen Hermann <jh@web.de>
9# Added checks that _() only contains string literals, and
10# command line args are resolved to module lists, i.e. you
11# can now pass a filename, a module or package name, or a
12# directory (including globbing chars, important for Win32).
13# Made docstring fit in 80 chars wide displays using pydoc.
14#
15
16# for selftesting
17try:
18    import fintl
19    _ = fintl.gettext
20except ImportError:
21    _ = lambda s: s
22
23__doc__ = _("""pygettext -- Python equivalent of xgettext(1)
24
25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
26internationalization of C programs. Most of these tools are independent of
27the programming language and can be used from within Python programs.
28Martin von Loewis' work[1] helps considerably in this regard.
29
30There's one problem though; xgettext is the program that scans source code
31looking for message strings, but it groks only C (or C++). Python
32introduces a few wrinkles, such as dual quoting characters, triple quoted
33strings, and raw strings. xgettext understands none of this.
34
35Enter pygettext, which uses Python's standard tokenize module to scan
36Python source code, generating .pot files identical to what GNU xgettext[2]
37generates for C and C++ code. From there, the standard GNU tools can be
38used.
39
40A word about marking Python strings as candidates for translation. GNU
41xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
42and gettext_noop. But those can be a lot of text to include all over your
43code. C and C++ have a trick: they use the C preprocessor. Most
44internationalized C source includes a #define for gettext() to _() so that
45what has to be written in the source is much less. Thus these are both
46translatable strings:
47
48    gettext("Translatable String")
49    _("Translatable String")
50
51Python of course has no preprocessor so this doesn't work so well.  Thus,
52pygettext searches only for _() by default, but see the -k/--keyword flag
53below for how to augment this.
54
55 [1] https://www.python.org/workshops/1997-10/proceedings/loewis.html
56 [2] https://www.gnu.org/software/gettext/gettext.html
57
58NOTE: pygettext attempts to be option and feature compatible with GNU
59xgettext where ever possible. However some options are still missing or are
60not fully implemented. Also, xgettext's use of command line switches with
61option arguments is broken, and in these cases, pygettext just defines
62additional switches.
63
64Usage: pygettext [options] inputfile ...
65
66Options:
67
68    -a
69    --extract-all
70        Extract all strings.
71
72    -d name
73    --default-domain=name
74        Rename the default output file from messages.pot to name.pot.
75
76    -E
77    --escape
78        Replace non-ASCII characters with octal escape sequences.
79
80    -D
81    --docstrings
82        Extract module, class, method, and function docstrings.  These do
83        not need to be wrapped in _() markers, and in fact cannot be for
84        Python to consider them docstrings. (See also the -X option).
85
86    -h
87    --help
88        Print this help message and exit.
89
90    -k word
91    --keyword=word
92        Keywords to look for in addition to the default set, which are:
93        %(DEFAULTKEYWORDS)s
94
95        You can have multiple -k flags on the command line.
96
97    -K
98    --no-default-keywords
99        Disable the default set of keywords (see above).  Any keywords
100        explicitly added with the -k/--keyword option are still recognized.
101
102    --no-location
103        Do not write filename/lineno location comments.
104
105    -n
106    --add-location
107        Write filename/lineno location comments indicating where each
108        extracted string is found in the source.  These lines appear before
109        each msgid.  The style of comments is controlled by the -S/--style
110        option.  This is the default.
111
112    -o filename
113    --output=filename
114        Rename the default output file from messages.pot to filename.  If
115        filename is `-' then the output is sent to standard out.
116
117    -p dir
118    --output-dir=dir
119        Output files will be placed in directory dir.
120
121    -S stylename
122    --style stylename
123        Specify which style to use for location comments.  Two styles are
124        supported:
125
126        Solaris  # File: filename, line: line-number
127        GNU      #: filename:line
128
129        The style name is case insensitive.  GNU style is the default.
130
131    -v
132    --verbose
133        Print the names of the files being processed.
134
135    -V
136    --version
137        Print the version of pygettext and exit.
138
139    -w columns
140    --width=columns
141        Set width of output to columns.
142
143    -x filename
144    --exclude-file=filename
145        Specify a file that contains a list of strings that are not be
146        extracted from the input files.  Each string to be excluded must
147        appear on a line by itself in the file.
148
149    -X filename
150    --no-docstrings=filename
151        Specify a file that contains a list of files (one per line) that
152        should not have their docstrings extracted.  This is only useful in
153        conjunction with the -D option above.
154
155If `inputfile' is -, standard input is read.
156""")
157
158import os
159import importlib.machinery
160import importlib.util
161import sys
162import glob
163import time
164import getopt
165import ast
166import token
167import tokenize
168
169__version__ = '1.5'
170
171default_keywords = ['_']
172DEFAULTKEYWORDS = ', '.join(default_keywords)
173
174EMPTYSTRING = ''
175
176
177
178# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
179# there.
180pot_header = _('''\
181# SOME DESCRIPTIVE TITLE.
182# Copyright (C) YEAR ORGANIZATION
183# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
184#
185msgid ""
186msgstr ""
187"Project-Id-Version: PACKAGE VERSION\\n"
188"POT-Creation-Date: %(time)s\\n"
189"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
190"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
191"Language-Team: LANGUAGE <LL@li.org>\\n"
192"MIME-Version: 1.0\\n"
193"Content-Type: text/plain; charset=%(charset)s\\n"
194"Content-Transfer-Encoding: %(encoding)s\\n"
195"Generated-By: pygettext.py %(version)s\\n"
196
197''')
198
199
200def usage(code, msg=''):
201    print(__doc__ % globals(), file=sys.stderr)
202    if msg:
203        print(msg, file=sys.stderr)
204    sys.exit(code)
205
206
207
208def make_escapes(pass_nonascii):
209    global escapes, escape
210    if pass_nonascii:
211        # Allow non-ascii characters to pass through so that e.g. 'msgid
212        # "H�he"' would result not result in 'msgid "H\366he"'.  Otherwise we
213        # escape any character outside the 32..126 range.
214        mod = 128
215        escape = escape_ascii
216    else:
217        mod = 256
218        escape = escape_nonascii
219    escapes = [r"\%03o" % i for i in range(mod)]
220    for i in range(32, 127):
221        escapes[i] = chr(i)
222    escapes[ord('\\')] = r'\\'
223    escapes[ord('\t')] = r'\t'
224    escapes[ord('\r')] = r'\r'
225    escapes[ord('\n')] = r'\n'
226    escapes[ord('\"')] = r'\"'
227
228
229def escape_ascii(s, encoding):
230    return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s)
231
232def escape_nonascii(s, encoding):
233    return ''.join(escapes[b] for b in s.encode(encoding))
234
235
236def is_literal_string(s):
237    return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"')
238
239
240def safe_eval(s):
241    # unwrap quotes, safely
242    return eval(s, {'__builtins__':{}}, {})
243
244
245def normalize(s, encoding):
246    # This converts the various Python string types into a format that is
247    # appropriate for .po files, namely much closer to C style.
248    lines = s.split('\n')
249    if len(lines) == 1:
250        s = '"' + escape(s, encoding) + '"'
251    else:
252        if not lines[-1]:
253            del lines[-1]
254            lines[-1] = lines[-1] + '\n'
255        for i in range(len(lines)):
256            lines[i] = escape(lines[i], encoding)
257        lineterm = '\\n"\n"'
258        s = '""\n"' + lineterm.join(lines) + '"'
259    return s
260
261
262def containsAny(str, set):
263    """Check whether 'str' contains ANY of the chars in 'set'"""
264    return 1 in [c in str for c in set]
265
266
267def getFilesForName(name):
268    """Get a list of module files for a filename, a module or package name,
269    or a directory.
270    """
271    if not os.path.exists(name):
272        # check for glob chars
273        if containsAny(name, "*?[]"):
274            files = glob.glob(name)
275            list = []
276            for file in files:
277                list.extend(getFilesForName(file))
278            return list
279
280        # try to find module or package
281        try:
282            spec = importlib.util.find_spec(name)
283            name = spec.origin
284        except ImportError:
285            name = None
286        if not name:
287            return []
288
289    if os.path.isdir(name):
290        # find all python files in directory
291        list = []
292        # get extension for python source files
293        _py_ext = importlib.machinery.SOURCE_SUFFIXES[0]
294        for root, dirs, files in os.walk(name):
295            # don't recurse into CVS directories
296            if 'CVS' in dirs:
297                dirs.remove('CVS')
298            # add all *.py files to list
299            list.extend(
300                [os.path.join(root, file) for file in files
301                 if os.path.splitext(file)[1] == _py_ext]
302                )
303        return list
304    elif os.path.exists(name):
305        # a single file
306        return [name]
307
308    return []
309
310
311class TokenEater:
312    def __init__(self, options):
313        self.__options = options
314        self.__messages = {}
315        self.__state = self.__waiting
316        self.__data = []
317        self.__lineno = -1
318        self.__freshmodule = 1
319        self.__curfile = None
320        self.__enclosurecount = 0
321
322    def __call__(self, ttype, tstring, stup, etup, line):
323        # dispatch
324##        import token
325##        print('ttype:', token.tok_name[ttype], 'tstring:', tstring,
326##              file=sys.stderr)
327        self.__state(ttype, tstring, stup[0])
328
329    def __waiting(self, ttype, tstring, lineno):
330        opts = self.__options
331        # Do docstring extractions, if enabled
332        if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
333            # module docstring?
334            if self.__freshmodule:
335                if ttype == tokenize.STRING and is_literal_string(tstring):
336                    self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
337                    self.__freshmodule = 0
338                elif ttype not in (tokenize.COMMENT, tokenize.NL):
339                    self.__freshmodule = 0
340                return
341            # class or func/method docstring?
342            if ttype == tokenize.NAME and tstring in ('class', 'def'):
343                self.__state = self.__suiteseen
344                return
345        if ttype == tokenize.NAME and tstring in opts.keywords:
346            self.__state = self.__keywordseen
347            return
348        if ttype == tokenize.STRING:
349            maybe_fstring = ast.parse(tstring, mode='eval').body
350            if not isinstance(maybe_fstring, ast.JoinedStr):
351                return
352            for value in filter(lambda node: isinstance(node, ast.FormattedValue),
353                                maybe_fstring.values):
354                for call in filter(lambda node: isinstance(node, ast.Call),
355                                   ast.walk(value)):
356                    func = call.func
357                    if isinstance(func, ast.Name):
358                        func_name = func.id
359                    elif isinstance(func, ast.Attribute):
360                        func_name = func.attr
361                    else:
362                        continue
363
364                    if func_name not in opts.keywords:
365                        continue
366                    if len(call.args) != 1:
367                        print(_(
368                            '*** %(file)s:%(lineno)s: Seen unexpected amount of'
369                            ' positional arguments in gettext call: %(source_segment)s'
370                            ) % {
371                            'source_segment': ast.get_source_segment(tstring, call) or tstring,
372                            'file': self.__curfile,
373                            'lineno': lineno
374                            }, file=sys.stderr)
375                        continue
376                    if call.keywords:
377                        print(_(
378                            '*** %(file)s:%(lineno)s: Seen unexpected keyword arguments'
379                            ' in gettext call: %(source_segment)s'
380                            ) % {
381                            'source_segment': ast.get_source_segment(tstring, call) or tstring,
382                            'file': self.__curfile,
383                            'lineno': lineno
384                            }, file=sys.stderr)
385                        continue
386                    arg = call.args[0]
387                    if not isinstance(arg, ast.Constant):
388                        print(_(
389                            '*** %(file)s:%(lineno)s: Seen unexpected argument type'
390                            ' in gettext call: %(source_segment)s'
391                            ) % {
392                            'source_segment': ast.get_source_segment(tstring, call) or tstring,
393                            'file': self.__curfile,
394                            'lineno': lineno
395                            }, file=sys.stderr)
396                        continue
397                    if isinstance(arg.value, str):
398                        self.__addentry(arg.value, lineno)
399
400    def __suiteseen(self, ttype, tstring, lineno):
401        # skip over any enclosure pairs until we see the colon
402        if ttype == tokenize.OP:
403            if tstring == ':' and self.__enclosurecount == 0:
404                # we see a colon and we're not in an enclosure: end of def
405                self.__state = self.__suitedocstring
406            elif tstring in '([{':
407                self.__enclosurecount += 1
408            elif tstring in ')]}':
409                self.__enclosurecount -= 1
410
411    def __suitedocstring(self, ttype, tstring, lineno):
412        # ignore any intervening noise
413        if ttype == tokenize.STRING and is_literal_string(tstring):
414            self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
415            self.__state = self.__waiting
416        elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
417                           tokenize.COMMENT):
418            # there was no class docstring
419            self.__state = self.__waiting
420
421    def __keywordseen(self, ttype, tstring, lineno):
422        if ttype == tokenize.OP and tstring == '(':
423            self.__data = []
424            self.__lineno = lineno
425            self.__state = self.__openseen
426        else:
427            self.__state = self.__waiting
428
429    def __openseen(self, ttype, tstring, lineno):
430        if ttype == tokenize.OP and tstring == ')':
431            # We've seen the last of the translatable strings.  Record the
432            # line number of the first line of the strings and update the list
433            # of messages seen.  Reset state for the next batch.  If there
434            # were no strings inside _(), then just ignore this entry.
435            if self.__data:
436                self.__addentry(EMPTYSTRING.join(self.__data))
437            self.__state = self.__waiting
438        elif ttype == tokenize.STRING and is_literal_string(tstring):
439            self.__data.append(safe_eval(tstring))
440        elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
441                           token.NEWLINE, tokenize.NL]:
442            # warn if we see anything else than STRING or whitespace
443            print(_(
444                '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
445                ) % {
446                'token': tstring,
447                'file': self.__curfile,
448                'lineno': self.__lineno
449                }, file=sys.stderr)
450            self.__state = self.__waiting
451
452    def __addentry(self, msg, lineno=None, isdocstring=0):
453        if lineno is None:
454            lineno = self.__lineno
455        if not msg in self.__options.toexclude:
456            entry = (self.__curfile, lineno)
457            self.__messages.setdefault(msg, {})[entry] = isdocstring
458
459    def set_filename(self, filename):
460        self.__curfile = filename
461        self.__freshmodule = 1
462
463    def write(self, fp):
464        options = self.__options
465        timestamp = time.strftime('%Y-%m-%d %H:%M%z')
466        encoding = fp.encoding if fp.encoding else 'UTF-8'
467        print(pot_header % {'time': timestamp, 'version': __version__,
468                            'charset': encoding,
469                            'encoding': '8bit'}, file=fp)
470        # Sort the entries.  First sort each particular entry's keys, then
471        # sort all the entries by their first item.
472        reverse = {}
473        for k, v in self.__messages.items():
474            keys = sorted(v.keys())
475            reverse.setdefault(tuple(keys), []).append((k, v))
476        rkeys = sorted(reverse.keys())
477        for rkey in rkeys:
478            rentries = reverse[rkey]
479            rentries.sort()
480            for k, v in rentries:
481                # If the entry was gleaned out of a docstring, then add a
482                # comment stating so.  This is to aid translators who may wish
483                # to skip translating some unimportant docstrings.
484                isdocstring = any(v.values())
485                # k is the message string, v is a dictionary-set of (filename,
486                # lineno) tuples.  We want to sort the entries in v first by
487                # file name and then by line number.
488                v = sorted(v.keys())
489                if not options.writelocations:
490                    pass
491                # location comments are different b/w Solaris and GNU:
492                elif options.locationstyle == options.SOLARIS:
493                    for filename, lineno in v:
494                        d = {'filename': filename, 'lineno': lineno}
495                        print(_(
496                            '# File: %(filename)s, line: %(lineno)d') % d, file=fp)
497                elif options.locationstyle == options.GNU:
498                    # fit as many locations on one line, as long as the
499                    # resulting line length doesn't exceed 'options.width'
500                    locline = '#:'
501                    for filename, lineno in v:
502                        d = {'filename': filename, 'lineno': lineno}
503                        s = _(' %(filename)s:%(lineno)d') % d
504                        if len(locline) + len(s) <= options.width:
505                            locline = locline + s
506                        else:
507                            print(locline, file=fp)
508                            locline = "#:" + s
509                    if len(locline) > 2:
510                        print(locline, file=fp)
511                if isdocstring:
512                    print('#, docstring', file=fp)
513                print('msgid', normalize(k, encoding), file=fp)
514                print('msgstr ""\n', file=fp)
515
516
517
518def main():
519    global default_keywords
520    try:
521        opts, args = getopt.getopt(
522            sys.argv[1:],
523            'ad:DEhk:Kno:p:S:Vvw:x:X:',
524            ['extract-all', 'default-domain=', 'escape', 'help',
525             'keyword=', 'no-default-keywords',
526             'add-location', 'no-location', 'output=', 'output-dir=',
527             'style=', 'verbose', 'version', 'width=', 'exclude-file=',
528             'docstrings', 'no-docstrings',
529             ])
530    except getopt.error as msg:
531        usage(1, msg)
532
533    # for holding option values
534    class Options:
535        # constants
536        GNU = 1
537        SOLARIS = 2
538        # defaults
539        extractall = 0 # FIXME: currently this option has no effect at all.
540        escape = 0
541        keywords = []
542        outpath = ''
543        outfile = 'messages.pot'
544        writelocations = 1
545        locationstyle = GNU
546        verbose = 0
547        width = 78
548        excludefilename = ''
549        docstrings = 0
550        nodocstrings = {}
551
552    options = Options()
553    locations = {'gnu' : options.GNU,
554                 'solaris' : options.SOLARIS,
555                 }
556
557    # parse options
558    for opt, arg in opts:
559        if opt in ('-h', '--help'):
560            usage(0)
561        elif opt in ('-a', '--extract-all'):
562            options.extractall = 1
563        elif opt in ('-d', '--default-domain'):
564            options.outfile = arg + '.pot'
565        elif opt in ('-E', '--escape'):
566            options.escape = 1
567        elif opt in ('-D', '--docstrings'):
568            options.docstrings = 1
569        elif opt in ('-k', '--keyword'):
570            options.keywords.append(arg)
571        elif opt in ('-K', '--no-default-keywords'):
572            default_keywords = []
573        elif opt in ('-n', '--add-location'):
574            options.writelocations = 1
575        elif opt in ('--no-location',):
576            options.writelocations = 0
577        elif opt in ('-S', '--style'):
578            options.locationstyle = locations.get(arg.lower())
579            if options.locationstyle is None:
580                usage(1, _('Invalid value for --style: %s') % arg)
581        elif opt in ('-o', '--output'):
582            options.outfile = arg
583        elif opt in ('-p', '--output-dir'):
584            options.outpath = arg
585        elif opt in ('-v', '--verbose'):
586            options.verbose = 1
587        elif opt in ('-V', '--version'):
588            print(_('pygettext.py (xgettext for Python) %s') % __version__)
589            sys.exit(0)
590        elif opt in ('-w', '--width'):
591            try:
592                options.width = int(arg)
593            except ValueError:
594                usage(1, _('--width argument must be an integer: %s') % arg)
595        elif opt in ('-x', '--exclude-file'):
596            options.excludefilename = arg
597        elif opt in ('-X', '--no-docstrings'):
598            fp = open(arg)
599            try:
600                while 1:
601                    line = fp.readline()
602                    if not line:
603                        break
604                    options.nodocstrings[line[:-1]] = 1
605            finally:
606                fp.close()
607
608    # calculate escapes
609    make_escapes(not options.escape)
610
611    # calculate all keywords
612    options.keywords.extend(default_keywords)
613
614    # initialize list of strings to exclude
615    if options.excludefilename:
616        try:
617            with open(options.excludefilename) as fp:
618                options.toexclude = fp.readlines()
619        except IOError:
620            print(_(
621                "Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr)
622            sys.exit(1)
623    else:
624        options.toexclude = []
625
626    # resolve args to module lists
627    expanded = []
628    for arg in args:
629        if arg == '-':
630            expanded.append(arg)
631        else:
632            expanded.extend(getFilesForName(arg))
633    args = expanded
634
635    # slurp through all the files
636    eater = TokenEater(options)
637    for filename in args:
638        if filename == '-':
639            if options.verbose:
640                print(_('Reading standard input'))
641            fp = sys.stdin.buffer
642            closep = 0
643        else:
644            if options.verbose:
645                print(_('Working on %s') % filename)
646            fp = open(filename, 'rb')
647            closep = 1
648        try:
649            eater.set_filename(filename)
650            try:
651                tokens = tokenize.tokenize(fp.readline)
652                for _token in tokens:
653                    eater(*_token)
654            except tokenize.TokenError as e:
655                print('%s: %s, line %d, column %d' % (
656                    e.args[0], filename, e.args[1][0], e.args[1][1]),
657                    file=sys.stderr)
658        finally:
659            if closep:
660                fp.close()
661
662    # write the output
663    if options.outfile == '-':
664        fp = sys.stdout
665        closep = 0
666    else:
667        if options.outpath:
668            options.outfile = os.path.join(options.outpath, options.outfile)
669        fp = open(options.outfile, 'w')
670        closep = 1
671    try:
672        eater.write(fp)
673    finally:
674        if closep:
675            fp.close()
676
677
678if __name__ == '__main__':
679    main()
680    # some more test strings
681    # this one creates a warning
682    _('*** Seen unexpected token "%(token)s"') % {'token': 'test'}
683    _('more' 'than' 'one' 'string')
684