1#! /usr/bin/env python3
2# -*- coding: iso-8859-1 -*-
3# Originally written by Barry Warsaw <barry@python.org>
4#
5# Minimally patched to make it even more xgettext compatible
6# by Peter Funk <pf@artcom-gmbh.de>
7#
8# 2002-11-22 J�rgen Hermann <jh@web.de>
9# Added checks that _() only contains string literals, and
10# command line args are resolved to module lists, i.e. you
11# can now pass a filename, a module or package name, or a
12# directory (including globbing chars, important for Win32).
13# Made docstring fit in 80 chars wide displays using pydoc.
14#
15
16# for selftesting
17try:
18    import fintl
19    _ = fintl.gettext
20except ImportError:
21    _ = lambda s: s
22
23__doc__ = _("""pygettext -- Python equivalent of xgettext(1)
24
25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
26internationalization of C programs. Most of these tools are independent of
27the programming language and can be used from within Python programs.
28Martin von Loewis' work[1] helps considerably in this regard.
29
30There's one problem though; xgettext is the program that scans source code
31looking for message strings, but it groks only C (or C++). Python
32introduces a few wrinkles, such as dual quoting characters, triple quoted
33strings, and raw strings. xgettext understands none of this.
34
35Enter pygettext, which uses Python's standard tokenize module to scan
36Python source code, generating .pot files identical to what GNU xgettext[2]
37generates for C and C++ code. From there, the standard GNU tools can be
38used.
39
40A word about marking Python strings as candidates for translation. GNU
41xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
42and gettext_noop. But those can be a lot of text to include all over your
43code. C and C++ have a trick: they use the C preprocessor. Most
44internationalized C source includes a #define for gettext() to _() so that
45what has to be written in the source is much less. Thus these are both
46translatable strings:
47
48    gettext("Translatable String")
49    _("Translatable String")
50
51Python of course has no preprocessor so this doesn't work so well.  Thus,
52pygettext searches only for _() by default, but see the -k/--keyword flag
53below for how to augment this.
54
55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
56 [2] http://www.gnu.org/software/gettext/gettext.html
57
58NOTE: pygettext attempts to be option and feature compatible with GNU
59xgettext where ever possible. However some options are still missing or are
60not fully implemented. Also, xgettext's use of command line switches with
61option arguments is broken, and in these cases, pygettext just defines
62additional switches.
63
64Usage: pygettext [options] inputfile ...
65
66Options:
67
68    -a
69    --extract-all
70        Extract all strings.
71
72    -d name
73    --default-domain=name
74        Rename the default output file from messages.pot to name.pot.
75
76    -E
77    --escape
78        Replace non-ASCII characters with octal escape sequences.
79
80    -D
81    --docstrings
82        Extract module, class, method, and function docstrings.  These do
83        not need to be wrapped in _() markers, and in fact cannot be for
84        Python to consider them docstrings. (See also the -X option).
85
86    -h
87    --help
88        Print this help message and exit.
89
90    -k word
91    --keyword=word
92        Keywords to look for in addition to the default set, which are:
93        %(DEFAULTKEYWORDS)s
94
95        You can have multiple -k flags on the command line.
96
97    -K
98    --no-default-keywords
99        Disable the default set of keywords (see above).  Any keywords
100        explicitly added with the -k/--keyword option are still recognized.
101
102    --no-location
103        Do not write filename/lineno location comments.
104
105    -n
106    --add-location
107        Write filename/lineno location comments indicating where each
108        extracted string is found in the source.  These lines appear before
109        each msgid.  The style of comments is controlled by the -S/--style
110        option.  This is the default.
111
112    -o filename
113    --output=filename
114        Rename the default output file from messages.pot to filename.  If
115        filename is `-' then the output is sent to standard out.
116
117    -p dir
118    --output-dir=dir
119        Output files will be placed in directory dir.
120
121    -S stylename
122    --style stylename
123        Specify which style to use for location comments.  Two styles are
124        supported:
125
126        Solaris  # File: filename, line: line-number
127        GNU      #: filename:line
128
129        The style name is case insensitive.  GNU style is the default.
130
131    -v
132    --verbose
133        Print the names of the files being processed.
134
135    -V
136    --version
137        Print the version of pygettext and exit.
138
139    -w columns
140    --width=columns
141        Set width of output to columns.
142
143    -x filename
144    --exclude-file=filename
145        Specify a file that contains a list of strings that are not be
146        extracted from the input files.  Each string to be excluded must
147        appear on a line by itself in the file.
148
149    -X filename
150    --no-docstrings=filename
151        Specify a file that contains a list of files (one per line) that
152        should not have their docstrings extracted.  This is only useful in
153        conjunction with the -D option above.
154
155If `inputfile' is -, standard input is read.
156""")
157
158import os
159import importlib.machinery
160import importlib.util
161import sys
162import glob
163import time
164import getopt
165import token
166import tokenize
167
168__version__ = '1.5'
169
170default_keywords = ['_']
171DEFAULTKEYWORDS = ', '.join(default_keywords)
172
173EMPTYSTRING = ''
174
175
176
177# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
178# there.
179pot_header = _('''\
180# SOME DESCRIPTIVE TITLE.
181# Copyright (C) YEAR ORGANIZATION
182# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
183#
184msgid ""
185msgstr ""
186"Project-Id-Version: PACKAGE VERSION\\n"
187"POT-Creation-Date: %(time)s\\n"
188"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
189"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
190"Language-Team: LANGUAGE <LL@li.org>\\n"
191"MIME-Version: 1.0\\n"
192"Content-Type: text/plain; charset=%(charset)s\\n"
193"Content-Transfer-Encoding: %(encoding)s\\n"
194"Generated-By: pygettext.py %(version)s\\n"
195
196''')
197
198
199def usage(code, msg=''):
200    print(__doc__ % globals(), file=sys.stderr)
201    if msg:
202        print(msg, file=sys.stderr)
203    sys.exit(code)
204
205
206
207def make_escapes(pass_nonascii):
208    global escapes, escape
209    if pass_nonascii:
210        # Allow non-ascii characters to pass through so that e.g. 'msgid
211        # "H�he"' would result not result in 'msgid "H\366he"'.  Otherwise we
212        # escape any character outside the 32..126 range.
213        mod = 128
214        escape = escape_ascii
215    else:
216        mod = 256
217        escape = escape_nonascii
218    escapes = [r"\%03o" % i for i in range(mod)]
219    for i in range(32, 127):
220        escapes[i] = chr(i)
221    escapes[ord('\\')] = r'\\'
222    escapes[ord('\t')] = r'\t'
223    escapes[ord('\r')] = r'\r'
224    escapes[ord('\n')] = r'\n'
225    escapes[ord('\"')] = r'\"'
226
227
228def escape_ascii(s, encoding):
229    return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s)
230
231def escape_nonascii(s, encoding):
232    return ''.join(escapes[b] for b in s.encode(encoding))
233
234
235def is_literal_string(s):
236    return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"')
237
238
239def safe_eval(s):
240    # unwrap quotes, safely
241    return eval(s, {'__builtins__':{}}, {})
242
243
244def normalize(s, encoding):
245    # This converts the various Python string types into a format that is
246    # appropriate for .po files, namely much closer to C style.
247    lines = s.split('\n')
248    if len(lines) == 1:
249        s = '"' + escape(s, encoding) + '"'
250    else:
251        if not lines[-1]:
252            del lines[-1]
253            lines[-1] = lines[-1] + '\n'
254        for i in range(len(lines)):
255            lines[i] = escape(lines[i], encoding)
256        lineterm = '\\n"\n"'
257        s = '""\n"' + lineterm.join(lines) + '"'
258    return s
259
260
261def containsAny(str, set):
262    """Check whether 'str' contains ANY of the chars in 'set'"""
263    return 1 in [c in str for c in set]
264
265
266def getFilesForName(name):
267    """Get a list of module files for a filename, a module or package name,
268    or a directory.
269    """
270    if not os.path.exists(name):
271        # check for glob chars
272        if containsAny(name, "*?[]"):
273            files = glob.glob(name)
274            list = []
275            for file in files:
276                list.extend(getFilesForName(file))
277            return list
278
279        # try to find module or package
280        try:
281            spec = importlib.util.find_spec(name)
282            name = spec.origin
283        except ImportError:
284            name = None
285        if not name:
286            return []
287
288    if os.path.isdir(name):
289        # find all python files in directory
290        list = []
291        # get extension for python source files
292        _py_ext = importlib.machinery.SOURCE_SUFFIXES[0]
293        for root, dirs, files in os.walk(name):
294            # don't recurse into CVS directories
295            if 'CVS' in dirs:
296                dirs.remove('CVS')
297            # add all *.py files to list
298            list.extend(
299                [os.path.join(root, file) for file in files
300                 if os.path.splitext(file)[1] == _py_ext]
301                )
302        return list
303    elif os.path.exists(name):
304        # a single file
305        return [name]
306
307    return []
308
309
310class TokenEater:
311    def __init__(self, options):
312        self.__options = options
313        self.__messages = {}
314        self.__state = self.__waiting
315        self.__data = []
316        self.__lineno = -1
317        self.__freshmodule = 1
318        self.__curfile = None
319        self.__enclosurecount = 0
320
321    def __call__(self, ttype, tstring, stup, etup, line):
322        # dispatch
323##        import token
324##        print('ttype:', token.tok_name[ttype], 'tstring:', tstring,
325##              file=sys.stderr)
326        self.__state(ttype, tstring, stup[0])
327
328    def __waiting(self, ttype, tstring, lineno):
329        opts = self.__options
330        # Do docstring extractions, if enabled
331        if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
332            # module docstring?
333            if self.__freshmodule:
334                if ttype == tokenize.STRING and is_literal_string(tstring):
335                    self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
336                    self.__freshmodule = 0
337                elif ttype not in (tokenize.COMMENT, tokenize.NL):
338                    self.__freshmodule = 0
339                return
340            # class or func/method docstring?
341            if ttype == tokenize.NAME and tstring in ('class', 'def'):
342                self.__state = self.__suiteseen
343                return
344        if ttype == tokenize.NAME and tstring in opts.keywords:
345            self.__state = self.__keywordseen
346
347    def __suiteseen(self, ttype, tstring, lineno):
348        # skip over any enclosure pairs until we see the colon
349        if ttype == tokenize.OP:
350            if tstring == ':' and self.__enclosurecount == 0:
351                # we see a colon and we're not in an enclosure: end of def
352                self.__state = self.__suitedocstring
353            elif tstring in '([{':
354                self.__enclosurecount += 1
355            elif tstring in ')]}':
356                self.__enclosurecount -= 1
357
358    def __suitedocstring(self, ttype, tstring, lineno):
359        # ignore any intervening noise
360        if ttype == tokenize.STRING and is_literal_string(tstring):
361            self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
362            self.__state = self.__waiting
363        elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
364                           tokenize.COMMENT):
365            # there was no class docstring
366            self.__state = self.__waiting
367
368    def __keywordseen(self, ttype, tstring, lineno):
369        if ttype == tokenize.OP and tstring == '(':
370            self.__data = []
371            self.__lineno = lineno
372            self.__state = self.__openseen
373        else:
374            self.__state = self.__waiting
375
376    def __openseen(self, ttype, tstring, lineno):
377        if ttype == tokenize.OP and tstring == ')':
378            # We've seen the last of the translatable strings.  Record the
379            # line number of the first line of the strings and update the list
380            # of messages seen.  Reset state for the next batch.  If there
381            # were no strings inside _(), then just ignore this entry.
382            if self.__data:
383                self.__addentry(EMPTYSTRING.join(self.__data))
384            self.__state = self.__waiting
385        elif ttype == tokenize.STRING and is_literal_string(tstring):
386            self.__data.append(safe_eval(tstring))
387        elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
388                           token.NEWLINE, tokenize.NL]:
389            # warn if we see anything else than STRING or whitespace
390            print(_(
391                '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
392                ) % {
393                'token': tstring,
394                'file': self.__curfile,
395                'lineno': self.__lineno
396                }, file=sys.stderr)
397            self.__state = self.__waiting
398
399    def __addentry(self, msg, lineno=None, isdocstring=0):
400        if lineno is None:
401            lineno = self.__lineno
402        if not msg in self.__options.toexclude:
403            entry = (self.__curfile, lineno)
404            self.__messages.setdefault(msg, {})[entry] = isdocstring
405
406    def set_filename(self, filename):
407        self.__curfile = filename
408        self.__freshmodule = 1
409
410    def write(self, fp):
411        options = self.__options
412        timestamp = time.strftime('%Y-%m-%d %H:%M%z')
413        encoding = fp.encoding if fp.encoding else 'UTF-8'
414        print(pot_header % {'time': timestamp, 'version': __version__,
415                            'charset': encoding,
416                            'encoding': '8bit'}, file=fp)
417        # Sort the entries.  First sort each particular entry's keys, then
418        # sort all the entries by their first item.
419        reverse = {}
420        for k, v in self.__messages.items():
421            keys = sorted(v.keys())
422            reverse.setdefault(tuple(keys), []).append((k, v))
423        rkeys = sorted(reverse.keys())
424        for rkey in rkeys:
425            rentries = reverse[rkey]
426            rentries.sort()
427            for k, v in rentries:
428                # If the entry was gleaned out of a docstring, then add a
429                # comment stating so.  This is to aid translators who may wish
430                # to skip translating some unimportant docstrings.
431                isdocstring = any(v.values())
432                # k is the message string, v is a dictionary-set of (filename,
433                # lineno) tuples.  We want to sort the entries in v first by
434                # file name and then by line number.
435                v = sorted(v.keys())
436                if not options.writelocations:
437                    pass
438                # location comments are different b/w Solaris and GNU:
439                elif options.locationstyle == options.SOLARIS:
440                    for filename, lineno in v:
441                        d = {'filename': filename, 'lineno': lineno}
442                        print(_(
443                            '# File: %(filename)s, line: %(lineno)d') % d, file=fp)
444                elif options.locationstyle == options.GNU:
445                    # fit as many locations on one line, as long as the
446                    # resulting line length doesn't exceed 'options.width'
447                    locline = '#:'
448                    for filename, lineno in v:
449                        d = {'filename': filename, 'lineno': lineno}
450                        s = _(' %(filename)s:%(lineno)d') % d
451                        if len(locline) + len(s) <= options.width:
452                            locline = locline + s
453                        else:
454                            print(locline, file=fp)
455                            locline = "#:" + s
456                    if len(locline) > 2:
457                        print(locline, file=fp)
458                if isdocstring:
459                    print('#, docstring', file=fp)
460                print('msgid', normalize(k, encoding), file=fp)
461                print('msgstr ""\n', file=fp)
462
463
464
465def main():
466    global default_keywords
467    try:
468        opts, args = getopt.getopt(
469            sys.argv[1:],
470            'ad:DEhk:Kno:p:S:Vvw:x:X:',
471            ['extract-all', 'default-domain=', 'escape', 'help',
472             'keyword=', 'no-default-keywords',
473             'add-location', 'no-location', 'output=', 'output-dir=',
474             'style=', 'verbose', 'version', 'width=', 'exclude-file=',
475             'docstrings', 'no-docstrings',
476             ])
477    except getopt.error as msg:
478        usage(1, msg)
479
480    # for holding option values
481    class Options:
482        # constants
483        GNU = 1
484        SOLARIS = 2
485        # defaults
486        extractall = 0 # FIXME: currently this option has no effect at all.
487        escape = 0
488        keywords = []
489        outpath = ''
490        outfile = 'messages.pot'
491        writelocations = 1
492        locationstyle = GNU
493        verbose = 0
494        width = 78
495        excludefilename = ''
496        docstrings = 0
497        nodocstrings = {}
498
499    options = Options()
500    locations = {'gnu' : options.GNU,
501                 'solaris' : options.SOLARIS,
502                 }
503
504    # parse options
505    for opt, arg in opts:
506        if opt in ('-h', '--help'):
507            usage(0)
508        elif opt in ('-a', '--extract-all'):
509            options.extractall = 1
510        elif opt in ('-d', '--default-domain'):
511            options.outfile = arg + '.pot'
512        elif opt in ('-E', '--escape'):
513            options.escape = 1
514        elif opt in ('-D', '--docstrings'):
515            options.docstrings = 1
516        elif opt in ('-k', '--keyword'):
517            options.keywords.append(arg)
518        elif opt in ('-K', '--no-default-keywords'):
519            default_keywords = []
520        elif opt in ('-n', '--add-location'):
521            options.writelocations = 1
522        elif opt in ('--no-location',):
523            options.writelocations = 0
524        elif opt in ('-S', '--style'):
525            options.locationstyle = locations.get(arg.lower())
526            if options.locationstyle is None:
527                usage(1, _('Invalid value for --style: %s') % arg)
528        elif opt in ('-o', '--output'):
529            options.outfile = arg
530        elif opt in ('-p', '--output-dir'):
531            options.outpath = arg
532        elif opt in ('-v', '--verbose'):
533            options.verbose = 1
534        elif opt in ('-V', '--version'):
535            print(_('pygettext.py (xgettext for Python) %s') % __version__)
536            sys.exit(0)
537        elif opt in ('-w', '--width'):
538            try:
539                options.width = int(arg)
540            except ValueError:
541                usage(1, _('--width argument must be an integer: %s') % arg)
542        elif opt in ('-x', '--exclude-file'):
543            options.excludefilename = arg
544        elif opt in ('-X', '--no-docstrings'):
545            fp = open(arg)
546            try:
547                while 1:
548                    line = fp.readline()
549                    if not line:
550                        break
551                    options.nodocstrings[line[:-1]] = 1
552            finally:
553                fp.close()
554
555    # calculate escapes
556    make_escapes(not options.escape)
557
558    # calculate all keywords
559    options.keywords.extend(default_keywords)
560
561    # initialize list of strings to exclude
562    if options.excludefilename:
563        try:
564            with open(options.excludefilename) as fp:
565                options.toexclude = fp.readlines()
566        except IOError:
567            print(_(
568                "Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr)
569            sys.exit(1)
570    else:
571        options.toexclude = []
572
573    # resolve args to module lists
574    expanded = []
575    for arg in args:
576        if arg == '-':
577            expanded.append(arg)
578        else:
579            expanded.extend(getFilesForName(arg))
580    args = expanded
581
582    # slurp through all the files
583    eater = TokenEater(options)
584    for filename in args:
585        if filename == '-':
586            if options.verbose:
587                print(_('Reading standard input'))
588            fp = sys.stdin.buffer
589            closep = 0
590        else:
591            if options.verbose:
592                print(_('Working on %s') % filename)
593            fp = open(filename, 'rb')
594            closep = 1
595        try:
596            eater.set_filename(filename)
597            try:
598                tokens = tokenize.tokenize(fp.readline)
599                for _token in tokens:
600                    eater(*_token)
601            except tokenize.TokenError as e:
602                print('%s: %s, line %d, column %d' % (
603                    e.args[0], filename, e.args[1][0], e.args[1][1]),
604                    file=sys.stderr)
605        finally:
606            if closep:
607                fp.close()
608
609    # write the output
610    if options.outfile == '-':
611        fp = sys.stdout
612        closep = 0
613    else:
614        if options.outpath:
615            options.outfile = os.path.join(options.outpath, options.outfile)
616        fp = open(options.outfile, 'w')
617        closep = 1
618    try:
619        eater.write(fp)
620    finally:
621        if closep:
622            fp.close()
623
624
625if __name__ == '__main__':
626    main()
627    # some more test strings
628    # this one creates a warning
629    _('*** Seen unexpected token "%(token)s"') % {'token': 'test'}
630    _('more' 'than' 'one' 'string')
631