1#! /usr/bin/env python
2# -*- coding: iso-8859-1 -*-
3# Originally written by Barry Warsaw <barry@python.org>
4#
5# Minimally patched to make it even more xgettext compatible
6# by Peter Funk <pf@artcom-gmbh.de>
7#
8# 2002-11-22 J�rgen Hermann <jh@web.de>
9# Added checks that _() only contains string literals, and
10# command line args are resolved to module lists, i.e. you
11# can now pass a filename, a module or package name, or a
12# directory (including globbing chars, important for Win32).
13# Made docstring fit in 80 chars wide displays using pydoc.
14#
15
16# for selftesting
17try:
18    import fintl
19    _ = fintl.gettext
20except ImportError:
21    _ = lambda s: s
22
23__doc__ = _("""pygettext -- Python equivalent of xgettext(1)
24
25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
26internationalization of C programs. Most of these tools are independent of
27the programming language and can be used from within Python programs.
28Martin von Loewis' work[1] helps considerably in this regard.
29
30There's one problem though; xgettext is the program that scans source code
31looking for message strings, but it groks only C (or C++). Python
32introduces a few wrinkles, such as dual quoting characters, triple quoted
33strings, and raw strings. xgettext understands none of this.
34
35Enter pygettext, which uses Python's standard tokenize module to scan
36Python source code, generating .pot files identical to what GNU xgettext[2]
37generates for C and C++ code. From there, the standard GNU tools can be
38used.
39
40A word about marking Python strings as candidates for translation. GNU
41xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
42and gettext_noop. But those can be a lot of text to include all over your
43code. C and C++ have a trick: they use the C preprocessor. Most
44internationalized C source includes a #define for gettext() to _() so that
45what has to be written in the source is much less. Thus these are both
46translatable strings:
47
48    gettext("Translatable String")
49    _("Translatable String")
50
51Python of course has no preprocessor so this doesn't work so well.  Thus,
52pygettext searches only for _() by default, but see the -k/--keyword flag
53below for how to augment this.
54
55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
56 [2] http://www.gnu.org/software/gettext/gettext.html
57
58NOTE: pygettext attempts to be option and feature compatible with GNU
59xgettext where ever possible. However some options are still missing or are
60not fully implemented. Also, xgettext's use of command line switches with
61option arguments is broken, and in these cases, pygettext just defines
62additional switches.
63
64Usage: pygettext [options] inputfile ...
65
66Options:
67
68    -a
69    --extract-all
70        Extract all strings.
71
72    -d name
73    --default-domain=name
74        Rename the default output file from messages.pot to name.pot.
75
76    -E
77    --escape
78        Replace non-ASCII characters with octal escape sequences.
79
80    -D
81    --docstrings
82        Extract module, class, method, and function docstrings.  These do
83        not need to be wrapped in _() markers, and in fact cannot be for
84        Python to consider them docstrings. (See also the -X option).
85
86    -h
87    --help
88        Print this help message and exit.
89
90    -k word
91    --keyword=word
92        Keywords to look for in addition to the default set, which are:
93        %(DEFAULTKEYWORDS)s
94
95        You can have multiple -k flags on the command line.
96
97    -K
98    --no-default-keywords
99        Disable the default set of keywords (see above).  Any keywords
100        explicitly added with the -k/--keyword option are still recognized.
101
102    --no-location
103        Do not write filename/lineno location comments.
104
105    -n
106    --add-location
107        Write filename/lineno location comments indicating where each
108        extracted string is found in the source.  These lines appear before
109        each msgid.  The style of comments is controlled by the -S/--style
110        option.  This is the default.
111
112    -o filename
113    --output=filename
114        Rename the default output file from messages.pot to filename.  If
115        filename is `-' then the output is sent to standard out.
116
117    -p dir
118    --output-dir=dir
119        Output files will be placed in directory dir.
120
121    -S stylename
122    --style stylename
123        Specify which style to use for location comments.  Two styles are
124        supported:
125
126        Solaris  # File: filename, line: line-number
127        GNU      #: filename:line
128
129        The style name is case insensitive.  GNU style is the default.
130
131    -v
132    --verbose
133        Print the names of the files being processed.
134
135    -V
136    --version
137        Print the version of pygettext and exit.
138
139    -w columns
140    --width=columns
141        Set width of output to columns.
142
143    -x filename
144    --exclude-file=filename
145        Specify a file that contains a list of strings that are not be
146        extracted from the input files.  Each string to be excluded must
147        appear on a line by itself in the file.
148
149    -X filename
150    --no-docstrings=filename
151        Specify a file that contains a list of files (one per line) that
152        should not have their docstrings extracted.  This is only useful in
153        conjunction with the -D option above.
154
155If `inputfile' is -, standard input is read.
156""")
157
158import os
159import imp
160import sys
161import glob
162import time
163import getopt
164import token
165import tokenize
166import operator
167
168__version__ = '1.5'
169
170default_keywords = ['_']
171DEFAULTKEYWORDS = ', '.join(default_keywords)
172
173EMPTYSTRING = ''
174
175
176
177# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
178# there.
179pot_header = _('''\
180# SOME DESCRIPTIVE TITLE.
181# Copyright (C) YEAR ORGANIZATION
182# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
183#
184msgid ""
185msgstr ""
186"Project-Id-Version: PACKAGE VERSION\\n"
187"POT-Creation-Date: %(time)s\\n"
188"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
189"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
190"Language-Team: LANGUAGE <LL@li.org>\\n"
191"MIME-Version: 1.0\\n"
192"Content-Type: text/plain; charset=CHARSET\\n"
193"Content-Transfer-Encoding: ENCODING\\n"
194"Generated-By: pygettext.py %(version)s\\n"
195
196''')
197
198
199def usage(code, msg=''):
200    print >> sys.stderr, __doc__ % globals()
201    if msg:
202        print >> sys.stderr, msg
203    sys.exit(code)
204
205
206
207escapes = []
208
209def make_escapes(pass_iso8859):
210    global escapes
211    escapes = [chr(i) for i in range(256)]
212    if pass_iso8859:
213        # Allow iso-8859 characters to pass through so that e.g. 'msgid
214        # "H�he"' would result not result in 'msgid "H\366he"'.  Otherwise we
215        # escape any character outside the 32..126 range.
216        mod = 128
217    else:
218        mod = 256
219    for i in range(mod):
220        if not(32 <= i <= 126):
221            escapes[i] = "\\%03o" % i
222    escapes[ord('\\')] = '\\\\'
223    escapes[ord('\t')] = '\\t'
224    escapes[ord('\r')] = '\\r'
225    escapes[ord('\n')] = '\\n'
226    escapes[ord('\"')] = '\\"'
227
228
229def escape(s):
230    global escapes
231    s = list(s)
232    for i in range(len(s)):
233        s[i] = escapes[ord(s[i])]
234    return EMPTYSTRING.join(s)
235
236
237def safe_eval(s):
238    # unwrap quotes, safely
239    return eval(s, {'__builtins__':{}}, {})
240
241
242def normalize(s):
243    # This converts the various Python string types into a format that is
244    # appropriate for .po files, namely much closer to C style.
245    lines = s.split('\n')
246    if len(lines) == 1:
247        s = '"' + escape(s) + '"'
248    else:
249        if not lines[-1]:
250            del lines[-1]
251            lines[-1] = lines[-1] + '\n'
252        for i in range(len(lines)):
253            lines[i] = escape(lines[i])
254        lineterm = '\\n"\n"'
255        s = '""\n"' + lineterm.join(lines) + '"'
256    return s
257
258
259def containsAny(str, set):
260    """Check whether 'str' contains ANY of the chars in 'set'"""
261    return 1 in [c in str for c in set]
262
263
264def _get_modpkg_path(dotted_name, pathlist=None):
265    """Get the filesystem path for a module or a package.
266
267    Return the file system path to a file for a module, and to a directory for
268    a package. Return None if the name is not found, or is a builtin or
269    extension module.
270    """
271    # split off top-most name
272    parts = dotted_name.split('.', 1)
273
274    if len(parts) > 1:
275        # we have a dotted path, import top-level package
276        try:
277            file, pathname, description = imp.find_module(parts[0], pathlist)
278            if file: file.close()
279        except ImportError:
280            return None
281
282        # check if it's indeed a package
283        if description[2] == imp.PKG_DIRECTORY:
284            # recursively handle the remaining name parts
285            pathname = _get_modpkg_path(parts[1], [pathname])
286        else:
287            pathname = None
288    else:
289        # plain name
290        try:
291            file, pathname, description = imp.find_module(
292                dotted_name, pathlist)
293            if file:
294                file.close()
295            if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]:
296                pathname = None
297        except ImportError:
298            pathname = None
299
300    return pathname
301
302
303def getFilesForName(name):
304    """Get a list of module files for a filename, a module or package name,
305    or a directory.
306    """
307    if not os.path.exists(name):
308        # check for glob chars
309        if containsAny(name, "*?[]"):
310            files = glob.glob(name)
311            list = []
312            for file in files:
313                list.extend(getFilesForName(file))
314            return list
315
316        # try to find module or package
317        name = _get_modpkg_path(name)
318        if not name:
319            return []
320
321    if os.path.isdir(name):
322        # find all python files in directory
323        list = []
324        # get extension for python source files
325        if '_py_ext' not in globals():
326            global _py_ext
327            _py_ext = [triple[0] for triple in imp.get_suffixes()
328                       if triple[2] == imp.PY_SOURCE][0]
329        for root, dirs, files in os.walk(name):
330            # don't recurse into CVS directories
331            if 'CVS' in dirs:
332                dirs.remove('CVS')
333            # add all *.py files to list
334            list.extend(
335                [os.path.join(root, file) for file in files
336                 if os.path.splitext(file)[1] == _py_ext]
337                )
338        return list
339    elif os.path.exists(name):
340        # a single file
341        return [name]
342
343    return []
344
345
346class TokenEater:
347    def __init__(self, options):
348        self.__options = options
349        self.__messages = {}
350        self.__state = self.__waiting
351        self.__data = []
352        self.__lineno = -1
353        self.__freshmodule = 1
354        self.__curfile = None
355
356    def __call__(self, ttype, tstring, stup, etup, line):
357        # dispatch
358##        import token
359##        print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
360##              'tstring:', tstring
361        self.__state(ttype, tstring, stup[0])
362
363    def __waiting(self, ttype, tstring, lineno):
364        opts = self.__options
365        # Do docstring extractions, if enabled
366        if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
367            # module docstring?
368            if self.__freshmodule:
369                if ttype == tokenize.STRING:
370                    self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
371                    self.__freshmodule = 0
372                elif ttype not in (tokenize.COMMENT, tokenize.NL):
373                    self.__freshmodule = 0
374                return
375            # class docstring?
376            if ttype == tokenize.NAME and tstring in ('class', 'def'):
377                self.__state = self.__suiteseen
378                return
379        if ttype == tokenize.NAME and tstring in opts.keywords:
380            self.__state = self.__keywordseen
381
382    def __suiteseen(self, ttype, tstring, lineno):
383        # ignore anything until we see the colon
384        if ttype == tokenize.OP and tstring == ':':
385            self.__state = self.__suitedocstring
386
387    def __suitedocstring(self, ttype, tstring, lineno):
388        # ignore any intervening noise
389        if ttype == tokenize.STRING:
390            self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
391            self.__state = self.__waiting
392        elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
393                           tokenize.COMMENT):
394            # there was no class docstring
395            self.__state = self.__waiting
396
397    def __keywordseen(self, ttype, tstring, lineno):
398        if ttype == tokenize.OP and tstring == '(':
399            self.__data = []
400            self.__lineno = lineno
401            self.__state = self.__openseen
402        else:
403            self.__state = self.__waiting
404
405    def __openseen(self, ttype, tstring, lineno):
406        if ttype == tokenize.OP and tstring == ')':
407            # We've seen the last of the translatable strings.  Record the
408            # line number of the first line of the strings and update the list
409            # of messages seen.  Reset state for the next batch.  If there
410            # were no strings inside _(), then just ignore this entry.
411            if self.__data:
412                self.__addentry(EMPTYSTRING.join(self.__data))
413            self.__state = self.__waiting
414        elif ttype == tokenize.STRING:
415            self.__data.append(safe_eval(tstring))
416        elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
417                           token.NEWLINE, tokenize.NL]:
418            # warn if we see anything else than STRING or whitespace
419            print >> sys.stderr, _(
420                '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
421                ) % {
422                'token': tstring,
423                'file': self.__curfile,
424                'lineno': self.__lineno
425                }
426            self.__state = self.__waiting
427
428    def __addentry(self, msg, lineno=None, isdocstring=0):
429        if lineno is None:
430            lineno = self.__lineno
431        if not msg in self.__options.toexclude:
432            entry = (self.__curfile, lineno)
433            self.__messages.setdefault(msg, {})[entry] = isdocstring
434
435    def set_filename(self, filename):
436        self.__curfile = filename
437        self.__freshmodule = 1
438
439    def write(self, fp):
440        options = self.__options
441        timestamp = time.strftime('%Y-%m-%d %H:%M+%Z')
442        # The time stamp in the header doesn't have the same format as that
443        # generated by xgettext...
444        print >> fp, pot_header % {'time': timestamp, 'version': __version__}
445        # Sort the entries.  First sort each particular entry's keys, then
446        # sort all the entries by their first item.
447        reverse = {}
448        for k, v in self.__messages.items():
449            keys = v.keys()
450            keys.sort()
451            reverse.setdefault(tuple(keys), []).append((k, v))
452        rkeys = reverse.keys()
453        rkeys.sort()
454        for rkey in rkeys:
455            rentries = reverse[rkey]
456            rentries.sort()
457            for k, v in rentries:
458                isdocstring = 0
459                # If the entry was gleaned out of a docstring, then add a
460                # comment stating so.  This is to aid translators who may wish
461                # to skip translating some unimportant docstrings.
462                if reduce(operator.__add__, v.values()):
463                    isdocstring = 1
464                # k is the message string, v is a dictionary-set of (filename,
465                # lineno) tuples.  We want to sort the entries in v first by
466                # file name and then by line number.
467                v = v.keys()
468                v.sort()
469                if not options.writelocations:
470                    pass
471                # location comments are different b/w Solaris and GNU:
472                elif options.locationstyle == options.SOLARIS:
473                    for filename, lineno in v:
474                        d = {'filename': filename, 'lineno': lineno}
475                        print >>fp, _(
476                            '# File: %(filename)s, line: %(lineno)d') % d
477                elif options.locationstyle == options.GNU:
478                    # fit as many locations on one line, as long as the
479                    # resulting line length doesn't exceed 'options.width'
480                    locline = '#:'
481                    for filename, lineno in v:
482                        d = {'filename': filename, 'lineno': lineno}
483                        s = _(' %(filename)s:%(lineno)d') % d
484                        if len(locline) + len(s) <= options.width:
485                            locline = locline + s
486                        else:
487                            print >> fp, locline
488                            locline = "#:" + s
489                    if len(locline) > 2:
490                        print >> fp, locline
491                if isdocstring:
492                    print >> fp, '#, docstring'
493                print >> fp, 'msgid', normalize(k)
494                print >> fp, 'msgstr ""\n'
495
496
497
498def main():
499    global default_keywords
500    try:
501        opts, args = getopt.getopt(
502            sys.argv[1:],
503            'ad:DEhk:Kno:p:S:Vvw:x:X:',
504            ['extract-all', 'default-domain=', 'escape', 'help',
505             'keyword=', 'no-default-keywords',
506             'add-location', 'no-location', 'output=', 'output-dir=',
507             'style=', 'verbose', 'version', 'width=', 'exclude-file=',
508             'docstrings', 'no-docstrings',
509             ])
510    except getopt.error, msg:
511        usage(1, msg)
512
513    # for holding option values
514    class Options:
515        # constants
516        GNU = 1
517        SOLARIS = 2
518        # defaults
519        extractall = 0 # FIXME: currently this option has no effect at all.
520        escape = 0
521        keywords = []
522        outpath = ''
523        outfile = 'messages.pot'
524        writelocations = 1
525        locationstyle = GNU
526        verbose = 0
527        width = 78
528        excludefilename = ''
529        docstrings = 0
530        nodocstrings = {}
531
532    options = Options()
533    locations = {'gnu' : options.GNU,
534                 'solaris' : options.SOLARIS,
535                 }
536
537    # parse options
538    for opt, arg in opts:
539        if opt in ('-h', '--help'):
540            usage(0)
541        elif opt in ('-a', '--extract-all'):
542            options.extractall = 1
543        elif opt in ('-d', '--default-domain'):
544            options.outfile = arg + '.pot'
545        elif opt in ('-E', '--escape'):
546            options.escape = 1
547        elif opt in ('-D', '--docstrings'):
548            options.docstrings = 1
549        elif opt in ('-k', '--keyword'):
550            options.keywords.append(arg)
551        elif opt in ('-K', '--no-default-keywords'):
552            default_keywords = []
553        elif opt in ('-n', '--add-location'):
554            options.writelocations = 1
555        elif opt in ('--no-location',):
556            options.writelocations = 0
557        elif opt in ('-S', '--style'):
558            options.locationstyle = locations.get(arg.lower())
559            if options.locationstyle is None:
560                usage(1, _('Invalid value for --style: %s') % arg)
561        elif opt in ('-o', '--output'):
562            options.outfile = arg
563        elif opt in ('-p', '--output-dir'):
564            options.outpath = arg
565        elif opt in ('-v', '--verbose'):
566            options.verbose = 1
567        elif opt in ('-V', '--version'):
568            print _('pygettext.py (xgettext for Python) %s') % __version__
569            sys.exit(0)
570        elif opt in ('-w', '--width'):
571            try:
572                options.width = int(arg)
573            except ValueError:
574                usage(1, _('--width argument must be an integer: %s') % arg)
575        elif opt in ('-x', '--exclude-file'):
576            options.excludefilename = arg
577        elif opt in ('-X', '--no-docstrings'):
578            fp = open(arg)
579            try:
580                while 1:
581                    line = fp.readline()
582                    if not line:
583                        break
584                    options.nodocstrings[line[:-1]] = 1
585            finally:
586                fp.close()
587
588    # calculate escapes
589    make_escapes(not options.escape)
590
591    # calculate all keywords
592    options.keywords.extend(default_keywords)
593
594    # initialize list of strings to exclude
595    if options.excludefilename:
596        try:
597            fp = open(options.excludefilename)
598            options.toexclude = fp.readlines()
599            fp.close()
600        except IOError:
601            print >> sys.stderr, _(
602                "Can't read --exclude-file: %s") % options.excludefilename
603            sys.exit(1)
604    else:
605        options.toexclude = []
606
607    # resolve args to module lists
608    expanded = []
609    for arg in args:
610        if arg == '-':
611            expanded.append(arg)
612        else:
613            expanded.extend(getFilesForName(arg))
614    args = expanded
615
616    # slurp through all the files
617    eater = TokenEater(options)
618    for filename in args:
619        if filename == '-':
620            if options.verbose:
621                print _('Reading standard input')
622            fp = sys.stdin
623            closep = 0
624        else:
625            if options.verbose:
626                print _('Working on %s') % filename
627            fp = open(filename)
628            closep = 1
629        try:
630            eater.set_filename(filename)
631            try:
632                tokenize.tokenize(fp.readline, eater)
633            except tokenize.TokenError, e:
634                print >> sys.stderr, '%s: %s, line %d, column %d' % (
635                    e[0], filename, e[1][0], e[1][1])
636        finally:
637            if closep:
638                fp.close()
639
640    # write the output
641    if options.outfile == '-':
642        fp = sys.stdout
643        closep = 0
644    else:
645        if options.outpath:
646            options.outfile = os.path.join(options.outpath, options.outfile)
647        fp = open(options.outfile, 'w')
648        closep = 1
649    try:
650        eater.write(fp)
651    finally:
652        if closep:
653            fp.close()
654
655
656if __name__ == '__main__':
657    main()
658    # some more test strings
659    _(u'a unicode string')
660    # this one creates a warning
661    _('*** Seen unexpected token "%(token)s"') % {'token': 'test'}
662    _('more' 'than' 'one' 'string')
663