1#! /usr/bin/env python
2# -*- coding: iso-8859-1 -*-
3# Originally written by Barry Warsaw <barry@zope.com>
4#
5# Minimally patched to make it even more xgettext compatible
6# by Peter Funk <pf@artcom-gmbh.de>
7#
8# 2002-11-22 J�rgen Hermann <jh@web.de>
9# Added checks that _() only contains string literals, and
10# command line args are resolved to module lists, i.e. you
11# can now pass a filename, a module or package name, or a
12# directory (including globbing chars, important for Win32).
13# Made docstring fit in 80 chars wide displays using pydoc.
14#
15
16# for selftesting
17try:
18    import fintl
19    _ = fintl.gettext
20except ImportError:
21    _ = lambda s: s
22
23__doc__ = _("""pygettext -- Python equivalent of xgettext(1)
24
25Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
26internationalization of C programs. Most of these tools are independent of
27the programming language and can be used from within Python programs.
28Martin von Loewis' work[1] helps considerably in this regard.
29
30There's one problem though; xgettext is the program that scans source code
31looking for message strings, but it groks only C (or C++). Python
32introduces a few wrinkles, such as dual quoting characters, triple quoted
33strings, and raw strings. xgettext understands none of this.
34
35Enter pygettext, which uses Python's standard tokenize module to scan
36Python source code, generating .pot files identical to what GNU xgettext[2]
37generates for C and C++ code. From there, the standard GNU tools can be
38used.
39
40A word about marking Python strings as candidates for translation. GNU
41xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
42and gettext_noop. But those can be a lot of text to include all over your
43code. C and C++ have a trick: they use the C preprocessor. Most
44internationalized C source includes a #define for gettext() to _() so that
45what has to be written in the source is much less. Thus these are both
46translatable strings:
47
48    gettext("Translatable String")
49    _("Translatable String")
50
51Python of course has no preprocessor so this doesn't work so well.  Thus,
52pygettext searches only for _() by default, but see the -k/--keyword flag
53below for how to augment this.
54
55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
56 [2] http://www.gnu.org/software/gettext/gettext.html
57
58NOTE: pygettext attempts to be option and feature compatible with GNU
59xgettext where ever possible. However some options are still missing or are
60not fully implemented. Also, xgettext's use of command line switches with
61option arguments is broken, and in these cases, pygettext just defines
62additional switches.
63
64Usage: pygettext [options] inputfile ...
65
66Options:
67
68    -a
69    --extract-all
70        Extract all strings.
71
72    -d name
73    --default-domain=name
74        Rename the default output file from messages.pot to name.pot.
75
76    -E
77    --escape
78        Replace non-ASCII characters with octal escape sequences.
79
80    -D
81    --docstrings
82        Extract module, class, method, and function docstrings.  These do
83        not need to be wrapped in _() markers, and in fact cannot be for
84        Python to consider them docstrings. (See also the -X option).
85
86    -h
87    --help
88        Print this help message and exit.
89
90    -k word
91    --keyword=word
92        Keywords to look for in addition to the default set, which are:
93        %(DEFAULTKEYWORDS)s
94
95        You can have multiple -k flags on the command line.
96
97    -K
98    --no-default-keywords
99        Disable the default set of keywords (see above).  Any keywords
100        explicitly added with the -k/--keyword option are still recognized.
101
102    --no-location
103        Do not write filename/lineno location comments.
104
105    -n
106    --add-location
107        Write filename/lineno location comments indicating where each
108        extracted string is found in the source.  These lines appear before
109        each msgid.  The style of comments is controlled by the -S/--style
110        option.  This is the default.
111
112    -o filename
113    --output=filename
114        Rename the default output file from messages.pot to filename.  If
115        filename is `-' then the output is sent to standard out.
116
117    -p dir
118    --output-dir=dir
119        Output files will be placed in directory dir.
120
121    -S stylename
122    --style stylename
123        Specify which style to use for location comments.  Two styles are
124        supported:
125
126        Solaris  # File: filename, line: line-number
127        GNU      #: filename:line
128
129        The style name is case insensitive.  GNU style is the default.
130
131    -v
132    --verbose
133        Print the names of the files being processed.
134
135    -V
136    --version
137        Print the version of pygettext and exit.
138
139    -w columns
140    --width=columns
141        Set width of output to columns.
142
143    -x filename
144    --exclude-file=filename
145        Specify a file that contains a list of strings that are not be
146        extracted from the input files.  Each string to be excluded must
147        appear on a line by itself in the file.
148
149    -X filename
150    --no-docstrings=filename
151        Specify a file that contains a list of files (one per line) that
152        should not have their docstrings extracted.  This is only useful in
153        conjunction with the -D option above.
154
155If `inputfile' is -, standard input is read.
156""")
157
158import os
159import imp
160import sys
161import glob
162import time
163import getopt
164import token
165import tokenize
166import operator
167
168__version__ = '1.5'
169
170default_keywords = ['_']
171DEFAULTKEYWORDS = ', '.join(default_keywords)
172
173EMPTYSTRING = ''
174
175
176
177# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
178# there.
179pot_header = _('''\
180# SOME DESCRIPTIVE TITLE.
181# Copyright (C) YEAR ORGANIZATION
182# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
183#
184msgid ""
185msgstr ""
186"Project-Id-Version: PACKAGE VERSION\\n"
187"POT-Creation-Date: %(time)s\\n"
188"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
189"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
190"Language-Team: LANGUAGE <LL@li.org>\\n"
191"MIME-Version: 1.0\\n"
192"Content-Type: text/plain; charset=CHARSET\\n"
193"Content-Transfer-Encoding: ENCODING\\n"
194"Generated-By: pygettext.py %(version)s\\n"
195
196''')
197
198
199def usage(code, msg=''):
200    print >> sys.stderr, __doc__ % globals()
201    if msg:
202        print >> sys.stderr, msg
203    sys.exit(code)
204
205
206
207escapes = []
208
209def make_escapes(pass_iso8859):
210    global escapes
211    if pass_iso8859:
212        # Allow iso-8859 characters to pass through so that e.g. 'msgid
213        # "H�he"' would result not result in 'msgid "H\366he"'.  Otherwise we
214        # escape any character outside the 32..126 range.
215        mod = 128
216    else:
217        mod = 256
218    for i in range(256):
219        if 32 <= (i % mod) <= 126:
220            escapes.append(chr(i))
221        else:
222            escapes.append("\\%03o" % i)
223    escapes[ord('\\')] = '\\\\'
224    escapes[ord('\t')] = '\\t'
225    escapes[ord('\r')] = '\\r'
226    escapes[ord('\n')] = '\\n'
227    escapes[ord('\"')] = '\\"'
228
229
230def escape(s):
231    global escapes
232    s = list(s)
233    for i in range(len(s)):
234        s[i] = escapes[ord(s[i])]
235    return EMPTYSTRING.join(s)
236
237
238def safe_eval(s):
239    # unwrap quotes, safely
240    return eval(s, {'__builtins__':{}}, {})
241
242
243def normalize(s):
244    # This converts the various Python string types into a format that is
245    # appropriate for .po files, namely much closer to C style.
246    lines = s.split('\n')
247    if len(lines) == 1:
248        s = '"' + escape(s) + '"'
249    else:
250        if not lines[-1]:
251            del lines[-1]
252            lines[-1] = lines[-1] + '\n'
253        for i in range(len(lines)):
254            lines[i] = escape(lines[i])
255        lineterm = '\\n"\n"'
256        s = '""\n"' + lineterm.join(lines) + '"'
257    return s
258
259
260def containsAny(str, set):
261    """Check whether 'str' contains ANY of the chars in 'set'"""
262    return 1 in [c in str for c in set]
263
264
265def _visit_pyfiles(list, dirname, names):
266    """Helper for getFilesForName()."""
267    # get extension for python source files
268    if not globals().has_key('_py_ext'):
269        global _py_ext
270        _py_ext = [triple[0] for triple in imp.get_suffixes()
271                   if triple[2] == imp.PY_SOURCE][0]
272
273    # don't recurse into CVS directories
274    if 'CVS' in names:
275        names.remove('CVS')
276
277    # add all *.py files to list
278    list.extend(
279        [os.path.join(dirname, file) for file in names
280         if os.path.splitext(file)[1] == _py_ext]
281        )
282
283
284def _get_modpkg_path(dotted_name, pathlist=None):
285    """Get the filesystem path for a module or a package.
286
287    Return the file system path to a file for a module, and to a directory for
288    a package. Return None if the name is not found, or is a builtin or
289    extension module.
290    """
291    # split off top-most name
292    parts = dotted_name.split('.', 1)
293
294    if len(parts) > 1:
295        # we have a dotted path, import top-level package
296        try:
297            file, pathname, description = imp.find_module(parts[0], pathlist)
298            if file: file.close()
299        except ImportError:
300            return None
301
302        # check if it's indeed a package
303        if description[2] == imp.PKG_DIRECTORY:
304            # recursively handle the remaining name parts
305            pathname = _get_modpkg_path(parts[1], [pathname])
306        else:
307            pathname = None
308    else:
309        # plain name
310        try:
311            file, pathname, description = imp.find_module(
312                dotted_name, pathlist)
313            if file:
314                file.close()
315            if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]:
316                pathname = None
317        except ImportError:
318            pathname = None
319
320    return pathname
321
322
323def getFilesForName(name):
324    """Get a list of module files for a filename, a module or package name,
325    or a directory.
326    """
327    if not os.path.exists(name):
328        # check for glob chars
329        if containsAny(name, "*?[]"):
330            files = glob.glob(name)
331            list = []
332            for file in files:
333                list.extend(getFilesForName(file))
334            return list
335
336        # try to find module or package
337        name = _get_modpkg_path(name)
338        if not name:
339            return []
340
341    if os.path.isdir(name):
342        # find all python files in directory
343        list = []
344        os.path.walk(name, _visit_pyfiles, list)
345        return list
346    elif os.path.exists(name):
347        # a single file
348        return [name]
349
350    return []
351
352
353class TokenEater:
354    def __init__(self, options):
355        self.__options = options
356        self.__messages = {}
357        self.__state = self.__waiting
358        self.__data = []
359        self.__lineno = -1
360        self.__freshmodule = 1
361        self.__curfile = None
362
363    def __call__(self, ttype, tstring, stup, etup, line):
364        # dispatch
365##        import token
366##        print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
367##              'tstring:', tstring
368        self.__state(ttype, tstring, stup[0])
369
370    def __waiting(self, ttype, tstring, lineno):
371        opts = self.__options
372        # Do docstring extractions, if enabled
373        if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
374            # module docstring?
375            if self.__freshmodule:
376                if ttype == tokenize.STRING:
377                    self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
378                    self.__freshmodule = 0
379                elif ttype not in (tokenize.COMMENT, tokenize.NL):
380                    self.__freshmodule = 0
381                return
382            # class docstring?
383            if ttype == tokenize.NAME and tstring in ('class', 'def'):
384                self.__state = self.__suiteseen
385                return
386        if ttype == tokenize.NAME and tstring in opts.keywords:
387            self.__state = self.__keywordseen
388
389    def __suiteseen(self, ttype, tstring, lineno):
390        # ignore anything until we see the colon
391        if ttype == tokenize.OP and tstring == ':':
392            self.__state = self.__suitedocstring
393
394    def __suitedocstring(self, ttype, tstring, lineno):
395        # ignore any intervening noise
396        if ttype == tokenize.STRING:
397            self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
398            self.__state = self.__waiting
399        elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
400                           tokenize.COMMENT):
401            # there was no class docstring
402            self.__state = self.__waiting
403
404    def __keywordseen(self, ttype, tstring, lineno):
405        if ttype == tokenize.OP and tstring == '(':
406            self.__data = []
407            self.__lineno = lineno
408            self.__state = self.__openseen
409        else:
410            self.__state = self.__waiting
411
412    def __openseen(self, ttype, tstring, lineno):
413        if ttype == tokenize.OP and tstring == ')':
414            # We've seen the last of the translatable strings.  Record the
415            # line number of the first line of the strings and update the list
416            # of messages seen.  Reset state for the next batch.  If there
417            # were no strings inside _(), then just ignore this entry.
418            if self.__data:
419                self.__addentry(EMPTYSTRING.join(self.__data))
420            self.__state = self.__waiting
421        elif ttype == tokenize.STRING:
422            self.__data.append(safe_eval(tstring))
423        elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
424                           token.NEWLINE, tokenize.NL]:
425            # warn if we see anything else than STRING or whitespace
426            print >> sys.stderr, _(
427                '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
428                ) % {
429                'token': tstring,
430                'file': self.__curfile,
431                'lineno': self.__lineno
432                }
433            self.__state = self.__waiting
434
435    def __addentry(self, msg, lineno=None, isdocstring=0):
436        if lineno is None:
437            lineno = self.__lineno
438        if not msg in self.__options.toexclude:
439            entry = (self.__curfile, lineno)
440            self.__messages.setdefault(msg, {})[entry] = isdocstring
441
442    def set_filename(self, filename):
443        self.__curfile = filename
444        self.__freshmodule = 1
445
446    def write(self, fp):
447        options = self.__options
448        timestamp = time.strftime('%Y-%m-%d %H:%M+%Z')
449        # The time stamp in the header doesn't have the same format as that
450        # generated by xgettext...
451        print >> fp, pot_header % {'time': timestamp, 'version': __version__}
452        # Sort the entries.  First sort each particular entry's keys, then
453        # sort all the entries by their first item.
454        reverse = {}
455        for k, v in self.__messages.items():
456            keys = v.keys()
457            keys.sort()
458            reverse.setdefault(tuple(keys), []).append((k, v))
459        rkeys = reverse.keys()
460        rkeys.sort()
461        for rkey in rkeys:
462            rentries = reverse[rkey]
463            rentries.sort()
464            for k, v in rentries:
465                isdocstring = 0
466                # If the entry was gleaned out of a docstring, then add a
467                # comment stating so.  This is to aid translators who may wish
468                # to skip translating some unimportant docstrings.
469                if reduce(operator.__add__, v.values()):
470                    isdocstring = 1
471                # k is the message string, v is a dictionary-set of (filename,
472                # lineno) tuples.  We want to sort the entries in v first by
473                # file name and then by line number.
474                v = v.keys()
475                v.sort()
476                if not options.writelocations:
477                    pass
478                # location comments are different b/w Solaris and GNU:
479                elif options.locationstyle == options.SOLARIS:
480                    for filename, lineno in v:
481                        d = {'filename': filename, 'lineno': lineno}
482                        print >>fp, _(
483                            '# File: %(filename)s, line: %(lineno)d') % d
484                elif options.locationstyle == options.GNU:
485                    # fit as many locations on one line, as long as the
486                    # resulting line length doesn't exceeds 'options.width'
487                    locline = '#:'
488                    for filename, lineno in v:
489                        d = {'filename': filename, 'lineno': lineno}
490                        s = _(' %(filename)s:%(lineno)d') % d
491                        if len(locline) + len(s) <= options.width:
492                            locline = locline + s
493                        else:
494                            print >> fp, locline
495                            locline = "#:" + s
496                    if len(locline) > 2:
497                        print >> fp, locline
498                if isdocstring:
499                    print >> fp, '#, docstring'
500                print >> fp, 'msgid', normalize(k)
501                print >> fp, 'msgstr ""\n'
502
503
504
505def main():
506    global default_keywords
507    try:
508        opts, args = getopt.getopt(
509            sys.argv[1:],
510            'ad:DEhk:Kno:p:S:Vvw:x:X:',
511            ['extract-all', 'default-domain=', 'escape', 'help',
512             'keyword=', 'no-default-keywords',
513             'add-location', 'no-location', 'output=', 'output-dir=',
514             'style=', 'verbose', 'version', 'width=', 'exclude-file=',
515             'docstrings', 'no-docstrings',
516             ])
517    except getopt.error, msg:
518        usage(1, msg)
519
520    # for holding option values
521    class Options:
522        # constants
523        GNU = 1
524        SOLARIS = 2
525        # defaults
526        extractall = 0 # FIXME: currently this option has no effect at all.
527        escape = 0
528        keywords = []
529        outpath = ''
530        outfile = 'messages.pot'
531        writelocations = 1
532        locationstyle = GNU
533        verbose = 0
534        width = 78
535        excludefilename = ''
536        docstrings = 0
537        nodocstrings = {}
538
539    options = Options()
540    locations = {'gnu' : options.GNU,
541                 'solaris' : options.SOLARIS,
542                 }
543
544    # parse options
545    for opt, arg in opts:
546        if opt in ('-h', '--help'):
547            usage(0)
548        elif opt in ('-a', '--extract-all'):
549            options.extractall = 1
550        elif opt in ('-d', '--default-domain'):
551            options.outfile = arg + '.pot'
552        elif opt in ('-E', '--escape'):
553            options.escape = 1
554        elif opt in ('-D', '--docstrings'):
555            options.docstrings = 1
556        elif opt in ('-k', '--keyword'):
557            options.keywords.append(arg)
558        elif opt in ('-K', '--no-default-keywords'):
559            default_keywords = []
560        elif opt in ('-n', '--add-location'):
561            options.writelocations = 1
562        elif opt in ('--no-location',):
563            options.writelocations = 0
564        elif opt in ('-S', '--style'):
565            options.locationstyle = locations.get(arg.lower())
566            if options.locationstyle is None:
567                usage(1, _('Invalid value for --style: %s') % arg)
568        elif opt in ('-o', '--output'):
569            options.outfile = arg
570        elif opt in ('-p', '--output-dir'):
571            options.outpath = arg
572        elif opt in ('-v', '--verbose'):
573            options.verbose = 1
574        elif opt in ('-V', '--version'):
575            print _('pygettext.py (xgettext for Python) %s') % __version__
576            sys.exit(0)
577        elif opt in ('-w', '--width'):
578            try:
579                options.width = int(arg)
580            except ValueError:
581                usage(1, _('--width argument must be an integer: %s') % arg)
582        elif opt in ('-x', '--exclude-file'):
583            options.excludefilename = arg
584        elif opt in ('-X', '--no-docstrings'):
585            fp = open(arg)
586            try:
587                while 1:
588                    line = fp.readline()
589                    if not line:
590                        break
591                    options.nodocstrings[line[:-1]] = 1
592            finally:
593                fp.close()
594
595    # calculate escapes
596    make_escapes(options.escape)
597
598    # calculate all keywords
599    options.keywords.extend(default_keywords)
600
601    # initialize list of strings to exclude
602    if options.excludefilename:
603        try:
604            fp = open(options.excludefilename)
605            options.toexclude = fp.readlines()
606            fp.close()
607        except IOError:
608            print >> sys.stderr, _(
609                "Can't read --exclude-file: %s") % options.excludefilename
610            sys.exit(1)
611    else:
612        options.toexclude = []
613
614    # resolve args to module lists
615    expanded = []
616    for arg in args:
617        if arg == '-':
618            expanded.append(arg)
619        else:
620            expanded.extend(getFilesForName(arg))
621    args = expanded
622
623    # slurp through all the files
624    eater = TokenEater(options)
625    for filename in args:
626        if filename == '-':
627            if options.verbose:
628                print _('Reading standard input')
629            fp = sys.stdin
630            closep = 0
631        else:
632            if options.verbose:
633                print _('Working on %s') % filename
634            fp = open(filename)
635            closep = 1
636        try:
637            eater.set_filename(filename)
638            try:
639                tokenize.tokenize(fp.readline, eater)
640            except tokenize.TokenError, e:
641                print >> sys.stderr, '%s: %s, line %d, column %d' % (
642                    e[0], filename, e[1][0], e[1][1])
643        finally:
644            if closep:
645                fp.close()
646
647    # write the output
648    if options.outfile == '-':
649        fp = sys.stdout
650        closep = 0
651    else:
652        if options.outpath:
653            options.outfile = os.path.join(options.outpath, options.outfile)
654        fp = open(options.outfile, 'w')
655        closep = 1
656    try:
657        eater.write(fp)
658    finally:
659        if closep:
660            fp.close()
661
662
663if __name__ == '__main__':
664    main()
665    # some more test strings
666    _(u'a unicode string')
667    # this one creates a warning
668    _('*** Seen unexpected token "%(token)s"') % {'token': 'test'}
669    _('more' 'than' 'one' 'string')
670