1#! @PYTHON@
2# Originally written by Barry Warsaw <barry@zope.com>
3#
4# Minimally patched to make it even more xgettext compatible
5# by Peter Funk <pf@artcom-gmbh.de>
6
7"""pygettext -- Python equivalent of xgettext(1)
8
9Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
10internationalization of C programs.  Most of these tools are independent of
11the programming language and can be used from within Python programs.  Martin
12von Loewis' work[1] helps considerably in this regard.
13
14There's one problem though; xgettext is the program that scans source code
15looking for message strings, but it groks only C (or C++).  Python introduces
16a few wrinkles, such as dual quoting characters, triple quoted strings, and
17raw strings.  xgettext understands none of this.
18
19Enter pygettext, which uses Python's standard tokenize module to scan Python
20source code, generating .pot files identical to what GNU xgettext[2] generates
21for C and C++ code.  From there, the standard GNU tools can be used.
22
23A word about marking Python strings as candidates for translation.  GNU
24xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
25gettext_noop.  But those can be a lot of text to include all over your code.
26C and C++ have a trick: they use the C preprocessor.  Most internationalized C
27source includes a #define for gettext() to _() so that what has to be written
28in the source is much less.  Thus these are both translatable strings:
29
30    gettext("Translatable String")
31    _("Translatable String")
32
33Python of course has no preprocessor so this doesn't work so well.  Thus,
34pygettext searches only for _() by default, but see the -k/--keyword flag
35below for how to augment this.
36
37 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
38 [2] http://www.gnu.org/software/gettext/gettext.html
39
40NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
41where ever possible.  However some options are still missing or are not fully
42implemented.  Also, xgettext's use of command line switches with option
43arguments is broken, and in these cases, pygettext just defines additional
44switches.
45
46Usage: pygettext [options] inputfile ...
47
48Options:
49
50    -a
51    --extract-all
52        Extract all strings.
53
54    -d name
55    --default-domain=name
56        Rename the default output file from messages.pot to name.pot.
57
58    -E
59    --escape
60        Replace non-ASCII characters with octal escape sequences.
61
62    -D
63    --docstrings
64        Extract module, class, method, and function docstrings.  These do not
65        need to be wrapped in _() markers, and in fact cannot be for Python to
66        consider them docstrings. (See also the -X option).
67
68    -h
69    --help
70        Print this help message and exit.
71
72    -k word
73    --keyword=word
74        Keywords to look for in addition to the default set, which are:
75        %(DEFAULTKEYWORDS)s
76
77        You can have multiple -k flags on the command line.
78
79    -K
80    --no-default-keywords
81        Disable the default set of keywords (see above).  Any keywords
82        explicitly added with the -k/--keyword option are still recognized.
83
84    --no-location
85        Do not write filename/lineno location comments.
86
87    -n
88    --add-location
89        Write filename/lineno location comments indicating where each
90        extracted string is found in the source.  These lines appear before
91        each msgid.  The style of comments is controlled by the -S/--style
92        option.  This is the default.
93
94    -o filename
95    --output=filename
96        Rename the default output file from messages.pot to filename.  If
97        filename is `-' then the output is sent to standard out.
98
99    -p dir
100    --output-dir=dir
101        Output files will be placed in directory dir.
102
103    -S stylename
104    --style stylename
105        Specify which style to use for location comments.  Two styles are
106        supported:
107
108        Solaris  # File: filename, line: line-number
109        GNU      #: filename:line
110
111        The style name is case insensitive.  GNU style is the default.
112
113    -v
114    --verbose
115        Print the names of the files being processed.
116
117    -V
118    --version
119        Print the version of pygettext and exit.
120
121    -w columns
122    --width=columns
123        Set width of output to columns.
124
125    -x filename
126    --exclude-file=filename
127        Specify a file that contains a list of strings that are not be
128        extracted from the input files.  Each string to be excluded must
129        appear on a line by itself in the file.
130
131    -X filename
132    --no-docstrings=filename
133        Specify a file that contains a list of files (one per line) that
134        should not have their docstrings extracted.  This is only useful in
135        conjunction with the -D option above.
136
137If `inputfile' is -, standard input is read.
138"""
139
140import os
141import sys
142import time
143import getopt
144import tokenize
145import operator
146
147# for selftesting
148try:
149    import fintl
150    _ = fintl.gettext
151except ImportError:
152    def _(s): return s
153
154__version__ = '1.4'
155
156default_keywords = ['_']
157DEFAULTKEYWORDS = ', '.join(default_keywords)
158
159EMPTYSTRING = ''
160
161
162
163# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
164# there.
165pot_header = _('''\
166# SOME DESCRIPTIVE TITLE.
167# Copyright (C) YEAR ORGANIZATION
168# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
169#
170msgid ""
171msgstr ""
172"Project-Id-Version: PACKAGE VERSION\\n"
173"POT-Creation-Date: %(time)s\\n"
174"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
175"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
176"Language-Team: LANGUAGE <LL@li.org>\\n"
177"MIME-Version: 1.0\\n"
178"Content-Type: text/plain; charset=CHARSET\\n"
179"Content-Transfer-Encoding: ENCODING\\n"
180"Generated-By: pygettext.py %(version)s\\n"
181
182''')
183
184
185def usage(code, msg=''):
186    if code:
187        fd = sys.stderr
188    else:
189        fd = sys.stdout
190    print >> fd, _(__doc__) % globals()
191    if msg:
192        print >> fd, msg
193    sys.exit(code)
194
195
196
197escapes = []
198
199def make_escapes(pass_iso8859):
200    global escapes
201    if pass_iso8859:
202        # Allow iso-8859 characters to pass through so that e.g. 'msgid
203        # "H[o-umlaut]he"' would result not result in 'msgid "H\366he"'.
204        # Otherwise we escape any character outside the 32..126 range.
205        mod = 128
206    else:
207        mod = 256
208    for i in range(256):
209        if 32 <= (i % mod) <= 126:
210            escapes.append(chr(i))
211        else:
212            escapes.append("\\%03o" % i)
213    escapes[ord('\\')] = '\\\\'
214    escapes[ord('\t')] = '\\t'
215    escapes[ord('\r')] = '\\r'
216    escapes[ord('\n')] = '\\n'
217    escapes[ord('\"')] = '\\"'
218
219
220def escape(s):
221    global escapes
222    s = list(s)
223    for i in range(len(s)):
224        s[i] = escapes[ord(s[i])]
225    return EMPTYSTRING.join(s)
226
227
228def safe_eval(s):
229    # unwrap quotes, safely
230    return eval(s, {'__builtins__':{}}, {})
231
232
233def normalize(s):
234    # This converts the various Python string types into a format that is
235    # appropriate for .po files, namely much closer to C style.
236    lines = s.split('\n')
237    if len(lines) == 1:
238        s = '"' + escape(s) + '"'
239    else:
240        if not lines[-1]:
241            del lines[-1]
242            lines[-1] = lines[-1] + '\n'
243        for i in range(len(lines)):
244            lines[i] = escape(lines[i])
245        lineterm = '\\n"\n"'
246        s = '""\n"' + lineterm.join(lines) + '"'
247    return s
248
249
250
251class TokenEater:
252    def __init__(self, options):
253        self.__options = options
254        self.__messages = {}
255        self.__state = self.__waiting
256        self.__data = []
257        self.__lineno = -1
258        self.__freshmodule = 1
259        self.__curfile = None
260
261    def __call__(self, ttype, tstring, stup, etup, line):
262        # dispatch
263##        import token
264##        print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
265##              'tstring:', tstring
266        self.__state(ttype, tstring, stup[0])
267
268    def __waiting(self, ttype, tstring, lineno):
269        opts = self.__options
270        # Do docstring extractions, if enabled
271        if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
272            # module docstring?
273            if self.__freshmodule:
274                if ttype == tokenize.STRING:
275                    self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
276                    self.__freshmodule = 0
277                elif ttype not in (tokenize.COMMENT, tokenize.NL):
278                    self.__freshmodule = 0
279                return
280            # class docstring?
281            if ttype == tokenize.NAME and tstring in ('class', 'def'):
282                self.__state = self.__suiteseen
283                return
284        if ttype == tokenize.NAME and tstring in opts.keywords:
285            self.__state = self.__keywordseen
286
287    def __suiteseen(self, ttype, tstring, lineno):
288        # ignore anything until we see the colon
289        if ttype == tokenize.OP and tstring == ':':
290            self.__state = self.__suitedocstring
291
292    def __suitedocstring(self, ttype, tstring, lineno):
293        # ignore any intervening noise
294        if ttype == tokenize.STRING:
295            self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
296            self.__state = self.__waiting
297        elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
298                           tokenize.COMMENT):
299            # there was no class docstring
300            self.__state = self.__waiting
301
302    def __keywordseen(self, ttype, tstring, lineno):
303        if ttype == tokenize.OP and tstring == '(':
304            self.__data = []
305            self.__lineno = lineno
306            self.__state = self.__openseen
307        else:
308            self.__state = self.__waiting
309
310    def __openseen(self, ttype, tstring, lineno):
311        if ttype == tokenize.OP and tstring == ')':
312            # We've seen the last of the translatable strings.  Record the
313            # line number of the first line of the strings and update the list
314            # of messages seen.  Reset state for the next batch.  If there
315            # were no strings inside _(), then just ignore this entry.
316            if self.__data:
317                self.__addentry(EMPTYSTRING.join(self.__data))
318            self.__state = self.__waiting
319        elif ttype == tokenize.STRING:
320            self.__data.append(safe_eval(tstring))
321        # TBD: should we warn if we seen anything else?
322
323    def __addentry(self, msg, lineno=None, isdocstring=0):
324        if lineno is None:
325            lineno = self.__lineno
326        if not msg in self.__options.toexclude:
327            entry = (self.__curfile, lineno)
328            self.__messages.setdefault(msg, {})[entry] = isdocstring
329
330    def set_filename(self, filename):
331        self.__curfile = filename
332        self.__freshmodule = 1
333
334    def write(self, fp):
335        options = self.__options
336        timestamp = time.ctime(time.time())
337        # The time stamp in the header doesn't have the same format as that
338        # generated by xgettext...
339        print >> fp, pot_header % {'time': timestamp, 'version': __version__}
340        # Sort the entries.  First sort each particular entry's keys, then
341        # sort all the entries by their first item.
342        reverse = {}
343        for k, v in self.__messages.items():
344            keys = v.keys()
345            keys.sort()
346            reverse.setdefault(tuple(keys), []).append((k, v))
347        rkeys = reverse.keys()
348        rkeys.sort()
349        for rkey in rkeys:
350            rentries = reverse[rkey]
351            rentries.sort()
352            for k, v in rentries:
353                isdocstring = 0
354                # If the entry was gleaned out of a docstring, then add a
355                # comment stating so.  This is to aid translators who may wish
356                # to skip translating some unimportant docstrings.
357                if reduce(operator.__add__, v.values()):
358                    isdocstring = 1
359                # k is the message string, v is a dictionary-set of (filename,
360                # lineno) tuples.  We want to sort the entries in v first by
361                # file name and then by line number.
362                v = v.keys()
363                v.sort()
364                if not options.writelocations:
365                    pass
366                # location comments are different b/w Solaris and GNU:
367                elif options.locationstyle == options.SOLARIS:
368                    for filename, lineno in v:
369                        d = {'filename': filename, 'lineno': lineno}
370                        print >>fp, _(
371                            '# File: %(filename)s, line: %(lineno)d') % d
372                elif options.locationstyle == options.GNU:
373                    # fit as many locations on one line, as long as the
374                    # resulting line length doesn't exceeds 'options.width'
375                    locline = '#:'
376                    for filename, lineno in v:
377                        d = {'filename': filename, 'lineno': lineno}
378                        s = _(' %(filename)s:%(lineno)d') % d
379                        if len(locline) + len(s) <= options.width:
380                            locline = locline + s
381                        else:
382                            print >> fp, locline
383                            locline = "#:" + s
384                    if len(locline) > 2:
385                        print >> fp, locline
386                if isdocstring:
387                    print >> fp, '#, docstring'
388                print >> fp, 'msgid', normalize(k)
389                print >> fp, 'msgstr ""\n'
390
391
392
393def main():
394    global default_keywords
395    try:
396        opts, args = getopt.getopt(
397            sys.argv[1:],
398            'ad:DEhk:Kno:p:S:Vvw:x:X:',
399            ['extract-all', 'default-domain=', 'escape', 'help',
400             'keyword=', 'no-default-keywords',
401             'add-location', 'no-location', 'output=', 'output-dir=',
402             'style=', 'verbose', 'version', 'width=', 'exclude-file=',
403             'docstrings', 'no-docstrings',
404             ])
405    except getopt.error, msg:
406        usage(1, msg)
407
408    # for holding option values
409    class Options:
410        # constants
411        GNU = 1
412        SOLARIS = 2
413        # defaults
414        extractall = 0 # FIXME: currently this option has no effect at all.
415        escape = 0
416        keywords = []
417        outpath = ''
418        outfile = 'messages.pot'
419        writelocations = 1
420        locationstyle = GNU
421        verbose = 0
422        width = 78
423        excludefilename = ''
424        docstrings = 0
425        nodocstrings = {}
426
427    options = Options()
428    locations = {'gnu' : options.GNU,
429                 'solaris' : options.SOLARIS,
430                 }
431
432    # parse options
433    for opt, arg in opts:
434        if opt in ('-h', '--help'):
435            usage(0)
436        elif opt in ('-a', '--extract-all'):
437            options.extractall = 1
438        elif opt in ('-d', '--default-domain'):
439            options.outfile = arg + '.pot'
440        elif opt in ('-E', '--escape'):
441            options.escape = 1
442        elif opt in ('-D', '--docstrings'):
443            options.docstrings = 1
444        elif opt in ('-k', '--keyword'):
445            options.keywords.append(arg)
446        elif opt in ('-K', '--no-default-keywords'):
447            default_keywords = []
448        elif opt in ('-n', '--add-location'):
449            options.writelocations = 1
450        elif opt in ('--no-location',):
451            options.writelocations = 0
452        elif opt in ('-S', '--style'):
453            options.locationstyle = locations.get(arg.lower())
454            if options.locationstyle is None:
455                usage(1, _('Invalid value for --style: %s') % arg)
456        elif opt in ('-o', '--output'):
457            options.outfile = arg
458        elif opt in ('-p', '--output-dir'):
459            options.outpath = arg
460        elif opt in ('-v', '--verbose'):
461            options.verbose = 1
462        elif opt in ('-V', '--version'):
463            print _('pygettext.py (xgettext for Python) %s') % __version__
464            sys.exit(0)
465        elif opt in ('-w', '--width'):
466            try:
467                options.width = int(arg)
468            except ValueError:
469                usage(1, _('--width argument must be an integer: %s') % arg)
470        elif opt in ('-x', '--exclude-file'):
471            options.excludefilename = arg
472        elif opt in ('-X', '--no-docstrings'):
473            fp = open(arg)
474            try:
475                while 1:
476                    line = fp.readline()
477                    if not line:
478                        break
479                    options.nodocstrings[line[:-1]] = 1
480            finally:
481                fp.close()
482
483    # calculate escapes
484    make_escapes(options.escape)
485
486    # calculate all keywords
487    options.keywords.extend(default_keywords)
488
489    # initialize list of strings to exclude
490    if options.excludefilename:
491        try:
492            fp = open(options.excludefilename)
493            options.toexclude = fp.readlines()
494            fp.close()
495        except IOError:
496            print >> sys.stderr, _(
497                "Can't read --exclude-file: %s") % options.excludefilename
498            sys.exit(1)
499    else:
500        options.toexclude = []
501
502    # slurp through all the files
503    eater = TokenEater(options)
504    for filename in args:
505        if filename == '-':
506            if options.verbose:
507                print _('Reading standard input')
508            fp = sys.stdin
509            closep = 0
510        else:
511            if options.verbose:
512                print _('Working on %s') % filename
513            fp = open(filename)
514            closep = 1
515        try:
516            eater.set_filename(filename)
517            try:
518                tokenize.tokenize(fp.readline, eater)
519            except tokenize.TokenError, e:
520                print >> sys.stderr, '%s: %s, line %d, column %d' % (
521                    e[0], filename, e[1][0], e[1][1])
522        finally:
523            if closep:
524                fp.close()
525
526    # write the output
527    if options.outfile == '-':
528        fp = sys.stdout
529        closep = 0
530    else:
531        if options.outpath:
532            options.outfile = os.path.join(options.outpath, options.outfile)
533        fp = open(options.outfile, 'w')
534        closep = 1
535    try:
536        eater.write(fp)
537    finally:
538        if closep:
539            fp.close()
540
541
542if __name__ == '__main__':
543    main()
544    # some more test strings
545    _(u'a unicode string')
546