1#! @PYTHON@ 2# Originally written by Barry Warsaw <barry@zope.com> 3# 4# Minimally patched to make it even more xgettext compatible 5# by Peter Funk <pf@artcom-gmbh.de> 6 7"""pygettext -- Python equivalent of xgettext(1) 8 9Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the 10internationalization of C programs. Most of these tools are independent of 11the programming language and can be used from within Python programs. Martin 12von Loewis' work[1] helps considerably in this regard. 13 14There's one problem though; xgettext is the program that scans source code 15looking for message strings, but it groks only C (or C++). Python introduces 16a few wrinkles, such as dual quoting characters, triple quoted strings, and 17raw strings. xgettext understands none of this. 18 19Enter pygettext, which uses Python's standard tokenize module to scan Python 20source code, generating .pot files identical to what GNU xgettext[2] generates 21for C and C++ code. From there, the standard GNU tools can be used. 22 23A word about marking Python strings as candidates for translation. GNU 24xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and 25gettext_noop. But those can be a lot of text to include all over your code. 26C and C++ have a trick: they use the C preprocessor. Most internationalized C 27source includes a #define for gettext() to _() so that what has to be written 28in the source is much less. Thus these are both translatable strings: 29 30 gettext("Translatable String") 31 _("Translatable String") 32 33Python of course has no preprocessor so this doesn't work so well. Thus, 34pygettext searches only for _() by default, but see the -k/--keyword flag 35below for how to augment this. 36 37 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html 38 [2] http://www.gnu.org/software/gettext/gettext.html 39 40NOTE: pygettext attempts to be option and feature compatible with GNU xgettext 41where ever possible. However some options are still missing or are not fully 42implemented. Also, xgettext's use of command line switches with option 43arguments is broken, and in these cases, pygettext just defines additional 44switches. 45 46Usage: pygettext [options] inputfile ... 47 48Options: 49 50 -a 51 --extract-all 52 Extract all strings. 53 54 -d name 55 --default-domain=name 56 Rename the default output file from messages.pot to name.pot. 57 58 -E 59 --escape 60 Replace non-ASCII characters with octal escape sequences. 61 62 -D 63 --docstrings 64 Extract module, class, method, and function docstrings. These do not 65 need to be wrapped in _() markers, and in fact cannot be for Python to 66 consider them docstrings. (See also the -X option). 67 68 -h 69 --help 70 Print this help message and exit. 71 72 -k word 73 --keyword=word 74 Keywords to look for in addition to the default set, which are: 75 %(DEFAULTKEYWORDS)s 76 77 You can have multiple -k flags on the command line. 78 79 -K 80 --no-default-keywords 81 Disable the default set of keywords (see above). Any keywords 82 explicitly added with the -k/--keyword option are still recognized. 83 84 --no-location 85 Do not write filename/lineno location comments. 86 87 -n 88 --add-location 89 Write filename/lineno location comments indicating where each 90 extracted string is found in the source. These lines appear before 91 each msgid. The style of comments is controlled by the -S/--style 92 option. This is the default. 93 94 -o filename 95 --output=filename 96 Rename the default output file from messages.pot to filename. If 97 filename is `-' then the output is sent to standard out. 98 99 -p dir 100 --output-dir=dir 101 Output files will be placed in directory dir. 102 103 -S stylename 104 --style stylename 105 Specify which style to use for location comments. Two styles are 106 supported: 107 108 Solaris # File: filename, line: line-number 109 GNU #: filename:line 110 111 The style name is case insensitive. GNU style is the default. 112 113 -v 114 --verbose 115 Print the names of the files being processed. 116 117 -V 118 --version 119 Print the version of pygettext and exit. 120 121 -w columns 122 --width=columns 123 Set width of output to columns. 124 125 -x filename 126 --exclude-file=filename 127 Specify a file that contains a list of strings that are not be 128 extracted from the input files. Each string to be excluded must 129 appear on a line by itself in the file. 130 131 -X filename 132 --no-docstrings=filename 133 Specify a file that contains a list of files (one per line) that 134 should not have their docstrings extracted. This is only useful in 135 conjunction with the -D option above. 136 137If `inputfile' is -, standard input is read. 138""" 139 140import os 141import sys 142import time 143import getopt 144import tokenize 145import operator 146 147# for selftesting 148try: 149 import fintl 150 _ = fintl.gettext 151except ImportError: 152 def _(s): return s 153 154__version__ = '1.4' 155 156default_keywords = ['_'] 157DEFAULTKEYWORDS = ', '.join(default_keywords) 158 159EMPTYSTRING = '' 160 161 162 163# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's 164# there. 165pot_header = _('''\ 166# SOME DESCRIPTIVE TITLE. 167# Copyright (C) YEAR ORGANIZATION 168# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. 169# 170msgid "" 171msgstr "" 172"Project-Id-Version: PACKAGE VERSION\\n" 173"POT-Creation-Date: %(time)s\\n" 174"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" 175"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" 176"Language-Team: LANGUAGE <LL@li.org>\\n" 177"MIME-Version: 1.0\\n" 178"Content-Type: text/plain; charset=CHARSET\\n" 179"Content-Transfer-Encoding: ENCODING\\n" 180"Generated-By: pygettext.py %(version)s\\n" 181 182''') 183 184 185def usage(code, msg=''): 186 if code: 187 fd = sys.stderr 188 else: 189 fd = sys.stdout 190 print >> fd, _(__doc__) % globals() 191 if msg: 192 print >> fd, msg 193 sys.exit(code) 194 195 196 197escapes = [] 198 199def make_escapes(pass_iso8859): 200 global escapes 201 if pass_iso8859: 202 # Allow iso-8859 characters to pass through so that e.g. 'msgid 203 # "H[o-umlaut]he"' would result not result in 'msgid "H\366he"'. 204 # Otherwise we escape any character outside the 32..126 range. 205 mod = 128 206 else: 207 mod = 256 208 for i in range(256): 209 if 32 <= (i % mod) <= 126: 210 escapes.append(chr(i)) 211 else: 212 escapes.append("\\%03o" % i) 213 escapes[ord('\\')] = '\\\\' 214 escapes[ord('\t')] = '\\t' 215 escapes[ord('\r')] = '\\r' 216 escapes[ord('\n')] = '\\n' 217 escapes[ord('\"')] = '\\"' 218 219 220def escape(s): 221 global escapes 222 s = list(s) 223 for i in range(len(s)): 224 s[i] = escapes[ord(s[i])] 225 return EMPTYSTRING.join(s) 226 227 228def safe_eval(s): 229 # unwrap quotes, safely 230 return eval(s, {'__builtins__':{}}, {}) 231 232 233def normalize(s): 234 # This converts the various Python string types into a format that is 235 # appropriate for .po files, namely much closer to C style. 236 lines = s.split('\n') 237 if len(lines) == 1: 238 s = '"' + escape(s) + '"' 239 else: 240 if not lines[-1]: 241 del lines[-1] 242 lines[-1] = lines[-1] + '\n' 243 for i in range(len(lines)): 244 lines[i] = escape(lines[i]) 245 lineterm = '\\n"\n"' 246 s = '""\n"' + lineterm.join(lines) + '"' 247 return s 248 249 250 251class TokenEater: 252 def __init__(self, options): 253 self.__options = options 254 self.__messages = {} 255 self.__state = self.__waiting 256 self.__data = [] 257 self.__lineno = -1 258 self.__freshmodule = 1 259 self.__curfile = None 260 261 def __call__(self, ttype, tstring, stup, etup, line): 262 # dispatch 263## import token 264## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \ 265## 'tstring:', tstring 266 self.__state(ttype, tstring, stup[0]) 267 268 def __waiting(self, ttype, tstring, lineno): 269 opts = self.__options 270 # Do docstring extractions, if enabled 271 if opts.docstrings and not opts.nodocstrings.get(self.__curfile): 272 # module docstring? 273 if self.__freshmodule: 274 if ttype == tokenize.STRING: 275 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 276 self.__freshmodule = 0 277 elif ttype not in (tokenize.COMMENT, tokenize.NL): 278 self.__freshmodule = 0 279 return 280 # class docstring? 281 if ttype == tokenize.NAME and tstring in ('class', 'def'): 282 self.__state = self.__suiteseen 283 return 284 if ttype == tokenize.NAME and tstring in opts.keywords: 285 self.__state = self.__keywordseen 286 287 def __suiteseen(self, ttype, tstring, lineno): 288 # ignore anything until we see the colon 289 if ttype == tokenize.OP and tstring == ':': 290 self.__state = self.__suitedocstring 291 292 def __suitedocstring(self, ttype, tstring, lineno): 293 # ignore any intervening noise 294 if ttype == tokenize.STRING: 295 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 296 self.__state = self.__waiting 297 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, 298 tokenize.COMMENT): 299 # there was no class docstring 300 self.__state = self.__waiting 301 302 def __keywordseen(self, ttype, tstring, lineno): 303 if ttype == tokenize.OP and tstring == '(': 304 self.__data = [] 305 self.__lineno = lineno 306 self.__state = self.__openseen 307 else: 308 self.__state = self.__waiting 309 310 def __openseen(self, ttype, tstring, lineno): 311 if ttype == tokenize.OP and tstring == ')': 312 # We've seen the last of the translatable strings. Record the 313 # line number of the first line of the strings and update the list 314 # of messages seen. Reset state for the next batch. If there 315 # were no strings inside _(), then just ignore this entry. 316 if self.__data: 317 self.__addentry(EMPTYSTRING.join(self.__data)) 318 self.__state = self.__waiting 319 elif ttype == tokenize.STRING: 320 self.__data.append(safe_eval(tstring)) 321 # TBD: should we warn if we seen anything else? 322 323 def __addentry(self, msg, lineno=None, isdocstring=0): 324 if lineno is None: 325 lineno = self.__lineno 326 if not msg in self.__options.toexclude: 327 entry = (self.__curfile, lineno) 328 self.__messages.setdefault(msg, {})[entry] = isdocstring 329 330 def set_filename(self, filename): 331 self.__curfile = filename 332 self.__freshmodule = 1 333 334 def write(self, fp): 335 options = self.__options 336 timestamp = time.ctime(time.time()) 337 # The time stamp in the header doesn't have the same format as that 338 # generated by xgettext... 339 print >> fp, pot_header % {'time': timestamp, 'version': __version__} 340 # Sort the entries. First sort each particular entry's keys, then 341 # sort all the entries by their first item. 342 reverse = {} 343 for k, v in self.__messages.items(): 344 keys = v.keys() 345 keys.sort() 346 reverse.setdefault(tuple(keys), []).append((k, v)) 347 rkeys = reverse.keys() 348 rkeys.sort() 349 for rkey in rkeys: 350 rentries = reverse[rkey] 351 rentries.sort() 352 for k, v in rentries: 353 isdocstring = 0 354 # If the entry was gleaned out of a docstring, then add a 355 # comment stating so. This is to aid translators who may wish 356 # to skip translating some unimportant docstrings. 357 if reduce(operator.__add__, v.values()): 358 isdocstring = 1 359 # k is the message string, v is a dictionary-set of (filename, 360 # lineno) tuples. We want to sort the entries in v first by 361 # file name and then by line number. 362 v = v.keys() 363 v.sort() 364 if not options.writelocations: 365 pass 366 # location comments are different b/w Solaris and GNU: 367 elif options.locationstyle == options.SOLARIS: 368 for filename, lineno in v: 369 d = {'filename': filename, 'lineno': lineno} 370 print >>fp, _( 371 '# File: %(filename)s, line: %(lineno)d') % d 372 elif options.locationstyle == options.GNU: 373 # fit as many locations on one line, as long as the 374 # resulting line length doesn't exceeds 'options.width' 375 locline = '#:' 376 for filename, lineno in v: 377 d = {'filename': filename, 'lineno': lineno} 378 s = _(' %(filename)s:%(lineno)d') % d 379 if len(locline) + len(s) <= options.width: 380 locline = locline + s 381 else: 382 print >> fp, locline 383 locline = "#:" + s 384 if len(locline) > 2: 385 print >> fp, locline 386 if isdocstring: 387 print >> fp, '#, docstring' 388 print >> fp, 'msgid', normalize(k) 389 print >> fp, 'msgstr ""\n' 390 391 392 393def main(): 394 global default_keywords 395 try: 396 opts, args = getopt.getopt( 397 sys.argv[1:], 398 'ad:DEhk:Kno:p:S:Vvw:x:X:', 399 ['extract-all', 'default-domain=', 'escape', 'help', 400 'keyword=', 'no-default-keywords', 401 'add-location', 'no-location', 'output=', 'output-dir=', 402 'style=', 'verbose', 'version', 'width=', 'exclude-file=', 403 'docstrings', 'no-docstrings', 404 ]) 405 except getopt.error, msg: 406 usage(1, msg) 407 408 # for holding option values 409 class Options: 410 # constants 411 GNU = 1 412 SOLARIS = 2 413 # defaults 414 extractall = 0 # FIXME: currently this option has no effect at all. 415 escape = 0 416 keywords = [] 417 outpath = '' 418 outfile = 'messages.pot' 419 writelocations = 1 420 locationstyle = GNU 421 verbose = 0 422 width = 78 423 excludefilename = '' 424 docstrings = 0 425 nodocstrings = {} 426 427 options = Options() 428 locations = {'gnu' : options.GNU, 429 'solaris' : options.SOLARIS, 430 } 431 432 # parse options 433 for opt, arg in opts: 434 if opt in ('-h', '--help'): 435 usage(0) 436 elif opt in ('-a', '--extract-all'): 437 options.extractall = 1 438 elif opt in ('-d', '--default-domain'): 439 options.outfile = arg + '.pot' 440 elif opt in ('-E', '--escape'): 441 options.escape = 1 442 elif opt in ('-D', '--docstrings'): 443 options.docstrings = 1 444 elif opt in ('-k', '--keyword'): 445 options.keywords.append(arg) 446 elif opt in ('-K', '--no-default-keywords'): 447 default_keywords = [] 448 elif opt in ('-n', '--add-location'): 449 options.writelocations = 1 450 elif opt in ('--no-location',): 451 options.writelocations = 0 452 elif opt in ('-S', '--style'): 453 options.locationstyle = locations.get(arg.lower()) 454 if options.locationstyle is None: 455 usage(1, _('Invalid value for --style: %s') % arg) 456 elif opt in ('-o', '--output'): 457 options.outfile = arg 458 elif opt in ('-p', '--output-dir'): 459 options.outpath = arg 460 elif opt in ('-v', '--verbose'): 461 options.verbose = 1 462 elif opt in ('-V', '--version'): 463 print _('pygettext.py (xgettext for Python) %s') % __version__ 464 sys.exit(0) 465 elif opt in ('-w', '--width'): 466 try: 467 options.width = int(arg) 468 except ValueError: 469 usage(1, _('--width argument must be an integer: %s') % arg) 470 elif opt in ('-x', '--exclude-file'): 471 options.excludefilename = arg 472 elif opt in ('-X', '--no-docstrings'): 473 fp = open(arg) 474 try: 475 while 1: 476 line = fp.readline() 477 if not line: 478 break 479 options.nodocstrings[line[:-1]] = 1 480 finally: 481 fp.close() 482 483 # calculate escapes 484 make_escapes(options.escape) 485 486 # calculate all keywords 487 options.keywords.extend(default_keywords) 488 489 # initialize list of strings to exclude 490 if options.excludefilename: 491 try: 492 fp = open(options.excludefilename) 493 options.toexclude = fp.readlines() 494 fp.close() 495 except IOError: 496 print >> sys.stderr, _( 497 "Can't read --exclude-file: %s") % options.excludefilename 498 sys.exit(1) 499 else: 500 options.toexclude = [] 501 502 # slurp through all the files 503 eater = TokenEater(options) 504 for filename in args: 505 if filename == '-': 506 if options.verbose: 507 print _('Reading standard input') 508 fp = sys.stdin 509 closep = 0 510 else: 511 if options.verbose: 512 print _('Working on %s') % filename 513 fp = open(filename) 514 closep = 1 515 try: 516 eater.set_filename(filename) 517 try: 518 tokenize.tokenize(fp.readline, eater) 519 except tokenize.TokenError, e: 520 print >> sys.stderr, '%s: %s, line %d, column %d' % ( 521 e[0], filename, e[1][0], e[1][1]) 522 finally: 523 if closep: 524 fp.close() 525 526 # write the output 527 if options.outfile == '-': 528 fp = sys.stdout 529 closep = 0 530 else: 531 if options.outpath: 532 options.outfile = os.path.join(options.outpath, options.outfile) 533 fp = open(options.outfile, 'w') 534 closep = 1 535 try: 536 eater.write(fp) 537 finally: 538 if closep: 539 fp.close() 540 541 542if __name__ == '__main__': 543 main() 544 # some more test strings 545 _(u'a unicode string') 546