1# -* coding: utf-8 -*- 2# 3# License: MIT (see LICENSE file provided) 4# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: 58 6""" 7**polib** allows you to manipulate, create, modify gettext files (pot, po and 8mo files). You can load existing files, iterate through it's entries, add, 9modify entries, comments or metadata, etc. or create new po files from scratch. 10 11**polib** provides a simple and pythonic API via the :func:`~polib.pofile` and 12:func:`~polib.mofile` convenience functions. 13""" 14 15__author__ = 'David Jean Louis <izimobil@gmail.com>' 16__version__ = '1.0.8' 17__all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry', 18 'default_encoding', 'escape', 'unescape', 'detect_encoding', ] 19 20import array 21import codecs 22import os 23import re 24import struct 25import sys 26import textwrap 27import binascii 28 29try: 30 import io 31except ImportError: 32 # replacement of io.open() for python < 2.6 33 # we use codecs instead 34 class io(object): 35 @staticmethod 36 def open(fpath, mode='r', encoding=None): 37 return codecs.open(fpath, mode, encoding) 38 39 40# the default encoding to use when encoding cannot be detected 41default_encoding = 'utf-8' 42 43# python 2/3 compatibility helpers {{{ 44 45 46if sys.version_info[:2] < (3, 0): 47 PY3 = False 48 text_type = unicode 49 50 def b(s): 51 return s 52 53 def u(s): 54 return unicode(s, "unicode_escape") 55 56else: 57 PY3 = True 58 text_type = str 59 60 def b(s): 61 return s.encode("latin-1") 62 63 def u(s): 64 return s 65# }}} 66# _pofile_or_mofile {{{ 67 68 69def _pofile_or_mofile(f, type, **kwargs): 70 """ 71 Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to 72 honor the DRY concept. 73 """ 74 # get the file encoding 75 enc = kwargs.get('encoding') 76 if enc is None: 77 enc = detect_encoding(f, type == 'mofile') 78 79 # parse the file 80 kls = type == 'pofile' and _POFileParser or _MOFileParser 81 parser = kls( 82 f, 83 encoding=enc, 84 check_for_duplicates=kwargs.get('check_for_duplicates', False), 85 klass=kwargs.get('klass') 86 ) 87 instance = parser.parse() 88 instance.wrapwidth = kwargs.get('wrapwidth', 78) 89 return instance 90# }}} 91# _is_file {{{ 92 93 94def _is_file(filename_or_contents): 95 """ 96 Safely returns the value of os.path.exists(filename_or_contents). 97 98 Arguments: 99 100 ``filename_or_contents`` 101 either a filename, or a string holding the contents of some file. 102 In the latter case, this function will always return False. 103 """ 104 try: 105 return os.path.exists(filename_or_contents) 106 except (ValueError, UnicodeEncodeError): 107 return False 108# }}} 109# function pofile() {{{ 110 111 112def pofile(pofile, **kwargs): 113 """ 114 Convenience function that parses the po or pot file ``pofile`` and returns 115 a :class:`~polib.POFile` instance. 116 117 Arguments: 118 119 ``pofile`` 120 string, full or relative path to the po/pot file or its content (data). 121 122 ``wrapwidth`` 123 integer, the wrap width, only useful when the ``-w`` option was passed 124 to xgettext (optional, default: ``78``). 125 126 ``encoding`` 127 string, the encoding to use (e.g. "utf-8") (default: ``None``, the 128 encoding will be auto-detected). 129 130 ``check_for_duplicates`` 131 whether to check for duplicate entries when adding entries to the 132 file (optional, default: ``False``). 133 134 ``klass`` 135 class which is used to instantiate the return value (optional, 136 default: ``None``, the return value with be a :class:`~polib.POFile` 137 instance). 138 """ 139 return _pofile_or_mofile(pofile, 'pofile', **kwargs) 140# }}} 141# function mofile() {{{ 142 143 144def mofile(mofile, **kwargs): 145 """ 146 Convenience function that parses the mo file ``mofile`` and returns a 147 :class:`~polib.MOFile` instance. 148 149 Arguments: 150 151 ``mofile`` 152 string, full or relative path to the mo file or its content (data). 153 154 ``wrapwidth`` 155 integer, the wrap width, only useful when the ``-w`` option was passed 156 to xgettext to generate the po file that was used to format the mo file 157 (optional, default: ``78``). 158 159 ``encoding`` 160 string, the encoding to use (e.g. "utf-8") (default: ``None``, the 161 encoding will be auto-detected). 162 163 ``check_for_duplicates`` 164 whether to check for duplicate entries when adding entries to the 165 file (optional, default: ``False``). 166 167 ``klass`` 168 class which is used to instantiate the return value (optional, 169 default: ``None``, the return value with be a :class:`~polib.POFile` 170 instance). 171 """ 172 return _pofile_or_mofile(mofile, 'mofile', **kwargs) 173# }}} 174# function detect_encoding() {{{ 175 176 177def detect_encoding(file, binary_mode=False): 178 """ 179 Try to detect the encoding used by the ``file``. The ``file`` argument can 180 be a PO or MO file path or a string containing the contents of the file. 181 If the encoding cannot be detected, the function will return the value of 182 ``default_encoding``. 183 184 Arguments: 185 186 ``file`` 187 string, full or relative path to the po/mo file or its content. 188 189 ``binary_mode`` 190 boolean, set this to True if ``file`` is a mo file. 191 """ 192 PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)' 193 rxt = re.compile(u(PATTERN)) 194 rxb = re.compile(b(PATTERN)) 195 196 def charset_exists(charset): 197 """Check whether ``charset`` is valid or not.""" 198 try: 199 codecs.lookup(charset) 200 except LookupError: 201 return False 202 return True 203 204 if not _is_file(file): 205 match = rxt.search(file) 206 if match: 207 enc = match.group(1).strip() 208 if charset_exists(enc): 209 return enc 210 else: 211 # For PY3, always treat as binary 212 if binary_mode or PY3: 213 mode = 'rb' 214 rx = rxb 215 else: 216 mode = 'r' 217 rx = rxt 218 f = open(file, mode) 219 for l in f.readlines(): 220 match = rx.search(l) 221 if match: 222 f.close() 223 enc = match.group(1).strip() 224 if not isinstance(enc, text_type): 225 enc = enc.decode('utf-8') 226 if charset_exists(enc): 227 return enc 228 f.close() 229 return default_encoding 230# }}} 231# function escape() {{{ 232 233 234def escape(st): 235 """ 236 Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in 237 the given string ``st`` and returns it. 238 """ 239 return st.replace('\\', r'\\')\ 240 .replace('\t', r'\t')\ 241 .replace('\r', r'\r')\ 242 .replace('\n', r'\n')\ 243 .replace('\"', r'\"') 244# }}} 245# function unescape() {{{ 246 247 248def unescape(st): 249 """ 250 Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in 251 the given string ``st`` and returns it. 252 """ 253 def unescape_repl(m): 254 m = m.group(1) 255 if m == 'n': 256 return '\n' 257 if m == 't': 258 return '\t' 259 if m == 'r': 260 return '\r' 261 if m == '\\': 262 return '\\' 263 return m # handles escaped double quote 264 return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st) 265# }}} 266# function natural_sort() {{{ 267 268 269def natural_sort(lst): 270 """ 271 Sort naturally the given list. 272 Credits: http://stackoverflow.com/a/4836734 273 """ 274 convert = lambda text: int(text) if text.isdigit() else text.lower() 275 alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 276 return sorted(lst, key = alphanum_key) 277# }}} 278# class _BaseFile {{{ 279 280 281class _BaseFile(list): 282 """ 283 Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile` 284 classes. This class should **not** be instantiated directly. 285 """ 286 287 def __init__(self, *args, **kwargs): 288 """ 289 Constructor, accepts the following keyword arguments: 290 291 ``pofile`` 292 string, the path to the po or mo file, or its content as a string. 293 294 ``wrapwidth`` 295 integer, the wrap width, only useful when the ``-w`` option was 296 passed to xgettext (optional, default: ``78``). 297 298 ``encoding`` 299 string, the encoding to use, defaults to ``default_encoding`` 300 global variable (optional). 301 302 ``check_for_duplicates`` 303 whether to check for duplicate entries when adding entries to the 304 file, (optional, default: ``False``). 305 """ 306 list.__init__(self) 307 # the opened file handle 308 pofile = kwargs.get('pofile', None) 309 if pofile and _is_file(pofile): 310 self.fpath = pofile 311 else: 312 self.fpath = kwargs.get('fpath') 313 # the width at which lines should be wrapped 314 self.wrapwidth = kwargs.get('wrapwidth', 78) 315 # the file encoding 316 self.encoding = kwargs.get('encoding', default_encoding) 317 # whether to check for duplicate entries or not 318 self.check_for_duplicates = kwargs.get('check_for_duplicates', False) 319 # header 320 self.header = '' 321 # both po and mo files have metadata 322 self.metadata = {} 323 self.metadata_is_fuzzy = 0 324 325 def __unicode__(self): 326 """ 327 Returns the unicode representation of the file. 328 """ 329 ret = [] 330 entries = [self.metadata_as_entry()] + \ 331 [e for e in self if not e.obsolete] 332 for entry in entries: 333 ret.append(entry.__unicode__(self.wrapwidth)) 334 for entry in self.obsolete_entries(): 335 ret.append(entry.__unicode__(self.wrapwidth)) 336 ret = u('\n').join(ret) 337 338 assert isinstance(ret, text_type) 339 #if type(ret) != text_type: 340 # return unicode(ret, self.encoding) 341 return ret 342 343 if PY3: 344 def __str__(self): 345 return self.__unicode__() 346 else: 347 def __str__(self): 348 """ 349 Returns the string representation of the file. 350 """ 351 return unicode(self).encode(self.encoding) 352 353 def __contains__(self, entry): 354 """ 355 Overridden ``list`` method to implement the membership test (in and 356 not in). 357 The method considers that an entry is in the file if it finds an entry 358 that has the same msgid (the test is **case sensitive**) and the same 359 msgctxt (or none for both entries). 360 361 Argument: 362 363 ``entry`` 364 an instance of :class:`~polib._BaseEntry`. 365 """ 366 return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \ 367 is not None 368 369 def __eq__(self, other): 370 return str(self) == str(other) 371 372 def append(self, entry): 373 """ 374 Overridden method to check for duplicates entries, if a user tries to 375 add an entry that is already in the file, the method will raise a 376 ``ValueError`` exception. 377 378 Argument: 379 380 ``entry`` 381 an instance of :class:`~polib._BaseEntry`. 382 """ 383 # check_for_duplicates may not be defined (yet) when unpickling. 384 # But if pickling, we never want to check for duplicates anyway. 385 if getattr(self, 'check_for_duplicates', False) and entry in self: 386 raise ValueError('Entry "%s" already exists' % entry.msgid) 387 super(_BaseFile, self).append(entry) 388 389 def insert(self, index, entry): 390 """ 391 Overridden method to check for duplicates entries, if a user tries to 392 add an entry that is already in the file, the method will raise a 393 ``ValueError`` exception. 394 395 Arguments: 396 397 ``index`` 398 index at which the entry should be inserted. 399 400 ``entry`` 401 an instance of :class:`~polib._BaseEntry`. 402 """ 403 if self.check_for_duplicates and entry in self: 404 raise ValueError('Entry "%s" already exists' % entry.msgid) 405 super(_BaseFile, self).insert(index, entry) 406 407 def metadata_as_entry(self): 408 """ 409 Returns the file metadata as a :class:`~polib.POFile` instance. 410 """ 411 e = POEntry(msgid='') 412 mdata = self.ordered_metadata() 413 if mdata: 414 strs = [] 415 for name, value in mdata: 416 # Strip whitespace off each line in a multi-line entry 417 strs.append('%s: %s' % (name, value)) 418 e.msgstr = '\n'.join(strs) + '\n' 419 if self.metadata_is_fuzzy: 420 e.flags.append('fuzzy') 421 return e 422 423 def save(self, fpath=None, repr_method='__unicode__'): 424 """ 425 Saves the po file to ``fpath``. 426 If it is an existing file and no ``fpath`` is provided, then the 427 existing file is rewritten with the modified data. 428 429 Keyword arguments: 430 431 ``fpath`` 432 string, full or relative path to the file. 433 434 ``repr_method`` 435 string, the method to use for output. 436 """ 437 if self.fpath is None and fpath is None: 438 raise IOError('You must provide a file path to save() method') 439 contents = getattr(self, repr_method)() 440 if fpath is None: 441 fpath = self.fpath 442 if repr_method == 'to_binary': 443 fhandle = open(fpath, 'wb') 444 else: 445 fhandle = io.open(fpath, 'w', encoding=self.encoding) 446 if not isinstance(contents, text_type): 447 contents = contents.decode(self.encoding) 448 fhandle.write(contents) 449 fhandle.close() 450 # set the file path if not set 451 if self.fpath is None and fpath: 452 self.fpath = fpath 453 454 def find(self, st, by='msgid', include_obsolete_entries=False, 455 msgctxt=False): 456 """ 457 Find the entry which msgid (or property identified by the ``by`` 458 argument) matches the string ``st``. 459 460 Keyword arguments: 461 462 ``st`` 463 string, the string to search for. 464 465 ``by`` 466 string, the property to use for comparison (default: ``msgid``). 467 468 ``include_obsolete_entries`` 469 boolean, whether to also search in entries that are obsolete. 470 471 ``msgctxt`` 472 string, allows specifying a specific message context for the 473 search. 474 """ 475 if include_obsolete_entries: 476 entries = self[:] 477 else: 478 entries = [e for e in self if not e.obsolete] 479 for e in entries: 480 if getattr(e, by) == st: 481 if msgctxt is not False and e.msgctxt != msgctxt: 482 continue 483 return e 484 return None 485 486 def ordered_metadata(self): 487 """ 488 Convenience method that returns an ordered version of the metadata 489 dictionary. The return value is list of tuples (metadata name, 490 metadata_value). 491 """ 492 # copy the dict first 493 metadata = self.metadata.copy() 494 data_order = [ 495 'Project-Id-Version', 496 'Report-Msgid-Bugs-To', 497 'POT-Creation-Date', 498 'PO-Revision-Date', 499 'Last-Translator', 500 'Language-Team', 501 'Language', 502 'MIME-Version', 503 'Content-Type', 504 'Content-Transfer-Encoding', 505 'Plural-Forms' 506 ] 507 ordered_data = [] 508 for data in data_order: 509 try: 510 value = metadata.pop(data) 511 ordered_data.append((data, value)) 512 except KeyError: 513 pass 514 # the rest of the metadata will be alphabetically ordered since there 515 # are no specs for this AFAIK 516 for data in natural_sort(metadata.keys()): 517 value = metadata[data] 518 ordered_data.append((data, value)) 519 return ordered_data 520 521 def to_binary(self): 522 """ 523 Return the binary representation of the file. 524 """ 525 offsets = [] 526 entries = self.translated_entries() 527 528 # the keys are sorted in the .mo file 529 def cmp(_self, other): 530 # msgfmt compares entries with msgctxt if it exists 531 self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid 532 other_msgid = other.msgctxt and other.msgctxt or other.msgid 533 if self_msgid > other_msgid: 534 return 1 535 elif self_msgid < other_msgid: 536 return -1 537 else: 538 return 0 539 # add metadata entry 540 entries.sort(key=lambda o: o.msgctxt or o.msgid) 541 mentry = self.metadata_as_entry() 542 #mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip() 543 entries = [mentry] + entries 544 entries_len = len(entries) 545 ids, strs = b(''), b('') 546 for e in entries: 547 # For each string, we need size and file offset. Each string is 548 # NUL terminated; the NUL does not count into the size. 549 msgid = b('') 550 if e.msgctxt: 551 # Contexts are stored by storing the concatenation of the 552 # context, a <EOT> byte, and the original string 553 msgid = self._encode(e.msgctxt + '\4') 554 if e.msgid_plural: 555 msgstr = [] 556 for index in sorted(e.msgstr_plural.keys()): 557 msgstr.append(e.msgstr_plural[index]) 558 msgid += self._encode(e.msgid + '\0' + e.msgid_plural) 559 msgstr = self._encode('\0'.join(msgstr)) 560 else: 561 msgid += self._encode(e.msgid) 562 msgstr = self._encode(e.msgstr) 563 offsets.append((len(ids), len(msgid), len(strs), len(msgstr))) 564 ids += msgid + b('\0') 565 strs += msgstr + b('\0') 566 567 # The header is 7 32-bit unsigned integers. 568 keystart = 7 * 4 + 16 * entries_len 569 # and the values start after the keys 570 valuestart = keystart + len(ids) 571 koffsets = [] 572 voffsets = [] 573 # The string table first has the list of keys, then the list of values. 574 # Each entry has first the size of the string, then the file offset. 575 for o1, l1, o2, l2 in offsets: 576 koffsets += [l1, o1 + keystart] 577 voffsets += [l2, o2 + valuestart] 578 offsets = koffsets + voffsets 579 580 output = struct.pack( 581 "Iiiiiii", 582 # Magic number 583 MOFile.MAGIC, 584 # Version 585 0, 586 # number of entries 587 entries_len, 588 # start of key index 589 7 * 4, 590 # start of value index 591 7 * 4 + entries_len * 8, 592 # size and offset of hash table, we don't use hash tables 593 0, keystart 594 595 ) 596 if PY3 and sys.version_info.minor > 1: # python 3.2 or superior 597 output += array.array("i", offsets).tobytes() 598 else: 599 output += array.array("i", offsets).tostring() 600 output += ids 601 output += strs 602 return output 603 604 def _encode(self, mixed): 605 """ 606 Encodes the given ``mixed`` argument with the file encoding if and 607 only if it's a unicode string and returns the encoded string. 608 """ 609 if isinstance(mixed, text_type): 610 mixed = mixed.encode(self.encoding) 611 return mixed 612# }}} 613# class POFile {{{ 614 615 616class POFile(_BaseFile): 617 """ 618 Po (or Pot) file reader/writer. 619 This class inherits the :class:`~polib._BaseFile` class and, by extension, 620 the python ``list`` type. 621 """ 622 623 def __unicode__(self): 624 """ 625 Returns the unicode representation of the po file. 626 """ 627 ret, headers = '', self.header.split('\n') 628 for header in headers: 629 if not len(header): 630 ret += "#\n" 631 elif header[:1] in [',', ':']: 632 ret += '#%s\n' % header 633 else: 634 ret += '# %s\n' % header 635 636 if not isinstance(ret, text_type): 637 ret = ret.decode(self.encoding) 638 639 return ret + _BaseFile.__unicode__(self) 640 641 def save_as_mofile(self, fpath): 642 """ 643 Saves the binary representation of the file to given ``fpath``. 644 645 Keyword argument: 646 647 ``fpath`` 648 string, full or relative path to the mo file. 649 """ 650 _BaseFile.save(self, fpath, 'to_binary') 651 652 def percent_translated(self): 653 """ 654 Convenience method that returns the percentage of translated 655 messages. 656 """ 657 total = len([e for e in self if not e.obsolete]) 658 if total == 0: 659 return 100 660 translated = len(self.translated_entries()) 661 return int(translated * 100 / float(total)) 662 663 def translated_entries(self): 664 """ 665 Convenience method that returns the list of translated entries. 666 """ 667 return [e for e in self if e.translated()] 668 669 def untranslated_entries(self): 670 """ 671 Convenience method that returns the list of untranslated entries. 672 """ 673 return [e for e in self if not e.translated() and not e.obsolete 674 and not 'fuzzy' in e.flags] 675 676 def fuzzy_entries(self): 677 """ 678 Convenience method that returns the list of fuzzy entries. 679 """ 680 return [e for e in self if 'fuzzy' in e.flags] 681 682 def obsolete_entries(self): 683 """ 684 Convenience method that returns the list of obsolete entries. 685 """ 686 return [e for e in self if e.obsolete] 687 688 def merge(self, refpot): 689 """ 690 Convenience method that merges the current pofile with the pot file 691 provided. It behaves exactly as the gettext msgmerge utility: 692 693 * comments of this file will be preserved, but extracted comments and 694 occurrences will be discarded; 695 * any translations or comments in the file will be discarded, however, 696 dot comments and file positions will be preserved; 697 * the fuzzy flags are preserved. 698 699 Keyword argument: 700 701 ``refpot`` 702 object POFile, the reference catalog. 703 """ 704 # Store entries in dict/set for faster access 705 self_entries = dict((entry.msgid, entry) for entry in self) 706 refpot_msgids = set(entry.msgid for entry in refpot) 707 # Merge entries that are in the refpot 708 for entry in refpot: 709 e = self_entries.get(entry.msgid) 710 if e is None: 711 e = POEntry() 712 self.append(e) 713 e.merge(entry) 714 # ok, now we must "obsolete" entries that are not in the refpot anymore 715 for entry in self: 716 if entry.msgid not in refpot_msgids: 717 entry.obsolete = True 718# }}} 719# class MOFile {{{ 720 721 722class MOFile(_BaseFile): 723 """ 724 Mo file reader/writer. 725 This class inherits the :class:`~polib._BaseFile` class and, by 726 extension, the python ``list`` type. 727 """ 728 MAGIC = 0x950412de 729 MAGIC_SWAPPED = 0xde120495 730 731 def __init__(self, *args, **kwargs): 732 """ 733 Constructor, accepts all keywords arguments accepted by 734 :class:`~polib._BaseFile` class. 735 """ 736 _BaseFile.__init__(self, *args, **kwargs) 737 self.magic_number = None 738 self.version = 0 739 740 def save_as_pofile(self, fpath): 741 """ 742 Saves the mofile as a pofile to ``fpath``. 743 744 Keyword argument: 745 746 ``fpath`` 747 string, full or relative path to the file. 748 """ 749 _BaseFile.save(self, fpath) 750 751 def save(self, fpath=None): 752 """ 753 Saves the mofile to ``fpath``. 754 755 Keyword argument: 756 757 ``fpath`` 758 string, full or relative path to the file. 759 """ 760 _BaseFile.save(self, fpath, 'to_binary') 761 762 def percent_translated(self): 763 """ 764 Convenience method to keep the same interface with POFile instances. 765 """ 766 return 100 767 768 def translated_entries(self): 769 """ 770 Convenience method to keep the same interface with POFile instances. 771 """ 772 return self 773 774 def untranslated_entries(self): 775 """ 776 Convenience method to keep the same interface with POFile instances. 777 """ 778 return [] 779 780 def fuzzy_entries(self): 781 """ 782 Convenience method to keep the same interface with POFile instances. 783 """ 784 return [] 785 786 def obsolete_entries(self): 787 """ 788 Convenience method to keep the same interface with POFile instances. 789 """ 790 return [] 791# }}} 792# class _BaseEntry {{{ 793 794 795class _BaseEntry(object): 796 """ 797 Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes. 798 This class should **not** be instantiated directly. 799 """ 800 801 def __init__(self, *args, **kwargs): 802 """ 803 Constructor, accepts the following keyword arguments: 804 805 ``msgid`` 806 string, the entry msgid. 807 808 ``msgstr`` 809 string, the entry msgstr. 810 811 ``msgid_plural`` 812 string, the entry msgid_plural. 813 814 ``msgstr_plural`` 815 list, the entry msgstr_plural lines. 816 817 ``msgctxt`` 818 string, the entry context (msgctxt). 819 820 ``obsolete`` 821 bool, whether the entry is "obsolete" or not. 822 823 ``encoding`` 824 string, the encoding to use, defaults to ``default_encoding`` 825 global variable (optional). 826 """ 827 self.msgid = kwargs.get('msgid', '') 828 self.msgstr = kwargs.get('msgstr', '') 829 self.msgid_plural = kwargs.get('msgid_plural', '') 830 self.msgstr_plural = kwargs.get('msgstr_plural', {}) 831 self.msgctxt = kwargs.get('msgctxt', None) 832 self.obsolete = kwargs.get('obsolete', False) 833 self.encoding = kwargs.get('encoding', default_encoding) 834 835 def __unicode__(self, wrapwidth=78): 836 """ 837 Returns the unicode representation of the entry. 838 """ 839 if self.obsolete: 840 delflag = '#~ ' 841 else: 842 delflag = '' 843 ret = [] 844 # write the msgctxt if any 845 if self.msgctxt is not None: 846 ret += self._str_field("msgctxt", delflag, "", self.msgctxt, 847 wrapwidth) 848 # write the msgid 849 ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth) 850 # write the msgid_plural if any 851 if self.msgid_plural: 852 ret += self._str_field("msgid_plural", delflag, "", 853 self.msgid_plural, wrapwidth) 854 if self.msgstr_plural: 855 # write the msgstr_plural if any 856 msgstrs = self.msgstr_plural 857 keys = list(msgstrs) 858 keys.sort() 859 for index in keys: 860 msgstr = msgstrs[index] 861 plural_index = '[%s]' % index 862 ret += self._str_field("msgstr", delflag, plural_index, msgstr, 863 wrapwidth) 864 else: 865 # otherwise write the msgstr 866 ret += self._str_field("msgstr", delflag, "", self.msgstr, 867 wrapwidth) 868 ret.append('') 869 usedirect = True 870 if not PY3 and type(ret[0] != unicode): 871 try: 872 usedirect = False 873 ret = u('\n').join(x.decode('utf-8') for x in ret) 874 except: 875 usedirect = True 876 if usedirect: 877 ret = u('\n').join(ret) 878 return ret 879 880 if PY3: 881 def __str__(self): 882 return self.__unicode__() 883 else: 884 def __str__(self): 885 """ 886 Returns the string representation of the entry. 887 """ 888 return unicode(self).encode(self.encoding) 889 890 def __eq__(self, other): 891 return str(self) == str(other) 892 893 def _str_field(self, fieldname, delflag, plural_index, field, 894 wrapwidth=78): 895 lines = field.splitlines(True) 896 if len(lines) > 1: 897 lines = [''] + lines # start with initial empty line 898 else: 899 escaped_field = escape(field) 900 specialchars_count = 0 901 for c in ['\\', '\n', '\r', '\t', '"']: 902 specialchars_count += field.count(c) 903 # comparison must take into account fieldname length + one space 904 # + 2 quotes (eg. msgid "<string>") 905 flength = len(fieldname) + 3 906 if plural_index: 907 flength += len(plural_index) 908 real_wrapwidth = wrapwidth - flength + specialchars_count 909 if wrapwidth > 0 and len(field) > real_wrapwidth: 910 # Wrap the line but take field name into account 911 lines = [''] + [unescape(item) for item in wrap( 912 escaped_field, 913 wrapwidth - 2, # 2 for quotes "" 914 drop_whitespace=False, 915 break_long_words=False 916 )] 917 else: 918 lines = [field] 919 if fieldname.startswith('previous_'): 920 # quick and dirty trick to get the real field name 921 fieldname = fieldname[9:] 922 923 ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index, 924 escape(lines.pop(0)))] 925 for line in lines: 926 ret.append('%s"%s"' % (delflag, escape(line))) 927 return ret 928# }}} 929# class POEntry {{{ 930 931 932class POEntry(_BaseEntry): 933 """ 934 Represents a po file entry. 935 """ 936 937 def __init__(self, *args, **kwargs): 938 """ 939 Constructor, accepts the following keyword arguments: 940 941 ``comment`` 942 string, the entry comment. 943 944 ``tcomment`` 945 string, the entry translator comment. 946 947 ``occurrences`` 948 list, the entry occurrences. 949 950 ``flags`` 951 list, the entry flags. 952 953 ``previous_msgctxt`` 954 string, the entry previous context. 955 956 ``previous_msgid`` 957 string, the entry previous msgid. 958 959 ``previous_msgid_plural`` 960 string, the entry previous msgid_plural. 961 962 ``linenum`` 963 integer, the line number of the entry 964 """ 965 _BaseEntry.__init__(self, *args, **kwargs) 966 self.comment = kwargs.get('comment', '') 967 self.tcomment = kwargs.get('tcomment', '') 968 self.occurrences = kwargs.get('occurrences', []) 969 self.flags = kwargs.get('flags', []) 970 self.previous_msgctxt = kwargs.get('previous_msgctxt', None) 971 self.previous_msgid = kwargs.get('previous_msgid', None) 972 self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None) 973 self.linenum = kwargs.get('linenum', None) 974 975 def __unicode__(self, wrapwidth=0): 976 """ 977 Returns the unicode representation of the entry. 978 """ 979 ret = [] 980 # comments first, if any (with text wrapping as xgettext does) 981 if self.obsolete: 982 comments = [('tcomment', '# ')] 983 else: 984 comments = [('comment', '#. '), ('tcomment', '# ')] 985 for c in comments: 986 val = getattr(self, c[0]) 987 if val: 988 for comment in val.split('\n'): 989 if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth: 990 ret += wrap( 991 comment, 992 wrapwidth, 993 initial_indent=c[1], 994 subsequent_indent=c[1], 995 break_long_words=False 996 ) 997 else: 998 ret.append('%s%s' % (c[1], comment)) 999 1000 # occurrences (with text wrapping as xgettext does) 1001 if not self.obsolete and self.occurrences: 1002 filelist = [] 1003 for fpath, lineno in self.occurrences: 1004 if lineno: 1005 filelist.append('%s:%s' % (fpath, lineno)) 1006 else: 1007 filelist.append(fpath) 1008 filestr = ' '.join(filelist) 1009 if wrapwidth > 0 and len(filestr) + 3 > wrapwidth: 1010 # textwrap split words that contain hyphen, this is not 1011 # what we want for filenames, so the dirty hack is to 1012 # temporally replace hyphens with a char that a file cannot 1013 # contain, like "*" 1014 ret += [l.replace('*', '-') for l in wrap( 1015 filestr.replace('-', '*'), 1016 wrapwidth, 1017 initial_indent='#: ', 1018 subsequent_indent='#: ', 1019 break_long_words=False 1020 )] 1021 else: 1022 ret.append('#: ' + filestr) 1023 1024 # flags (TODO: wrapping ?) 1025 if self.flags: 1026 ret.append('#, %s' % ', '.join(self.flags)) 1027 1028 # previous context and previous msgid/msgid_plural 1029 fields = ['previous_msgctxt', 'previous_msgid', 1030 'previous_msgid_plural'] 1031 if self.obsolete: 1032 prefix = "#~| " 1033 else: 1034 prefix = "#| " 1035 for f in fields: 1036 val = getattr(self, f) 1037 if val: 1038 ret += self._str_field(f, prefix, "", val, wrapwidth) 1039 1040 ret.append(_BaseEntry.__unicode__(self, wrapwidth)) 1041 ret = u('\n').join(ret) 1042 return ret 1043 1044 def __cmp__(self, other): 1045 """ 1046 Called by comparison operations if rich comparison is not defined. 1047 """ 1048 1049 # First: Obsolete test 1050 if self.obsolete != other.obsolete: 1051 if self.obsolete: 1052 return -1 1053 else: 1054 return 1 1055 # Work on a copy to protect original 1056 occ1 = sorted(self.occurrences[:]) 1057 occ2 = sorted(other.occurrences[:]) 1058 pos = 0 1059 for entry1 in occ1: 1060 try: 1061 entry2 = occ2[pos] 1062 except IndexError: 1063 return 1 1064 pos = pos + 1 1065 if entry1[0] != entry2[0]: 1066 if entry1[0] > entry2[0]: 1067 return 1 1068 else: 1069 return -1 1070 if entry1[1] != entry2[1]: 1071 if entry1[1] > entry2[1]: 1072 return 1 1073 else: 1074 return -1 1075 # Compare msgid_plural if set 1076 if self.msgid_plural: 1077 if not other.msgid_plural: 1078 return 1 1079 for pos in self.msgid_plural: 1080 if pos not in other.msgid_plural: 1081 return 1 1082 if self.msgid_plural[pos] > other.msgid_plural[pos]: 1083 return 1 1084 if self.msgid_plural[pos] < other.msgid_plural[pos]: 1085 return -1 1086 # Finally: Compare message ID 1087 if self.msgid > other.msgid: 1088 return 1 1089 elif self.msgid < other.msgid: 1090 return -1 1091 return 0 1092 1093 def __gt__(self, other): 1094 return self.__cmp__(other) > 0 1095 1096 def __lt__(self, other): 1097 return self.__cmp__(other) < 0 1098 1099 def __ge__(self, other): 1100 return self.__cmp__(other) >= 0 1101 1102 def __le__(self, other): 1103 return self.__cmp__(other) <= 0 1104 1105 def __eq__(self, other): 1106 return self.__cmp__(other) == 0 1107 1108 def __ne__(self, other): 1109 return self.__cmp__(other) != 0 1110 1111 def translated(self): 1112 """ 1113 Returns ``True`` if the entry has been translated or ``False`` 1114 otherwise. 1115 """ 1116 if self.obsolete or 'fuzzy' in self.flags: 1117 return False 1118 if self.msgstr != '': 1119 return True 1120 if self.msgstr_plural: 1121 for pos in self.msgstr_plural: 1122 if self.msgstr_plural[pos] == '': 1123 return False 1124 return True 1125 return False 1126 1127 def merge(self, other): 1128 """ 1129 Merge the current entry with the given pot entry. 1130 """ 1131 self.msgid = other.msgid 1132 self.msgctxt = other.msgctxt 1133 self.occurrences = other.occurrences 1134 self.comment = other.comment 1135 fuzzy = 'fuzzy' in self.flags 1136 self.flags = other.flags[:] # clone flags 1137 if fuzzy: 1138 self.flags.append('fuzzy') 1139 self.msgid_plural = other.msgid_plural 1140 self.obsolete = other.obsolete 1141 self.previous_msgctxt = other.previous_msgctxt 1142 self.previous_msgid = other.previous_msgid 1143 self.previous_msgid_plural = other.previous_msgid_plural 1144 if other.msgstr_plural: 1145 for pos in other.msgstr_plural: 1146 try: 1147 # keep existing translation at pos if any 1148 self.msgstr_plural[pos] 1149 except KeyError: 1150 self.msgstr_plural[pos] = '' 1151 1152 def __hash__(self): 1153 return hash((self.msgid, self.msgstr)) 1154# }}} 1155# class MOEntry {{{ 1156 1157 1158class MOEntry(_BaseEntry): 1159 """ 1160 Represents a mo file entry. 1161 """ 1162 def __init__(self, *args, **kwargs): 1163 """ 1164 Constructor, accepts the following keyword arguments, 1165 for consistency with :class:`~polib.POEntry`: 1166 1167 ``comment`` 1168 ``tcomment`` 1169 ``occurrences`` 1170 ``flags`` 1171 ``previous_msgctxt`` 1172 ``previous_msgid`` 1173 ``previous_msgid_plural`` 1174 1175 Note: even though these keyword arguments are accepted, 1176 they hold no real meaning in the context of MO files 1177 and are simply ignored. 1178 """ 1179 _BaseEntry.__init__(self, *args, **kwargs) 1180 self.comment = '' 1181 self.tcomment = '' 1182 self.occurrences = [] 1183 self.flags = [] 1184 self.previous_msgctxt = None 1185 self.previous_msgid = None 1186 self.previous_msgid_plural = None 1187 1188 def __hash__(self): 1189 return hash((self.msgid, self.msgstr)) 1190 1191# }}} 1192# class _POFileParser {{{ 1193 1194 1195class _POFileParser(object): 1196 """ 1197 A finite state machine to parse efficiently and correctly po 1198 file format. 1199 """ 1200 1201 def __init__(self, pofile, *args, **kwargs): 1202 """ 1203 Constructor. 1204 1205 Keyword arguments: 1206 1207 ``pofile`` 1208 string, path to the po file or its content 1209 1210 ``encoding`` 1211 string, the encoding to use, defaults to ``default_encoding`` 1212 global variable (optional). 1213 1214 ``check_for_duplicates`` 1215 whether to check for duplicate entries when adding entries to the 1216 file (optional, default: ``False``). 1217 """ 1218 enc = kwargs.get('encoding', default_encoding) 1219 if _is_file(pofile): 1220 try: 1221 self.fhandle = io.open(pofile, 'rt', encoding=enc) 1222 except LookupError: 1223 enc = default_encoding 1224 self.fhandle = io.open(pofile, 'rt', encoding=enc) 1225 else: 1226 self.fhandle = pofile.splitlines() 1227 1228 klass = kwargs.get('klass') 1229 if klass is None: 1230 klass = POFile 1231 self.instance = klass( 1232 pofile=pofile, 1233 encoding=enc, 1234 check_for_duplicates=kwargs.get('check_for_duplicates', False) 1235 ) 1236 self.transitions = {} 1237 self.current_line = 0 1238 self.current_entry = POEntry(linenum=self.current_line) 1239 self.current_state = 'st' 1240 self.current_token = None 1241 # two memo flags used in handlers 1242 self.msgstr_index = 0 1243 self.entry_obsolete = 0 1244 # Configure the state machine, by adding transitions. 1245 # Signification of symbols: 1246 # * ST: Beginning of the file (start) 1247 # * HE: Header 1248 # * TC: a translation comment 1249 # * GC: a generated comment 1250 # * OC: a file/line occurrence 1251 # * FL: a flags line 1252 # * CT: a message context 1253 # * PC: a previous msgctxt 1254 # * PM: a previous msgid 1255 # * PP: a previous msgid_plural 1256 # * MI: a msgid 1257 # * MP: a msgid plural 1258 # * MS: a msgstr 1259 # * MX: a msgstr plural 1260 # * MC: a msgid or msgstr continuation line 1261 all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc', 1262 'ms', 'mp', 'mx', 'mi'] 1263 1264 self.add('tc', ['st', 'he'], 'he') 1265 self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 1266 'mp', 'mx', 'mi'], 'tc') 1267 self.add('gc', all, 'gc') 1268 self.add('oc', all, 'oc') 1269 self.add('fl', all, 'fl') 1270 self.add('pc', all, 'pc') 1271 self.add('pm', all, 'pm') 1272 self.add('pp', all, 'pp') 1273 self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm', 1274 'pp', 'ms', 'mx'], 'ct') 1275 self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc', 1276 'pm', 'pp', 'ms', 'mx'], 'mi') 1277 self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp') 1278 self.add('ms', ['mi', 'mp', 'tc'], 'ms') 1279 self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx') 1280 self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc') 1281 1282 def parse(self): 1283 """ 1284 Run the state machine, parse the file line by line and call process() 1285 with the current matched symbol. 1286 """ 1287 1288 keywords = { 1289 'msgctxt': 'ct', 1290 'msgid': 'mi', 1291 'msgstr': 'ms', 1292 'msgid_plural': 'mp', 1293 } 1294 prev_keywords = { 1295 'msgid_plural': 'pp', 1296 'msgid': 'pm', 1297 'msgctxt': 'pc', 1298 } 1299 tokens = [] 1300 for line in self.fhandle: 1301 self.current_line += 1 1302 line = line.strip() 1303 if line == '': 1304 continue 1305 1306 tokens = line.split(None, 2) 1307 nb_tokens = len(tokens) 1308 1309 if tokens[0] == '#~|': 1310 continue 1311 1312 if tokens[0] == '#~' and nb_tokens > 1: 1313 line = line[3:].strip() 1314 tokens = tokens[1:] 1315 nb_tokens -= 1 1316 self.entry_obsolete = 1 1317 else: 1318 self.entry_obsolete = 0 1319 1320 # Take care of keywords like 1321 # msgid, msgid_plural, msgctxt & msgstr. 1322 if tokens[0] in keywords and nb_tokens > 1: 1323 line = line[len(tokens[0]):].lstrip() 1324 if re.search(r'([^\\]|^)"', line[1:-1]): 1325 raise IOError('Syntax error in po file %s (line %s): ' 1326 'unescaped double quote found' % 1327 (self.instance.fpath, self.current_line)) 1328 self.current_token = line 1329 self.process(keywords[tokens[0]]) 1330 continue 1331 1332 self.current_token = line 1333 1334 if tokens[0] == '#:': 1335 if nb_tokens <= 1: 1336 continue 1337 # we are on an occurrences line 1338 self.process('oc') 1339 1340 elif line[:1] == '"': 1341 # we are on a continuation line 1342 if re.search(r'([^\\]|^)"', line[1:-1]): 1343 raise IOError('Syntax error in po file %s (line %s): ' 1344 'unescaped double quote found' % 1345 (self.instance.fpath, self.current_line)) 1346 self.process('mc') 1347 1348 elif line[:7] == 'msgstr[': 1349 # we are on a msgstr plural 1350 self.process('mx') 1351 1352 elif tokens[0] == '#,': 1353 if nb_tokens <= 1: 1354 continue 1355 # we are on a flags line 1356 self.process('fl') 1357 1358 elif tokens[0] == '#' or tokens[0].startswith('##'): 1359 if line == '#': 1360 line += ' ' 1361 # we are on a translator comment line 1362 self.process('tc') 1363 1364 elif tokens[0] == '#.': 1365 if nb_tokens <= 1: 1366 continue 1367 # we are on a generated comment line 1368 self.process('gc') 1369 1370 elif tokens[0] == '#|': 1371 if nb_tokens <= 1: 1372 raise IOError('Syntax error in po file %s (line %s)' % 1373 (self.instance.fpath, self.current_line)) 1374 1375 # Remove the marker and any whitespace right after that. 1376 line = line[2:].lstrip() 1377 self.current_token = line 1378 1379 if tokens[1].startswith('"'): 1380 # Continuation of previous metadata. 1381 self.process('mc') 1382 continue 1383 1384 if nb_tokens == 2: 1385 # Invalid continuation line. 1386 raise IOError('Syntax error in po file %s (line %s): ' 1387 'invalid continuation line' % 1388 (self.instance.fpath, self.current_line)) 1389 1390 # we are on a "previous translation" comment line, 1391 if tokens[1] not in prev_keywords: 1392 # Unknown keyword in previous translation comment. 1393 raise IOError('Syntax error in po file %s (line %s): ' 1394 'unknown keyword %s' % 1395 (self.instance.fpath, self.current_line, 1396 tokens[1])) 1397 1398 # Remove the keyword and any whitespace 1399 # between it and the starting quote. 1400 line = line[len(tokens[1]):].lstrip() 1401 self.current_token = line 1402 self.process(prev_keywords[tokens[1]]) 1403 1404 else: 1405 raise IOError('Syntax error in po file %s (line %s)' % 1406 (self.instance.fpath, self.current_line)) 1407 1408 if self.current_entry and len(tokens) > 0 and \ 1409 not tokens[0].startswith('#'): 1410 # since entries are added when another entry is found, we must add 1411 # the last entry here (only if there are lines). Trailing comments 1412 # are ignored 1413 self.instance.append(self.current_entry) 1414 1415 # before returning the instance, check if there's metadata and if 1416 # so extract it in a dict 1417 metadataentry = self.instance.find('') 1418 if metadataentry: # metadata found 1419 # remove the entry 1420 self.instance.remove(metadataentry) 1421 self.instance.metadata_is_fuzzy = metadataentry.flags 1422 key = None 1423 for msg in metadataentry.msgstr.splitlines(): 1424 try: 1425 key, val = msg.split(':', 1) 1426 self.instance.metadata[key] = val.strip() 1427 except (ValueError, KeyError): 1428 if key is not None: 1429 self.instance.metadata[key] += '\n' + msg.strip() 1430 # close opened file 1431 if not isinstance(self.fhandle, list): # must be file 1432 self.fhandle.close() 1433 return self.instance 1434 1435 def add(self, symbol, states, next_state): 1436 """ 1437 Add a transition to the state machine. 1438 1439 Keywords arguments: 1440 1441 ``symbol`` 1442 string, the matched token (two chars symbol). 1443 1444 ``states`` 1445 list, a list of states (two chars symbols). 1446 1447 ``next_state`` 1448 the next state the fsm will have after the action. 1449 """ 1450 for state in states: 1451 action = getattr(self, 'handle_%s' % next_state) 1452 self.transitions[(symbol, state)] = (action, next_state) 1453 1454 def process(self, symbol): 1455 """ 1456 Process the transition corresponding to the current state and the 1457 symbol provided. 1458 1459 Keywords arguments: 1460 1461 ``symbol`` 1462 string, the matched token (two chars symbol). 1463 1464 ``linenum`` 1465 integer, the current line number of the parsed file. 1466 """ 1467 try: 1468 (action, state) = self.transitions[(symbol, self.current_state)] 1469 if action(): 1470 self.current_state = state 1471 except Exception: 1472 raise IOError('Syntax error in po file (line %s)' % 1473 self.current_line) 1474 1475 # state handlers 1476 1477 def handle_he(self): 1478 """Handle a header comment.""" 1479 if self.instance.header != '': 1480 self.instance.header += '\n' 1481 self.instance.header += self.current_token[2:] 1482 return 1 1483 1484 def handle_tc(self): 1485 """Handle a translator comment.""" 1486 if self.current_state in ['mc', 'ms', 'mx']: 1487 self.instance.append(self.current_entry) 1488 self.current_entry = POEntry(linenum=self.current_line) 1489 if self.current_entry.tcomment != '': 1490 self.current_entry.tcomment += '\n' 1491 tcomment = self.current_token.lstrip('#') 1492 if tcomment.startswith(' '): 1493 tcomment = tcomment[1:] 1494 self.current_entry.tcomment += tcomment 1495 return True 1496 1497 def handle_gc(self): 1498 """Handle a generated comment.""" 1499 if self.current_state in ['mc', 'ms', 'mx']: 1500 self.instance.append(self.current_entry) 1501 self.current_entry = POEntry(linenum=self.current_line) 1502 if self.current_entry.comment != '': 1503 self.current_entry.comment += '\n' 1504 self.current_entry.comment += self.current_token[3:] 1505 return True 1506 1507 def handle_oc(self): 1508 """Handle a file:num occurrence.""" 1509 if self.current_state in ['mc', 'ms', 'mx']: 1510 self.instance.append(self.current_entry) 1511 self.current_entry = POEntry(linenum=self.current_line) 1512 occurrences = self.current_token[3:].split() 1513 for occurrence in occurrences: 1514 if occurrence != '': 1515 try: 1516 fil, line = occurrence.rsplit(':', 1) 1517 if not line.isdigit(): 1518 fil = fil + line 1519 line = '' 1520 self.current_entry.occurrences.append((fil, line)) 1521 except (ValueError, AttributeError): 1522 self.current_entry.occurrences.append((occurrence, '')) 1523 return True 1524 1525 def handle_fl(self): 1526 """Handle a flags line.""" 1527 if self.current_state in ['mc', 'ms', 'mx']: 1528 self.instance.append(self.current_entry) 1529 self.current_entry = POEntry(linenum=self.current_line) 1530 self.current_entry.flags += [c.strip() for c in 1531 self.current_token[3:].split(',')] 1532 return True 1533 1534 def handle_pp(self): 1535 """Handle a previous msgid_plural line.""" 1536 if self.current_state in ['mc', 'ms', 'mx']: 1537 self.instance.append(self.current_entry) 1538 self.current_entry = POEntry(linenum=self.current_line) 1539 self.current_entry.previous_msgid_plural = \ 1540 unescape(self.current_token[1:-1]) 1541 return True 1542 1543 def handle_pm(self): 1544 """Handle a previous msgid line.""" 1545 if self.current_state in ['mc', 'ms', 'mx']: 1546 self.instance.append(self.current_entry) 1547 self.current_entry = POEntry(linenum=self.current_line) 1548 self.current_entry.previous_msgid = \ 1549 unescape(self.current_token[1:-1]) 1550 return True 1551 1552 def handle_pc(self): 1553 """Handle a previous msgctxt line.""" 1554 if self.current_state in ['mc', 'ms', 'mx']: 1555 self.instance.append(self.current_entry) 1556 self.current_entry = POEntry(linenum=self.current_line) 1557 self.current_entry.previous_msgctxt = \ 1558 unescape(self.current_token[1:-1]) 1559 return True 1560 1561 def handle_ct(self): 1562 """Handle a msgctxt.""" 1563 if self.current_state in ['mc', 'ms', 'mx']: 1564 self.instance.append(self.current_entry) 1565 self.current_entry = POEntry(linenum=self.current_line) 1566 self.current_entry.msgctxt = unescape(self.current_token[1:-1]) 1567 return True 1568 1569 def handle_mi(self): 1570 """Handle a msgid.""" 1571 if self.current_state in ['mc', 'ms', 'mx']: 1572 self.instance.append(self.current_entry) 1573 self.current_entry = POEntry(linenum=self.current_line) 1574 self.current_entry.obsolete = self.entry_obsolete 1575 self.current_entry.msgid = unescape(self.current_token[1:-1]) 1576 return True 1577 1578 def handle_mp(self): 1579 """Handle a msgid plural.""" 1580 self.current_entry.msgid_plural = unescape(self.current_token[1:-1]) 1581 return True 1582 1583 def handle_ms(self): 1584 """Handle a msgstr.""" 1585 self.current_entry.msgstr = unescape(self.current_token[1:-1]) 1586 return True 1587 1588 def handle_mx(self): 1589 """Handle a msgstr plural.""" 1590 index = self.current_token[7] 1591 value = self.current_token[self.current_token.find('"') + 1:-1] 1592 self.current_entry.msgstr_plural[int(index)] = unescape(value) 1593 self.msgstr_index = int(index) 1594 return True 1595 1596 def handle_mc(self): 1597 """Handle a msgid or msgstr continuation line.""" 1598 token = unescape(self.current_token[1:-1]) 1599 if self.current_state == 'ct': 1600 self.current_entry.msgctxt += token 1601 elif self.current_state == 'mi': 1602 self.current_entry.msgid += token 1603 elif self.current_state == 'mp': 1604 self.current_entry.msgid_plural += token 1605 elif self.current_state == 'ms': 1606 self.current_entry.msgstr += token 1607 elif self.current_state == 'mx': 1608 self.current_entry.msgstr_plural[self.msgstr_index] += token 1609 elif self.current_state == 'pp': 1610 self.current_entry.previous_msgid_plural += token 1611 elif self.current_state == 'pm': 1612 self.current_entry.previous_msgid += token 1613 elif self.current_state == 'pc': 1614 self.current_entry.previous_msgctxt += token 1615 # don't change the current state 1616 return False 1617# }}} 1618# class _MOFileParser {{{ 1619 1620 1621class _MOFileParser(object): 1622 """ 1623 A class to parse binary mo files. 1624 """ 1625 1626 def __init__(self, mofile, *args, **kwargs): 1627 """ 1628 Constructor. 1629 1630 Keyword arguments: 1631 1632 ``mofile`` 1633 string, path to the mo file or its content 1634 1635 ``encoding`` 1636 string, the encoding to use, defaults to ``default_encoding`` 1637 global variable (optional). 1638 1639 ``check_for_duplicates`` 1640 whether to check for duplicate entries when adding entries to the 1641 file (optional, default: ``False``). 1642 """ 1643 self.fhandle = open(mofile, 'rb') 1644 1645 klass = kwargs.get('klass') 1646 if klass is None: 1647 klass = MOFile 1648 self.instance = klass( 1649 fpath=mofile, 1650 encoding=kwargs.get('encoding', default_encoding), 1651 check_for_duplicates=kwargs.get('check_for_duplicates', False) 1652 ) 1653 1654 def __del__(self): 1655 """ 1656 Make sure the file is closed, this prevents warnings on unclosed file 1657 when running tests with python >= 3.2. 1658 """ 1659 if self.fhandle: 1660 self.fhandle.close() 1661 1662 def parse(self): 1663 """ 1664 Build the instance with the file handle provided in the 1665 constructor. 1666 """ 1667 # parse magic number 1668 magic_number = self._readbinary('<I', 4) 1669 if magic_number == MOFile.MAGIC: 1670 ii = '<II' 1671 elif magic_number == MOFile.MAGIC_SWAPPED: 1672 ii = '>II' 1673 else: 1674 raise IOError('Invalid mo file, magic number is incorrect !') 1675 self.instance.magic_number = magic_number 1676 # parse the version number and the number of strings 1677 version, numofstrings = self._readbinary(ii, 8) 1678 # from MO file format specs: "A program seeing an unexpected major 1679 # revision number should stop reading the MO file entirely" 1680 if version not in (0, 1): 1681 raise IOError('Invalid mo file, unexpected major revision number') 1682 self.instance.version = version 1683 # original strings and translation strings hash table offset 1684 msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8) 1685 # move to msgid hash table and read length and offset of msgids 1686 self.fhandle.seek(msgids_hash_offset) 1687 msgids_index = [] 1688 for i in range(numofstrings): 1689 msgids_index.append(self._readbinary(ii, 8)) 1690 # move to msgstr hash table and read length and offset of msgstrs 1691 self.fhandle.seek(msgstrs_hash_offset) 1692 msgstrs_index = [] 1693 for i in range(numofstrings): 1694 msgstrs_index.append(self._readbinary(ii, 8)) 1695 # build entries 1696 encoding = self.instance.encoding 1697 for i in range(numofstrings): 1698 self.fhandle.seek(msgids_index[i][1]) 1699 msgid = self.fhandle.read(msgids_index[i][0]) 1700 1701 self.fhandle.seek(msgstrs_index[i][1]) 1702 msgstr = self.fhandle.read(msgstrs_index[i][0]) 1703 if i == 0 and not msgid: # metadata 1704 raw_metadata, metadata = msgstr.split(b('\n')), {} 1705 for line in raw_metadata: 1706 tokens = line.split(b(':'), 1) 1707 if tokens[0] != b(''): 1708 try: 1709 k = tokens[0].decode(encoding) 1710 v = tokens[1].decode(encoding) 1711 metadata[k] = v.strip() 1712 except IndexError: 1713 metadata[k] = u('') 1714 self.instance.metadata = metadata 1715 continue 1716 # test if we have a plural entry 1717 msgid_tokens = msgid.split(b('\0')) 1718 if len(msgid_tokens) > 1: 1719 entry = self._build_entry( 1720 msgid=msgid_tokens[0], 1721 msgid_plural=msgid_tokens[1], 1722 msgstr_plural=dict((k, v) for k, v in 1723 enumerate(msgstr.split(b('\0')))) 1724 ) 1725 else: 1726 entry = self._build_entry(msgid=msgid, msgstr=msgstr) 1727 self.instance.append(entry) 1728 # close opened file 1729 self.fhandle.close() 1730 return self.instance 1731 1732 def _build_entry(self, msgid, msgstr=None, msgid_plural=None, 1733 msgstr_plural=None): 1734 msgctxt_msgid = msgid.split(b('\x04')) 1735 encoding = self.instance.encoding 1736 if len(msgctxt_msgid) > 1: 1737 kwargs = { 1738 'msgctxt': msgctxt_msgid[0].decode(encoding), 1739 'msgid': msgctxt_msgid[1].decode(encoding), 1740 } 1741 else: 1742 kwargs = {'msgid': msgid.decode(encoding)} 1743 if msgstr: 1744 kwargs['msgstr'] = msgstr.decode(encoding) 1745 if msgid_plural: 1746 kwargs['msgid_plural'] = msgid_plural.decode(encoding) 1747 if msgstr_plural: 1748 for k in msgstr_plural: 1749 msgstr_plural[k] = msgstr_plural[k].decode(encoding) 1750 kwargs['msgstr_plural'] = msgstr_plural 1751 return MOEntry(**kwargs) 1752 1753 def _readbinary(self, fmt, numbytes): 1754 """ 1755 Private method that unpack n bytes of data using format <fmt>. 1756 It returns a tuple or a mixed value if the tuple length is 1. 1757 """ 1758 bytes = self.fhandle.read(numbytes) 1759 tup = struct.unpack(fmt, bytes) 1760 if len(tup) == 1: 1761 return tup[0] 1762 return tup 1763# }}} 1764# class TextWrapper {{{ 1765 1766 1767class TextWrapper(textwrap.TextWrapper): 1768 """ 1769 Subclass of textwrap.TextWrapper that backport the 1770 drop_whitespace option. 1771 """ 1772 def __init__(self, *args, **kwargs): 1773 drop_whitespace = kwargs.pop('drop_whitespace', True) 1774 textwrap.TextWrapper.__init__(self, *args, **kwargs) 1775 self.drop_whitespace = drop_whitespace 1776 1777 def _wrap_chunks(self, chunks): 1778 """_wrap_chunks(chunks : [string]) -> [string] 1779 1780 Wrap a sequence of text chunks and return a list of lines of 1781 length 'self.width' or less. (If 'break_long_words' is false, 1782 some lines may be longer than this.) Chunks correspond roughly 1783 to words and the whitespace between them: each chunk is 1784 indivisible (modulo 'break_long_words'), but a line break can 1785 come between any two chunks. Chunks should not have internal 1786 whitespace; ie. a chunk is either all whitespace or a "word". 1787 Whitespace chunks will be removed from the beginning and end of 1788 lines, but apart from that whitespace is preserved. 1789 """ 1790 lines = [] 1791 if self.width <= 0: 1792 raise ValueError("invalid width %r (must be > 0)" % self.width) 1793 1794 # Arrange in reverse order so items can be efficiently popped 1795 # from a stack of chucks. 1796 chunks.reverse() 1797 1798 while chunks: 1799 1800 # Start the list of chunks that will make up the current line. 1801 # cur_len is just the length of all the chunks in cur_line. 1802 cur_line = [] 1803 cur_len = 0 1804 1805 # Figure out which static string will prefix this line. 1806 if lines: 1807 indent = self.subsequent_indent 1808 else: 1809 indent = self.initial_indent 1810 1811 # Maximum width for this line. 1812 width = self.width - len(indent) 1813 1814 # First chunk on line is whitespace -- drop it, unless this 1815 # is the very beginning of the text (ie. no lines started yet). 1816 if self.drop_whitespace and chunks[-1].strip() == '' and lines: 1817 del chunks[-1] 1818 1819 while chunks: 1820 l = len(chunks[-1]) 1821 1822 # Can at least squeeze this chunk onto the current line. 1823 if cur_len + l <= width: 1824 cur_line.append(chunks.pop()) 1825 cur_len += l 1826 1827 # Nope, this line is full. 1828 else: 1829 break 1830 1831 # The current line is full, and the next chunk is too big to 1832 # fit on *any* line (not just this one). 1833 if chunks and len(chunks[-1]) > width: 1834 self._handle_long_word(chunks, cur_line, cur_len, width) 1835 1836 # If the last chunk on this line is all whitespace, drop it. 1837 if self.drop_whitespace and cur_line and not cur_line[-1].strip(): 1838 del cur_line[-1] 1839 1840 # Convert current line back to a string and store it in list 1841 # of all lines (return value). 1842 if cur_line: 1843 lines.append(indent + ''.join(cur_line)) 1844 1845 return lines 1846# }}} 1847# function wrap() {{{ 1848 1849 1850def wrap(text, width=70, **kwargs): 1851 """ 1852 Wrap a single paragraph of text, returning a list of wrapped lines. 1853 """ 1854 if sys.version_info < (2, 6): 1855 return TextWrapper(width=width, **kwargs).wrap(text) 1856 return textwrap.wrap(text, width=width, **kwargs) 1857 1858# }}} 1859 1860def genKeyId(inkey): 1861 crc = binascii.crc32(bytes(inkey, encoding="UTF-8")) & 0xffffffff 1862 # Use simple ASCII characters, exclude I, l, 1 and O, 0 to avoid confusing IDs 1863 symbols = "ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz23456789"; 1864 outkey = "" 1865 for keyind in range(0, 5): 1866 outkey += symbols[(crc & 63) % len(symbols)]; 1867 crc >>= 6; 1868 return outkey 1869