1# -* coding: utf-8 -*- 2# 3# License: MIT (see LICENSE file provided) 4# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: 5 6""" 7**polib** allows you to manipulate, create, modify gettext files (pot, po and 8mo files). You can load existing files, iterate through it's entries, add, 9modify entries, comments or metadata, etc. or create new po files from scratch. 10 11**polib** provides a simple and pythonic API via the :func:`~polib.pofile` and 12:func:`~polib.mofile` convenience functions. 13""" 14 15import array 16import codecs 17import os 18import re 19import struct 20import sys 21import textwrap 22 23try: 24 import io 25except ImportError: 26 # replacement of io.open() for python < 2.6 27 # we use codecs instead 28 class io(object): 29 @staticmethod 30 def open(fpath, mode='r', encoding=None): 31 return codecs.open(fpath, mode, encoding) 32 33 34__author__ = 'David Jean Louis <izimobil@gmail.com>' 35__version__ = '1.1.0' 36__all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry', 37 'default_encoding', 'escape', 'unescape', 'detect_encoding', ] 38 39 40# the default encoding to use when encoding cannot be detected 41default_encoding = 'utf-8' 42 43# python 2/3 compatibility helpers {{{ 44 45 46if sys.version_info[:2] < (3, 0): 47 PY3 = False 48 text_type = unicode 49 50 def b(s): 51 return s 52 53 def u(s): 54 return unicode(s, "unicode_escape") 55 56else: 57 PY3 = True 58 text_type = str 59 60 def b(s): 61 return s.encode("latin-1") 62 63 def u(s): 64 return s 65# }}} 66# _pofile_or_mofile {{{ 67 68 69def _pofile_or_mofile(f, type, **kwargs): 70 """ 71 Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to 72 honor the DRY concept. 73 """ 74 # get the file encoding 75 enc = kwargs.get('encoding') 76 if enc is None: 77 enc = detect_encoding(f, type == 'mofile') 78 79 # parse the file 80 kls = type == 'pofile' and _POFileParser or _MOFileParser 81 parser = kls( 82 f, 83 encoding=enc, 84 check_for_duplicates=kwargs.get('check_for_duplicates', False), 85 klass=kwargs.get('klass') 86 ) 87 instance = parser.parse() 88 instance.wrapwidth = kwargs.get('wrapwidth', 78) 89 return instance 90# }}} 91# _is_file {{{ 92 93 94def _is_file(filename_or_contents): 95 """ 96 Safely returns the value of os.path.exists(filename_or_contents). 97 98 Arguments: 99 100 ``filename_or_contents`` 101 either a filename, or a string holding the contents of some file. 102 In the latter case, this function will always return False. 103 """ 104 try: 105 return os.path.exists(filename_or_contents) 106 except (ValueError, UnicodeEncodeError): 107 return False 108# }}} 109# function pofile() {{{ 110 111 112def pofile(pofile, **kwargs): 113 """ 114 Convenience function that parses the po or pot file ``pofile`` and returns 115 a :class:`~polib.POFile` instance. 116 117 Arguments: 118 119 ``pofile`` 120 string, full or relative path to the po/pot file or its content (data). 121 122 ``wrapwidth`` 123 integer, the wrap width, only useful when the ``-w`` option was passed 124 to xgettext (optional, default: ``78``). 125 126 ``encoding`` 127 string, the encoding to use (e.g. "utf-8") (default: ``None``, the 128 encoding will be auto-detected). 129 130 ``check_for_duplicates`` 131 whether to check for duplicate entries when adding entries to the 132 file (optional, default: ``False``). 133 134 ``klass`` 135 class which is used to instantiate the return value (optional, 136 default: ``None``, the return value with be a :class:`~polib.POFile` 137 instance). 138 """ 139 return _pofile_or_mofile(pofile, 'pofile', **kwargs) 140# }}} 141# function mofile() {{{ 142 143 144def mofile(mofile, **kwargs): 145 """ 146 Convenience function that parses the mo file ``mofile`` and returns a 147 :class:`~polib.MOFile` instance. 148 149 Arguments: 150 151 ``mofile`` 152 string, full or relative path to the mo file or its content (data). 153 154 ``wrapwidth`` 155 integer, the wrap width, only useful when the ``-w`` option was passed 156 to xgettext to generate the po file that was used to format the mo file 157 (optional, default: ``78``). 158 159 ``encoding`` 160 string, the encoding to use (e.g. "utf-8") (default: ``None``, the 161 encoding will be auto-detected). 162 163 ``check_for_duplicates`` 164 whether to check for duplicate entries when adding entries to the 165 file (optional, default: ``False``). 166 167 ``klass`` 168 class which is used to instantiate the return value (optional, 169 default: ``None``, the return value with be a :class:`~polib.POFile` 170 instance). 171 """ 172 return _pofile_or_mofile(mofile, 'mofile', **kwargs) 173# }}} 174# function detect_encoding() {{{ 175 176 177def detect_encoding(file, binary_mode=False): 178 """ 179 Try to detect the encoding used by the ``file``. The ``file`` argument can 180 be a PO or MO file path or a string containing the contents of the file. 181 If the encoding cannot be detected, the function will return the value of 182 ``default_encoding``. 183 184 Arguments: 185 186 ``file`` 187 string, full or relative path to the po/mo file or its content. 188 189 ``binary_mode`` 190 boolean, set this to True if ``file`` is a mo file. 191 """ 192 PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)' 193 rxt = re.compile(u(PATTERN)) 194 rxb = re.compile(b(PATTERN)) 195 196 def charset_exists(charset): 197 """Check whether ``charset`` is valid or not.""" 198 try: 199 codecs.lookup(charset) 200 except LookupError: 201 return False 202 return True 203 204 if not _is_file(file): 205 match = rxt.search(file) 206 if match: 207 enc = match.group(1).strip() 208 if charset_exists(enc): 209 return enc 210 else: 211 # For PY3, always treat as binary 212 if binary_mode or PY3: 213 mode = 'rb' 214 rx = rxb 215 else: 216 mode = 'r' 217 rx = rxt 218 f = open(file, mode) 219 for l in f.readlines(): 220 match = rx.search(l) 221 if match: 222 f.close() 223 enc = match.group(1).strip() 224 if not isinstance(enc, text_type): 225 enc = enc.decode('utf-8') 226 if charset_exists(enc): 227 return enc 228 f.close() 229 return default_encoding 230# }}} 231# function escape() {{{ 232 233 234def escape(st): 235 """ 236 Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in 237 the given string ``st`` and returns it. 238 """ 239 return st.replace('\\', r'\\')\ 240 .replace('\t', r'\t')\ 241 .replace('\r', r'\r')\ 242 .replace('\n', r'\n')\ 243 .replace('\"', r'\"') 244# }}} 245# function unescape() {{{ 246 247 248def unescape(st): 249 """ 250 Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in 251 the given string ``st`` and returns it. 252 """ 253 def unescape_repl(m): 254 m = m.group(1) 255 if m == 'n': 256 return '\n' 257 if m == 't': 258 return '\t' 259 if m == 'r': 260 return '\r' 261 if m == '\\': 262 return '\\' 263 return m # handles escaped double quote 264 return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st) 265# }}} 266# function natural_sort() {{{ 267 268 269def natural_sort(lst): 270 """ 271 Sort naturally the given list. 272 Credits: http://stackoverflow.com/a/4836734 273 """ 274 def convert(text): 275 return int(text) if text.isdigit() else text.lower() 276 277 def alphanum_key(key): 278 return [convert(c) for c in re.split('([0-9]+)', key)] 279 280 return sorted(lst, key=alphanum_key) 281 282# }}} 283# class _BaseFile {{{ 284 285 286class _BaseFile(list): 287 """ 288 Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile` 289 classes. This class should **not** be instantiated directly. 290 """ 291 292 def __init__(self, *args, **kwargs): 293 """ 294 Constructor, accepts the following keyword arguments: 295 296 ``pofile`` 297 string, the path to the po or mo file, or its content as a string. 298 299 ``wrapwidth`` 300 integer, the wrap width, only useful when the ``-w`` option was 301 passed to xgettext (optional, default: ``78``). 302 303 ``encoding`` 304 string, the encoding to use, defaults to ``default_encoding`` 305 global variable (optional). 306 307 ``check_for_duplicates`` 308 whether to check for duplicate entries when adding entries to the 309 file, (optional, default: ``False``). 310 """ 311 list.__init__(self) 312 # the opened file handle 313 pofile = kwargs.get('pofile', None) 314 if pofile and _is_file(pofile): 315 self.fpath = pofile 316 else: 317 self.fpath = kwargs.get('fpath') 318 # the width at which lines should be wrapped 319 self.wrapwidth = kwargs.get('wrapwidth', 78) 320 # the file encoding 321 self.encoding = kwargs.get('encoding', default_encoding) 322 # whether to check for duplicate entries or not 323 self.check_for_duplicates = kwargs.get('check_for_duplicates', False) 324 # header 325 self.header = '' 326 # both po and mo files have metadata 327 self.metadata = {} 328 self.metadata_is_fuzzy = 0 329 330 def __unicode__(self): 331 """ 332 Returns the unicode representation of the file. 333 """ 334 ret = [] 335 entries = [self.metadata_as_entry()] + \ 336 [e for e in self if not e.obsolete] 337 for entry in entries: 338 ret.append(entry.__unicode__(self.wrapwidth)) 339 for entry in self.obsolete_entries(): 340 ret.append(entry.__unicode__(self.wrapwidth)) 341 ret = u('\n').join(ret) 342 return ret 343 344 if PY3: 345 def __str__(self): 346 return self.__unicode__() 347 else: 348 def __str__(self): 349 """ 350 Returns the string representation of the file. 351 """ 352 return unicode(self).encode(self.encoding) 353 354 def __contains__(self, entry): 355 """ 356 Overridden ``list`` method to implement the membership test (in and 357 not in). 358 The method considers that an entry is in the file if it finds an entry 359 that has the same msgid (the test is **case sensitive**) and the same 360 msgctxt (or none for both entries). 361 362 Argument: 363 364 ``entry`` 365 an instance of :class:`~polib._BaseEntry`. 366 """ 367 return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \ 368 is not None 369 370 def __eq__(self, other): 371 return str(self) == str(other) 372 373 def append(self, entry): 374 """ 375 Overridden method to check for duplicates entries, if a user tries to 376 add an entry that is already in the file, the method will raise a 377 ``ValueError`` exception. 378 379 Argument: 380 381 ``entry`` 382 an instance of :class:`~polib._BaseEntry`. 383 """ 384 # check_for_duplicates may not be defined (yet) when unpickling. 385 # But if pickling, we never want to check for duplicates anyway. 386 if getattr(self, 'check_for_duplicates', False) and entry in self: 387 raise ValueError('Entry "%s" already exists' % entry.msgid) 388 super(_BaseFile, self).append(entry) 389 390 def insert(self, index, entry): 391 """ 392 Overridden method to check for duplicates entries, if a user tries to 393 add an entry that is already in the file, the method will raise a 394 ``ValueError`` exception. 395 396 Arguments: 397 398 ``index`` 399 index at which the entry should be inserted. 400 401 ``entry`` 402 an instance of :class:`~polib._BaseEntry`. 403 """ 404 if self.check_for_duplicates and entry in self: 405 raise ValueError('Entry "%s" already exists' % entry.msgid) 406 super(_BaseFile, self).insert(index, entry) 407 408 def metadata_as_entry(self): 409 """ 410 Returns the file metadata as a :class:`~polib.POFile` instance. 411 """ 412 e = POEntry(msgid='') 413 mdata = self.ordered_metadata() 414 if mdata: 415 strs = [] 416 for name, value in mdata: 417 # Strip whitespace off each line in a multi-line entry 418 strs.append('%s: %s' % (name, value)) 419 e.msgstr = '\n'.join(strs) + '\n' 420 if self.metadata_is_fuzzy: 421 e.flags.append('fuzzy') 422 return e 423 424 def save(self, fpath=None, repr_method='__unicode__'): 425 """ 426 Saves the po file to ``fpath``. 427 If it is an existing file and no ``fpath`` is provided, then the 428 existing file is rewritten with the modified data. 429 430 Keyword arguments: 431 432 ``fpath`` 433 string, full or relative path to the file. 434 435 ``repr_method`` 436 string, the method to use for output. 437 """ 438 if self.fpath is None and fpath is None: 439 raise IOError('You must provide a file path to save() method') 440 contents = getattr(self, repr_method)() 441 if fpath is None: 442 fpath = self.fpath 443 if repr_method == 'to_binary': 444 fhandle = open(fpath, 'wb') 445 else: 446 fhandle = io.open(fpath, 'w', encoding=self.encoding) 447 if not isinstance(contents, text_type): 448 contents = contents.decode(self.encoding) 449 fhandle.write(contents) 450 fhandle.close() 451 # set the file path if not set 452 if self.fpath is None and fpath: 453 self.fpath = fpath 454 455 def find(self, st, by='msgid', include_obsolete_entries=False, 456 msgctxt=False): 457 """ 458 Find the entry which msgid (or property identified by the ``by`` 459 argument) matches the string ``st``. 460 461 Keyword arguments: 462 463 ``st`` 464 string, the string to search for. 465 466 ``by`` 467 string, the property to use for comparison (default: ``msgid``). 468 469 ``include_obsolete_entries`` 470 boolean, whether to also search in entries that are obsolete. 471 472 ``msgctxt`` 473 string, allows specifying a specific message context for the 474 search. 475 """ 476 if include_obsolete_entries: 477 entries = self[:] 478 else: 479 entries = [e for e in self if not e.obsolete] 480 matches = [] 481 for e in entries: 482 if getattr(e, by) == st: 483 if msgctxt is not False and e.msgctxt != msgctxt: 484 continue 485 matches.append(e) 486 if len(matches) == 1: 487 return matches[0] 488 elif len(matches) > 1: 489 if not msgctxt: 490 # find the entry with no msgctx 491 e = None 492 for m in matches: 493 if not m.msgctxt: 494 e = m 495 if e: 496 return e 497 # fallback to the first entry found 498 return matches[0] 499 return None 500 501 def ordered_metadata(self): 502 """ 503 Convenience method that returns an ordered version of the metadata 504 dictionary. The return value is list of tuples (metadata name, 505 metadata_value). 506 """ 507 # copy the dict first 508 metadata = self.metadata.copy() 509 data_order = [ 510 'Project-Id-Version', 511 'Report-Msgid-Bugs-To', 512 'POT-Creation-Date', 513 'PO-Revision-Date', 514 'Last-Translator', 515 'Language-Team', 516 'Language', 517 'MIME-Version', 518 'Content-Type', 519 'Content-Transfer-Encoding', 520 'Plural-Forms' 521 ] 522 ordered_data = [] 523 for data in data_order: 524 try: 525 value = metadata.pop(data) 526 ordered_data.append((data, value)) 527 except KeyError: 528 pass 529 # the rest of the metadata will be alphabetically ordered since there 530 # are no specs for this AFAIK 531 for data in natural_sort(metadata.keys()): 532 value = metadata[data] 533 ordered_data.append((data, value)) 534 return ordered_data 535 536 def to_binary(self): 537 """ 538 Return the binary representation of the file. 539 """ 540 offsets = [] 541 entries = self.translated_entries() 542 543 # the keys are sorted in the .mo file 544 def cmp(_self, other): 545 # msgfmt compares entries with msgctxt if it exists 546 self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid 547 other_msgid = other.msgctxt and other.msgctxt or other.msgid 548 if self_msgid > other_msgid: 549 return 1 550 elif self_msgid < other_msgid: 551 return -1 552 else: 553 return 0 554 # add metadata entry 555 entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8')) 556 mentry = self.metadata_as_entry() 557 entries = [mentry] + entries 558 entries_len = len(entries) 559 ids, strs = b(''), b('') 560 for e in entries: 561 # For each string, we need size and file offset. Each string is 562 # NUL terminated; the NUL does not count into the size. 563 msgid = b('') 564 if e.msgctxt: 565 # Contexts are stored by storing the concatenation of the 566 # context, a <EOT> byte, and the original string 567 msgid = self._encode(e.msgctxt + '\4') 568 if e.msgid_plural: 569 msgstr = [] 570 for index in sorted(e.msgstr_plural.keys()): 571 msgstr.append(e.msgstr_plural[index]) 572 msgid += self._encode(e.msgid + '\0' + e.msgid_plural) 573 msgstr = self._encode('\0'.join(msgstr)) 574 else: 575 msgid += self._encode(e.msgid) 576 msgstr = self._encode(e.msgstr) 577 offsets.append((len(ids), len(msgid), len(strs), len(msgstr))) 578 ids += msgid + b('\0') 579 strs += msgstr + b('\0') 580 581 # The header is 7 32-bit unsigned integers. 582 keystart = 7 * 4 + 16 * entries_len 583 # and the values start after the keys 584 valuestart = keystart + len(ids) 585 koffsets = [] 586 voffsets = [] 587 # The string table first has the list of keys, then the list of values. 588 # Each entry has first the size of the string, then the file offset. 589 for o1, l1, o2, l2 in offsets: 590 koffsets += [l1, o1 + keystart] 591 voffsets += [l2, o2 + valuestart] 592 offsets = koffsets + voffsets 593 594 output = struct.pack( 595 "Iiiiiii", 596 # Magic number 597 MOFile.MAGIC, 598 # Version 599 0, 600 # number of entries 601 entries_len, 602 # start of key index 603 7 * 4, 604 # start of value index 605 7 * 4 + entries_len * 8, 606 # size and offset of hash table, we don't use hash tables 607 0, keystart 608 609 ) 610 if PY3 and sys.version_info.minor > 1: # python 3.2 or superior 611 output += array.array("i", offsets).tobytes() 612 else: 613 output += array.array("i", offsets).tostring() 614 output += ids 615 output += strs 616 return output 617 618 def _encode(self, mixed): 619 """ 620 Encodes the given ``mixed`` argument with the file encoding if and 621 only if it's an unicode string and returns the encoded string. 622 """ 623 if isinstance(mixed, text_type): 624 mixed = mixed.encode(self.encoding) 625 return mixed 626# }}} 627# class POFile {{{ 628 629 630class POFile(_BaseFile): 631 """ 632 Po (or Pot) file reader/writer. 633 This class inherits the :class:`~polib._BaseFile` class and, by extension, 634 the python ``list`` type. 635 """ 636 637 def __unicode__(self): 638 """ 639 Returns the unicode representation of the po file. 640 """ 641 ret, headers = '', self.header.split('\n') 642 for header in headers: 643 if not len(header): 644 ret += "#\n" 645 elif header[:1] in [',', ':']: 646 ret += '#%s\n' % header 647 else: 648 ret += '# %s\n' % header 649 650 if not isinstance(ret, text_type): 651 ret = ret.decode(self.encoding) 652 653 return ret + _BaseFile.__unicode__(self) 654 655 def save_as_mofile(self, fpath): 656 """ 657 Saves the binary representation of the file to given ``fpath``. 658 659 Keyword argument: 660 661 ``fpath`` 662 string, full or relative path to the mo file. 663 """ 664 _BaseFile.save(self, fpath, 'to_binary') 665 666 def percent_translated(self): 667 """ 668 Convenience method that returns the percentage of translated 669 messages. 670 """ 671 total = len([e for e in self if not e.obsolete]) 672 if total == 0: 673 return 100 674 translated = len(self.translated_entries()) 675 return int(translated * 100 / float(total)) 676 677 def translated_entries(self): 678 """ 679 Convenience method that returns the list of translated entries. 680 """ 681 return [e for e in self if e.translated()] 682 683 def untranslated_entries(self): 684 """ 685 Convenience method that returns the list of untranslated entries. 686 """ 687 return [e for e in self if not e.translated() and not e.obsolete 688 and not e.fuzzy] 689 690 def fuzzy_entries(self): 691 """ 692 Convenience method that returns the list of fuzzy entries. 693 """ 694 return [e for e in self if e.fuzzy] 695 696 def obsolete_entries(self): 697 """ 698 Convenience method that returns the list of obsolete entries. 699 """ 700 return [e for e in self if e.obsolete] 701 702 def merge(self, refpot): 703 """ 704 Convenience method that merges the current pofile with the pot file 705 provided. It behaves exactly as the gettext msgmerge utility: 706 707 * comments of this file will be preserved, but extracted comments and 708 occurrences will be discarded; 709 * any translations or comments in the file will be discarded, however, 710 dot comments and file positions will be preserved; 711 * the fuzzy flags are preserved. 712 713 Keyword argument: 714 715 ``refpot`` 716 object POFile, the reference catalog. 717 """ 718 # Store entries in dict/set for faster access 719 self_entries = dict( 720 (entry.msgid_with_context, entry) for entry in self 721 ) 722 refpot_msgids = set(entry.msgid_with_context for entry in refpot) 723 # Merge entries that are in the refpot 724 for entry in refpot: 725 e = self_entries.get(entry.msgid_with_context) 726 if e is None: 727 e = POEntry() 728 self.append(e) 729 e.merge(entry) 730 # ok, now we must "obsolete" entries that are not in the refpot anymore 731 for entry in self: 732 if entry.msgid_with_context not in refpot_msgids: 733 entry.obsolete = True 734# }}} 735# class MOFile {{{ 736 737 738class MOFile(_BaseFile): 739 """ 740 Mo file reader/writer. 741 This class inherits the :class:`~polib._BaseFile` class and, by 742 extension, the python ``list`` type. 743 """ 744 MAGIC = 0x950412de 745 MAGIC_SWAPPED = 0xde120495 746 747 def __init__(self, *args, **kwargs): 748 """ 749 Constructor, accepts all keywords arguments accepted by 750 :class:`~polib._BaseFile` class. 751 """ 752 _BaseFile.__init__(self, *args, **kwargs) 753 self.magic_number = None 754 self.version = 0 755 756 def save_as_pofile(self, fpath): 757 """ 758 Saves the mofile as a pofile to ``fpath``. 759 760 Keyword argument: 761 762 ``fpath`` 763 string, full or relative path to the file. 764 """ 765 _BaseFile.save(self, fpath) 766 767 def save(self, fpath=None): 768 """ 769 Saves the mofile to ``fpath``. 770 771 Keyword argument: 772 773 ``fpath`` 774 string, full or relative path to the file. 775 """ 776 _BaseFile.save(self, fpath, 'to_binary') 777 778 def percent_translated(self): 779 """ 780 Convenience method to keep the same interface with POFile instances. 781 """ 782 return 100 783 784 def translated_entries(self): 785 """ 786 Convenience method to keep the same interface with POFile instances. 787 """ 788 return self 789 790 def untranslated_entries(self): 791 """ 792 Convenience method to keep the same interface with POFile instances. 793 """ 794 return [] 795 796 def fuzzy_entries(self): 797 """ 798 Convenience method to keep the same interface with POFile instances. 799 """ 800 return [] 801 802 def obsolete_entries(self): 803 """ 804 Convenience method to keep the same interface with POFile instances. 805 """ 806 return [] 807# }}} 808# class _BaseEntry {{{ 809 810 811class _BaseEntry(object): 812 """ 813 Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes. 814 This class should **not** be instantiated directly. 815 """ 816 817 def __init__(self, *args, **kwargs): 818 """ 819 Constructor, accepts the following keyword arguments: 820 821 ``msgid`` 822 string, the entry msgid. 823 824 ``msgstr`` 825 string, the entry msgstr. 826 827 ``msgid_plural`` 828 string, the entry msgid_plural. 829 830 ``msgstr_plural`` 831 list, the entry msgstr_plural lines. 832 833 ``msgctxt`` 834 string, the entry context (msgctxt). 835 836 ``obsolete`` 837 bool, whether the entry is "obsolete" or not. 838 839 ``encoding`` 840 string, the encoding to use, defaults to ``default_encoding`` 841 global variable (optional). 842 """ 843 self.msgid = kwargs.get('msgid', '') 844 self.msgstr = kwargs.get('msgstr', '') 845 self.msgid_plural = kwargs.get('msgid_plural', '') 846 self.msgstr_plural = kwargs.get('msgstr_plural', {}) 847 self.msgctxt = kwargs.get('msgctxt', None) 848 self.obsolete = kwargs.get('obsolete', False) 849 self.encoding = kwargs.get('encoding', default_encoding) 850 851 def __unicode__(self, wrapwidth=78): 852 """ 853 Returns the unicode representation of the entry. 854 """ 855 if self.obsolete: 856 delflag = '#~ ' 857 else: 858 delflag = '' 859 ret = [] 860 # write the msgctxt if any 861 if self.msgctxt is not None: 862 ret += self._str_field("msgctxt", delflag, "", self.msgctxt, 863 wrapwidth) 864 # write the msgid 865 ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth) 866 # write the msgid_plural if any 867 if self.msgid_plural: 868 ret += self._str_field("msgid_plural", delflag, "", 869 self.msgid_plural, wrapwidth) 870 if self.msgstr_plural: 871 # write the msgstr_plural if any 872 msgstrs = self.msgstr_plural 873 keys = list(msgstrs) 874 keys.sort() 875 for index in keys: 876 msgstr = msgstrs[index] 877 plural_index = '[%s]' % index 878 ret += self._str_field("msgstr", delflag, plural_index, msgstr, 879 wrapwidth) 880 else: 881 # otherwise write the msgstr 882 ret += self._str_field("msgstr", delflag, "", self.msgstr, 883 wrapwidth) 884 ret.append('') 885 ret = u('\n').join(ret) 886 return ret 887 888 if PY3: 889 def __str__(self): 890 return self.__unicode__() 891 else: 892 def __str__(self): 893 """ 894 Returns the string representation of the entry. 895 """ 896 return unicode(self).encode(self.encoding) 897 898 def __eq__(self, other): 899 return str(self) == str(other) 900 901 def _str_field(self, fieldname, delflag, plural_index, field, 902 wrapwidth=78): 903 lines = field.splitlines(True) 904 if len(lines) > 1: 905 lines = [''] + lines # start with initial empty line 906 else: 907 escaped_field = escape(field) 908 specialchars_count = 0 909 for c in ['\\', '\n', '\r', '\t', '"']: 910 specialchars_count += field.count(c) 911 # comparison must take into account fieldname length + one space 912 # + 2 quotes (eg. msgid "<string>") 913 flength = len(fieldname) + 3 914 if plural_index: 915 flength += len(plural_index) 916 real_wrapwidth = wrapwidth - flength + specialchars_count 917 if wrapwidth > 0 and len(field) > real_wrapwidth: 918 # Wrap the line but take field name into account 919 lines = [''] + [unescape(item) for item in wrap( 920 escaped_field, 921 wrapwidth - 2, # 2 for quotes "" 922 drop_whitespace=False, 923 break_long_words=False 924 )] 925 else: 926 lines = [field] 927 if fieldname.startswith('previous_'): 928 # quick and dirty trick to get the real field name 929 fieldname = fieldname[9:] 930 931 ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index, 932 escape(lines.pop(0)))] 933 for line in lines: 934 ret.append('%s"%s"' % (delflag, escape(line))) 935 return ret 936# }}} 937# class POEntry {{{ 938 939 940class POEntry(_BaseEntry): 941 """ 942 Represents a po file entry. 943 """ 944 945 def __init__(self, *args, **kwargs): 946 """ 947 Constructor, accepts the following keyword arguments: 948 949 ``comment`` 950 string, the entry comment. 951 952 ``tcomment`` 953 string, the entry translator comment. 954 955 ``occurrences`` 956 list, the entry occurrences. 957 958 ``flags`` 959 list, the entry flags. 960 961 ``previous_msgctxt`` 962 string, the entry previous context. 963 964 ``previous_msgid`` 965 string, the entry previous msgid. 966 967 ``previous_msgid_plural`` 968 string, the entry previous msgid_plural. 969 970 ``linenum`` 971 integer, the line number of the entry 972 """ 973 _BaseEntry.__init__(self, *args, **kwargs) 974 self.comment = kwargs.get('comment', '') 975 self.tcomment = kwargs.get('tcomment', '') 976 self.occurrences = kwargs.get('occurrences', []) 977 self.flags = kwargs.get('flags', []) 978 self.previous_msgctxt = kwargs.get('previous_msgctxt', None) 979 self.previous_msgid = kwargs.get('previous_msgid', None) 980 self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None) 981 self.linenum = kwargs.get('linenum', None) 982 983 def __unicode__(self, wrapwidth=78): 984 """ 985 Returns the unicode representation of the entry. 986 """ 987 ret = [] 988 # comments first, if any (with text wrapping as xgettext does) 989 if self.obsolete: 990 comments = [('tcomment', '# ')] 991 else: 992 comments = [('comment', '#. '), ('tcomment', '# ')] 993 for c in comments: 994 val = getattr(self, c[0]) 995 if val: 996 for comment in val.split('\n'): 997 if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth: 998 ret += wrap( 999 comment, 1000 wrapwidth, 1001 initial_indent=c[1], 1002 subsequent_indent=c[1], 1003 break_long_words=False 1004 ) 1005 else: 1006 ret.append('%s%s' % (c[1], comment)) 1007 1008 # occurrences (with text wrapping as xgettext does) 1009 if not self.obsolete and self.occurrences: 1010 filelist = [] 1011 for fpath, lineno in self.occurrences: 1012 if lineno: 1013 filelist.append('%s:%s' % (fpath, lineno)) 1014 else: 1015 filelist.append(fpath) 1016 filestr = ' '.join(filelist) 1017 if wrapwidth > 0 and len(filestr) + 3 > wrapwidth: 1018 # textwrap split words that contain hyphen, this is not 1019 # what we want for filenames, so the dirty hack is to 1020 # temporally replace hyphens with a char that a file cannot 1021 # contain, like "*" 1022 ret += [l.replace('*', '-') for l in wrap( 1023 filestr.replace('-', '*'), 1024 wrapwidth, 1025 initial_indent='#: ', 1026 subsequent_indent='#: ', 1027 break_long_words=False 1028 )] 1029 else: 1030 ret.append('#: ' + filestr) 1031 1032 # flags (TODO: wrapping ?) 1033 if self.flags: 1034 ret.append('#, %s' % ', '.join(self.flags)) 1035 1036 # previous context and previous msgid/msgid_plural 1037 fields = ['previous_msgctxt', 'previous_msgid', 1038 'previous_msgid_plural'] 1039 if self.obsolete: 1040 prefix = "#~| " 1041 else: 1042 prefix = "#| " 1043 for f in fields: 1044 val = getattr(self, f) 1045 if val: 1046 ret += self._str_field(f, prefix, "", val, wrapwidth) 1047 1048 ret.append(_BaseEntry.__unicode__(self, wrapwidth)) 1049 ret = u('\n').join(ret) 1050 return ret 1051 1052 def __cmp__(self, other): 1053 """ 1054 Called by comparison operations if rich comparison is not defined. 1055 """ 1056 # First: Obsolete test 1057 if self.obsolete != other.obsolete: 1058 if self.obsolete: 1059 return -1 1060 else: 1061 return 1 1062 # Work on a copy to protect original 1063 occ1 = sorted(self.occurrences[:]) 1064 occ2 = sorted(other.occurrences[:]) 1065 pos = 0 1066 if occ1 > occ2: 1067 return 1 1068 if occ1 < occ2: 1069 return -1 1070 # Compare context 1071 msgctxt = self.msgctxt or 0 1072 othermsgctxt = other.msgctxt or 0 1073 if msgctxt > othermsgctxt: 1074 return 1 1075 elif msgctxt < othermsgctxt: 1076 return -1 1077 # Compare msgid_plural 1078 msgid_plural = self.msgid_plural or 0 1079 othermsgid_plural = other.msgid_plural or 0 1080 if msgid_plural > othermsgid_plural: 1081 return 1 1082 elif msgid_plural < othermsgid_plural: 1083 return -1 1084 # Compare msgstr_plural 1085 msgstr_plural = self.msgstr_plural or 0 1086 othermsgstr_plural = other.msgstr_plural or 0 1087 if msgstr_plural > othermsgstr_plural: 1088 return 1 1089 elif msgstr_plural < othermsgstr_plural: 1090 return -1 1091 # Compare msgid 1092 if self.msgid > other.msgid: 1093 return 1 1094 elif self.msgid < other.msgid: 1095 return -1 1096 return 0 1097 # Compare msgstr 1098 if self.msgstr > other.msgstr: 1099 return 1 1100 elif self.msgstr < other.msgstr: 1101 return -1 1102 return 0 1103 1104 def __gt__(self, other): 1105 return self.__cmp__(other) > 0 1106 1107 def __lt__(self, other): 1108 return self.__cmp__(other) < 0 1109 1110 def __ge__(self, other): 1111 return self.__cmp__(other) >= 0 1112 1113 def __le__(self, other): 1114 return self.__cmp__(other) <= 0 1115 1116 def __eq__(self, other): 1117 return self.__cmp__(other) == 0 1118 1119 def __ne__(self, other): 1120 return self.__cmp__(other) != 0 1121 1122 def translated(self): 1123 """ 1124 Returns ``True`` if the entry has been translated or ``False`` 1125 otherwise. 1126 """ 1127 if self.obsolete or self.fuzzy: 1128 return False 1129 if self.msgstr != '': 1130 return True 1131 if self.msgstr_plural: 1132 for pos in self.msgstr_plural: 1133 if self.msgstr_plural[pos] == '': 1134 return False 1135 return True 1136 return False 1137 1138 def merge(self, other): 1139 """ 1140 Merge the current entry with the given pot entry. 1141 """ 1142 self.msgid = other.msgid 1143 self.msgctxt = other.msgctxt 1144 self.occurrences = other.occurrences 1145 self.comment = other.comment 1146 fuzzy = self.fuzzy 1147 self.flags = other.flags[:] # clone flags 1148 if fuzzy: 1149 self.flags.append('fuzzy') 1150 self.msgid_plural = other.msgid_plural 1151 self.obsolete = other.obsolete 1152 self.previous_msgctxt = other.previous_msgctxt 1153 self.previous_msgid = other.previous_msgid 1154 self.previous_msgid_plural = other.previous_msgid_plural 1155 if other.msgstr_plural: 1156 for pos in other.msgstr_plural: 1157 try: 1158 # keep existing translation at pos if any 1159 self.msgstr_plural[pos] 1160 except KeyError: 1161 self.msgstr_plural[pos] = '' 1162 1163 @property 1164 def fuzzy(self): 1165 return 'fuzzy' in self.flags 1166 1167 @property 1168 def msgid_with_context(self): 1169 if self.msgctxt: 1170 return '%s%s%s' % (self.msgctxt, "\x04", self.msgid) 1171 return self.msgid 1172 1173 def __hash__(self): 1174 return hash((self.msgid, self.msgstr)) 1175# }}} 1176# class MOEntry {{{ 1177 1178 1179class MOEntry(_BaseEntry): 1180 """ 1181 Represents a mo file entry. 1182 """ 1183 def __init__(self, *args, **kwargs): 1184 """ 1185 Constructor, accepts the following keyword arguments, 1186 for consistency with :class:`~polib.POEntry`: 1187 1188 ``comment`` 1189 ``tcomment`` 1190 ``occurrences`` 1191 ``flags`` 1192 ``previous_msgctxt`` 1193 ``previous_msgid`` 1194 ``previous_msgid_plural`` 1195 1196 Note: even though these keyword arguments are accepted, 1197 they hold no real meaning in the context of MO files 1198 and are simply ignored. 1199 """ 1200 _BaseEntry.__init__(self, *args, **kwargs) 1201 self.comment = '' 1202 self.tcomment = '' 1203 self.occurrences = [] 1204 self.flags = [] 1205 self.previous_msgctxt = None 1206 self.previous_msgid = None 1207 self.previous_msgid_plural = None 1208 1209 def __hash__(self): 1210 return hash((self.msgid, self.msgstr)) 1211 1212# }}} 1213# class _POFileParser {{{ 1214 1215 1216class _POFileParser(object): 1217 """ 1218 A finite state machine to parse efficiently and correctly po 1219 file format. 1220 """ 1221 1222 def __init__(self, pofile, *args, **kwargs): 1223 """ 1224 Constructor. 1225 1226 Keyword arguments: 1227 1228 ``pofile`` 1229 string, path to the po file or its content 1230 1231 ``encoding`` 1232 string, the encoding to use, defaults to ``default_encoding`` 1233 global variable (optional). 1234 1235 ``check_for_duplicates`` 1236 whether to check for duplicate entries when adding entries to the 1237 file (optional, default: ``False``). 1238 """ 1239 enc = kwargs.get('encoding', default_encoding) 1240 if _is_file(pofile): 1241 try: 1242 self.fhandle = io.open(pofile, 'rt', encoding=enc) 1243 except LookupError: 1244 enc = default_encoding 1245 self.fhandle = io.open(pofile, 'rt', encoding=enc) 1246 else: 1247 self.fhandle = pofile.splitlines() 1248 1249 klass = kwargs.get('klass') 1250 if klass is None: 1251 klass = POFile 1252 self.instance = klass( 1253 pofile=pofile, 1254 encoding=enc, 1255 check_for_duplicates=kwargs.get('check_for_duplicates', False) 1256 ) 1257 self.transitions = {} 1258 self.current_line = 0 1259 self.current_entry = POEntry(linenum=self.current_line) 1260 self.current_state = 'st' 1261 self.current_token = None 1262 # two memo flags used in handlers 1263 self.msgstr_index = 0 1264 self.entry_obsolete = 0 1265 # Configure the state machine, by adding transitions. 1266 # Signification of symbols: 1267 # * ST: Beginning of the file (start) 1268 # * HE: Header 1269 # * TC: a translation comment 1270 # * GC: a generated comment 1271 # * OC: a file/line occurrence 1272 # * FL: a flags line 1273 # * CT: a message context 1274 # * PC: a previous msgctxt 1275 # * PM: a previous msgid 1276 # * PP: a previous msgid_plural 1277 # * MI: a msgid 1278 # * MP: a msgid plural 1279 # * MS: a msgstr 1280 # * MX: a msgstr plural 1281 # * MC: a msgid or msgstr continuation line 1282 all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc', 1283 'ms', 'mp', 'mx', 'mi'] 1284 1285 self.add('tc', ['st', 'he'], 'he') 1286 self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 1287 'mp', 'mx', 'mi'], 'tc') 1288 self.add('gc', all, 'gc') 1289 self.add('oc', all, 'oc') 1290 self.add('fl', all, 'fl') 1291 self.add('pc', all, 'pc') 1292 self.add('pm', all, 'pm') 1293 self.add('pp', all, 'pp') 1294 self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm', 1295 'pp', 'ms', 'mx'], 'ct') 1296 self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc', 1297 'pm', 'pp', 'ms', 'mx'], 'mi') 1298 self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp') 1299 self.add('ms', ['mi', 'mp', 'tc'], 'ms') 1300 self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx') 1301 self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc') 1302 1303 def parse(self): 1304 """ 1305 Run the state machine, parse the file line by line and call process() 1306 with the current matched symbol. 1307 """ 1308 1309 keywords = { 1310 'msgctxt': 'ct', 1311 'msgid': 'mi', 1312 'msgstr': 'ms', 1313 'msgid_plural': 'mp', 1314 } 1315 prev_keywords = { 1316 'msgid_plural': 'pp', 1317 'msgid': 'pm', 1318 'msgctxt': 'pc', 1319 } 1320 tokens = [] 1321 fpath = '%s ' % self.instance.fpath if self.instance.fpath else '' 1322 for line in self.fhandle: 1323 self.current_line += 1 1324 line = line.strip() 1325 if line == '': 1326 continue 1327 1328 tokens = line.split(None, 2) 1329 nb_tokens = len(tokens) 1330 1331 if tokens[0] == '#~|': 1332 continue 1333 1334 if tokens[0] == '#~' and nb_tokens > 1: 1335 line = line[3:].strip() 1336 tokens = tokens[1:] 1337 nb_tokens -= 1 1338 self.entry_obsolete = 1 1339 else: 1340 self.entry_obsolete = 0 1341 1342 # Take care of keywords like 1343 # msgid, msgid_plural, msgctxt & msgstr. 1344 if tokens[0] in keywords and nb_tokens > 1: 1345 line = line[len(tokens[0]):].lstrip() 1346 if re.search(r'([^\\]|^)"', line[1:-1]): 1347 raise IOError('Syntax error in po file %s(line %s): ' 1348 'unescaped double quote found' % 1349 (fpath, self.current_line)) 1350 self.current_token = line 1351 self.process(keywords[tokens[0]]) 1352 continue 1353 1354 self.current_token = line 1355 1356 if tokens[0] == '#:': 1357 if nb_tokens <= 1: 1358 continue 1359 # we are on a occurrences line 1360 self.process('oc') 1361 1362 elif line[:1] == '"': 1363 # we are on a continuation line 1364 if re.search(r'([^\\]|^)"', line[1:-1]): 1365 raise IOError('Syntax error in po file %s(line %s): ' 1366 'unescaped double quote found' % 1367 (fpath, self.current_line)) 1368 self.process('mc') 1369 1370 elif line[:7] == 'msgstr[': 1371 # we are on a msgstr plural 1372 self.process('mx') 1373 1374 elif tokens[0] == '#,': 1375 if nb_tokens <= 1: 1376 continue 1377 # we are on a flags line 1378 self.process('fl') 1379 1380 elif tokens[0] == '#' or tokens[0].startswith('##'): 1381 if line == '#': 1382 line += ' ' 1383 # we are on a translator comment line 1384 self.process('tc') 1385 1386 elif tokens[0] == '#.': 1387 if nb_tokens <= 1: 1388 continue 1389 # we are on a generated comment line 1390 self.process('gc') 1391 1392 elif tokens[0] == '#|': 1393 if nb_tokens <= 1: 1394 raise IOError('Syntax error in po file %s(line %s)' % 1395 (fpath, self.current_line)) 1396 1397 # Remove the marker and any whitespace right after that. 1398 line = line[2:].lstrip() 1399 self.current_token = line 1400 1401 if tokens[1].startswith('"'): 1402 # Continuation of previous metadata. 1403 self.process('mc') 1404 continue 1405 1406 if nb_tokens == 2: 1407 # Invalid continuation line. 1408 raise IOError('Syntax error in po file %s(line %s): ' 1409 'invalid continuation line' % 1410 (fpath, self.current_line)) 1411 1412 # we are on a "previous translation" comment line, 1413 if tokens[1] not in prev_keywords: 1414 # Unknown keyword in previous translation comment. 1415 raise IOError('Syntax error in po file %s(line %s): ' 1416 'unknown keyword %s' % 1417 (fpath, self.current_line, 1418 tokens[1])) 1419 1420 # Remove the keyword and any whitespace 1421 # between it and the starting quote. 1422 line = line[len(tokens[1]):].lstrip() 1423 self.current_token = line 1424 self.process(prev_keywords[tokens[1]]) 1425 1426 else: 1427 raise IOError('Syntax error in po file %s(line %s)' % 1428 (fpath, self.current_line)) 1429 1430 if self.current_entry and len(tokens) > 0 and \ 1431 not tokens[0].startswith('#'): 1432 # since entries are added when another entry is found, we must add 1433 # the last entry here (only if there are lines). Trailing comments 1434 # are ignored 1435 self.instance.append(self.current_entry) 1436 1437 # before returning the instance, check if there's metadata and if 1438 # so extract it in a dict 1439 metadataentry = self.instance.find('') 1440 if metadataentry: # metadata found 1441 # remove the entry 1442 self.instance.remove(metadataentry) 1443 self.instance.metadata_is_fuzzy = metadataentry.flags 1444 key = None 1445 for msg in metadataentry.msgstr.splitlines(): 1446 try: 1447 key, val = msg.split(':', 1) 1448 self.instance.metadata[key] = val.strip() 1449 except (ValueError, KeyError): 1450 if key is not None: 1451 self.instance.metadata[key] += '\n' + msg.strip() 1452 # close opened file 1453 if not isinstance(self.fhandle, list): # must be file 1454 self.fhandle.close() 1455 return self.instance 1456 1457 def add(self, symbol, states, next_state): 1458 """ 1459 Add a transition to the state machine. 1460 1461 Keywords arguments: 1462 1463 ``symbol`` 1464 string, the matched token (two chars symbol). 1465 1466 ``states`` 1467 list, a list of states (two chars symbols). 1468 1469 ``next_state`` 1470 the next state the fsm will have after the action. 1471 """ 1472 for state in states: 1473 action = getattr(self, 'handle_%s' % next_state) 1474 self.transitions[(symbol, state)] = (action, next_state) 1475 1476 def process(self, symbol): 1477 """ 1478 Process the transition corresponding to the current state and the 1479 symbol provided. 1480 1481 Keywords arguments: 1482 1483 ``symbol`` 1484 string, the matched token (two chars symbol). 1485 1486 ``linenum`` 1487 integer, the current line number of the parsed file. 1488 """ 1489 try: 1490 (action, state) = self.transitions[(symbol, self.current_state)] 1491 if action(): 1492 self.current_state = state 1493 except Exception: 1494 raise IOError('Syntax error in po file (line %s)' % 1495 self.current_line) 1496 1497 # state handlers 1498 1499 def handle_he(self): 1500 """Handle a header comment.""" 1501 if self.instance.header != '': 1502 self.instance.header += '\n' 1503 self.instance.header += self.current_token[2:] 1504 return 1 1505 1506 def handle_tc(self): 1507 """Handle a translator comment.""" 1508 if self.current_state in ['mc', 'ms', 'mx']: 1509 self.instance.append(self.current_entry) 1510 self.current_entry = POEntry(linenum=self.current_line) 1511 if self.current_entry.tcomment != '': 1512 self.current_entry.tcomment += '\n' 1513 tcomment = self.current_token.lstrip('#') 1514 if tcomment.startswith(' '): 1515 tcomment = tcomment[1:] 1516 self.current_entry.tcomment += tcomment 1517 return True 1518 1519 def handle_gc(self): 1520 """Handle a generated comment.""" 1521 if self.current_state in ['mc', 'ms', 'mx']: 1522 self.instance.append(self.current_entry) 1523 self.current_entry = POEntry(linenum=self.current_line) 1524 if self.current_entry.comment != '': 1525 self.current_entry.comment += '\n' 1526 self.current_entry.comment += self.current_token[3:] 1527 return True 1528 1529 def handle_oc(self): 1530 """Handle a file:num occurrence.""" 1531 if self.current_state in ['mc', 'ms', 'mx']: 1532 self.instance.append(self.current_entry) 1533 self.current_entry = POEntry(linenum=self.current_line) 1534 occurrences = self.current_token[3:].split() 1535 for occurrence in occurrences: 1536 if occurrence != '': 1537 try: 1538 fil, line = occurrence.rsplit(':', 1) 1539 if not line.isdigit(): 1540 fil = occurrence 1541 line = '' 1542 self.current_entry.occurrences.append((fil, line)) 1543 except (ValueError, AttributeError): 1544 self.current_entry.occurrences.append((occurrence, '')) 1545 return True 1546 1547 def handle_fl(self): 1548 """Handle a flags line.""" 1549 if self.current_state in ['mc', 'ms', 'mx']: 1550 self.instance.append(self.current_entry) 1551 self.current_entry = POEntry(linenum=self.current_line) 1552 self.current_entry.flags += [c.strip() for c in 1553 self.current_token[3:].split(',')] 1554 return True 1555 1556 def handle_pp(self): 1557 """Handle a previous msgid_plural line.""" 1558 if self.current_state in ['mc', 'ms', 'mx']: 1559 self.instance.append(self.current_entry) 1560 self.current_entry = POEntry(linenum=self.current_line) 1561 self.current_entry.previous_msgid_plural = \ 1562 unescape(self.current_token[1:-1]) 1563 return True 1564 1565 def handle_pm(self): 1566 """Handle a previous msgid line.""" 1567 if self.current_state in ['mc', 'ms', 'mx']: 1568 self.instance.append(self.current_entry) 1569 self.current_entry = POEntry(linenum=self.current_line) 1570 self.current_entry.previous_msgid = \ 1571 unescape(self.current_token[1:-1]) 1572 return True 1573 1574 def handle_pc(self): 1575 """Handle a previous msgctxt line.""" 1576 if self.current_state in ['mc', 'ms', 'mx']: 1577 self.instance.append(self.current_entry) 1578 self.current_entry = POEntry(linenum=self.current_line) 1579 self.current_entry.previous_msgctxt = \ 1580 unescape(self.current_token[1:-1]) 1581 return True 1582 1583 def handle_ct(self): 1584 """Handle a msgctxt.""" 1585 if self.current_state in ['mc', 'ms', 'mx']: 1586 self.instance.append(self.current_entry) 1587 self.current_entry = POEntry(linenum=self.current_line) 1588 self.current_entry.msgctxt = unescape(self.current_token[1:-1]) 1589 return True 1590 1591 def handle_mi(self): 1592 """Handle a msgid.""" 1593 if self.current_state in ['mc', 'ms', 'mx']: 1594 self.instance.append(self.current_entry) 1595 self.current_entry = POEntry(linenum=self.current_line) 1596 self.current_entry.obsolete = self.entry_obsolete 1597 self.current_entry.msgid = unescape(self.current_token[1:-1]) 1598 return True 1599 1600 def handle_mp(self): 1601 """Handle a msgid plural.""" 1602 self.current_entry.msgid_plural = unescape(self.current_token[1:-1]) 1603 return True 1604 1605 def handle_ms(self): 1606 """Handle a msgstr.""" 1607 self.current_entry.msgstr = unescape(self.current_token[1:-1]) 1608 return True 1609 1610 def handle_mx(self): 1611 """Handle a msgstr plural.""" 1612 index = self.current_token[7] 1613 value = self.current_token[self.current_token.find('"') + 1:-1] 1614 self.current_entry.msgstr_plural[int(index)] = unescape(value) 1615 self.msgstr_index = int(index) 1616 return True 1617 1618 def handle_mc(self): 1619 """Handle a msgid or msgstr continuation line.""" 1620 token = unescape(self.current_token[1:-1]) 1621 if self.current_state == 'ct': 1622 self.current_entry.msgctxt += token 1623 elif self.current_state == 'mi': 1624 self.current_entry.msgid += token 1625 elif self.current_state == 'mp': 1626 self.current_entry.msgid_plural += token 1627 elif self.current_state == 'ms': 1628 self.current_entry.msgstr += token 1629 elif self.current_state == 'mx': 1630 self.current_entry.msgstr_plural[self.msgstr_index] += token 1631 elif self.current_state == 'pp': 1632 self.current_entry.previous_msgid_plural += token 1633 elif self.current_state == 'pm': 1634 self.current_entry.previous_msgid += token 1635 elif self.current_state == 'pc': 1636 self.current_entry.previous_msgctxt += token 1637 # don't change the current state 1638 return False 1639# }}} 1640# class _MOFileParser {{{ 1641 1642 1643class _MOFileParser(object): 1644 """ 1645 A class to parse binary mo files. 1646 """ 1647 1648 def __init__(self, mofile, *args, **kwargs): 1649 """ 1650 Constructor. 1651 1652 Keyword arguments: 1653 1654 ``mofile`` 1655 string, path to the mo file or its content 1656 1657 ``encoding`` 1658 string, the encoding to use, defaults to ``default_encoding`` 1659 global variable (optional). 1660 1661 ``check_for_duplicates`` 1662 whether to check for duplicate entries when adding entries to the 1663 file (optional, default: ``False``). 1664 """ 1665 self.fhandle = open(mofile, 'rb') 1666 1667 klass = kwargs.get('klass') 1668 if klass is None: 1669 klass = MOFile 1670 self.instance = klass( 1671 fpath=mofile, 1672 encoding=kwargs.get('encoding', default_encoding), 1673 check_for_duplicates=kwargs.get('check_for_duplicates', False) 1674 ) 1675 1676 def __del__(self): 1677 """ 1678 Make sure the file is closed, this prevents warnings on unclosed file 1679 when running tests with python >= 3.2. 1680 """ 1681 if self.fhandle: 1682 self.fhandle.close() 1683 1684 def parse(self): 1685 """ 1686 Build the instance with the file handle provided in the 1687 constructor. 1688 """ 1689 # parse magic number 1690 magic_number = self._readbinary('<I', 4) 1691 if magic_number == MOFile.MAGIC: 1692 ii = '<II' 1693 elif magic_number == MOFile.MAGIC_SWAPPED: 1694 ii = '>II' 1695 else: 1696 raise IOError('Invalid mo file, magic number is incorrect !') 1697 self.instance.magic_number = magic_number 1698 # parse the version number and the number of strings 1699 version, numofstrings = self._readbinary(ii, 8) 1700 # from MO file format specs: "A program seeing an unexpected major 1701 # revision number should stop reading the MO file entirely" 1702 if version >> 16 not in (0, 1): 1703 raise IOError('Invalid mo file, unexpected major revision number') 1704 self.instance.version = version 1705 # original strings and translation strings hash table offset 1706 msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8) 1707 # move to msgid hash table and read length and offset of msgids 1708 self.fhandle.seek(msgids_hash_offset) 1709 msgids_index = [] 1710 for i in range(numofstrings): 1711 msgids_index.append(self._readbinary(ii, 8)) 1712 # move to msgstr hash table and read length and offset of msgstrs 1713 self.fhandle.seek(msgstrs_hash_offset) 1714 msgstrs_index = [] 1715 for i in range(numofstrings): 1716 msgstrs_index.append(self._readbinary(ii, 8)) 1717 # build entries 1718 encoding = self.instance.encoding 1719 for i in range(numofstrings): 1720 self.fhandle.seek(msgids_index[i][1]) 1721 msgid = self.fhandle.read(msgids_index[i][0]) 1722 1723 self.fhandle.seek(msgstrs_index[i][1]) 1724 msgstr = self.fhandle.read(msgstrs_index[i][0]) 1725 if i == 0 and not msgid: # metadata 1726 raw_metadata, metadata = msgstr.split(b('\n')), {} 1727 for line in raw_metadata: 1728 tokens = line.split(b(':'), 1) 1729 if tokens[0] != b(''): 1730 try: 1731 k = tokens[0].decode(encoding) 1732 v = tokens[1].decode(encoding) 1733 metadata[k] = v.strip() 1734 except IndexError: 1735 metadata[k] = u('') 1736 self.instance.metadata = metadata 1737 continue 1738 # test if we have a plural entry 1739 msgid_tokens = msgid.split(b('\0')) 1740 if len(msgid_tokens) > 1: 1741 entry = self._build_entry( 1742 msgid=msgid_tokens[0], 1743 msgid_plural=msgid_tokens[1], 1744 msgstr_plural=dict((k, v) for k, v in 1745 enumerate(msgstr.split(b('\0')))) 1746 ) 1747 else: 1748 entry = self._build_entry(msgid=msgid, msgstr=msgstr) 1749 self.instance.append(entry) 1750 # close opened file 1751 self.fhandle.close() 1752 return self.instance 1753 1754 def _build_entry(self, msgid, msgstr=None, msgid_plural=None, 1755 msgstr_plural=None): 1756 msgctxt_msgid = msgid.split(b('\x04')) 1757 encoding = self.instance.encoding 1758 if len(msgctxt_msgid) > 1: 1759 kwargs = { 1760 'msgctxt': msgctxt_msgid[0].decode(encoding), 1761 'msgid': msgctxt_msgid[1].decode(encoding), 1762 } 1763 else: 1764 kwargs = {'msgid': msgid.decode(encoding)} 1765 if msgstr: 1766 kwargs['msgstr'] = msgstr.decode(encoding) 1767 if msgid_plural: 1768 kwargs['msgid_plural'] = msgid_plural.decode(encoding) 1769 if msgstr_plural: 1770 for k in msgstr_plural: 1771 msgstr_plural[k] = msgstr_plural[k].decode(encoding) 1772 kwargs['msgstr_plural'] = msgstr_plural 1773 return MOEntry(**kwargs) 1774 1775 def _readbinary(self, fmt, numbytes): 1776 """ 1777 Private method that unpack n bytes of data using format <fmt>. 1778 It returns a tuple or a mixed value if the tuple length is 1. 1779 """ 1780 bytes = self.fhandle.read(numbytes) 1781 tup = struct.unpack(fmt, bytes) 1782 if len(tup) == 1: 1783 return tup[0] 1784 return tup 1785# }}} 1786# class TextWrapper {{{ 1787 1788 1789class TextWrapper(textwrap.TextWrapper): 1790 """ 1791 Subclass of textwrap.TextWrapper that backport the 1792 drop_whitespace option. 1793 """ 1794 def __init__(self, *args, **kwargs): 1795 drop_whitespace = kwargs.pop('drop_whitespace', True) 1796 textwrap.TextWrapper.__init__(self, *args, **kwargs) 1797 self.drop_whitespace = drop_whitespace 1798 1799 def _wrap_chunks(self, chunks): 1800 """_wrap_chunks(chunks : [string]) -> [string] 1801 1802 Wrap a sequence of text chunks and return a list of lines of 1803 length 'self.width' or less. (If 'break_long_words' is false, 1804 some lines may be longer than this.) Chunks correspond roughly 1805 to words and the whitespace between them: each chunk is 1806 indivisible (modulo 'break_long_words'), but a line break can 1807 come between any two chunks. Chunks should not have internal 1808 whitespace; ie. a chunk is either all whitespace or a "word". 1809 Whitespace chunks will be removed from the beginning and end of 1810 lines, but apart from that whitespace is preserved. 1811 """ 1812 lines = [] 1813 if self.width <= 0: 1814 raise ValueError("invalid width %r (must be > 0)" % self.width) 1815 1816 # Arrange in reverse order so items can be efficiently popped 1817 # from a stack of chucks. 1818 chunks.reverse() 1819 1820 while chunks: 1821 1822 # Start the list of chunks that will make up the current line. 1823 # cur_len is just the length of all the chunks in cur_line. 1824 cur_line = [] 1825 cur_len = 0 1826 1827 # Figure out which static string will prefix this line. 1828 if lines: 1829 indent = self.subsequent_indent 1830 else: 1831 indent = self.initial_indent 1832 1833 # Maximum width for this line. 1834 width = self.width - len(indent) 1835 1836 # First chunk on line is whitespace -- drop it, unless this 1837 # is the very beginning of the text (ie. no lines started yet). 1838 if self.drop_whitespace and chunks[-1].strip() == '' and lines: 1839 del chunks[-1] 1840 1841 while chunks: 1842 length = len(chunks[-1]) 1843 1844 # Can at least squeeze this chunk onto the current line. 1845 if cur_len + length <= width: 1846 cur_line.append(chunks.pop()) 1847 cur_len += length 1848 1849 # Nope, this line is full. 1850 else: 1851 break 1852 1853 # The current line is full, and the next chunk is too big to 1854 # fit on *any* line (not just this one). 1855 if chunks and len(chunks[-1]) > width: 1856 self._handle_long_word(chunks, cur_line, cur_len, width) 1857 1858 # If the last chunk on this line is all whitespace, drop it. 1859 if self.drop_whitespace and cur_line and not cur_line[-1].strip(): 1860 del cur_line[-1] 1861 1862 # Convert current line back to a string and store it in list 1863 # of all lines (return value). 1864 if cur_line: 1865 lines.append(indent + ''.join(cur_line)) 1866 1867 return lines 1868# }}} 1869# function wrap() {{{ 1870 1871 1872def wrap(text, width=70, **kwargs): 1873 """ 1874 Wrap a single paragraph of text, returning a list of wrapped lines. 1875 """ 1876 if sys.version_info < (2, 6): 1877 return TextWrapper(width=width, **kwargs).wrap(text) 1878 return textwrap.wrap(text, width=width, **kwargs) 1879 1880# }}} 1881