1# -* coding: utf-8 -*-
2#
3# License: MIT (see LICENSE file provided)
4# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
58
6"""
7**polib** allows you to manipulate, create, modify gettext files (pot, po and
8mo files).  You can load existing files, iterate through it's entries, add,
9modify entries, comments or metadata, etc. or create new po files from scratch.
10
11**polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
12:func:`~polib.mofile` convenience functions.
13"""
14
15__author__ = 'David Jean Louis <izimobil@gmail.com>'
16__version__ = '1.0.8'
17__all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
18           'default_encoding', 'escape', 'unescape', 'detect_encoding', ]
19
20import array
21import codecs
22import os
23import re
24import struct
25import sys
26import textwrap
27import binascii
28
29try:
30    import io
31except ImportError:
32    # replacement of io.open() for python < 2.6
33    # we use codecs instead
34    class io(object):
35        @staticmethod
36        def open(fpath, mode='r', encoding=None):
37            return codecs.open(fpath, mode, encoding)
38
39
40# the default encoding to use when encoding cannot be detected
41default_encoding = 'utf-8'
42
43# python 2/3 compatibility helpers {{{
44
45
46if sys.version_info[:2] < (3, 0):
47    PY3 = False
48    text_type = unicode
49
50    def b(s):
51        return s
52
53    def u(s):
54        return unicode(s, "unicode_escape")
55
56else:
57    PY3 = True
58    text_type = str
59
60    def b(s):
61        return s.encode("latin-1")
62
63    def u(s):
64        return s
65# }}}
66# _pofile_or_mofile {{{
67
68
69def _pofile_or_mofile(f, type, **kwargs):
70    """
71    Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
72    honor the DRY concept.
73    """
74    # get the file encoding
75    enc = kwargs.get('encoding')
76    if enc is None:
77        enc = detect_encoding(f, type == 'mofile')
78
79    # parse the file
80    kls = type == 'pofile' and _POFileParser or _MOFileParser
81    parser = kls(
82        f,
83        encoding=enc,
84        check_for_duplicates=kwargs.get('check_for_duplicates', False),
85        klass=kwargs.get('klass')
86    )
87    instance = parser.parse()
88    instance.wrapwidth = kwargs.get('wrapwidth', 78)
89    return instance
90# }}}
91# _is_file {{{
92
93
94def _is_file(filename_or_contents):
95    """
96    Safely returns the value of os.path.exists(filename_or_contents).
97
98    Arguments:
99
100    ``filename_or_contents``
101        either a filename, or a string holding the contents of some file.
102        In the latter case, this function will always return False.
103    """
104    try:
105        return os.path.exists(filename_or_contents)
106    except (ValueError, UnicodeEncodeError):
107        return False
108# }}}
109# function pofile() {{{
110
111
112def pofile(pofile, **kwargs):
113    """
114    Convenience function that parses the po or pot file ``pofile`` and returns
115    a :class:`~polib.POFile` instance.
116
117    Arguments:
118
119    ``pofile``
120        string, full or relative path to the po/pot file or its content (data).
121
122    ``wrapwidth``
123        integer, the wrap width, only useful when the ``-w`` option was passed
124        to xgettext (optional, default: ``78``).
125
126    ``encoding``
127        string, the encoding to use (e.g. "utf-8") (default: ``None``, the
128        encoding will be auto-detected).
129
130    ``check_for_duplicates``
131        whether to check for duplicate entries when adding entries to the
132        file (optional, default: ``False``).
133
134    ``klass``
135        class which is used to instantiate the return value (optional,
136        default: ``None``, the return value with be a :class:`~polib.POFile`
137        instance).
138    """
139    return _pofile_or_mofile(pofile, 'pofile', **kwargs)
140# }}}
141# function mofile() {{{
142
143
144def mofile(mofile, **kwargs):
145    """
146    Convenience function that parses the mo file ``mofile`` and returns a
147    :class:`~polib.MOFile` instance.
148
149    Arguments:
150
151    ``mofile``
152        string, full or relative path to the mo file or its content (data).
153
154    ``wrapwidth``
155        integer, the wrap width, only useful when the ``-w`` option was passed
156        to xgettext to generate the po file that was used to format the mo file
157        (optional, default: ``78``).
158
159    ``encoding``
160        string, the encoding to use (e.g. "utf-8") (default: ``None``, the
161        encoding will be auto-detected).
162
163    ``check_for_duplicates``
164        whether to check for duplicate entries when adding entries to the
165        file (optional, default: ``False``).
166
167    ``klass``
168        class which is used to instantiate the return value (optional,
169        default: ``None``, the return value with be a :class:`~polib.POFile`
170        instance).
171    """
172    return _pofile_or_mofile(mofile, 'mofile', **kwargs)
173# }}}
174# function detect_encoding() {{{
175
176
177def detect_encoding(file, binary_mode=False):
178    """
179    Try to detect the encoding used by the ``file``. The ``file`` argument can
180    be a PO or MO file path or a string containing the contents of the file.
181    If the encoding cannot be detected, the function will return the value of
182    ``default_encoding``.
183
184    Arguments:
185
186    ``file``
187        string, full or relative path to the po/mo file or its content.
188
189    ``binary_mode``
190        boolean, set this to True if ``file`` is a mo file.
191    """
192    PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
193    rxt = re.compile(u(PATTERN))
194    rxb = re.compile(b(PATTERN))
195
196    def charset_exists(charset):
197        """Check whether ``charset`` is valid or not."""
198        try:
199            codecs.lookup(charset)
200        except LookupError:
201            return False
202        return True
203
204    if not _is_file(file):
205        match = rxt.search(file)
206        if match:
207            enc = match.group(1).strip()
208            if charset_exists(enc):
209                return enc
210    else:
211        # For PY3, always treat as binary
212        if binary_mode or PY3:
213            mode = 'rb'
214            rx = rxb
215        else:
216            mode = 'r'
217            rx = rxt
218        f = open(file, mode)
219        for l in f.readlines():
220            match = rx.search(l)
221            if match:
222                f.close()
223                enc = match.group(1).strip()
224                if not isinstance(enc, text_type):
225                    enc = enc.decode('utf-8')
226                if charset_exists(enc):
227                    return enc
228        f.close()
229    return default_encoding
230# }}}
231# function escape() {{{
232
233
234def escape(st):
235    """
236    Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
237    the given string ``st`` and returns it.
238    """
239    return st.replace('\\', r'\\')\
240             .replace('\t', r'\t')\
241             .replace('\r', r'\r')\
242             .replace('\n', r'\n')\
243             .replace('\"', r'\"')
244# }}}
245# function unescape() {{{
246
247
248def unescape(st):
249    """
250    Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
251    the given string ``st`` and returns it.
252    """
253    def unescape_repl(m):
254        m = m.group(1)
255        if m == 'n':
256            return '\n'
257        if m == 't':
258            return '\t'
259        if m == 'r':
260            return '\r'
261        if m == '\\':
262            return '\\'
263        return m  # handles escaped double quote
264    return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
265# }}}
266# function natural_sort() {{{
267
268
269def natural_sort(lst):
270    """
271    Sort naturally the given list.
272    Credits: http://stackoverflow.com/a/4836734
273    """
274    convert = lambda text: int(text) if text.isdigit() else text.lower()
275    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
276    return sorted(lst, key = alphanum_key)
277# }}}
278# class _BaseFile {{{
279
280
281class _BaseFile(list):
282    """
283    Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
284    classes. This class should **not** be instantiated directly.
285    """
286
287    def __init__(self, *args, **kwargs):
288        """
289        Constructor, accepts the following keyword arguments:
290
291        ``pofile``
292            string, the path to the po or mo file, or its content as a string.
293
294        ``wrapwidth``
295            integer, the wrap width, only useful when the ``-w`` option was
296            passed to xgettext (optional, default: ``78``).
297
298        ``encoding``
299            string, the encoding to use, defaults to ``default_encoding``
300            global variable (optional).
301
302        ``check_for_duplicates``
303            whether to check for duplicate entries when adding entries to the
304            file, (optional, default: ``False``).
305        """
306        list.__init__(self)
307        # the opened file handle
308        pofile = kwargs.get('pofile', None)
309        if pofile and _is_file(pofile):
310            self.fpath = pofile
311        else:
312            self.fpath = kwargs.get('fpath')
313        # the width at which lines should be wrapped
314        self.wrapwidth = kwargs.get('wrapwidth', 78)
315        # the file encoding
316        self.encoding = kwargs.get('encoding', default_encoding)
317        # whether to check for duplicate entries or not
318        self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
319        # header
320        self.header = ''
321        # both po and mo files have metadata
322        self.metadata = {}
323        self.metadata_is_fuzzy = 0
324
325    def __unicode__(self):
326        """
327        Returns the unicode representation of the file.
328        """
329        ret = []
330        entries = [self.metadata_as_entry()] + \
331                  [e for e in self if not e.obsolete]
332        for entry in entries:
333            ret.append(entry.__unicode__(self.wrapwidth))
334        for entry in self.obsolete_entries():
335            ret.append(entry.__unicode__(self.wrapwidth))
336        ret = u('\n').join(ret)
337
338        assert isinstance(ret, text_type)
339        #if type(ret) != text_type:
340        #    return unicode(ret, self.encoding)
341        return ret
342
343    if PY3:
344        def __str__(self):
345            return self.__unicode__()
346    else:
347        def __str__(self):
348            """
349            Returns the string representation of the file.
350            """
351            return unicode(self).encode(self.encoding)
352
353    def __contains__(self, entry):
354        """
355        Overridden ``list`` method to implement the membership test (in and
356        not in).
357        The method considers that an entry is in the file if it finds an entry
358        that has the same msgid (the test is **case sensitive**) and the same
359        msgctxt (or none for both entries).
360
361        Argument:
362
363        ``entry``
364            an instance of :class:`~polib._BaseEntry`.
365        """
366        return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \
367            is not None
368
369    def __eq__(self, other):
370        return str(self) == str(other)
371
372    def append(self, entry):
373        """
374        Overridden method to check for duplicates entries, if a user tries to
375        add an entry that is already in the file, the method will raise a
376        ``ValueError`` exception.
377
378        Argument:
379
380        ``entry``
381            an instance of :class:`~polib._BaseEntry`.
382        """
383        # check_for_duplicates may not be defined (yet) when unpickling.
384        # But if pickling, we never want to check for duplicates anyway.
385        if getattr(self, 'check_for_duplicates', False) and entry in self:
386            raise ValueError('Entry "%s" already exists' % entry.msgid)
387        super(_BaseFile, self).append(entry)
388
389    def insert(self, index, entry):
390        """
391        Overridden method to check for duplicates entries, if a user tries to
392        add an entry that is already in the file, the method will raise a
393        ``ValueError`` exception.
394
395        Arguments:
396
397        ``index``
398            index at which the entry should be inserted.
399
400        ``entry``
401            an instance of :class:`~polib._BaseEntry`.
402        """
403        if self.check_for_duplicates and entry in self:
404            raise ValueError('Entry "%s" already exists' % entry.msgid)
405        super(_BaseFile, self).insert(index, entry)
406
407    def metadata_as_entry(self):
408        """
409        Returns the file metadata as a :class:`~polib.POFile` instance.
410        """
411        e = POEntry(msgid='')
412        mdata = self.ordered_metadata()
413        if mdata:
414            strs = []
415            for name, value in mdata:
416                # Strip whitespace off each line in a multi-line entry
417                strs.append('%s: %s' % (name, value))
418            e.msgstr = '\n'.join(strs) + '\n'
419        if self.metadata_is_fuzzy:
420            e.flags.append('fuzzy')
421        return e
422
423    def save(self, fpath=None, repr_method='__unicode__'):
424        """
425        Saves the po file to ``fpath``.
426        If it is an existing file and no ``fpath`` is provided, then the
427        existing file is rewritten with the modified data.
428
429        Keyword arguments:
430
431        ``fpath``
432            string, full or relative path to the file.
433
434        ``repr_method``
435            string, the method to use for output.
436        """
437        if self.fpath is None and fpath is None:
438            raise IOError('You must provide a file path to save() method')
439        contents = getattr(self, repr_method)()
440        if fpath is None:
441            fpath = self.fpath
442        if repr_method == 'to_binary':
443            fhandle = open(fpath, 'wb')
444        else:
445            fhandle = io.open(fpath, 'w', encoding=self.encoding)
446            if not isinstance(contents, text_type):
447                contents = contents.decode(self.encoding)
448        fhandle.write(contents)
449        fhandle.close()
450        # set the file path if not set
451        if self.fpath is None and fpath:
452            self.fpath = fpath
453
454    def find(self, st, by='msgid', include_obsolete_entries=False,
455             msgctxt=False):
456        """
457        Find the entry which msgid (or property identified by the ``by``
458        argument) matches the string ``st``.
459
460        Keyword arguments:
461
462        ``st``
463            string, the string to search for.
464
465        ``by``
466            string, the property to use for comparison (default: ``msgid``).
467
468        ``include_obsolete_entries``
469            boolean, whether to also search in entries that are obsolete.
470
471        ``msgctxt``
472            string, allows specifying a specific message context for the
473            search.
474        """
475        if include_obsolete_entries:
476            entries = self[:]
477        else:
478            entries = [e for e in self if not e.obsolete]
479        for e in entries:
480            if getattr(e, by) == st:
481                if msgctxt is not False and e.msgctxt != msgctxt:
482                    continue
483                return e
484        return None
485
486    def ordered_metadata(self):
487        """
488        Convenience method that returns an ordered version of the metadata
489        dictionary. The return value is list of tuples (metadata name,
490        metadata_value).
491        """
492        # copy the dict first
493        metadata = self.metadata.copy()
494        data_order = [
495            'Project-Id-Version',
496            'Report-Msgid-Bugs-To',
497            'POT-Creation-Date',
498            'PO-Revision-Date',
499            'Last-Translator',
500            'Language-Team',
501            'Language',
502            'MIME-Version',
503            'Content-Type',
504            'Content-Transfer-Encoding',
505            'Plural-Forms'
506        ]
507        ordered_data = []
508        for data in data_order:
509            try:
510                value = metadata.pop(data)
511                ordered_data.append((data, value))
512            except KeyError:
513                pass
514        # the rest of the metadata will be alphabetically ordered since there
515        # are no specs for this AFAIK
516        for data in natural_sort(metadata.keys()):
517            value = metadata[data]
518            ordered_data.append((data, value))
519        return ordered_data
520
521    def to_binary(self):
522        """
523        Return the binary representation of the file.
524        """
525        offsets = []
526        entries = self.translated_entries()
527
528        # the keys are sorted in the .mo file
529        def cmp(_self, other):
530            # msgfmt compares entries with msgctxt if it exists
531            self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid
532            other_msgid = other.msgctxt and other.msgctxt or other.msgid
533            if self_msgid > other_msgid:
534                return 1
535            elif self_msgid < other_msgid:
536                return -1
537            else:
538                return 0
539        # add metadata entry
540        entries.sort(key=lambda o: o.msgctxt or o.msgid)
541        mentry = self.metadata_as_entry()
542        #mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip()
543        entries = [mentry] + entries
544        entries_len = len(entries)
545        ids, strs = b(''), b('')
546        for e in entries:
547            # For each string, we need size and file offset.  Each string is
548            # NUL terminated; the NUL does not count into the size.
549            msgid = b('')
550            if e.msgctxt:
551                # Contexts are stored by storing the concatenation of the
552                # context, a <EOT> byte, and the original string
553                msgid = self._encode(e.msgctxt + '\4')
554            if e.msgid_plural:
555                msgstr = []
556                for index in sorted(e.msgstr_plural.keys()):
557                    msgstr.append(e.msgstr_plural[index])
558                msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
559                msgstr = self._encode('\0'.join(msgstr))
560            else:
561                msgid += self._encode(e.msgid)
562                msgstr = self._encode(e.msgstr)
563            offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
564            ids += msgid + b('\0')
565            strs += msgstr + b('\0')
566
567        # The header is 7 32-bit unsigned integers.
568        keystart = 7 * 4 + 16 * entries_len
569        # and the values start after the keys
570        valuestart = keystart + len(ids)
571        koffsets = []
572        voffsets = []
573        # The string table first has the list of keys, then the list of values.
574        # Each entry has first the size of the string, then the file offset.
575        for o1, l1, o2, l2 in offsets:
576            koffsets += [l1, o1 + keystart]
577            voffsets += [l2, o2 + valuestart]
578        offsets = koffsets + voffsets
579
580        output = struct.pack(
581            "Iiiiiii",
582            # Magic number
583            MOFile.MAGIC,
584            # Version
585            0,
586            # number of entries
587            entries_len,
588            # start of key index
589            7 * 4,
590            # start of value index
591            7 * 4 + entries_len * 8,
592            # size and offset of hash table, we don't use hash tables
593            0, keystart
594
595        )
596        if PY3 and sys.version_info.minor > 1:  # python 3.2 or superior
597            output += array.array("i", offsets).tobytes()
598        else:
599            output += array.array("i", offsets).tostring()
600        output += ids
601        output += strs
602        return output
603
604    def _encode(self, mixed):
605        """
606        Encodes the given ``mixed`` argument with the file encoding if and
607        only if it's a unicode string and returns the encoded string.
608        """
609        if isinstance(mixed, text_type):
610            mixed = mixed.encode(self.encoding)
611        return mixed
612# }}}
613# class POFile {{{
614
615
616class POFile(_BaseFile):
617    """
618    Po (or Pot) file reader/writer.
619    This class inherits the :class:`~polib._BaseFile` class and, by extension,
620    the python ``list`` type.
621    """
622
623    def __unicode__(self):
624        """
625        Returns the unicode representation of the po file.
626        """
627        ret, headers = '', self.header.split('\n')
628        for header in headers:
629            if not len(header):
630                ret += "#\n"
631            elif header[:1] in [',', ':']:
632                ret += '#%s\n' % header
633            else:
634                ret += '# %s\n' % header
635
636        if not isinstance(ret, text_type):
637            ret = ret.decode(self.encoding)
638
639        return ret + _BaseFile.__unicode__(self)
640
641    def save_as_mofile(self, fpath):
642        """
643        Saves the binary representation of the file to given ``fpath``.
644
645        Keyword argument:
646
647        ``fpath``
648            string, full or relative path to the mo file.
649        """
650        _BaseFile.save(self, fpath, 'to_binary')
651
652    def percent_translated(self):
653        """
654        Convenience method that returns the percentage of translated
655        messages.
656        """
657        total = len([e for e in self if not e.obsolete])
658        if total == 0:
659            return 100
660        translated = len(self.translated_entries())
661        return int(translated * 100 / float(total))
662
663    def translated_entries(self):
664        """
665        Convenience method that returns the list of translated entries.
666        """
667        return [e for e in self if e.translated()]
668
669    def untranslated_entries(self):
670        """
671        Convenience method that returns the list of untranslated entries.
672        """
673        return [e for e in self if not e.translated() and not e.obsolete
674                and not 'fuzzy' in e.flags]
675
676    def fuzzy_entries(self):
677        """
678        Convenience method that returns the list of fuzzy entries.
679        """
680        return [e for e in self if 'fuzzy' in e.flags]
681
682    def obsolete_entries(self):
683        """
684        Convenience method that returns the list of obsolete entries.
685        """
686        return [e for e in self if e.obsolete]
687
688    def merge(self, refpot):
689        """
690        Convenience method that merges the current pofile with the pot file
691        provided. It behaves exactly as the gettext msgmerge utility:
692
693        * comments of this file will be preserved, but extracted comments and
694          occurrences will be discarded;
695        * any translations or comments in the file will be discarded, however,
696          dot comments and file positions will be preserved;
697        * the fuzzy flags are preserved.
698
699        Keyword argument:
700
701        ``refpot``
702            object POFile, the reference catalog.
703        """
704        # Store entries in dict/set for faster access
705        self_entries = dict((entry.msgid, entry) for entry in self)
706        refpot_msgids = set(entry.msgid for entry in refpot)
707        # Merge entries that are in the refpot
708        for entry in refpot:
709            e = self_entries.get(entry.msgid)
710            if e is None:
711                e = POEntry()
712                self.append(e)
713            e.merge(entry)
714        # ok, now we must "obsolete" entries that are not in the refpot anymore
715        for entry in self:
716            if entry.msgid not in refpot_msgids:
717                entry.obsolete = True
718# }}}
719# class MOFile {{{
720
721
722class MOFile(_BaseFile):
723    """
724    Mo file reader/writer.
725    This class inherits the :class:`~polib._BaseFile` class and, by
726    extension, the python ``list`` type.
727    """
728    MAGIC = 0x950412de
729    MAGIC_SWAPPED = 0xde120495
730
731    def __init__(self, *args, **kwargs):
732        """
733        Constructor, accepts all keywords arguments accepted by
734        :class:`~polib._BaseFile` class.
735        """
736        _BaseFile.__init__(self, *args, **kwargs)
737        self.magic_number = None
738        self.version = 0
739
740    def save_as_pofile(self, fpath):
741        """
742        Saves the mofile as a pofile to ``fpath``.
743
744        Keyword argument:
745
746        ``fpath``
747            string, full or relative path to the file.
748        """
749        _BaseFile.save(self, fpath)
750
751    def save(self, fpath=None):
752        """
753        Saves the mofile to ``fpath``.
754
755        Keyword argument:
756
757        ``fpath``
758            string, full or relative path to the file.
759        """
760        _BaseFile.save(self, fpath, 'to_binary')
761
762    def percent_translated(self):
763        """
764        Convenience method to keep the same interface with POFile instances.
765        """
766        return 100
767
768    def translated_entries(self):
769        """
770        Convenience method to keep the same interface with POFile instances.
771        """
772        return self
773
774    def untranslated_entries(self):
775        """
776        Convenience method to keep the same interface with POFile instances.
777        """
778        return []
779
780    def fuzzy_entries(self):
781        """
782        Convenience method to keep the same interface with POFile instances.
783        """
784        return []
785
786    def obsolete_entries(self):
787        """
788        Convenience method to keep the same interface with POFile instances.
789        """
790        return []
791# }}}
792# class _BaseEntry {{{
793
794
795class _BaseEntry(object):
796    """
797    Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
798    This class should **not** be instantiated directly.
799    """
800
801    def __init__(self, *args, **kwargs):
802        """
803        Constructor, accepts the following keyword arguments:
804
805        ``msgid``
806            string, the entry msgid.
807
808        ``msgstr``
809            string, the entry msgstr.
810
811        ``msgid_plural``
812            string, the entry msgid_plural.
813
814        ``msgstr_plural``
815            list, the entry msgstr_plural lines.
816
817        ``msgctxt``
818            string, the entry context (msgctxt).
819
820        ``obsolete``
821            bool, whether the entry is "obsolete" or not.
822
823        ``encoding``
824            string, the encoding to use, defaults to ``default_encoding``
825            global variable (optional).
826        """
827        self.msgid = kwargs.get('msgid', '')
828        self.msgstr = kwargs.get('msgstr', '')
829        self.msgid_plural = kwargs.get('msgid_plural', '')
830        self.msgstr_plural = kwargs.get('msgstr_plural', {})
831        self.msgctxt = kwargs.get('msgctxt', None)
832        self.obsolete = kwargs.get('obsolete', False)
833        self.encoding = kwargs.get('encoding', default_encoding)
834
835    def __unicode__(self, wrapwidth=78):
836        """
837        Returns the unicode representation of the entry.
838        """
839        if self.obsolete:
840            delflag = '#~ '
841        else:
842            delflag = ''
843        ret = []
844        # write the msgctxt if any
845        if self.msgctxt is not None:
846            ret += self._str_field("msgctxt", delflag, "", self.msgctxt,
847                                   wrapwidth)
848        # write the msgid
849        ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
850        # write the msgid_plural if any
851        if self.msgid_plural:
852            ret += self._str_field("msgid_plural", delflag, "",
853                                   self.msgid_plural, wrapwidth)
854        if self.msgstr_plural:
855            # write the msgstr_plural if any
856            msgstrs = self.msgstr_plural
857            keys = list(msgstrs)
858            keys.sort()
859            for index in keys:
860                msgstr = msgstrs[index]
861                plural_index = '[%s]' % index
862                ret += self._str_field("msgstr", delflag, plural_index, msgstr,
863                                       wrapwidth)
864        else:
865            # otherwise write the msgstr
866            ret += self._str_field("msgstr", delflag, "", self.msgstr,
867                                   wrapwidth)
868        ret.append('')
869        usedirect = True
870        if not PY3 and type(ret[0] != unicode):
871            try:
872                usedirect = False
873                ret = u('\n').join(x.decode('utf-8') for x in ret)
874            except:
875                usedirect = True
876        if usedirect:
877            ret = u('\n').join(ret)
878        return ret
879
880    if PY3:
881        def __str__(self):
882            return self.__unicode__()
883    else:
884        def __str__(self):
885            """
886            Returns the string representation of the entry.
887            """
888            return unicode(self).encode(self.encoding)
889
890    def __eq__(self, other):
891        return str(self) == str(other)
892
893    def _str_field(self, fieldname, delflag, plural_index, field,
894                   wrapwidth=78):
895        lines = field.splitlines(True)
896        if len(lines) > 1:
897            lines = [''] + lines  # start with initial empty line
898        else:
899            escaped_field = escape(field)
900            specialchars_count = 0
901            for c in ['\\', '\n', '\r', '\t', '"']:
902                specialchars_count += field.count(c)
903            # comparison must take into account fieldname length + one space
904            # + 2 quotes (eg. msgid "<string>")
905            flength = len(fieldname) + 3
906            if plural_index:
907                flength += len(plural_index)
908            real_wrapwidth = wrapwidth - flength + specialchars_count
909            if wrapwidth > 0 and len(field) > real_wrapwidth:
910                # Wrap the line but take field name into account
911                lines = [''] + [unescape(item) for item in wrap(
912                    escaped_field,
913                    wrapwidth - 2,  # 2 for quotes ""
914                    drop_whitespace=False,
915                    break_long_words=False
916                )]
917            else:
918                lines = [field]
919        if fieldname.startswith('previous_'):
920            # quick and dirty trick to get the real field name
921            fieldname = fieldname[9:]
922
923        ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index,
924                                escape(lines.pop(0)))]
925        for line in lines:
926            ret.append('%s"%s"' % (delflag, escape(line)))
927        return ret
928# }}}
929# class POEntry {{{
930
931
932class POEntry(_BaseEntry):
933    """
934    Represents a po file entry.
935    """
936
937    def __init__(self, *args, **kwargs):
938        """
939        Constructor, accepts the following keyword arguments:
940
941        ``comment``
942            string, the entry comment.
943
944        ``tcomment``
945            string, the entry translator comment.
946
947        ``occurrences``
948            list, the entry occurrences.
949
950        ``flags``
951            list, the entry flags.
952
953        ``previous_msgctxt``
954            string, the entry previous context.
955
956        ``previous_msgid``
957            string, the entry previous msgid.
958
959        ``previous_msgid_plural``
960            string, the entry previous msgid_plural.
961
962        ``linenum``
963            integer, the line number of the entry
964        """
965        _BaseEntry.__init__(self, *args, **kwargs)
966        self.comment = kwargs.get('comment', '')
967        self.tcomment = kwargs.get('tcomment', '')
968        self.occurrences = kwargs.get('occurrences', [])
969        self.flags = kwargs.get('flags', [])
970        self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
971        self.previous_msgid = kwargs.get('previous_msgid', None)
972        self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
973        self.linenum = kwargs.get('linenum', None)
974
975    def __unicode__(self, wrapwidth=0):
976        """
977        Returns the unicode representation of the entry.
978        """
979        ret = []
980        # comments first, if any (with text wrapping as xgettext does)
981        if self.obsolete:
982            comments = [('tcomment', '# ')]
983        else:
984            comments = [('comment', '#. '), ('tcomment', '# ')]
985        for c in comments:
986            val = getattr(self, c[0])
987            if val:
988                for comment in val.split('\n'):
989                    if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth:
990                        ret += wrap(
991                            comment,
992                            wrapwidth,
993                            initial_indent=c[1],
994                            subsequent_indent=c[1],
995                            break_long_words=False
996                        )
997                    else:
998                        ret.append('%s%s' % (c[1], comment))
999
1000        # occurrences (with text wrapping as xgettext does)
1001        if not self.obsolete and self.occurrences:
1002            filelist = []
1003            for fpath, lineno in self.occurrences:
1004                if lineno:
1005                    filelist.append('%s:%s' % (fpath, lineno))
1006                else:
1007                    filelist.append(fpath)
1008            filestr = ' '.join(filelist)
1009            if wrapwidth > 0 and len(filestr) + 3 > wrapwidth:
1010                # textwrap split words that contain hyphen, this is not
1011                # what we want for filenames, so the dirty hack is to
1012                # temporally replace hyphens with a char that a file cannot
1013                # contain, like "*"
1014                ret += [l.replace('*', '-') for l in wrap(
1015                    filestr.replace('-', '*'),
1016                    wrapwidth,
1017                    initial_indent='#: ',
1018                    subsequent_indent='#: ',
1019                    break_long_words=False
1020                )]
1021            else:
1022                ret.append('#: ' + filestr)
1023
1024        # flags (TODO: wrapping ?)
1025        if self.flags:
1026            ret.append('#, %s' % ', '.join(self.flags))
1027
1028        # previous context and previous msgid/msgid_plural
1029        fields = ['previous_msgctxt', 'previous_msgid',
1030                  'previous_msgid_plural']
1031        if self.obsolete:
1032            prefix = "#~| "
1033        else:
1034            prefix = "#| "
1035        for f in fields:
1036            val = getattr(self, f)
1037            if val:
1038                ret += self._str_field(f, prefix, "", val, wrapwidth)
1039
1040        ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1041        ret = u('\n').join(ret)
1042        return ret
1043
1044    def __cmp__(self, other):
1045        """
1046        Called by comparison operations if rich comparison is not defined.
1047        """
1048
1049        # First: Obsolete test
1050        if self.obsolete != other.obsolete:
1051            if self.obsolete:
1052                return -1
1053            else:
1054                return 1
1055        # Work on a copy to protect original
1056        occ1 = sorted(self.occurrences[:])
1057        occ2 = sorted(other.occurrences[:])
1058        pos = 0
1059        for entry1 in occ1:
1060            try:
1061                entry2 = occ2[pos]
1062            except IndexError:
1063                return 1
1064            pos = pos + 1
1065            if entry1[0] != entry2[0]:
1066                if entry1[0] > entry2[0]:
1067                    return 1
1068                else:
1069                    return -1
1070            if entry1[1] != entry2[1]:
1071                if entry1[1] > entry2[1]:
1072                    return 1
1073                else:
1074                    return -1
1075        # Compare msgid_plural if set
1076        if self.msgid_plural:
1077            if not other.msgid_plural:
1078                return 1
1079            for pos in self.msgid_plural:
1080                if pos not in other.msgid_plural:
1081                    return 1
1082                if self.msgid_plural[pos] > other.msgid_plural[pos]:
1083                    return 1
1084                if self.msgid_plural[pos] < other.msgid_plural[pos]:
1085                    return -1
1086        # Finally: Compare message ID
1087        if self.msgid > other.msgid:
1088            return 1
1089        elif self.msgid < other.msgid:
1090            return -1
1091        return 0
1092
1093    def __gt__(self, other):
1094        return self.__cmp__(other) > 0
1095
1096    def __lt__(self, other):
1097        return self.__cmp__(other) < 0
1098
1099    def __ge__(self, other):
1100        return self.__cmp__(other) >= 0
1101
1102    def __le__(self, other):
1103        return self.__cmp__(other) <= 0
1104
1105    def __eq__(self, other):
1106        return self.__cmp__(other) == 0
1107
1108    def __ne__(self, other):
1109        return self.__cmp__(other) != 0
1110
1111    def translated(self):
1112        """
1113        Returns ``True`` if the entry has been translated or ``False``
1114        otherwise.
1115        """
1116        if self.obsolete or 'fuzzy' in self.flags:
1117            return False
1118        if self.msgstr != '':
1119            return True
1120        if self.msgstr_plural:
1121            for pos in self.msgstr_plural:
1122                if self.msgstr_plural[pos] == '':
1123                    return False
1124            return True
1125        return False
1126
1127    def merge(self, other):
1128        """
1129        Merge the current entry with the given pot entry.
1130        """
1131        self.msgid = other.msgid
1132        self.msgctxt = other.msgctxt
1133        self.occurrences = other.occurrences
1134        self.comment = other.comment
1135        fuzzy = 'fuzzy' in self.flags
1136        self.flags = other.flags[:]  # clone flags
1137        if fuzzy:
1138            self.flags.append('fuzzy')
1139        self.msgid_plural = other.msgid_plural
1140        self.obsolete = other.obsolete
1141        self.previous_msgctxt = other.previous_msgctxt
1142        self.previous_msgid = other.previous_msgid
1143        self.previous_msgid_plural = other.previous_msgid_plural
1144        if other.msgstr_plural:
1145            for pos in other.msgstr_plural:
1146                try:
1147                    # keep existing translation at pos if any
1148                    self.msgstr_plural[pos]
1149                except KeyError:
1150                    self.msgstr_plural[pos] = ''
1151
1152    def __hash__(self):
1153        return hash((self.msgid, self.msgstr))
1154# }}}
1155# class MOEntry {{{
1156
1157
1158class MOEntry(_BaseEntry):
1159    """
1160    Represents a mo file entry.
1161    """
1162    def __init__(self, *args, **kwargs):
1163        """
1164        Constructor, accepts the following keyword arguments,
1165        for consistency with :class:`~polib.POEntry`:
1166
1167        ``comment``
1168        ``tcomment``
1169        ``occurrences``
1170        ``flags``
1171        ``previous_msgctxt``
1172        ``previous_msgid``
1173        ``previous_msgid_plural``
1174
1175        Note: even though these keyword arguments are accepted,
1176        they hold no real meaning in the context of MO files
1177        and are simply ignored.
1178        """
1179        _BaseEntry.__init__(self, *args, **kwargs)
1180        self.comment = ''
1181        self.tcomment = ''
1182        self.occurrences = []
1183        self.flags = []
1184        self.previous_msgctxt = None
1185        self.previous_msgid = None
1186        self.previous_msgid_plural = None
1187
1188    def __hash__(self):
1189        return hash((self.msgid, self.msgstr))
1190
1191# }}}
1192# class _POFileParser {{{
1193
1194
1195class _POFileParser(object):
1196    """
1197    A finite state machine to parse efficiently and correctly po
1198    file format.
1199    """
1200
1201    def __init__(self, pofile, *args, **kwargs):
1202        """
1203        Constructor.
1204
1205        Keyword arguments:
1206
1207        ``pofile``
1208            string, path to the po file or its content
1209
1210        ``encoding``
1211            string, the encoding to use, defaults to ``default_encoding``
1212            global variable (optional).
1213
1214        ``check_for_duplicates``
1215            whether to check for duplicate entries when adding entries to the
1216            file (optional, default: ``False``).
1217        """
1218        enc = kwargs.get('encoding', default_encoding)
1219        if _is_file(pofile):
1220            try:
1221                self.fhandle = io.open(pofile, 'rt', encoding=enc)
1222            except LookupError:
1223                enc = default_encoding
1224                self.fhandle = io.open(pofile, 'rt', encoding=enc)
1225        else:
1226            self.fhandle = pofile.splitlines()
1227
1228        klass = kwargs.get('klass')
1229        if klass is None:
1230            klass = POFile
1231        self.instance = klass(
1232            pofile=pofile,
1233            encoding=enc,
1234            check_for_duplicates=kwargs.get('check_for_duplicates', False)
1235        )
1236        self.transitions = {}
1237        self.current_line = 0
1238        self.current_entry = POEntry(linenum=self.current_line)
1239        self.current_state = 'st'
1240        self.current_token = None
1241        # two memo flags used in handlers
1242        self.msgstr_index = 0
1243        self.entry_obsolete = 0
1244        # Configure the state machine, by adding transitions.
1245        # Signification of symbols:
1246        #     * ST: Beginning of the file (start)
1247        #     * HE: Header
1248        #     * TC: a translation comment
1249        #     * GC: a generated comment
1250        #     * OC: a file/line occurrence
1251        #     * FL: a flags line
1252        #     * CT: a message context
1253        #     * PC: a previous msgctxt
1254        #     * PM: a previous msgid
1255        #     * PP: a previous msgid_plural
1256        #     * MI: a msgid
1257        #     * MP: a msgid plural
1258        #     * MS: a msgstr
1259        #     * MX: a msgstr plural
1260        #     * MC: a msgid or msgstr continuation line
1261        all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc',
1262               'ms', 'mp', 'mx', 'mi']
1263
1264        self.add('tc', ['st', 'he'],                                     'he')
1265        self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms',
1266                        'mp', 'mx', 'mi'],                               'tc')
1267        self.add('gc', all,                                              'gc')
1268        self.add('oc', all,                                              'oc')
1269        self.add('fl', all,                                              'fl')
1270        self.add('pc', all,                                              'pc')
1271        self.add('pm', all,                                              'pm')
1272        self.add('pp', all,                                              'pp')
1273        self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm',
1274                        'pp', 'ms', 'mx'],                               'ct')
1275        self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc',
1276                 'pm', 'pp', 'ms', 'mx'],                                'mi')
1277        self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'],             'mp')
1278        self.add('ms', ['mi', 'mp', 'tc'],                               'ms')
1279        self.add('mx', ['mi', 'mx', 'mp', 'tc'],                         'mx')
1280        self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1281
1282    def parse(self):
1283        """
1284        Run the state machine, parse the file line by line and call process()
1285        with the current matched symbol.
1286        """
1287
1288        keywords = {
1289            'msgctxt': 'ct',
1290            'msgid': 'mi',
1291            'msgstr': 'ms',
1292            'msgid_plural': 'mp',
1293        }
1294        prev_keywords = {
1295            'msgid_plural': 'pp',
1296            'msgid': 'pm',
1297            'msgctxt': 'pc',
1298        }
1299        tokens = []
1300        for line in self.fhandle:
1301            self.current_line += 1
1302            line = line.strip()
1303            if line == '':
1304                continue
1305
1306            tokens = line.split(None, 2)
1307            nb_tokens = len(tokens)
1308
1309            if tokens[0] == '#~|':
1310                continue
1311
1312            if tokens[0] == '#~' and nb_tokens > 1:
1313                line = line[3:].strip()
1314                tokens = tokens[1:]
1315                nb_tokens -= 1
1316                self.entry_obsolete = 1
1317            else:
1318                self.entry_obsolete = 0
1319
1320            # Take care of keywords like
1321            # msgid, msgid_plural, msgctxt & msgstr.
1322            if tokens[0] in keywords and nb_tokens > 1:
1323                line = line[len(tokens[0]):].lstrip()
1324                if re.search(r'([^\\]|^)"', line[1:-1]):
1325                    raise IOError('Syntax error in po file %s (line %s): '
1326                                  'unescaped double quote found' %
1327                                  (self.instance.fpath, self.current_line))
1328                self.current_token = line
1329                self.process(keywords[tokens[0]])
1330                continue
1331
1332            self.current_token = line
1333
1334            if tokens[0] == '#:':
1335                if nb_tokens <= 1:
1336                    continue
1337                # we are on an occurrences line
1338                self.process('oc')
1339
1340            elif line[:1] == '"':
1341                # we are on a continuation line
1342                if re.search(r'([^\\]|^)"', line[1:-1]):
1343                    raise IOError('Syntax error in po file %s (line %s): '
1344                                  'unescaped double quote found' %
1345                                  (self.instance.fpath, self.current_line))
1346                self.process('mc')
1347
1348            elif line[:7] == 'msgstr[':
1349                # we are on a msgstr plural
1350                self.process('mx')
1351
1352            elif tokens[0] == '#,':
1353                if nb_tokens <= 1:
1354                    continue
1355                # we are on a flags line
1356                self.process('fl')
1357
1358            elif tokens[0] == '#' or tokens[0].startswith('##'):
1359                if line == '#':
1360                    line += ' '
1361                # we are on a translator comment line
1362                self.process('tc')
1363
1364            elif tokens[0] == '#.':
1365                if nb_tokens <= 1:
1366                    continue
1367                # we are on a generated comment line
1368                self.process('gc')
1369
1370            elif tokens[0] == '#|':
1371                if nb_tokens <= 1:
1372                    raise IOError('Syntax error in po file %s (line %s)' %
1373                                  (self.instance.fpath, self.current_line))
1374
1375                # Remove the marker and any whitespace right after that.
1376                line = line[2:].lstrip()
1377                self.current_token = line
1378
1379                if tokens[1].startswith('"'):
1380                    # Continuation of previous metadata.
1381                    self.process('mc')
1382                    continue
1383
1384                if nb_tokens == 2:
1385                    # Invalid continuation line.
1386                    raise IOError('Syntax error in po file %s (line %s): '
1387                                  'invalid continuation line' %
1388                                  (self.instance.fpath, self.current_line))
1389
1390                # we are on a "previous translation" comment line,
1391                if tokens[1] not in prev_keywords:
1392                    # Unknown keyword in previous translation comment.
1393                    raise IOError('Syntax error in po file %s (line %s): '
1394                                  'unknown keyword %s' %
1395                                  (self.instance.fpath, self.current_line,
1396                                   tokens[1]))
1397
1398                # Remove the keyword and any whitespace
1399                # between it and the starting quote.
1400                line = line[len(tokens[1]):].lstrip()
1401                self.current_token = line
1402                self.process(prev_keywords[tokens[1]])
1403
1404            else:
1405                raise IOError('Syntax error in po file %s (line %s)' %
1406                              (self.instance.fpath, self.current_line))
1407
1408        if self.current_entry and len(tokens) > 0 and \
1409           not tokens[0].startswith('#'):
1410            # since entries are added when another entry is found, we must add
1411            # the last entry here (only if there are lines). Trailing comments
1412            # are ignored
1413            self.instance.append(self.current_entry)
1414
1415        # before returning the instance, check if there's metadata and if
1416        # so extract it in a dict
1417        metadataentry = self.instance.find('')
1418        if metadataentry:  # metadata found
1419            # remove the entry
1420            self.instance.remove(metadataentry)
1421            self.instance.metadata_is_fuzzy = metadataentry.flags
1422            key = None
1423            for msg in metadataentry.msgstr.splitlines():
1424                try:
1425                    key, val = msg.split(':', 1)
1426                    self.instance.metadata[key] = val.strip()
1427                except (ValueError, KeyError):
1428                    if key is not None:
1429                        self.instance.metadata[key] += '\n' + msg.strip()
1430        # close opened file
1431        if not isinstance(self.fhandle, list):  # must be file
1432            self.fhandle.close()
1433        return self.instance
1434
1435    def add(self, symbol, states, next_state):
1436        """
1437        Add a transition to the state machine.
1438
1439        Keywords arguments:
1440
1441        ``symbol``
1442            string, the matched token (two chars symbol).
1443
1444        ``states``
1445            list, a list of states (two chars symbols).
1446
1447        ``next_state``
1448            the next state the fsm will have after the action.
1449        """
1450        for state in states:
1451            action = getattr(self, 'handle_%s' % next_state)
1452            self.transitions[(symbol, state)] = (action, next_state)
1453
1454    def process(self, symbol):
1455        """
1456        Process the transition corresponding to the current state and the
1457        symbol provided.
1458
1459        Keywords arguments:
1460
1461        ``symbol``
1462            string, the matched token (two chars symbol).
1463
1464        ``linenum``
1465            integer, the current line number of the parsed file.
1466        """
1467        try:
1468            (action, state) = self.transitions[(symbol, self.current_state)]
1469            if action():
1470                self.current_state = state
1471        except Exception:
1472            raise IOError('Syntax error in po file (line %s)' %
1473                          self.current_line)
1474
1475    # state handlers
1476
1477    def handle_he(self):
1478        """Handle a header comment."""
1479        if self.instance.header != '':
1480            self.instance.header += '\n'
1481        self.instance.header += self.current_token[2:]
1482        return 1
1483
1484    def handle_tc(self):
1485        """Handle a translator comment."""
1486        if self.current_state in ['mc', 'ms', 'mx']:
1487            self.instance.append(self.current_entry)
1488            self.current_entry = POEntry(linenum=self.current_line)
1489        if self.current_entry.tcomment != '':
1490            self.current_entry.tcomment += '\n'
1491        tcomment = self.current_token.lstrip('#')
1492        if tcomment.startswith(' '):
1493            tcomment = tcomment[1:]
1494        self.current_entry.tcomment += tcomment
1495        return True
1496
1497    def handle_gc(self):
1498        """Handle a generated comment."""
1499        if self.current_state in ['mc', 'ms', 'mx']:
1500            self.instance.append(self.current_entry)
1501            self.current_entry = POEntry(linenum=self.current_line)
1502        if self.current_entry.comment != '':
1503            self.current_entry.comment += '\n'
1504        self.current_entry.comment += self.current_token[3:]
1505        return True
1506
1507    def handle_oc(self):
1508        """Handle a file:num occurrence."""
1509        if self.current_state in ['mc', 'ms', 'mx']:
1510            self.instance.append(self.current_entry)
1511            self.current_entry = POEntry(linenum=self.current_line)
1512        occurrences = self.current_token[3:].split()
1513        for occurrence in occurrences:
1514            if occurrence != '':
1515                try:
1516                    fil, line = occurrence.rsplit(':', 1)
1517                    if not line.isdigit():
1518                        fil = fil + line
1519                        line = ''
1520                    self.current_entry.occurrences.append((fil, line))
1521                except (ValueError, AttributeError):
1522                    self.current_entry.occurrences.append((occurrence, ''))
1523        return True
1524
1525    def handle_fl(self):
1526        """Handle a flags line."""
1527        if self.current_state in ['mc', 'ms', 'mx']:
1528            self.instance.append(self.current_entry)
1529            self.current_entry = POEntry(linenum=self.current_line)
1530        self.current_entry.flags += [c.strip() for c in
1531                                     self.current_token[3:].split(',')]
1532        return True
1533
1534    def handle_pp(self):
1535        """Handle a previous msgid_plural line."""
1536        if self.current_state in ['mc', 'ms', 'mx']:
1537            self.instance.append(self.current_entry)
1538            self.current_entry = POEntry(linenum=self.current_line)
1539        self.current_entry.previous_msgid_plural = \
1540            unescape(self.current_token[1:-1])
1541        return True
1542
1543    def handle_pm(self):
1544        """Handle a previous msgid line."""
1545        if self.current_state in ['mc', 'ms', 'mx']:
1546            self.instance.append(self.current_entry)
1547            self.current_entry = POEntry(linenum=self.current_line)
1548        self.current_entry.previous_msgid = \
1549            unescape(self.current_token[1:-1])
1550        return True
1551
1552    def handle_pc(self):
1553        """Handle a previous msgctxt line."""
1554        if self.current_state in ['mc', 'ms', 'mx']:
1555            self.instance.append(self.current_entry)
1556            self.current_entry = POEntry(linenum=self.current_line)
1557        self.current_entry.previous_msgctxt = \
1558            unescape(self.current_token[1:-1])
1559        return True
1560
1561    def handle_ct(self):
1562        """Handle a msgctxt."""
1563        if self.current_state in ['mc', 'ms', 'mx']:
1564            self.instance.append(self.current_entry)
1565            self.current_entry = POEntry(linenum=self.current_line)
1566        self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1567        return True
1568
1569    def handle_mi(self):
1570        """Handle a msgid."""
1571        if self.current_state in ['mc', 'ms', 'mx']:
1572            self.instance.append(self.current_entry)
1573            self.current_entry = POEntry(linenum=self.current_line)
1574        self.current_entry.obsolete = self.entry_obsolete
1575        self.current_entry.msgid = unescape(self.current_token[1:-1])
1576        return True
1577
1578    def handle_mp(self):
1579        """Handle a msgid plural."""
1580        self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1581        return True
1582
1583    def handle_ms(self):
1584        """Handle a msgstr."""
1585        self.current_entry.msgstr = unescape(self.current_token[1:-1])
1586        return True
1587
1588    def handle_mx(self):
1589        """Handle a msgstr plural."""
1590        index = self.current_token[7]
1591        value = self.current_token[self.current_token.find('"') + 1:-1]
1592        self.current_entry.msgstr_plural[int(index)] = unescape(value)
1593        self.msgstr_index = int(index)
1594        return True
1595
1596    def handle_mc(self):
1597        """Handle a msgid or msgstr continuation line."""
1598        token = unescape(self.current_token[1:-1])
1599        if self.current_state == 'ct':
1600            self.current_entry.msgctxt += token
1601        elif self.current_state == 'mi':
1602            self.current_entry.msgid += token
1603        elif self.current_state == 'mp':
1604            self.current_entry.msgid_plural += token
1605        elif self.current_state == 'ms':
1606            self.current_entry.msgstr += token
1607        elif self.current_state == 'mx':
1608            self.current_entry.msgstr_plural[self.msgstr_index] += token
1609        elif self.current_state == 'pp':
1610            self.current_entry.previous_msgid_plural += token
1611        elif self.current_state == 'pm':
1612            self.current_entry.previous_msgid += token
1613        elif self.current_state == 'pc':
1614            self.current_entry.previous_msgctxt += token
1615        # don't change the current state
1616        return False
1617# }}}
1618# class _MOFileParser {{{
1619
1620
1621class _MOFileParser(object):
1622    """
1623    A class to parse binary mo files.
1624    """
1625
1626    def __init__(self, mofile, *args, **kwargs):
1627        """
1628        Constructor.
1629
1630        Keyword arguments:
1631
1632        ``mofile``
1633            string, path to the mo file or its content
1634
1635        ``encoding``
1636            string, the encoding to use, defaults to ``default_encoding``
1637            global variable (optional).
1638
1639        ``check_for_duplicates``
1640            whether to check for duplicate entries when adding entries to the
1641            file (optional, default: ``False``).
1642        """
1643        self.fhandle = open(mofile, 'rb')
1644
1645        klass = kwargs.get('klass')
1646        if klass is None:
1647            klass = MOFile
1648        self.instance = klass(
1649            fpath=mofile,
1650            encoding=kwargs.get('encoding', default_encoding),
1651            check_for_duplicates=kwargs.get('check_for_duplicates', False)
1652        )
1653
1654    def __del__(self):
1655        """
1656        Make sure the file is closed, this prevents warnings on unclosed file
1657        when running tests with python >= 3.2.
1658        """
1659        if self.fhandle:
1660            self.fhandle.close()
1661
1662    def parse(self):
1663        """
1664        Build the instance with the file handle provided in the
1665        constructor.
1666        """
1667        # parse magic number
1668        magic_number = self._readbinary('<I', 4)
1669        if magic_number == MOFile.MAGIC:
1670            ii = '<II'
1671        elif magic_number == MOFile.MAGIC_SWAPPED:
1672            ii = '>II'
1673        else:
1674            raise IOError('Invalid mo file, magic number is incorrect !')
1675        self.instance.magic_number = magic_number
1676        # parse the version number and the number of strings
1677        version, numofstrings = self._readbinary(ii, 8)
1678        # from MO file format specs: "A program seeing an unexpected major
1679        # revision number should stop reading the MO file entirely"
1680        if version not in (0, 1):
1681            raise IOError('Invalid mo file, unexpected major revision number')
1682        self.instance.version = version
1683        # original strings and translation strings hash table offset
1684        msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1685        # move to msgid hash table and read length and offset of msgids
1686        self.fhandle.seek(msgids_hash_offset)
1687        msgids_index = []
1688        for i in range(numofstrings):
1689            msgids_index.append(self._readbinary(ii, 8))
1690        # move to msgstr hash table and read length and offset of msgstrs
1691        self.fhandle.seek(msgstrs_hash_offset)
1692        msgstrs_index = []
1693        for i in range(numofstrings):
1694            msgstrs_index.append(self._readbinary(ii, 8))
1695        # build entries
1696        encoding = self.instance.encoding
1697        for i in range(numofstrings):
1698            self.fhandle.seek(msgids_index[i][1])
1699            msgid = self.fhandle.read(msgids_index[i][0])
1700
1701            self.fhandle.seek(msgstrs_index[i][1])
1702            msgstr = self.fhandle.read(msgstrs_index[i][0])
1703            if i == 0 and not msgid:  # metadata
1704                raw_metadata, metadata = msgstr.split(b('\n')), {}
1705                for line in raw_metadata:
1706                    tokens = line.split(b(':'), 1)
1707                    if tokens[0] != b(''):
1708                        try:
1709                            k = tokens[0].decode(encoding)
1710                            v = tokens[1].decode(encoding)
1711                            metadata[k] = v.strip()
1712                        except IndexError:
1713                            metadata[k] = u('')
1714                self.instance.metadata = metadata
1715                continue
1716            # test if we have a plural entry
1717            msgid_tokens = msgid.split(b('\0'))
1718            if len(msgid_tokens) > 1:
1719                entry = self._build_entry(
1720                    msgid=msgid_tokens[0],
1721                    msgid_plural=msgid_tokens[1],
1722                    msgstr_plural=dict((k, v) for k, v in
1723                                       enumerate(msgstr.split(b('\0'))))
1724                )
1725            else:
1726                entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1727            self.instance.append(entry)
1728        # close opened file
1729        self.fhandle.close()
1730        return self.instance
1731
1732    def _build_entry(self, msgid, msgstr=None, msgid_plural=None,
1733                     msgstr_plural=None):
1734        msgctxt_msgid = msgid.split(b('\x04'))
1735        encoding = self.instance.encoding
1736        if len(msgctxt_msgid) > 1:
1737            kwargs = {
1738                'msgctxt': msgctxt_msgid[0].decode(encoding),
1739                'msgid': msgctxt_msgid[1].decode(encoding),
1740            }
1741        else:
1742            kwargs = {'msgid': msgid.decode(encoding)}
1743        if msgstr:
1744            kwargs['msgstr'] = msgstr.decode(encoding)
1745        if msgid_plural:
1746            kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1747        if msgstr_plural:
1748            for k in msgstr_plural:
1749                msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1750            kwargs['msgstr_plural'] = msgstr_plural
1751        return MOEntry(**kwargs)
1752
1753    def _readbinary(self, fmt, numbytes):
1754        """
1755        Private method that unpack n bytes of data using format <fmt>.
1756        It returns a tuple or a mixed value if the tuple length is 1.
1757        """
1758        bytes = self.fhandle.read(numbytes)
1759        tup = struct.unpack(fmt, bytes)
1760        if len(tup) == 1:
1761            return tup[0]
1762        return tup
1763# }}}
1764# class TextWrapper {{{
1765
1766
1767class TextWrapper(textwrap.TextWrapper):
1768    """
1769    Subclass of textwrap.TextWrapper that backport the
1770    drop_whitespace option.
1771    """
1772    def __init__(self, *args, **kwargs):
1773        drop_whitespace = kwargs.pop('drop_whitespace', True)
1774        textwrap.TextWrapper.__init__(self, *args, **kwargs)
1775        self.drop_whitespace = drop_whitespace
1776
1777    def _wrap_chunks(self, chunks):
1778        """_wrap_chunks(chunks : [string]) -> [string]
1779
1780        Wrap a sequence of text chunks and return a list of lines of
1781        length 'self.width' or less.  (If 'break_long_words' is false,
1782        some lines may be longer than this.)  Chunks correspond roughly
1783        to words and the whitespace between them: each chunk is
1784        indivisible (modulo 'break_long_words'), but a line break can
1785        come between any two chunks.  Chunks should not have internal
1786        whitespace; ie. a chunk is either all whitespace or a "word".
1787        Whitespace chunks will be removed from the beginning and end of
1788        lines, but apart from that whitespace is preserved.
1789        """
1790        lines = []
1791        if self.width <= 0:
1792            raise ValueError("invalid width %r (must be > 0)" % self.width)
1793
1794        # Arrange in reverse order so items can be efficiently popped
1795        # from a stack of chucks.
1796        chunks.reverse()
1797
1798        while chunks:
1799
1800            # Start the list of chunks that will make up the current line.
1801            # cur_len is just the length of all the chunks in cur_line.
1802            cur_line = []
1803            cur_len = 0
1804
1805            # Figure out which static string will prefix this line.
1806            if lines:
1807                indent = self.subsequent_indent
1808            else:
1809                indent = self.initial_indent
1810
1811            # Maximum width for this line.
1812            width = self.width - len(indent)
1813
1814            # First chunk on line is whitespace -- drop it, unless this
1815            # is the very beginning of the text (ie. no lines started yet).
1816            if self.drop_whitespace and chunks[-1].strip() == '' and lines:
1817                del chunks[-1]
1818
1819            while chunks:
1820                l = len(chunks[-1])
1821
1822                # Can at least squeeze this chunk onto the current line.
1823                if cur_len + l <= width:
1824                    cur_line.append(chunks.pop())
1825                    cur_len += l
1826
1827                # Nope, this line is full.
1828                else:
1829                    break
1830
1831            # The current line is full, and the next chunk is too big to
1832            # fit on *any* line (not just this one).
1833            if chunks and len(chunks[-1]) > width:
1834                self._handle_long_word(chunks, cur_line, cur_len, width)
1835
1836            # If the last chunk on this line is all whitespace, drop it.
1837            if self.drop_whitespace and cur_line and not cur_line[-1].strip():
1838                del cur_line[-1]
1839
1840            # Convert current line back to a string and store it in list
1841            # of all lines (return value).
1842            if cur_line:
1843                lines.append(indent + ''.join(cur_line))
1844
1845        return lines
1846# }}}
1847# function wrap() {{{
1848
1849
1850def wrap(text, width=70, **kwargs):
1851    """
1852    Wrap a single paragraph of text, returning a list of wrapped lines.
1853    """
1854    if sys.version_info < (2, 6):
1855        return TextWrapper(width=width, **kwargs).wrap(text)
1856    return textwrap.wrap(text, width=width, **kwargs)
1857
1858# }}}
1859
1860def genKeyId(inkey):
1861    crc = binascii.crc32(bytes(inkey, encoding="UTF-8")) & 0xffffffff
1862    # Use simple ASCII characters, exclude I, l, 1 and O, 0 to avoid confusing IDs
1863    symbols = "ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz23456789";
1864    outkey = ""
1865    for keyind in range(0, 5):
1866        outkey += symbols[(crc & 63) % len(symbols)];
1867        crc >>= 6;
1868    return outkey
1869