1# -* coding: utf-8 -*-
2#
3# License: MIT (see LICENSE file provided)
4# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
5
6"""
7**polib** allows you to manipulate, create, modify gettext files (pot, po and
8mo files).  You can load existing files, iterate through it's entries, add,
9modify entries, comments or metadata, etc. or create new po files from scratch.
10
11**polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
12:func:`~polib.mofile` convenience functions.
13"""
14
15import array
16import codecs
17import os
18import re
19import struct
20import sys
21import textwrap
22
23try:
24    import io
25except ImportError:
26    # replacement of io.open() for python < 2.6
27    # we use codecs instead
28    class io(object):
29        @staticmethod
30        def open(fpath, mode='r', encoding=None):
31            return codecs.open(fpath, mode, encoding)
32
33
34__author__ = 'David Jean Louis <izimobil@gmail.com>'
35__version__ = '1.1.0'
36__all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
37           'default_encoding', 'escape', 'unescape', 'detect_encoding', ]
38
39
40# the default encoding to use when encoding cannot be detected
41default_encoding = 'utf-8'
42
43# python 2/3 compatibility helpers {{{
44
45
46if sys.version_info[:2] < (3, 0):
47    PY3 = False
48    text_type = unicode
49
50    def b(s):
51        return s
52
53    def u(s):
54        return unicode(s, "unicode_escape")
55
56else:
57    PY3 = True
58    text_type = str
59
60    def b(s):
61        return s.encode("latin-1")
62
63    def u(s):
64        return s
65# }}}
66# _pofile_or_mofile {{{
67
68
69def _pofile_or_mofile(f, type, **kwargs):
70    """
71    Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
72    honor the DRY concept.
73    """
74    # get the file encoding
75    enc = kwargs.get('encoding')
76    if enc is None:
77        enc = detect_encoding(f, type == 'mofile')
78
79    # parse the file
80    kls = type == 'pofile' and _POFileParser or _MOFileParser
81    parser = kls(
82        f,
83        encoding=enc,
84        check_for_duplicates=kwargs.get('check_for_duplicates', False),
85        klass=kwargs.get('klass')
86    )
87    instance = parser.parse()
88    instance.wrapwidth = kwargs.get('wrapwidth', 78)
89    return instance
90# }}}
91# _is_file {{{
92
93
94def _is_file(filename_or_contents):
95    """
96    Safely returns the value of os.path.exists(filename_or_contents).
97
98    Arguments:
99
100    ``filename_or_contents``
101        either a filename, or a string holding the contents of some file.
102        In the latter case, this function will always return False.
103    """
104    try:
105        return os.path.exists(filename_or_contents)
106    except (ValueError, UnicodeEncodeError):
107        return False
108# }}}
109# function pofile() {{{
110
111
112def pofile(pofile, **kwargs):
113    """
114    Convenience function that parses the po or pot file ``pofile`` and returns
115    a :class:`~polib.POFile` instance.
116
117    Arguments:
118
119    ``pofile``
120        string, full or relative path to the po/pot file or its content (data).
121
122    ``wrapwidth``
123        integer, the wrap width, only useful when the ``-w`` option was passed
124        to xgettext (optional, default: ``78``).
125
126    ``encoding``
127        string, the encoding to use (e.g. "utf-8") (default: ``None``, the
128        encoding will be auto-detected).
129
130    ``check_for_duplicates``
131        whether to check for duplicate entries when adding entries to the
132        file (optional, default: ``False``).
133
134    ``klass``
135        class which is used to instantiate the return value (optional,
136        default: ``None``, the return value with be a :class:`~polib.POFile`
137        instance).
138    """
139    return _pofile_or_mofile(pofile, 'pofile', **kwargs)
140# }}}
141# function mofile() {{{
142
143
144def mofile(mofile, **kwargs):
145    """
146    Convenience function that parses the mo file ``mofile`` and returns a
147    :class:`~polib.MOFile` instance.
148
149    Arguments:
150
151    ``mofile``
152        string, full or relative path to the mo file or its content (data).
153
154    ``wrapwidth``
155        integer, the wrap width, only useful when the ``-w`` option was passed
156        to xgettext to generate the po file that was used to format the mo file
157        (optional, default: ``78``).
158
159    ``encoding``
160        string, the encoding to use (e.g. "utf-8") (default: ``None``, the
161        encoding will be auto-detected).
162
163    ``check_for_duplicates``
164        whether to check for duplicate entries when adding entries to the
165        file (optional, default: ``False``).
166
167    ``klass``
168        class which is used to instantiate the return value (optional,
169        default: ``None``, the return value with be a :class:`~polib.POFile`
170        instance).
171    """
172    return _pofile_or_mofile(mofile, 'mofile', **kwargs)
173# }}}
174# function detect_encoding() {{{
175
176
177def detect_encoding(file, binary_mode=False):
178    """
179    Try to detect the encoding used by the ``file``. The ``file`` argument can
180    be a PO or MO file path or a string containing the contents of the file.
181    If the encoding cannot be detected, the function will return the value of
182    ``default_encoding``.
183
184    Arguments:
185
186    ``file``
187        string, full or relative path to the po/mo file or its content.
188
189    ``binary_mode``
190        boolean, set this to True if ``file`` is a mo file.
191    """
192    PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
193    rxt = re.compile(u(PATTERN))
194    rxb = re.compile(b(PATTERN))
195
196    def charset_exists(charset):
197        """Check whether ``charset`` is valid or not."""
198        try:
199            codecs.lookup(charset)
200        except LookupError:
201            return False
202        return True
203
204    if not _is_file(file):
205        match = rxt.search(file)
206        if match:
207            enc = match.group(1).strip()
208            if charset_exists(enc):
209                return enc
210    else:
211        # For PY3, always treat as binary
212        if binary_mode or PY3:
213            mode = 'rb'
214            rx = rxb
215        else:
216            mode = 'r'
217            rx = rxt
218        f = open(file, mode)
219        for l in f.readlines():
220            match = rx.search(l)
221            if match:
222                f.close()
223                enc = match.group(1).strip()
224                if not isinstance(enc, text_type):
225                    enc = enc.decode('utf-8')
226                if charset_exists(enc):
227                    return enc
228        f.close()
229    return default_encoding
230# }}}
231# function escape() {{{
232
233
234def escape(st):
235    """
236    Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
237    the given string ``st`` and returns it.
238    """
239    return st.replace('\\', r'\\')\
240             .replace('\t', r'\t')\
241             .replace('\r', r'\r')\
242             .replace('\n', r'\n')\
243             .replace('\"', r'\"')
244# }}}
245# function unescape() {{{
246
247
248def unescape(st):
249    """
250    Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
251    the given string ``st`` and returns it.
252    """
253    def unescape_repl(m):
254        m = m.group(1)
255        if m == 'n':
256            return '\n'
257        if m == 't':
258            return '\t'
259        if m == 'r':
260            return '\r'
261        if m == '\\':
262            return '\\'
263        return m  # handles escaped double quote
264    return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
265# }}}
266# function natural_sort() {{{
267
268
269def natural_sort(lst):
270    """
271    Sort naturally the given list.
272    Credits: http://stackoverflow.com/a/4836734
273    """
274    def convert(text):
275        return int(text) if text.isdigit() else text.lower()
276
277    def alphanum_key(key):
278        return [convert(c) for c in re.split('([0-9]+)', key)]
279
280    return sorted(lst, key=alphanum_key)
281
282# }}}
283# class _BaseFile {{{
284
285
286class _BaseFile(list):
287    """
288    Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
289    classes. This class should **not** be instantiated directly.
290    """
291
292    def __init__(self, *args, **kwargs):
293        """
294        Constructor, accepts the following keyword arguments:
295
296        ``pofile``
297            string, the path to the po or mo file, or its content as a string.
298
299        ``wrapwidth``
300            integer, the wrap width, only useful when the ``-w`` option was
301            passed to xgettext (optional, default: ``78``).
302
303        ``encoding``
304            string, the encoding to use, defaults to ``default_encoding``
305            global variable (optional).
306
307        ``check_for_duplicates``
308            whether to check for duplicate entries when adding entries to the
309            file, (optional, default: ``False``).
310        """
311        list.__init__(self)
312        # the opened file handle
313        pofile = kwargs.get('pofile', None)
314        if pofile and _is_file(pofile):
315            self.fpath = pofile
316        else:
317            self.fpath = kwargs.get('fpath')
318        # the width at which lines should be wrapped
319        self.wrapwidth = kwargs.get('wrapwidth', 78)
320        # the file encoding
321        self.encoding = kwargs.get('encoding', default_encoding)
322        # whether to check for duplicate entries or not
323        self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
324        # header
325        self.header = ''
326        # both po and mo files have metadata
327        self.metadata = {}
328        self.metadata_is_fuzzy = 0
329
330    def __unicode__(self):
331        """
332        Returns the unicode representation of the file.
333        """
334        ret = []
335        entries = [self.metadata_as_entry()] + \
336                  [e for e in self if not e.obsolete]
337        for entry in entries:
338            ret.append(entry.__unicode__(self.wrapwidth))
339        for entry in self.obsolete_entries():
340            ret.append(entry.__unicode__(self.wrapwidth))
341        ret = u('\n').join(ret)
342        return ret
343
344    if PY3:
345        def __str__(self):
346            return self.__unicode__()
347    else:
348        def __str__(self):
349            """
350            Returns the string representation of the file.
351            """
352            return unicode(self).encode(self.encoding)
353
354    def __contains__(self, entry):
355        """
356        Overridden ``list`` method to implement the membership test (in and
357        not in).
358        The method considers that an entry is in the file if it finds an entry
359        that has the same msgid (the test is **case sensitive**) and the same
360        msgctxt (or none for both entries).
361
362        Argument:
363
364        ``entry``
365            an instance of :class:`~polib._BaseEntry`.
366        """
367        return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \
368            is not None
369
370    def __eq__(self, other):
371        return str(self) == str(other)
372
373    def append(self, entry):
374        """
375        Overridden method to check for duplicates entries, if a user tries to
376        add an entry that is already in the file, the method will raise a
377        ``ValueError`` exception.
378
379        Argument:
380
381        ``entry``
382            an instance of :class:`~polib._BaseEntry`.
383        """
384        # check_for_duplicates may not be defined (yet) when unpickling.
385        # But if pickling, we never want to check for duplicates anyway.
386        if getattr(self, 'check_for_duplicates', False) and entry in self:
387            raise ValueError('Entry "%s" already exists' % entry.msgid)
388        super(_BaseFile, self).append(entry)
389
390    def insert(self, index, entry):
391        """
392        Overridden method to check for duplicates entries, if a user tries to
393        add an entry that is already in the file, the method will raise a
394        ``ValueError`` exception.
395
396        Arguments:
397
398        ``index``
399            index at which the entry should be inserted.
400
401        ``entry``
402            an instance of :class:`~polib._BaseEntry`.
403        """
404        if self.check_for_duplicates and entry in self:
405            raise ValueError('Entry "%s" already exists' % entry.msgid)
406        super(_BaseFile, self).insert(index, entry)
407
408    def metadata_as_entry(self):
409        """
410        Returns the file metadata as a :class:`~polib.POFile` instance.
411        """
412        e = POEntry(msgid='')
413        mdata = self.ordered_metadata()
414        if mdata:
415            strs = []
416            for name, value in mdata:
417                # Strip whitespace off each line in a multi-line entry
418                strs.append('%s: %s' % (name, value))
419            e.msgstr = '\n'.join(strs) + '\n'
420        if self.metadata_is_fuzzy:
421            e.flags.append('fuzzy')
422        return e
423
424    def save(self, fpath=None, repr_method='__unicode__'):
425        """
426        Saves the po file to ``fpath``.
427        If it is an existing file and no ``fpath`` is provided, then the
428        existing file is rewritten with the modified data.
429
430        Keyword arguments:
431
432        ``fpath``
433            string, full or relative path to the file.
434
435        ``repr_method``
436            string, the method to use for output.
437        """
438        if self.fpath is None and fpath is None:
439            raise IOError('You must provide a file path to save() method')
440        contents = getattr(self, repr_method)()
441        if fpath is None:
442            fpath = self.fpath
443        if repr_method == 'to_binary':
444            fhandle = open(fpath, 'wb')
445        else:
446            fhandle = io.open(fpath, 'w', encoding=self.encoding)
447            if not isinstance(contents, text_type):
448                contents = contents.decode(self.encoding)
449        fhandle.write(contents)
450        fhandle.close()
451        # set the file path if not set
452        if self.fpath is None and fpath:
453            self.fpath = fpath
454
455    def find(self, st, by='msgid', include_obsolete_entries=False,
456             msgctxt=False):
457        """
458        Find the entry which msgid (or property identified by the ``by``
459        argument) matches the string ``st``.
460
461        Keyword arguments:
462
463        ``st``
464            string, the string to search for.
465
466        ``by``
467            string, the property to use for comparison (default: ``msgid``).
468
469        ``include_obsolete_entries``
470            boolean, whether to also search in entries that are obsolete.
471
472        ``msgctxt``
473            string, allows specifying a specific message context for the
474            search.
475        """
476        if include_obsolete_entries:
477            entries = self[:]
478        else:
479            entries = [e for e in self if not e.obsolete]
480        matches = []
481        for e in entries:
482            if getattr(e, by) == st:
483                if msgctxt is not False and e.msgctxt != msgctxt:
484                    continue
485                matches.append(e)
486        if len(matches) == 1:
487            return matches[0]
488        elif len(matches) > 1:
489            if not msgctxt:
490                # find the entry with no msgctx
491                e = None
492                for m in matches:
493                    if not m.msgctxt:
494                        e = m
495                if e:
496                    return e
497                # fallback to the first entry found
498                return matches[0]
499        return None
500
501    def ordered_metadata(self):
502        """
503        Convenience method that returns an ordered version of the metadata
504        dictionary. The return value is list of tuples (metadata name,
505        metadata_value).
506        """
507        # copy the dict first
508        metadata = self.metadata.copy()
509        data_order = [
510            'Project-Id-Version',
511            'Report-Msgid-Bugs-To',
512            'POT-Creation-Date',
513            'PO-Revision-Date',
514            'Last-Translator',
515            'Language-Team',
516            'Language',
517            'MIME-Version',
518            'Content-Type',
519            'Content-Transfer-Encoding',
520            'Plural-Forms'
521        ]
522        ordered_data = []
523        for data in data_order:
524            try:
525                value = metadata.pop(data)
526                ordered_data.append((data, value))
527            except KeyError:
528                pass
529        # the rest of the metadata will be alphabetically ordered since there
530        # are no specs for this AFAIK
531        for data in natural_sort(metadata.keys()):
532            value = metadata[data]
533            ordered_data.append((data, value))
534        return ordered_data
535
536    def to_binary(self):
537        """
538        Return the binary representation of the file.
539        """
540        offsets = []
541        entries = self.translated_entries()
542
543        # the keys are sorted in the .mo file
544        def cmp(_self, other):
545            # msgfmt compares entries with msgctxt if it exists
546            self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid
547            other_msgid = other.msgctxt and other.msgctxt or other.msgid
548            if self_msgid > other_msgid:
549                return 1
550            elif self_msgid < other_msgid:
551                return -1
552            else:
553                return 0
554        # add metadata entry
555        entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8'))
556        mentry = self.metadata_as_entry()
557        entries = [mentry] + entries
558        entries_len = len(entries)
559        ids, strs = b(''), b('')
560        for e in entries:
561            # For each string, we need size and file offset.  Each string is
562            # NUL terminated; the NUL does not count into the size.
563            msgid = b('')
564            if e.msgctxt:
565                # Contexts are stored by storing the concatenation of the
566                # context, a <EOT> byte, and the original string
567                msgid = self._encode(e.msgctxt + '\4')
568            if e.msgid_plural:
569                msgstr = []
570                for index in sorted(e.msgstr_plural.keys()):
571                    msgstr.append(e.msgstr_plural[index])
572                msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
573                msgstr = self._encode('\0'.join(msgstr))
574            else:
575                msgid += self._encode(e.msgid)
576                msgstr = self._encode(e.msgstr)
577            offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
578            ids += msgid + b('\0')
579            strs += msgstr + b('\0')
580
581        # The header is 7 32-bit unsigned integers.
582        keystart = 7 * 4 + 16 * entries_len
583        # and the values start after the keys
584        valuestart = keystart + len(ids)
585        koffsets = []
586        voffsets = []
587        # The string table first has the list of keys, then the list of values.
588        # Each entry has first the size of the string, then the file offset.
589        for o1, l1, o2, l2 in offsets:
590            koffsets += [l1, o1 + keystart]
591            voffsets += [l2, o2 + valuestart]
592        offsets = koffsets + voffsets
593
594        output = struct.pack(
595            "Iiiiiii",
596            # Magic number
597            MOFile.MAGIC,
598            # Version
599            0,
600            # number of entries
601            entries_len,
602            # start of key index
603            7 * 4,
604            # start of value index
605            7 * 4 + entries_len * 8,
606            # size and offset of hash table, we don't use hash tables
607            0, keystart
608
609        )
610        if PY3 and sys.version_info.minor > 1:  # python 3.2 or superior
611            output += array.array("i", offsets).tobytes()
612        else:
613            output += array.array("i", offsets).tostring()
614        output += ids
615        output += strs
616        return output
617
618    def _encode(self, mixed):
619        """
620        Encodes the given ``mixed`` argument with the file encoding if and
621        only if it's an unicode string and returns the encoded string.
622        """
623        if isinstance(mixed, text_type):
624            mixed = mixed.encode(self.encoding)
625        return mixed
626# }}}
627# class POFile {{{
628
629
630class POFile(_BaseFile):
631    """
632    Po (or Pot) file reader/writer.
633    This class inherits the :class:`~polib._BaseFile` class and, by extension,
634    the python ``list`` type.
635    """
636
637    def __unicode__(self):
638        """
639        Returns the unicode representation of the po file.
640        """
641        ret, headers = '', self.header.split('\n')
642        for header in headers:
643            if not len(header):
644                ret += "#\n"
645            elif header[:1] in [',', ':']:
646                ret += '#%s\n' % header
647            else:
648                ret += '# %s\n' % header
649
650        if not isinstance(ret, text_type):
651            ret = ret.decode(self.encoding)
652
653        return ret + _BaseFile.__unicode__(self)
654
655    def save_as_mofile(self, fpath):
656        """
657        Saves the binary representation of the file to given ``fpath``.
658
659        Keyword argument:
660
661        ``fpath``
662            string, full or relative path to the mo file.
663        """
664        _BaseFile.save(self, fpath, 'to_binary')
665
666    def percent_translated(self):
667        """
668        Convenience method that returns the percentage of translated
669        messages.
670        """
671        total = len([e for e in self if not e.obsolete])
672        if total == 0:
673            return 100
674        translated = len(self.translated_entries())
675        return int(translated * 100 / float(total))
676
677    def translated_entries(self):
678        """
679        Convenience method that returns the list of translated entries.
680        """
681        return [e for e in self if e.translated()]
682
683    def untranslated_entries(self):
684        """
685        Convenience method that returns the list of untranslated entries.
686        """
687        return [e for e in self if not e.translated() and not e.obsolete
688                and not e.fuzzy]
689
690    def fuzzy_entries(self):
691        """
692        Convenience method that returns the list of fuzzy entries.
693        """
694        return [e for e in self if e.fuzzy]
695
696    def obsolete_entries(self):
697        """
698        Convenience method that returns the list of obsolete entries.
699        """
700        return [e for e in self if e.obsolete]
701
702    def merge(self, refpot):
703        """
704        Convenience method that merges the current pofile with the pot file
705        provided. It behaves exactly as the gettext msgmerge utility:
706
707        * comments of this file will be preserved, but extracted comments and
708          occurrences will be discarded;
709        * any translations or comments in the file will be discarded, however,
710          dot comments and file positions will be preserved;
711        * the fuzzy flags are preserved.
712
713        Keyword argument:
714
715        ``refpot``
716            object POFile, the reference catalog.
717        """
718        # Store entries in dict/set for faster access
719        self_entries = dict(
720            (entry.msgid_with_context, entry) for entry in self
721        )
722        refpot_msgids = set(entry.msgid_with_context for entry in refpot)
723        # Merge entries that are in the refpot
724        for entry in refpot:
725            e = self_entries.get(entry.msgid_with_context)
726            if e is None:
727                e = POEntry()
728                self.append(e)
729            e.merge(entry)
730        # ok, now we must "obsolete" entries that are not in the refpot anymore
731        for entry in self:
732            if entry.msgid_with_context not in refpot_msgids:
733                entry.obsolete = True
734# }}}
735# class MOFile {{{
736
737
738class MOFile(_BaseFile):
739    """
740    Mo file reader/writer.
741    This class inherits the :class:`~polib._BaseFile` class and, by
742    extension, the python ``list`` type.
743    """
744    MAGIC = 0x950412de
745    MAGIC_SWAPPED = 0xde120495
746
747    def __init__(self, *args, **kwargs):
748        """
749        Constructor, accepts all keywords arguments accepted by
750        :class:`~polib._BaseFile` class.
751        """
752        _BaseFile.__init__(self, *args, **kwargs)
753        self.magic_number = None
754        self.version = 0
755
756    def save_as_pofile(self, fpath):
757        """
758        Saves the mofile as a pofile to ``fpath``.
759
760        Keyword argument:
761
762        ``fpath``
763            string, full or relative path to the file.
764        """
765        _BaseFile.save(self, fpath)
766
767    def save(self, fpath=None):
768        """
769        Saves the mofile to ``fpath``.
770
771        Keyword argument:
772
773        ``fpath``
774            string, full or relative path to the file.
775        """
776        _BaseFile.save(self, fpath, 'to_binary')
777
778    def percent_translated(self):
779        """
780        Convenience method to keep the same interface with POFile instances.
781        """
782        return 100
783
784    def translated_entries(self):
785        """
786        Convenience method to keep the same interface with POFile instances.
787        """
788        return self
789
790    def untranslated_entries(self):
791        """
792        Convenience method to keep the same interface with POFile instances.
793        """
794        return []
795
796    def fuzzy_entries(self):
797        """
798        Convenience method to keep the same interface with POFile instances.
799        """
800        return []
801
802    def obsolete_entries(self):
803        """
804        Convenience method to keep the same interface with POFile instances.
805        """
806        return []
807# }}}
808# class _BaseEntry {{{
809
810
811class _BaseEntry(object):
812    """
813    Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
814    This class should **not** be instantiated directly.
815    """
816
817    def __init__(self, *args, **kwargs):
818        """
819        Constructor, accepts the following keyword arguments:
820
821        ``msgid``
822            string, the entry msgid.
823
824        ``msgstr``
825            string, the entry msgstr.
826
827        ``msgid_plural``
828            string, the entry msgid_plural.
829
830        ``msgstr_plural``
831            list, the entry msgstr_plural lines.
832
833        ``msgctxt``
834            string, the entry context (msgctxt).
835
836        ``obsolete``
837            bool, whether the entry is "obsolete" or not.
838
839        ``encoding``
840            string, the encoding to use, defaults to ``default_encoding``
841            global variable (optional).
842        """
843        self.msgid = kwargs.get('msgid', '')
844        self.msgstr = kwargs.get('msgstr', '')
845        self.msgid_plural = kwargs.get('msgid_plural', '')
846        self.msgstr_plural = kwargs.get('msgstr_plural', {})
847        self.msgctxt = kwargs.get('msgctxt', None)
848        self.obsolete = kwargs.get('obsolete', False)
849        self.encoding = kwargs.get('encoding', default_encoding)
850
851    def __unicode__(self, wrapwidth=78):
852        """
853        Returns the unicode representation of the entry.
854        """
855        if self.obsolete:
856            delflag = '#~ '
857        else:
858            delflag = ''
859        ret = []
860        # write the msgctxt if any
861        if self.msgctxt is not None:
862            ret += self._str_field("msgctxt", delflag, "", self.msgctxt,
863                                   wrapwidth)
864        # write the msgid
865        ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
866        # write the msgid_plural if any
867        if self.msgid_plural:
868            ret += self._str_field("msgid_plural", delflag, "",
869                                   self.msgid_plural, wrapwidth)
870        if self.msgstr_plural:
871            # write the msgstr_plural if any
872            msgstrs = self.msgstr_plural
873            keys = list(msgstrs)
874            keys.sort()
875            for index in keys:
876                msgstr = msgstrs[index]
877                plural_index = '[%s]' % index
878                ret += self._str_field("msgstr", delflag, plural_index, msgstr,
879                                       wrapwidth)
880        else:
881            # otherwise write the msgstr
882            ret += self._str_field("msgstr", delflag, "", self.msgstr,
883                                   wrapwidth)
884        ret.append('')
885        ret = u('\n').join(ret)
886        return ret
887
888    if PY3:
889        def __str__(self):
890            return self.__unicode__()
891    else:
892        def __str__(self):
893            """
894            Returns the string representation of the entry.
895            """
896            return unicode(self).encode(self.encoding)
897
898    def __eq__(self, other):
899        return str(self) == str(other)
900
901    def _str_field(self, fieldname, delflag, plural_index, field,
902                   wrapwidth=78):
903        lines = field.splitlines(True)
904        if len(lines) > 1:
905            lines = [''] + lines  # start with initial empty line
906        else:
907            escaped_field = escape(field)
908            specialchars_count = 0
909            for c in ['\\', '\n', '\r', '\t', '"']:
910                specialchars_count += field.count(c)
911            # comparison must take into account fieldname length + one space
912            # + 2 quotes (eg. msgid "<string>")
913            flength = len(fieldname) + 3
914            if plural_index:
915                flength += len(plural_index)
916            real_wrapwidth = wrapwidth - flength + specialchars_count
917            if wrapwidth > 0 and len(field) > real_wrapwidth:
918                # Wrap the line but take field name into account
919                lines = [''] + [unescape(item) for item in wrap(
920                    escaped_field,
921                    wrapwidth - 2,  # 2 for quotes ""
922                    drop_whitespace=False,
923                    break_long_words=False
924                )]
925            else:
926                lines = [field]
927        if fieldname.startswith('previous_'):
928            # quick and dirty trick to get the real field name
929            fieldname = fieldname[9:]
930
931        ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index,
932                                escape(lines.pop(0)))]
933        for line in lines:
934            ret.append('%s"%s"' % (delflag, escape(line)))
935        return ret
936# }}}
937# class POEntry {{{
938
939
940class POEntry(_BaseEntry):
941    """
942    Represents a po file entry.
943    """
944
945    def __init__(self, *args, **kwargs):
946        """
947        Constructor, accepts the following keyword arguments:
948
949        ``comment``
950            string, the entry comment.
951
952        ``tcomment``
953            string, the entry translator comment.
954
955        ``occurrences``
956            list, the entry occurrences.
957
958        ``flags``
959            list, the entry flags.
960
961        ``previous_msgctxt``
962            string, the entry previous context.
963
964        ``previous_msgid``
965            string, the entry previous msgid.
966
967        ``previous_msgid_plural``
968            string, the entry previous msgid_plural.
969
970        ``linenum``
971            integer, the line number of the entry
972        """
973        _BaseEntry.__init__(self, *args, **kwargs)
974        self.comment = kwargs.get('comment', '')
975        self.tcomment = kwargs.get('tcomment', '')
976        self.occurrences = kwargs.get('occurrences', [])
977        self.flags = kwargs.get('flags', [])
978        self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
979        self.previous_msgid = kwargs.get('previous_msgid', None)
980        self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
981        self.linenum = kwargs.get('linenum', None)
982
983    def __unicode__(self, wrapwidth=78):
984        """
985        Returns the unicode representation of the entry.
986        """
987        ret = []
988        # comments first, if any (with text wrapping as xgettext does)
989        if self.obsolete:
990            comments = [('tcomment', '# ')]
991        else:
992            comments = [('comment', '#. '), ('tcomment', '# ')]
993        for c in comments:
994            val = getattr(self, c[0])
995            if val:
996                for comment in val.split('\n'):
997                    if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth:
998                        ret += wrap(
999                            comment,
1000                            wrapwidth,
1001                            initial_indent=c[1],
1002                            subsequent_indent=c[1],
1003                            break_long_words=False
1004                        )
1005                    else:
1006                        ret.append('%s%s' % (c[1], comment))
1007
1008        # occurrences (with text wrapping as xgettext does)
1009        if not self.obsolete and self.occurrences:
1010            filelist = []
1011            for fpath, lineno in self.occurrences:
1012                if lineno:
1013                    filelist.append('%s:%s' % (fpath, lineno))
1014                else:
1015                    filelist.append(fpath)
1016            filestr = ' '.join(filelist)
1017            if wrapwidth > 0 and len(filestr) + 3 > wrapwidth:
1018                # textwrap split words that contain hyphen, this is not
1019                # what we want for filenames, so the dirty hack is to
1020                # temporally replace hyphens with a char that a file cannot
1021                # contain, like "*"
1022                ret += [l.replace('*', '-') for l in wrap(
1023                    filestr.replace('-', '*'),
1024                    wrapwidth,
1025                    initial_indent='#: ',
1026                    subsequent_indent='#: ',
1027                    break_long_words=False
1028                )]
1029            else:
1030                ret.append('#: ' + filestr)
1031
1032        # flags (TODO: wrapping ?)
1033        if self.flags:
1034            ret.append('#, %s' % ', '.join(self.flags))
1035
1036        # previous context and previous msgid/msgid_plural
1037        fields = ['previous_msgctxt', 'previous_msgid',
1038                  'previous_msgid_plural']
1039        if self.obsolete:
1040            prefix = "#~| "
1041        else:
1042            prefix = "#| "
1043        for f in fields:
1044            val = getattr(self, f)
1045            if val:
1046                ret += self._str_field(f, prefix, "", val, wrapwidth)
1047
1048        ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1049        ret = u('\n').join(ret)
1050        return ret
1051
1052    def __cmp__(self, other):
1053        """
1054        Called by comparison operations if rich comparison is not defined.
1055        """
1056        # First: Obsolete test
1057        if self.obsolete != other.obsolete:
1058            if self.obsolete:
1059                return -1
1060            else:
1061                return 1
1062        # Work on a copy to protect original
1063        occ1 = sorted(self.occurrences[:])
1064        occ2 = sorted(other.occurrences[:])
1065        pos = 0
1066        if occ1 > occ2:
1067            return 1
1068        if occ1 < occ2:
1069            return -1
1070        # Compare context
1071        msgctxt = self.msgctxt or 0
1072        othermsgctxt = other.msgctxt or 0
1073        if msgctxt > othermsgctxt:
1074            return 1
1075        elif msgctxt < othermsgctxt:
1076            return -1
1077        # Compare msgid_plural
1078        msgid_plural = self.msgid_plural or 0
1079        othermsgid_plural = other.msgid_plural or 0
1080        if msgid_plural > othermsgid_plural:
1081            return 1
1082        elif msgid_plural < othermsgid_plural:
1083            return -1
1084        # Compare msgstr_plural
1085        msgstr_plural = self.msgstr_plural or 0
1086        othermsgstr_plural = other.msgstr_plural or 0
1087        if msgstr_plural > othermsgstr_plural:
1088            return 1
1089        elif msgstr_plural < othermsgstr_plural:
1090            return -1
1091        # Compare msgid
1092        if self.msgid > other.msgid:
1093            return 1
1094        elif self.msgid < other.msgid:
1095            return -1
1096        return 0
1097        # Compare msgstr
1098        if self.msgstr > other.msgstr:
1099            return 1
1100        elif self.msgstr < other.msgstr:
1101            return -1
1102        return 0
1103
1104    def __gt__(self, other):
1105        return self.__cmp__(other) > 0
1106
1107    def __lt__(self, other):
1108        return self.__cmp__(other) < 0
1109
1110    def __ge__(self, other):
1111        return self.__cmp__(other) >= 0
1112
1113    def __le__(self, other):
1114        return self.__cmp__(other) <= 0
1115
1116    def __eq__(self, other):
1117        return self.__cmp__(other) == 0
1118
1119    def __ne__(self, other):
1120        return self.__cmp__(other) != 0
1121
1122    def translated(self):
1123        """
1124        Returns ``True`` if the entry has been translated or ``False``
1125        otherwise.
1126        """
1127        if self.obsolete or self.fuzzy:
1128            return False
1129        if self.msgstr != '':
1130            return True
1131        if self.msgstr_plural:
1132            for pos in self.msgstr_plural:
1133                if self.msgstr_plural[pos] == '':
1134                    return False
1135            return True
1136        return False
1137
1138    def merge(self, other):
1139        """
1140        Merge the current entry with the given pot entry.
1141        """
1142        self.msgid = other.msgid
1143        self.msgctxt = other.msgctxt
1144        self.occurrences = other.occurrences
1145        self.comment = other.comment
1146        fuzzy = self.fuzzy
1147        self.flags = other.flags[:]  # clone flags
1148        if fuzzy:
1149            self.flags.append('fuzzy')
1150        self.msgid_plural = other.msgid_plural
1151        self.obsolete = other.obsolete
1152        self.previous_msgctxt = other.previous_msgctxt
1153        self.previous_msgid = other.previous_msgid
1154        self.previous_msgid_plural = other.previous_msgid_plural
1155        if other.msgstr_plural:
1156            for pos in other.msgstr_plural:
1157                try:
1158                    # keep existing translation at pos if any
1159                    self.msgstr_plural[pos]
1160                except KeyError:
1161                    self.msgstr_plural[pos] = ''
1162
1163    @property
1164    def fuzzy(self):
1165        return 'fuzzy' in self.flags
1166
1167    @property
1168    def msgid_with_context(self):
1169        if self.msgctxt:
1170            return '%s%s%s' % (self.msgctxt, "\x04", self.msgid)
1171        return self.msgid
1172
1173    def __hash__(self):
1174        return hash((self.msgid, self.msgstr))
1175# }}}
1176# class MOEntry {{{
1177
1178
1179class MOEntry(_BaseEntry):
1180    """
1181    Represents a mo file entry.
1182    """
1183    def __init__(self, *args, **kwargs):
1184        """
1185        Constructor, accepts the following keyword arguments,
1186        for consistency with :class:`~polib.POEntry`:
1187
1188        ``comment``
1189        ``tcomment``
1190        ``occurrences``
1191        ``flags``
1192        ``previous_msgctxt``
1193        ``previous_msgid``
1194        ``previous_msgid_plural``
1195
1196        Note: even though these keyword arguments are accepted,
1197        they hold no real meaning in the context of MO files
1198        and are simply ignored.
1199        """
1200        _BaseEntry.__init__(self, *args, **kwargs)
1201        self.comment = ''
1202        self.tcomment = ''
1203        self.occurrences = []
1204        self.flags = []
1205        self.previous_msgctxt = None
1206        self.previous_msgid = None
1207        self.previous_msgid_plural = None
1208
1209    def __hash__(self):
1210        return hash((self.msgid, self.msgstr))
1211
1212# }}}
1213# class _POFileParser {{{
1214
1215
1216class _POFileParser(object):
1217    """
1218    A finite state machine to parse efficiently and correctly po
1219    file format.
1220    """
1221
1222    def __init__(self, pofile, *args, **kwargs):
1223        """
1224        Constructor.
1225
1226        Keyword arguments:
1227
1228        ``pofile``
1229            string, path to the po file or its content
1230
1231        ``encoding``
1232            string, the encoding to use, defaults to ``default_encoding``
1233            global variable (optional).
1234
1235        ``check_for_duplicates``
1236            whether to check for duplicate entries when adding entries to the
1237            file (optional, default: ``False``).
1238        """
1239        enc = kwargs.get('encoding', default_encoding)
1240        if _is_file(pofile):
1241            try:
1242                self.fhandle = io.open(pofile, 'rt', encoding=enc)
1243            except LookupError:
1244                enc = default_encoding
1245                self.fhandle = io.open(pofile, 'rt', encoding=enc)
1246        else:
1247            self.fhandle = pofile.splitlines()
1248
1249        klass = kwargs.get('klass')
1250        if klass is None:
1251            klass = POFile
1252        self.instance = klass(
1253            pofile=pofile,
1254            encoding=enc,
1255            check_for_duplicates=kwargs.get('check_for_duplicates', False)
1256        )
1257        self.transitions = {}
1258        self.current_line = 0
1259        self.current_entry = POEntry(linenum=self.current_line)
1260        self.current_state = 'st'
1261        self.current_token = None
1262        # two memo flags used in handlers
1263        self.msgstr_index = 0
1264        self.entry_obsolete = 0
1265        # Configure the state machine, by adding transitions.
1266        # Signification of symbols:
1267        #     * ST: Beginning of the file (start)
1268        #     * HE: Header
1269        #     * TC: a translation comment
1270        #     * GC: a generated comment
1271        #     * OC: a file/line occurrence
1272        #     * FL: a flags line
1273        #     * CT: a message context
1274        #     * PC: a previous msgctxt
1275        #     * PM: a previous msgid
1276        #     * PP: a previous msgid_plural
1277        #     * MI: a msgid
1278        #     * MP: a msgid plural
1279        #     * MS: a msgstr
1280        #     * MX: a msgstr plural
1281        #     * MC: a msgid or msgstr continuation line
1282        all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc',
1283               'ms', 'mp', 'mx', 'mi']
1284
1285        self.add('tc', ['st', 'he'],                                     'he')
1286        self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms',
1287                        'mp', 'mx', 'mi'],                               'tc')
1288        self.add('gc', all,                                              'gc')
1289        self.add('oc', all,                                              'oc')
1290        self.add('fl', all,                                              'fl')
1291        self.add('pc', all,                                              'pc')
1292        self.add('pm', all,                                              'pm')
1293        self.add('pp', all,                                              'pp')
1294        self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm',
1295                        'pp', 'ms', 'mx'],                               'ct')
1296        self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc',
1297                 'pm', 'pp', 'ms', 'mx'],                                'mi')
1298        self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'],             'mp')
1299        self.add('ms', ['mi', 'mp', 'tc'],                               'ms')
1300        self.add('mx', ['mi', 'mx', 'mp', 'tc'],                         'mx')
1301        self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1302
1303    def parse(self):
1304        """
1305        Run the state machine, parse the file line by line and call process()
1306        with the current matched symbol.
1307        """
1308
1309        keywords = {
1310            'msgctxt': 'ct',
1311            'msgid': 'mi',
1312            'msgstr': 'ms',
1313            'msgid_plural': 'mp',
1314        }
1315        prev_keywords = {
1316            'msgid_plural': 'pp',
1317            'msgid': 'pm',
1318            'msgctxt': 'pc',
1319        }
1320        tokens = []
1321        fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1322        for line in self.fhandle:
1323            self.current_line += 1
1324            line = line.strip()
1325            if line == '':
1326                continue
1327
1328            tokens = line.split(None, 2)
1329            nb_tokens = len(tokens)
1330
1331            if tokens[0] == '#~|':
1332                continue
1333
1334            if tokens[0] == '#~' and nb_tokens > 1:
1335                line = line[3:].strip()
1336                tokens = tokens[1:]
1337                nb_tokens -= 1
1338                self.entry_obsolete = 1
1339            else:
1340                self.entry_obsolete = 0
1341
1342            # Take care of keywords like
1343            # msgid, msgid_plural, msgctxt & msgstr.
1344            if tokens[0] in keywords and nb_tokens > 1:
1345                line = line[len(tokens[0]):].lstrip()
1346                if re.search(r'([^\\]|^)"', line[1:-1]):
1347                    raise IOError('Syntax error in po file %s(line %s): '
1348                                  'unescaped double quote found' %
1349                                  (fpath, self.current_line))
1350                self.current_token = line
1351                self.process(keywords[tokens[0]])
1352                continue
1353
1354            self.current_token = line
1355
1356            if tokens[0] == '#:':
1357                if nb_tokens <= 1:
1358                    continue
1359                # we are on a occurrences line
1360                self.process('oc')
1361
1362            elif line[:1] == '"':
1363                # we are on a continuation line
1364                if re.search(r'([^\\]|^)"', line[1:-1]):
1365                    raise IOError('Syntax error in po file %s(line %s): '
1366                                  'unescaped double quote found' %
1367                                  (fpath, self.current_line))
1368                self.process('mc')
1369
1370            elif line[:7] == 'msgstr[':
1371                # we are on a msgstr plural
1372                self.process('mx')
1373
1374            elif tokens[0] == '#,':
1375                if nb_tokens <= 1:
1376                    continue
1377                # we are on a flags line
1378                self.process('fl')
1379
1380            elif tokens[0] == '#' or tokens[0].startswith('##'):
1381                if line == '#':
1382                    line += ' '
1383                # we are on a translator comment line
1384                self.process('tc')
1385
1386            elif tokens[0] == '#.':
1387                if nb_tokens <= 1:
1388                    continue
1389                # we are on a generated comment line
1390                self.process('gc')
1391
1392            elif tokens[0] == '#|':
1393                if nb_tokens <= 1:
1394                    raise IOError('Syntax error in po file %s(line %s)' %
1395                                  (fpath, self.current_line))
1396
1397                # Remove the marker and any whitespace right after that.
1398                line = line[2:].lstrip()
1399                self.current_token = line
1400
1401                if tokens[1].startswith('"'):
1402                    # Continuation of previous metadata.
1403                    self.process('mc')
1404                    continue
1405
1406                if nb_tokens == 2:
1407                    # Invalid continuation line.
1408                    raise IOError('Syntax error in po file %s(line %s): '
1409                                  'invalid continuation line' %
1410                                  (fpath, self.current_line))
1411
1412                # we are on a "previous translation" comment line,
1413                if tokens[1] not in prev_keywords:
1414                    # Unknown keyword in previous translation comment.
1415                    raise IOError('Syntax error in po file %s(line %s): '
1416                                  'unknown keyword %s' %
1417                                  (fpath, self.current_line,
1418                                   tokens[1]))
1419
1420                # Remove the keyword and any whitespace
1421                # between it and the starting quote.
1422                line = line[len(tokens[1]):].lstrip()
1423                self.current_token = line
1424                self.process(prev_keywords[tokens[1]])
1425
1426            else:
1427                raise IOError('Syntax error in po file %s(line %s)' %
1428                              (fpath, self.current_line))
1429
1430        if self.current_entry and len(tokens) > 0 and \
1431           not tokens[0].startswith('#'):
1432            # since entries are added when another entry is found, we must add
1433            # the last entry here (only if there are lines). Trailing comments
1434            # are ignored
1435            self.instance.append(self.current_entry)
1436
1437        # before returning the instance, check if there's metadata and if
1438        # so extract it in a dict
1439        metadataentry = self.instance.find('')
1440        if metadataentry:  # metadata found
1441            # remove the entry
1442            self.instance.remove(metadataentry)
1443            self.instance.metadata_is_fuzzy = metadataentry.flags
1444            key = None
1445            for msg in metadataentry.msgstr.splitlines():
1446                try:
1447                    key, val = msg.split(':', 1)
1448                    self.instance.metadata[key] = val.strip()
1449                except (ValueError, KeyError):
1450                    if key is not None:
1451                        self.instance.metadata[key] += '\n' + msg.strip()
1452        # close opened file
1453        if not isinstance(self.fhandle, list):  # must be file
1454            self.fhandle.close()
1455        return self.instance
1456
1457    def add(self, symbol, states, next_state):
1458        """
1459        Add a transition to the state machine.
1460
1461        Keywords arguments:
1462
1463        ``symbol``
1464            string, the matched token (two chars symbol).
1465
1466        ``states``
1467            list, a list of states (two chars symbols).
1468
1469        ``next_state``
1470            the next state the fsm will have after the action.
1471        """
1472        for state in states:
1473            action = getattr(self, 'handle_%s' % next_state)
1474            self.transitions[(symbol, state)] = (action, next_state)
1475
1476    def process(self, symbol):
1477        """
1478        Process the transition corresponding to the current state and the
1479        symbol provided.
1480
1481        Keywords arguments:
1482
1483        ``symbol``
1484            string, the matched token (two chars symbol).
1485
1486        ``linenum``
1487            integer, the current line number of the parsed file.
1488        """
1489        try:
1490            (action, state) = self.transitions[(symbol, self.current_state)]
1491            if action():
1492                self.current_state = state
1493        except Exception:
1494            raise IOError('Syntax error in po file (line %s)' %
1495                          self.current_line)
1496
1497    # state handlers
1498
1499    def handle_he(self):
1500        """Handle a header comment."""
1501        if self.instance.header != '':
1502            self.instance.header += '\n'
1503        self.instance.header += self.current_token[2:]
1504        return 1
1505
1506    def handle_tc(self):
1507        """Handle a translator comment."""
1508        if self.current_state in ['mc', 'ms', 'mx']:
1509            self.instance.append(self.current_entry)
1510            self.current_entry = POEntry(linenum=self.current_line)
1511        if self.current_entry.tcomment != '':
1512            self.current_entry.tcomment += '\n'
1513        tcomment = self.current_token.lstrip('#')
1514        if tcomment.startswith(' '):
1515            tcomment = tcomment[1:]
1516        self.current_entry.tcomment += tcomment
1517        return True
1518
1519    def handle_gc(self):
1520        """Handle a generated comment."""
1521        if self.current_state in ['mc', 'ms', 'mx']:
1522            self.instance.append(self.current_entry)
1523            self.current_entry = POEntry(linenum=self.current_line)
1524        if self.current_entry.comment != '':
1525            self.current_entry.comment += '\n'
1526        self.current_entry.comment += self.current_token[3:]
1527        return True
1528
1529    def handle_oc(self):
1530        """Handle a file:num occurrence."""
1531        if self.current_state in ['mc', 'ms', 'mx']:
1532            self.instance.append(self.current_entry)
1533            self.current_entry = POEntry(linenum=self.current_line)
1534        occurrences = self.current_token[3:].split()
1535        for occurrence in occurrences:
1536            if occurrence != '':
1537                try:
1538                    fil, line = occurrence.rsplit(':', 1)
1539                    if not line.isdigit():
1540                        fil = occurrence
1541                        line = ''
1542                    self.current_entry.occurrences.append((fil, line))
1543                except (ValueError, AttributeError):
1544                    self.current_entry.occurrences.append((occurrence, ''))
1545        return True
1546
1547    def handle_fl(self):
1548        """Handle a flags line."""
1549        if self.current_state in ['mc', 'ms', 'mx']:
1550            self.instance.append(self.current_entry)
1551            self.current_entry = POEntry(linenum=self.current_line)
1552        self.current_entry.flags += [c.strip() for c in
1553                                     self.current_token[3:].split(',')]
1554        return True
1555
1556    def handle_pp(self):
1557        """Handle a previous msgid_plural line."""
1558        if self.current_state in ['mc', 'ms', 'mx']:
1559            self.instance.append(self.current_entry)
1560            self.current_entry = POEntry(linenum=self.current_line)
1561        self.current_entry.previous_msgid_plural = \
1562            unescape(self.current_token[1:-1])
1563        return True
1564
1565    def handle_pm(self):
1566        """Handle a previous msgid line."""
1567        if self.current_state in ['mc', 'ms', 'mx']:
1568            self.instance.append(self.current_entry)
1569            self.current_entry = POEntry(linenum=self.current_line)
1570        self.current_entry.previous_msgid = \
1571            unescape(self.current_token[1:-1])
1572        return True
1573
1574    def handle_pc(self):
1575        """Handle a previous msgctxt line."""
1576        if self.current_state in ['mc', 'ms', 'mx']:
1577            self.instance.append(self.current_entry)
1578            self.current_entry = POEntry(linenum=self.current_line)
1579        self.current_entry.previous_msgctxt = \
1580            unescape(self.current_token[1:-1])
1581        return True
1582
1583    def handle_ct(self):
1584        """Handle a msgctxt."""
1585        if self.current_state in ['mc', 'ms', 'mx']:
1586            self.instance.append(self.current_entry)
1587            self.current_entry = POEntry(linenum=self.current_line)
1588        self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1589        return True
1590
1591    def handle_mi(self):
1592        """Handle a msgid."""
1593        if self.current_state in ['mc', 'ms', 'mx']:
1594            self.instance.append(self.current_entry)
1595            self.current_entry = POEntry(linenum=self.current_line)
1596        self.current_entry.obsolete = self.entry_obsolete
1597        self.current_entry.msgid = unescape(self.current_token[1:-1])
1598        return True
1599
1600    def handle_mp(self):
1601        """Handle a msgid plural."""
1602        self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1603        return True
1604
1605    def handle_ms(self):
1606        """Handle a msgstr."""
1607        self.current_entry.msgstr = unescape(self.current_token[1:-1])
1608        return True
1609
1610    def handle_mx(self):
1611        """Handle a msgstr plural."""
1612        index = self.current_token[7]
1613        value = self.current_token[self.current_token.find('"') + 1:-1]
1614        self.current_entry.msgstr_plural[int(index)] = unescape(value)
1615        self.msgstr_index = int(index)
1616        return True
1617
1618    def handle_mc(self):
1619        """Handle a msgid or msgstr continuation line."""
1620        token = unescape(self.current_token[1:-1])
1621        if self.current_state == 'ct':
1622            self.current_entry.msgctxt += token
1623        elif self.current_state == 'mi':
1624            self.current_entry.msgid += token
1625        elif self.current_state == 'mp':
1626            self.current_entry.msgid_plural += token
1627        elif self.current_state == 'ms':
1628            self.current_entry.msgstr += token
1629        elif self.current_state == 'mx':
1630            self.current_entry.msgstr_plural[self.msgstr_index] += token
1631        elif self.current_state == 'pp':
1632            self.current_entry.previous_msgid_plural += token
1633        elif self.current_state == 'pm':
1634            self.current_entry.previous_msgid += token
1635        elif self.current_state == 'pc':
1636            self.current_entry.previous_msgctxt += token
1637        # don't change the current state
1638        return False
1639# }}}
1640# class _MOFileParser {{{
1641
1642
1643class _MOFileParser(object):
1644    """
1645    A class to parse binary mo files.
1646    """
1647
1648    def __init__(self, mofile, *args, **kwargs):
1649        """
1650        Constructor.
1651
1652        Keyword arguments:
1653
1654        ``mofile``
1655            string, path to the mo file or its content
1656
1657        ``encoding``
1658            string, the encoding to use, defaults to ``default_encoding``
1659            global variable (optional).
1660
1661        ``check_for_duplicates``
1662            whether to check for duplicate entries when adding entries to the
1663            file (optional, default: ``False``).
1664        """
1665        self.fhandle = open(mofile, 'rb')
1666
1667        klass = kwargs.get('klass')
1668        if klass is None:
1669            klass = MOFile
1670        self.instance = klass(
1671            fpath=mofile,
1672            encoding=kwargs.get('encoding', default_encoding),
1673            check_for_duplicates=kwargs.get('check_for_duplicates', False)
1674        )
1675
1676    def __del__(self):
1677        """
1678        Make sure the file is closed, this prevents warnings on unclosed file
1679        when running tests with python >= 3.2.
1680        """
1681        if self.fhandle:
1682            self.fhandle.close()
1683
1684    def parse(self):
1685        """
1686        Build the instance with the file handle provided in the
1687        constructor.
1688        """
1689        # parse magic number
1690        magic_number = self._readbinary('<I', 4)
1691        if magic_number == MOFile.MAGIC:
1692            ii = '<II'
1693        elif magic_number == MOFile.MAGIC_SWAPPED:
1694            ii = '>II'
1695        else:
1696            raise IOError('Invalid mo file, magic number is incorrect !')
1697        self.instance.magic_number = magic_number
1698        # parse the version number and the number of strings
1699        version, numofstrings = self._readbinary(ii, 8)
1700        # from MO file format specs: "A program seeing an unexpected major
1701        # revision number should stop reading the MO file entirely"
1702        if version >> 16 not in (0, 1):
1703            raise IOError('Invalid mo file, unexpected major revision number')
1704        self.instance.version = version
1705        # original strings and translation strings hash table offset
1706        msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1707        # move to msgid hash table and read length and offset of msgids
1708        self.fhandle.seek(msgids_hash_offset)
1709        msgids_index = []
1710        for i in range(numofstrings):
1711            msgids_index.append(self._readbinary(ii, 8))
1712        # move to msgstr hash table and read length and offset of msgstrs
1713        self.fhandle.seek(msgstrs_hash_offset)
1714        msgstrs_index = []
1715        for i in range(numofstrings):
1716            msgstrs_index.append(self._readbinary(ii, 8))
1717        # build entries
1718        encoding = self.instance.encoding
1719        for i in range(numofstrings):
1720            self.fhandle.seek(msgids_index[i][1])
1721            msgid = self.fhandle.read(msgids_index[i][0])
1722
1723            self.fhandle.seek(msgstrs_index[i][1])
1724            msgstr = self.fhandle.read(msgstrs_index[i][0])
1725            if i == 0 and not msgid:  # metadata
1726                raw_metadata, metadata = msgstr.split(b('\n')), {}
1727                for line in raw_metadata:
1728                    tokens = line.split(b(':'), 1)
1729                    if tokens[0] != b(''):
1730                        try:
1731                            k = tokens[0].decode(encoding)
1732                            v = tokens[1].decode(encoding)
1733                            metadata[k] = v.strip()
1734                        except IndexError:
1735                            metadata[k] = u('')
1736                self.instance.metadata = metadata
1737                continue
1738            # test if we have a plural entry
1739            msgid_tokens = msgid.split(b('\0'))
1740            if len(msgid_tokens) > 1:
1741                entry = self._build_entry(
1742                    msgid=msgid_tokens[0],
1743                    msgid_plural=msgid_tokens[1],
1744                    msgstr_plural=dict((k, v) for k, v in
1745                                       enumerate(msgstr.split(b('\0'))))
1746                )
1747            else:
1748                entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1749            self.instance.append(entry)
1750        # close opened file
1751        self.fhandle.close()
1752        return self.instance
1753
1754    def _build_entry(self, msgid, msgstr=None, msgid_plural=None,
1755                     msgstr_plural=None):
1756        msgctxt_msgid = msgid.split(b('\x04'))
1757        encoding = self.instance.encoding
1758        if len(msgctxt_msgid) > 1:
1759            kwargs = {
1760                'msgctxt': msgctxt_msgid[0].decode(encoding),
1761                'msgid': msgctxt_msgid[1].decode(encoding),
1762            }
1763        else:
1764            kwargs = {'msgid': msgid.decode(encoding)}
1765        if msgstr:
1766            kwargs['msgstr'] = msgstr.decode(encoding)
1767        if msgid_plural:
1768            kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1769        if msgstr_plural:
1770            for k in msgstr_plural:
1771                msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1772            kwargs['msgstr_plural'] = msgstr_plural
1773        return MOEntry(**kwargs)
1774
1775    def _readbinary(self, fmt, numbytes):
1776        """
1777        Private method that unpack n bytes of data using format <fmt>.
1778        It returns a tuple or a mixed value if the tuple length is 1.
1779        """
1780        bytes = self.fhandle.read(numbytes)
1781        tup = struct.unpack(fmt, bytes)
1782        if len(tup) == 1:
1783            return tup[0]
1784        return tup
1785# }}}
1786# class TextWrapper {{{
1787
1788
1789class TextWrapper(textwrap.TextWrapper):
1790    """
1791    Subclass of textwrap.TextWrapper that backport the
1792    drop_whitespace option.
1793    """
1794    def __init__(self, *args, **kwargs):
1795        drop_whitespace = kwargs.pop('drop_whitespace', True)
1796        textwrap.TextWrapper.__init__(self, *args, **kwargs)
1797        self.drop_whitespace = drop_whitespace
1798
1799    def _wrap_chunks(self, chunks):
1800        """_wrap_chunks(chunks : [string]) -> [string]
1801
1802        Wrap a sequence of text chunks and return a list of lines of
1803        length 'self.width' or less.  (If 'break_long_words' is false,
1804        some lines may be longer than this.)  Chunks correspond roughly
1805        to words and the whitespace between them: each chunk is
1806        indivisible (modulo 'break_long_words'), but a line break can
1807        come between any two chunks.  Chunks should not have internal
1808        whitespace; ie. a chunk is either all whitespace or a "word".
1809        Whitespace chunks will be removed from the beginning and end of
1810        lines, but apart from that whitespace is preserved.
1811        """
1812        lines = []
1813        if self.width <= 0:
1814            raise ValueError("invalid width %r (must be > 0)" % self.width)
1815
1816        # Arrange in reverse order so items can be efficiently popped
1817        # from a stack of chucks.
1818        chunks.reverse()
1819
1820        while chunks:
1821
1822            # Start the list of chunks that will make up the current line.
1823            # cur_len is just the length of all the chunks in cur_line.
1824            cur_line = []
1825            cur_len = 0
1826
1827            # Figure out which static string will prefix this line.
1828            if lines:
1829                indent = self.subsequent_indent
1830            else:
1831                indent = self.initial_indent
1832
1833            # Maximum width for this line.
1834            width = self.width - len(indent)
1835
1836            # First chunk on line is whitespace -- drop it, unless this
1837            # is the very beginning of the text (ie. no lines started yet).
1838            if self.drop_whitespace and chunks[-1].strip() == '' and lines:
1839                del chunks[-1]
1840
1841            while chunks:
1842                length = len(chunks[-1])
1843
1844                # Can at least squeeze this chunk onto the current line.
1845                if cur_len + length <= width:
1846                    cur_line.append(chunks.pop())
1847                    cur_len += length
1848
1849                # Nope, this line is full.
1850                else:
1851                    break
1852
1853            # The current line is full, and the next chunk is too big to
1854            # fit on *any* line (not just this one).
1855            if chunks and len(chunks[-1]) > width:
1856                self._handle_long_word(chunks, cur_line, cur_len, width)
1857
1858            # If the last chunk on this line is all whitespace, drop it.
1859            if self.drop_whitespace and cur_line and not cur_line[-1].strip():
1860                del cur_line[-1]
1861
1862            # Convert current line back to a string and store it in list
1863            # of all lines (return value).
1864            if cur_line:
1865                lines.append(indent + ''.join(cur_line))
1866
1867        return lines
1868# }}}
1869# function wrap() {{{
1870
1871
1872def wrap(text, width=70, **kwargs):
1873    """
1874    Wrap a single paragraph of text, returning a list of wrapped lines.
1875    """
1876    if sys.version_info < (2, 6):
1877        return TextWrapper(width=width, **kwargs).wrap(text)
1878    return textwrap.wrap(text, width=width, **kwargs)
1879
1880# }}}
1881