1#
2# Copyright 2007 Zuza Software Foundation
3#
4# the function "serialize" was derived from Python v2.4
5#       (Tools/i18n/msgfmt.py - function "generate"):
6#   Written by Martin v. Löwis <loewis@informatik.hu-berlin.de>
7#   Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
8#   All rights reserved.
9#   original license: Python Software Foundation (version 2)
10#
11#
12# This file is part of translate.
13#
14# translate is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# translate is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, see <http://www.gnu.org/licenses/>.
26#
27
28"""Module for parsing Gettext .mo files for translation.
29
30The coding of .mo files was produced from `Gettext documentation
31<http://www.gnu.org/software/gettext/manual/gettext.html#MO-Files>`_,
32Pythons msgfmt.py and by observing and testing existing .mo files in the wild.
33
34The hash algorithm is implemented for MO files, this should result in
35faster access of the MO file.  The hash is optional for Gettext
36and is not needed for reading or writing MO files, in this implementation
37it is always on and does produce sometimes different results to Gettext
38in very small files.
39"""
40
41import array
42import re
43import struct
44
45from translate.misc.multistring import multistring
46from translate.storage import base, poheader
47
48
49MO_MAGIC_NUMBER = 0x950412DE
50POT_HEADER = re.compile(r"^POT-Creation-Date:.*(\n|$)", re.IGNORECASE | re.MULTILINE)
51
52
53def mounpack(filename="messages.mo"):
54    """Helper to unpack Gettext MO files into a Python string"""
55    with open(filename, "rb") as fh:
56        s = fh.read()
57        print("\\x%02x" * len(s) % tuple(map(ord, s)))
58
59
60def my_swap4(result):
61    c0 = (result >> 0) & 0xFF
62    c1 = (result >> 8) & 0xFF
63    c2 = (result >> 16) & 0xFF
64    c3 = (result >> 24) & 0xFF
65
66    return (c0 << 24) | (c1 << 16) | (c2 << 8) | c3
67
68
69def hashpjw(str_param):
70    HASHWORDBITS = 32
71    hval = 0
72    for s in str_param:
73        if not s:
74            break
75        hval = hval << 4
76        hval += s
77        g = hval & 0xF << (HASHWORDBITS - 4)
78        if g != 0:
79            hval = hval ^ (g >> (HASHWORDBITS - 8))
80            hval = hval ^ g
81    return hval
82
83
84def get_next_prime_number(start):
85    # find the smallest prime number that is greater or equal "start"
86    # this is based on hash lib implementation in gettext
87
88    def is_prime(num):
89        # No even number and none less than 10 will be passed here
90        divn = 3
91        sq = divn * divn
92
93        while sq < num and num % divn != 0:
94            divn += 1
95            sq += 4 * divn
96            divn += 1
97
98        return num % divn != 0
99
100    # Make it definitely odd
101    candidate = start | 1
102
103    while not is_prime(candidate):
104        candidate += 2
105
106    return candidate
107
108
109class mounit(base.TranslationUnit):
110    """A class representing a .mo translation message."""
111
112    def __init__(self, source=None, **kwargs):
113        self.msgctxt = []
114        self.msgidcomments = []
115        super().__init__(source)
116
117    def getcontext(self):
118        """Get the message context"""
119        # Still need to handle KDE comments
120        if self.msgctxt is None:
121            return None
122        return "".join(self.msgctxt)
123
124    def setcontext(self, context):
125        self.msgctxt = [context]
126
127    def isheader(self):
128        """Is this a header entry?"""
129        return self.source == ""
130
131    def istranslatable(self):
132        """Is this message translateable?"""
133        return bool(self.source)
134
135
136class mofile(poheader.poheader, base.TranslationStore):
137    """A class representing a .mo file."""
138
139    UnitClass = mounit
140    Name = "Gettext MO file"
141    Mimetypes = ["application/x-gettext-catalog", "application/x-mo"]
142    Extensions = ["mo", "gmo"]
143    _binary = True
144
145    def __init__(self, inputfile=None, **kwargs):
146        super().__init__(**kwargs)
147        self.filename = ""
148        if inputfile is not None:
149            self.parsestring(inputfile)
150
151    def serialize(self, out):
152        """Output a string representation of the MO data file"""
153        # check the header of this file for the copyright note of this function
154
155        def add_to_hash_table(string, i):
156            hash_value = hashpjw(string)
157            hash_cursor = hash_value % hash_size
158            increment = 1 + (hash_value % (hash_size - 2))
159            while hash_table[hash_cursor] != 0:
160                hash_cursor += increment
161                hash_cursor = hash_cursor % hash_size
162            hash_table[hash_cursor] = i + 1
163
164        def lst_encode(lst, join_char=b""):
165            return join_char.join([i.encode("utf-8") for i in lst])
166
167        # hash_size should be the smallest prime number that is greater
168        # or equal (4 / 3 * N) - where N is the number of keys/units.
169        # see gettext-0.17:gettext-tools/src/write-mo.c:406
170        hash_size = get_next_prime_number((len(self.units) * 4) // 3)
171        if hash_size <= 2:
172            hash_size = 3
173        MESSAGES = {}
174        for unit in self.units:
175            # If the unit is not translated, we should rather omit it entirely
176            if not unit.istranslated():
177                continue
178            if isinstance(unit.source, multistring):
179                source = lst_encode(unit.msgidcomments) + lst_encode(
180                    unit.source.strings, b"\0"
181                )
182            else:
183                source = lst_encode(unit.msgidcomments) + unit.source.encode("utf-8")
184            if unit.msgctxt:
185                source = lst_encode(unit.msgctxt) + b"\x04" + source
186            if isinstance(unit.target, multistring):
187                target = lst_encode(unit.target.strings, b"\0")
188            elif unit.isheader():
189                # Support for "reproducible builds": Delete information that
190                # may vary between builds in the same conditions.
191                target = POT_HEADER.sub("", unit.target).encode("utf-8")
192            else:
193                target = unit.target.encode("utf-8")
194            if unit.target:
195                MESSAGES[source] = target
196        # using "I" works for 32- and 64-bit systems, but not for 16-bit!
197        hash_table = array.array("I", [0] * hash_size)
198        # the keys are sorted in the .mo file
199        keys = sorted(MESSAGES.keys())
200        offsets = []
201        ids = strs = b""
202        for i, id in enumerate(keys):
203            # For each string, we need size and file offset.  Each string is
204            # NUL terminated; the NUL does not count into the size.
205            # TODO: We don't do any encoding detection from the PO Header
206            add_to_hash_table(id, i)
207            string = MESSAGES[id]  # id already encoded for use as dictionary key
208            offsets.append((len(ids), len(id), len(strs), len(string)))
209            ids = ids + id + b"\0"
210            strs = strs + string + b"\0"
211        # The header is 7 32-bit unsigned integers
212        keystart = 7 * 4 + 16 * len(keys) + hash_size * 4
213        # and the values start after the keys
214        valuestart = keystart + len(ids)
215        koffsets = []
216        voffsets = []
217        # The string table first has the list of keys, then the list of values.
218        # Each entry has first the size of the string, then the file offset.
219        for o1, l1, o2, l2 in offsets:
220            koffsets = koffsets + [l1, o1 + keystart]
221            voffsets = voffsets + [l2, o2 + valuestart]
222        offsets = koffsets + voffsets
223        out.write(
224            struct.pack(
225                "Iiiiiii",
226                MO_MAGIC_NUMBER,  # Magic
227                0,  # Version
228                len(keys),  # # of entries
229                7 * 4,  # start of key index
230                7 * 4 + len(keys) * 8,  # start of value index
231                hash_size,  # size of hash table
232                7 * 4 + 2 * (len(keys) * 8),  # offset of hash table
233            )
234        )
235        # additional data is not necessary for empty mo files
236        if len(keys) > 0:
237            out.write(array.array("i", offsets).tobytes())
238            out.write(hash_table.tobytes())
239            out.write(ids)
240            out.write(strs)
241
242    def parse(self, input):
243        """parses the given file or file source string"""
244        if hasattr(input, "name"):
245            self.filename = input.name
246        elif not getattr(self, "filename", ""):
247            self.filename = ""
248        if hasattr(input, "read"):
249            mosrc = input.read()
250            input.close()
251            input = mosrc
252        (little,) = struct.unpack("<L", input[:4])
253        (big,) = struct.unpack(">L", input[:4])
254        if little == MO_MAGIC_NUMBER:
255            endian = "<"
256        elif big == MO_MAGIC_NUMBER:
257            endian = ">"
258        else:
259            raise ValueError("This is not an MO file")
260        (
261            magic,
262            version_maj,
263            version_min,
264            lenkeys,
265            startkey,
266            startvalue,
267            sizehash,
268            offsethash,
269        ) = struct.unpack("%sLHHiiiii" % endian, input[: (7 * 4)])
270        if version_maj >= 1:
271            raise base.ParseError(
272                """Unable to process version %d.%d MO files"""
273                % (version_maj, version_min)
274            )
275        for i in range(lenkeys):
276            nextkey = startkey + (i * 2 * 4)
277            nextvalue = startvalue + (i * 2 * 4)
278            klength, koffset = struct.unpack(
279                "%sii" % endian, input[nextkey : nextkey + (2 * 4)]
280            )
281            vlength, voffset = struct.unpack(
282                "%sii" % endian, input[nextvalue : nextvalue + (2 * 4)]
283            )
284            source = input[koffset : koffset + klength]
285            context = None
286            if b"\x04" in source:
287                context, source = source.split(b"\x04")
288            # Still need to handle KDE comments
289            if source == "":
290                charset = re.search(
291                    b"charset=([^\\s]+)", input[voffset : voffset + vlength]
292                )
293                if charset:
294                    self.encoding = charset.group(1)
295            source = multistring([s.decode(self.encoding) for s in source.split(b"\0")])
296            target = multistring(
297                [
298                    s.decode(self.encoding)
299                    for s in input[voffset : voffset + vlength].split(b"\0")
300                ]
301            )
302            newunit = mounit(source)
303            newunit.target = target
304            if context is not None:
305                newunit.msgctxt.append(context.decode(self.encoding))
306            self.addunit(newunit)
307