1# 2# Copyright 2007 Zuza Software Foundation 3# 4# the function "serialize" was derived from Python v2.4 5# (Tools/i18n/msgfmt.py - function "generate"): 6# Written by Martin v. Löwis <loewis@informatik.hu-berlin.de> 7# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 8# All rights reserved. 9# original license: Python Software Foundation (version 2) 10# 11# 12# This file is part of translate. 13# 14# translate is free software; you can redistribute it and/or modify 15# it under the terms of the GNU General Public License as published by 16# the Free Software Foundation; either version 2 of the License, or 17# (at your option) any later version. 18# 19# translate is distributed in the hope that it will be useful, 20# but WITHOUT ANY WARRANTY; without even the implied warranty of 21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22# GNU General Public License for more details. 23# 24# You should have received a copy of the GNU General Public License 25# along with this program; if not, see <http://www.gnu.org/licenses/>. 26# 27 28"""Module for parsing Gettext .mo files for translation. 29 30The coding of .mo files was produced from `Gettext documentation 31<http://www.gnu.org/software/gettext/manual/gettext.html#MO-Files>`_, 32Pythons msgfmt.py and by observing and testing existing .mo files in the wild. 33 34The hash algorithm is implemented for MO files, this should result in 35faster access of the MO file. The hash is optional for Gettext 36and is not needed for reading or writing MO files, in this implementation 37it is always on and does produce sometimes different results to Gettext 38in very small files. 39""" 40 41import array 42import re 43import struct 44 45from translate.misc.multistring import multistring 46from translate.storage import base, poheader 47 48 49MO_MAGIC_NUMBER = 0x950412DE 50POT_HEADER = re.compile(r"^POT-Creation-Date:.*(\n|$)", re.IGNORECASE | re.MULTILINE) 51 52 53def mounpack(filename="messages.mo"): 54 """Helper to unpack Gettext MO files into a Python string""" 55 with open(filename, "rb") as fh: 56 s = fh.read() 57 print("\\x%02x" * len(s) % tuple(map(ord, s))) 58 59 60def my_swap4(result): 61 c0 = (result >> 0) & 0xFF 62 c1 = (result >> 8) & 0xFF 63 c2 = (result >> 16) & 0xFF 64 c3 = (result >> 24) & 0xFF 65 66 return (c0 << 24) | (c1 << 16) | (c2 << 8) | c3 67 68 69def hashpjw(str_param): 70 HASHWORDBITS = 32 71 hval = 0 72 for s in str_param: 73 if not s: 74 break 75 hval = hval << 4 76 hval += s 77 g = hval & 0xF << (HASHWORDBITS - 4) 78 if g != 0: 79 hval = hval ^ (g >> (HASHWORDBITS - 8)) 80 hval = hval ^ g 81 return hval 82 83 84def get_next_prime_number(start): 85 # find the smallest prime number that is greater or equal "start" 86 # this is based on hash lib implementation in gettext 87 88 def is_prime(num): 89 # No even number and none less than 10 will be passed here 90 divn = 3 91 sq = divn * divn 92 93 while sq < num and num % divn != 0: 94 divn += 1 95 sq += 4 * divn 96 divn += 1 97 98 return num % divn != 0 99 100 # Make it definitely odd 101 candidate = start | 1 102 103 while not is_prime(candidate): 104 candidate += 2 105 106 return candidate 107 108 109class mounit(base.TranslationUnit): 110 """A class representing a .mo translation message.""" 111 112 def __init__(self, source=None, **kwargs): 113 self.msgctxt = [] 114 self.msgidcomments = [] 115 super().__init__(source) 116 117 def getcontext(self): 118 """Get the message context""" 119 # Still need to handle KDE comments 120 if self.msgctxt is None: 121 return None 122 return "".join(self.msgctxt) 123 124 def setcontext(self, context): 125 self.msgctxt = [context] 126 127 def isheader(self): 128 """Is this a header entry?""" 129 return self.source == "" 130 131 def istranslatable(self): 132 """Is this message translateable?""" 133 return bool(self.source) 134 135 136class mofile(poheader.poheader, base.TranslationStore): 137 """A class representing a .mo file.""" 138 139 UnitClass = mounit 140 Name = "Gettext MO file" 141 Mimetypes = ["application/x-gettext-catalog", "application/x-mo"] 142 Extensions = ["mo", "gmo"] 143 _binary = True 144 145 def __init__(self, inputfile=None, **kwargs): 146 super().__init__(**kwargs) 147 self.filename = "" 148 if inputfile is not None: 149 self.parsestring(inputfile) 150 151 def serialize(self, out): 152 """Output a string representation of the MO data file""" 153 # check the header of this file for the copyright note of this function 154 155 def add_to_hash_table(string, i): 156 hash_value = hashpjw(string) 157 hash_cursor = hash_value % hash_size 158 increment = 1 + (hash_value % (hash_size - 2)) 159 while hash_table[hash_cursor] != 0: 160 hash_cursor += increment 161 hash_cursor = hash_cursor % hash_size 162 hash_table[hash_cursor] = i + 1 163 164 def lst_encode(lst, join_char=b""): 165 return join_char.join([i.encode("utf-8") for i in lst]) 166 167 # hash_size should be the smallest prime number that is greater 168 # or equal (4 / 3 * N) - where N is the number of keys/units. 169 # see gettext-0.17:gettext-tools/src/write-mo.c:406 170 hash_size = get_next_prime_number((len(self.units) * 4) // 3) 171 if hash_size <= 2: 172 hash_size = 3 173 MESSAGES = {} 174 for unit in self.units: 175 # If the unit is not translated, we should rather omit it entirely 176 if not unit.istranslated(): 177 continue 178 if isinstance(unit.source, multistring): 179 source = lst_encode(unit.msgidcomments) + lst_encode( 180 unit.source.strings, b"\0" 181 ) 182 else: 183 source = lst_encode(unit.msgidcomments) + unit.source.encode("utf-8") 184 if unit.msgctxt: 185 source = lst_encode(unit.msgctxt) + b"\x04" + source 186 if isinstance(unit.target, multistring): 187 target = lst_encode(unit.target.strings, b"\0") 188 elif unit.isheader(): 189 # Support for "reproducible builds": Delete information that 190 # may vary between builds in the same conditions. 191 target = POT_HEADER.sub("", unit.target).encode("utf-8") 192 else: 193 target = unit.target.encode("utf-8") 194 if unit.target: 195 MESSAGES[source] = target 196 # using "I" works for 32- and 64-bit systems, but not for 16-bit! 197 hash_table = array.array("I", [0] * hash_size) 198 # the keys are sorted in the .mo file 199 keys = sorted(MESSAGES.keys()) 200 offsets = [] 201 ids = strs = b"" 202 for i, id in enumerate(keys): 203 # For each string, we need size and file offset. Each string is 204 # NUL terminated; the NUL does not count into the size. 205 # TODO: We don't do any encoding detection from the PO Header 206 add_to_hash_table(id, i) 207 string = MESSAGES[id] # id already encoded for use as dictionary key 208 offsets.append((len(ids), len(id), len(strs), len(string))) 209 ids = ids + id + b"\0" 210 strs = strs + string + b"\0" 211 # The header is 7 32-bit unsigned integers 212 keystart = 7 * 4 + 16 * len(keys) + hash_size * 4 213 # and the values start after the keys 214 valuestart = keystart + len(ids) 215 koffsets = [] 216 voffsets = [] 217 # The string table first has the list of keys, then the list of values. 218 # Each entry has first the size of the string, then the file offset. 219 for o1, l1, o2, l2 in offsets: 220 koffsets = koffsets + [l1, o1 + keystart] 221 voffsets = voffsets + [l2, o2 + valuestart] 222 offsets = koffsets + voffsets 223 out.write( 224 struct.pack( 225 "Iiiiiii", 226 MO_MAGIC_NUMBER, # Magic 227 0, # Version 228 len(keys), # # of entries 229 7 * 4, # start of key index 230 7 * 4 + len(keys) * 8, # start of value index 231 hash_size, # size of hash table 232 7 * 4 + 2 * (len(keys) * 8), # offset of hash table 233 ) 234 ) 235 # additional data is not necessary for empty mo files 236 if len(keys) > 0: 237 out.write(array.array("i", offsets).tobytes()) 238 out.write(hash_table.tobytes()) 239 out.write(ids) 240 out.write(strs) 241 242 def parse(self, input): 243 """parses the given file or file source string""" 244 if hasattr(input, "name"): 245 self.filename = input.name 246 elif not getattr(self, "filename", ""): 247 self.filename = "" 248 if hasattr(input, "read"): 249 mosrc = input.read() 250 input.close() 251 input = mosrc 252 (little,) = struct.unpack("<L", input[:4]) 253 (big,) = struct.unpack(">L", input[:4]) 254 if little == MO_MAGIC_NUMBER: 255 endian = "<" 256 elif big == MO_MAGIC_NUMBER: 257 endian = ">" 258 else: 259 raise ValueError("This is not an MO file") 260 ( 261 magic, 262 version_maj, 263 version_min, 264 lenkeys, 265 startkey, 266 startvalue, 267 sizehash, 268 offsethash, 269 ) = struct.unpack("%sLHHiiiii" % endian, input[: (7 * 4)]) 270 if version_maj >= 1: 271 raise base.ParseError( 272 """Unable to process version %d.%d MO files""" 273 % (version_maj, version_min) 274 ) 275 for i in range(lenkeys): 276 nextkey = startkey + (i * 2 * 4) 277 nextvalue = startvalue + (i * 2 * 4) 278 klength, koffset = struct.unpack( 279 "%sii" % endian, input[nextkey : nextkey + (2 * 4)] 280 ) 281 vlength, voffset = struct.unpack( 282 "%sii" % endian, input[nextvalue : nextvalue + (2 * 4)] 283 ) 284 source = input[koffset : koffset + klength] 285 context = None 286 if b"\x04" in source: 287 context, source = source.split(b"\x04") 288 # Still need to handle KDE comments 289 if source == "": 290 charset = re.search( 291 b"charset=([^\\s]+)", input[voffset : voffset + vlength] 292 ) 293 if charset: 294 self.encoding = charset.group(1) 295 source = multistring([s.decode(self.encoding) for s in source.split(b"\0")]) 296 target = multistring( 297 [ 298 s.decode(self.encoding) 299 for s in input[voffset : voffset + vlength].split(b"\0") 300 ] 301 ) 302 newunit = mounit(source) 303 newunit.target = target 304 if context is not None: 305 newunit.msgctxt.append(context.decode(self.encoding)) 306 self.addunit(newunit) 307