1# 2# Copyright 2002-2011 Zuza Software Foundation 3# 4# This file is part of the Translate Toolkit. 5# 6# This program is free software; you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation; either version 2 of the License, or 9# (at your option) any later version. 10# 11# This program is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# You should have received a copy of the GNU General Public License 17# along with this program; if not, see <http://www.gnu.org/licenses/>. 18 19"""Classes for the support of Gettext .po and .pot files. 20 21This implementation assumes that cpo is working. This should not be used 22directly, but can be used once cpo has been established to work. 23""" 24 25# TODO: 26# - handle headerless PO files better 27# - previous msgid and msgctxt 28# - accept only unicodes everywhere 29 30import copy 31import logging 32import re 33 34from translate.misc.multistring import multistring 35from translate.storage import base, cpo, pocommon 36 37 38logger = logging.getLogger(__name__) 39 40 41lsep = " " 42"""Separator for #: entries""" 43 44basic_header = r"""msgid "" 45msgstr "" 46"Content-Type: text/plain; charset=UTF-8\n" 47"Content-Transfer-Encoding: 8bit\n" 48""" 49 50 51class pounit(pocommon.pounit): 52 # othercomments = [] # # this is another comment 53 # automaticcomments = [] # #. comment extracted from the source code 54 # sourcecomments = [] # #: sourcefile.xxx:35 55 # prev_msgctxt = [] # #| The previous values that msgctxt and msgid held 56 # prev_msgid = [] # 57 # prev_msgid_plural = [] # 58 # typecomments = [] # #, fuzzy 59 # msgidcomment = "" # _: within msgid 60 # msgctxt 61 # msgid = [] 62 # msgstr = [] 63 64 # Our homegrown way to indicate what must be copied in a shallow 65 # fashion 66 __shallow__ = ["_store"] 67 68 def __init__(self, source=None, **kwargs): 69 super().__init__(source) 70 self._initallcomments(blankall=True) 71 self._msgctxt = "" 72 73 self.target = "" 74 75 def _initallcomments(self, blankall=False): 76 """Initialises allcomments""" 77 if blankall: 78 self.othercomments = [] 79 self.automaticcomments = [] 80 self.sourcecomments = [] 81 self.typecomments = [] 82 self.msgidcomment = "" 83 84 @property 85 def source(self): 86 return self._source 87 88 @source.setter 89 def source(self, source): 90 self._rich_source = None 91 source = source or "" 92 if isinstance(source, multistring): 93 self._source = source 94 elif isinstance(source, str): 95 self._source = source 96 else: # If it is unicode, list or dict. 97 self._source = multistring(source) 98 99 @property 100 def target(self): 101 """Returns the unescaped msgstr""" 102 return self._target 103 104 @target.setter 105 def target(self, target): 106 """Sets the msgstr to the given (unescaped) value""" 107 self._rich_target = None 108 if self.hasplural(): 109 if isinstance(target, multistring): 110 self._target = target 111 else: # If it is unicode, list or dict. 112 self._target = multistring(target) 113 elif isinstance(target, (dict, list)): 114 if len(target) == 1: 115 self._target = target[0] 116 else: 117 raise ValueError( 118 "po msgid element has no plural but msgstr" 119 "has %d elements (%s)" % (len(target), target) 120 ) 121 else: 122 self._target = target 123 124 def getnotes(self, origin=None): 125 """Return comments based on origin value (programmer, developer, source code and translator)""" 126 if origin is None: 127 comments = "\n".join(self.othercomments) 128 comments += "\n".join(self.automaticcomments) 129 elif origin == "translator": 130 comments = "\n".join(self.othercomments) 131 elif origin in ["programmer", "developer", "source code"]: 132 comments = "\n".join(self.automaticcomments) 133 else: 134 raise ValueError("Comment type not valid") 135 return comments 136 137 def addnote(self, text, origin=None, position="append"): 138 """This is modeled on the XLIFF method. See xliff.py::xliffunit.addnote""" 139 # ignore empty strings and strings without non-space characters 140 if not (text and text.strip()): 141 return 142 commentlist = self.othercomments 143 autocomments = False 144 if origin in ["programmer", "developer", "source code"]: 145 autocomments = True 146 commentlist = self.automaticcomments 147 if text.endswith("\n"): 148 text = text[:-1] 149 newcomments = text.split("\n") 150 if position == "append": 151 newcomments = commentlist + newcomments 152 elif position == "prepend": 153 newcomments = newcomments + commentlist 154 155 if autocomments: 156 self.automaticcomments = newcomments 157 else: 158 self.othercomments = newcomments 159 160 def removenotes(self, origin=None): 161 """Remove all the translator's notes (other comments)""" 162 self.othercomments = [] 163 164 def __deepcopy__(self, memo={}): 165 # Make an instance to serve as the copy 166 new_unit = self.__class__() 167 # We'll be testing membership frequently, so make a set from 168 # self.__shallow__ 169 shallow = set(self.__shallow__) 170 # Make deep copies of all members which are not in shallow 171 for key, value in self.__dict__.items(): 172 if key not in shallow: 173 setattr(new_unit, key, copy.deepcopy(value)) 174 # Make shallow copies of all members which are in shallow 175 for key in set(shallow): 176 setattr(new_unit, key, getattr(self, key)) 177 # Mark memo with ourself, so that we won't get deep copied 178 # again 179 memo[id(self)] = self 180 # Return our copied unit 181 return new_unit 182 183 def copy(self): 184 return copy.deepcopy(self) 185 186 def _msgidlen(self): 187 if self.hasplural(): 188 len("".join(string for string in self.source.strings)) 189 else: 190 return len(self.source) 191 192 def _msgstrlen(self): 193 if self.hasplural(): 194 len("".join(string for string in self.target.strings)) 195 else: 196 return len(self.target) 197 198 def merge(self, otherpo, overwrite=False, comments=True, authoritative=False): 199 """Merges the otherpo (with the same msgid) into this one. 200 201 Overwrite non-blank self.msgstr only if overwrite is True 202 merge comments only if comments is True 203 """ 204 205 def mergelists(list1, list2, split=False): 206 # Determine the newline style of list2 207 lineend = "" 208 if list2 and list2[0]: 209 for candidate in ["\n", "\r", "\n\r"]: 210 if list2[0].endswith(candidate): 211 lineend = candidate 212 if not lineend: 213 lineend = "" 214 215 # Split if directed to do so: 216 if split: 217 splitlist1 = [] 218 splitlist2 = [] 219 for item in list1: 220 splitlist1.extend(item.split()) 221 for item in list2: 222 splitlist2.extend(item.split()) 223 list1.extend([item for item in splitlist2 if item not in splitlist1]) 224 else: 225 # Normal merge, but conform to list1 newline style 226 if list1 != list2: 227 for item in list2: 228 item = item.rstrip(lineend) 229 # avoid duplicate comment lines (this might cause some problems) 230 if item not in list1 or len(item) < 5: 231 list1.append(item) 232 233 if not isinstance(otherpo, pounit): 234 super().merge(otherpo, overwrite, comments) 235 return 236 if comments: 237 mergelists(self.othercomments, otherpo.othercomments) 238 mergelists(self.typecomments, otherpo.typecomments) 239 if not authoritative: 240 # We don't bring across otherpo.automaticcomments as we consider ourself 241 # to be the the authority. Same applies to otherpo.msgidcomments 242 mergelists(self.automaticcomments, otherpo.automaticcomments) 243 # mergelists(self.msgidcomments, otherpo.msgidcomments) #XXX? 244 mergelists(self.sourcecomments, otherpo.sourcecomments, split=True) 245 if not self.istranslated() or overwrite: 246 # Remove kde-style comments from the translation (if any). XXX - remove 247 if pocommon.extract_msgid_comment(otherpo.target): 248 otherpo.target = otherpo.target.replace( 249 "_: " + otherpo._extract_msgidcomments() + "\n", "" 250 ) 251 self.target = otherpo.target 252 if ( 253 self.source != otherpo.source 254 or self.getcontext() != otherpo.getcontext() 255 ): 256 self.markfuzzy() 257 else: 258 self.markfuzzy(otherpo.isfuzzy()) 259 elif not otherpo.istranslated(): 260 if self.source != otherpo.source: 261 self.markfuzzy() 262 else: 263 if self.target != otherpo.target: 264 self.markfuzzy() 265 266 def isheader(self): 267 # TODO: fix up nicely 268 return not self.getid() and len(self.target) > 0 269 270 def isblank(self): 271 if self.isheader() or self.msgidcomment: 272 return False 273 if ( 274 (self._msgidlen() == 0) 275 and (self._msgstrlen() == 0) 276 and len(self._msgctxt) == 0 277 ): 278 return True 279 return False 280 281 def hastypecomment(self, typecomment): 282 """Check whether the given type comment is present""" 283 # check for word boundaries properly by using a regular expression... 284 return ( 285 sum( 286 map( 287 lambda tcline: len(re.findall("\\b%s\\b" % typecomment, tcline)), 288 self.typecomments, 289 ) 290 ) 291 != 0 292 ) 293 294 def hasmarkedcomment(self, commentmarker): 295 """Check whether the given comment marker is present as # (commentmarker) ...""" 296 commentmarker = "(%s)" % commentmarker 297 for comment in self.othercomments: 298 if comment.startswith(commentmarker): 299 return True 300 return False 301 302 def settypecomment(self, typecomment, present=True): 303 """Alters whether a given typecomment is present""" 304 if self.hastypecomment(typecomment) != present: 305 if present: 306 self.typecomments.append("#, %s\n" % typecomment) 307 else: 308 # this should handle word boundaries properly ... 309 typecomments = map( 310 lambda tcline: re.sub("\\b%s\\b[ \t,]*" % typecomment, "", tcline), 311 self.typecomments, 312 ) 313 self.typecomments = filter( 314 lambda tcline: tcline.strip() != "#,", typecomments 315 ) 316 317 def istranslated(self): 318 return super().istranslated() and not self.isobsolete() 319 320 def istranslatable(self): 321 return not (self.isheader() or self.isblank() or self.isobsolete()) 322 323 def isfuzzy(self): 324 return self.hastypecomment("fuzzy") 325 326 def _domarkfuzzy(self, present=True): 327 self.settypecomment("fuzzy", present) 328 329 def makeobsolete(self): 330 """Makes this unit obsolete""" 331 self.sourcecomments = [] 332 self.automaticcomments = [] 333 super().makeobsolete() 334 335 def hasplural(self): 336 """returns whether this pounit contains plural strings...""" 337 source = self.source 338 return isinstance(source, multistring) and len(source.strings) > 1 339 340 def __str__(self): 341 """convert to a string. double check that unicode is handled somehow here""" 342 _cpo_unit = cpo.pounit.buildfromunit(self) 343 return str(_cpo_unit) 344 345 def getlocations(self): 346 """Get a list of locations from sourcecomments in the PO unit. 347 348 rtype: List 349 return: A list of the locations with '#: ' stripped 350 351 """ 352 # TODO: rename to .locations 353 return self.sourcecomments 354 355 def addlocation(self, location): 356 """Add a location to sourcecomments in the PO unit. 357 358 :param location: Text location e.g. 'file.c:23' does not include #: 359 :type location: String 360 """ 361 self.sourcecomments.append(location) 362 363 def _extract_msgidcomments(self, text=None): 364 """Extract KDE style msgid comments from the unit. 365 366 :rtype: String 367 :return: Returns the extracted msgidcomments found in this unit's msgid. 368 """ 369 if text: 370 return pocommon.extract_msgid_comment(text) 371 else: 372 return self.msgidcomment 373 374 def getcontext(self): 375 """Get the message context.""" 376 return self._msgctxt + self.msgidcomment 377 378 def setcontext(self, context): 379 self._msgctxt = context or "" 380 381 def getid(self): 382 """Returns a unique identifier for this unit.""" 383 context = self.getcontext() 384 # Gettext does not consider the plural to determine duplicates, only 385 # the msgid. For generation of .mo files, we might want to use this 386 # code to generate the entry for the hash table, but for now, it is 387 # commented out for conformance to gettext. 388 # id = '\0'.join(self.source.strings) 389 id = self.source 390 if self.msgidcomment: 391 id = f"_: {context}\n{id}" 392 elif context: 393 id = f"{context}\04{id}" 394 return id 395 396 @classmethod 397 def buildfromunit(cls, unit): 398 """Build a native unit from a foreign unit, preserving as much 399 information as possible. 400 """ 401 if type(unit) == cls and hasattr(unit, "copy") and callable(unit.copy): 402 return unit.copy() 403 elif isinstance(unit, pocommon.pounit): 404 newunit = cls(unit.source) 405 newunit.target = unit.target 406 # context 407 newunit.msgidcomment = unit._extract_msgidcomments() 408 if not newunit.msgidcomment: 409 newunit.setcontext(unit.getcontext()) 410 411 locations = unit.getlocations() 412 if locations: 413 newunit.addlocations(locations) 414 notes = unit.getnotes("developer") 415 if notes: 416 newunit.addnote(notes, "developer") 417 notes = unit.getnotes("translator") 418 if notes: 419 newunit.addnote(notes, "translator") 420 newunit.markfuzzy(unit.isfuzzy()) 421 if unit.isobsolete(): 422 newunit.makeobsolete() 423 for tc in ["python-format", "c-format", "php-format"]: 424 if unit.hastypecomment(tc): 425 newunit.settypecomment(tc) 426 break 427 return newunit 428 else: 429 return base.TranslationUnit.buildfromunit(unit) 430 431 432class pofile(pocommon.pofile): 433 """A .po file containing various units""" 434 435 UnitClass = pounit 436 437 def _build_self_from_cpo(self): 438 """Builds up this store from the internal cpo store. 439 440 A user must ensure that self._cpo_store already exists, and that it is 441 deleted afterwards. 442 """ 443 for unit in self._cpo_store.units: 444 self.addunit(self.UnitClass.buildfromunit(unit)) 445 self.encoding = self._cpo_store.encoding 446 447 def _build_cpo_from_self(self): 448 """Builds the internal cpo store from the data in self. 449 450 A user must ensure that self._cpo_store does not exist, and should 451 delete it after using it. 452 """ 453 self._cpo_store = cpo.pofile(noheader=True) 454 for unit in self.units: 455 if not unit.isblank(): 456 self._cpo_store.addunit( 457 cpo.pofile.UnitClass.buildfromunit(unit, self.encoding) 458 ) 459 if not self._cpo_store.header(): 460 # only add a temporary header 461 self._cpo_store.makeheader(charset=self.encoding, encoding="8bit") 462 463 def parse(self, input): 464 """Parses the given file or file source string.""" 465 try: 466 if hasattr(input, "name"): 467 self.filename = input.name 468 elif not getattr(self, "filename", ""): 469 self.filename = "" 470 self.units = [] 471 self._cpo_store = cpo.pofile(input, noheader=True) 472 self._build_self_from_cpo() 473 del self._cpo_store 474 except Exception as e: 475 raise base.ParseError(e) 476 477 def removeduplicates(self, duplicatestyle="merge"): 478 """Make sure each msgid is unique ; merge comments etc from duplicates into original""" 479 # TODO: can we handle consecutive calls to removeduplicates()? What 480 # about files already containing msgctxt? - test 481 id_dict = {} 482 uniqueunits = [] 483 # TODO: this is using a list as the pos aren't hashable, but this is slow. 484 # probably not used frequently enough to worry about it, though. 485 markedpos = [] 486 487 def addcomment(thepo): 488 thepo.msgidcomment = " ".join(thepo.getlocations()) 489 markedpos.append(thepo) 490 491 for thepo in self.units: 492 id = thepo.getid() 493 if thepo.isheader() and not thepo.getlocations(): 494 # header msgids shouldn't be merged... 495 uniqueunits.append(thepo) 496 elif id in id_dict: 497 if duplicatestyle == "merge": 498 if id: 499 id_dict[id].merge(thepo) 500 else: 501 addcomment(thepo) 502 uniqueunits.append(thepo) 503 elif duplicatestyle == "msgctxt": 504 origpo = id_dict[id] 505 if origpo not in markedpos and id: 506 # if it doesn't have an id, we already added msgctxt 507 origpo._msgctxt += " ".join(origpo.getlocations()) 508 markedpos.append(thepo) 509 thepo._msgctxt += " ".join(thepo.getlocations()) 510 if not thepo._msgctxt == id_dict[id]._msgctxt: 511 uniqueunits.append(thepo) 512 else: 513 logger.warning( 514 "Duplicate unit found with msgctx of '%s' and source '%s'", 515 thepo._msgctxt, 516 thepo.source, 517 ) 518 else: 519 if not id: 520 if duplicatestyle == "merge": 521 addcomment(thepo) 522 else: 523 thepo._msgctxt += " ".join(thepo.getlocations()) 524 id_dict[id] = thepo 525 uniqueunits.append(thepo) 526 self.units = uniqueunits 527 528 def serialize(self, out): 529 """Write content to file""" 530 self._cpo_store = cpo.pofile(encoding=self.encoding, noheader=True) 531 try: 532 self._build_cpo_from_self() 533 except UnicodeEncodeError: 534 self.encoding = "utf-8" 535 self.updateheader(add=True, Content_Type="text/plain; charset=UTF-8") 536 self._build_cpo_from_self() 537 self._cpo_store.serialize(out) 538 del self._cpo_store 539