1# 2# Copyright 2002-2013 Zuza Software Foundation 3# 4# This file is part of translate. 5# 6# translate is free software; you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation; either version 2 of the License, or 9# (at your option) any later version. 10# 11# translate is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# You should have received a copy of the GNU General Public License 17# along with this program; if not, see <http://www.gnu.org/licenses/>. 18 19r"""Classes that hold units of .dtd files (:class:`dtdunit`) or entire files 20(:class:`dtdfile`). 21 22These are specific .dtd files for localisation used by mozilla. 23 24Specifications 25 The following information is provided by Mozilla: 26 27 `Specification <http://www.w3.org/TR/REC-xml/#sec-entexpand>`_ 28 29 There is a grammar for entity definitions, which isn't really precise, 30 as the spec says. There's no formal specification for DTD files, it's 31 just "whatever makes this work" basically. The whole piece is clearly not 32 the strongest point of the xml spec 33 34 XML elements are allowed in entity values. A number of things that are 35 allowed will just break the resulting document, Mozilla forbids these 36 in their DTD parser. 37 38Dialects 39 There are two dialects: 40 41 - Regular DTD 42 - Android DTD 43 44 Both dialects are similar, but the Android DTD uses some particular escapes 45 that regular DTDs don't have. 46 47Escaping in regular DTD 48 In DTD usually there are characters escaped in the entities. In order to 49 ease the translation some of those escaped characters are unescaped when 50 reading from, or converting, the DTD, and that are escaped again when 51 saving, or converting to a DTD. 52 53 In regular DTD the following characters are usually or sometimes escaped: 54 55 - The % character is escaped using % or % or % 56 - The " character is escaped using " 57 - The ' character is escaped using ' (partial roundtrip) 58 - The & character is escaped using & 59 - The < character is escaped using < (not yet implemented) 60 - The > character is escaped using > (not yet implemented) 61 62 Besides the previous ones there are a lot of escapes for a huge number of 63 characters. This escapes usually have the form of &#NUMBER; where NUMBER 64 represents the numerical code for the character. 65 66 There are a few particularities in DTD escaping. Some of the escapes are 67 not yet implemented since they are not really necessary, or because its 68 implementation is too hard. 69 70 A special case is the ' escaping using ' which doesn't provide a full 71 roundtrip conversion in order to support some special Mozilla DTD files. 72 73 Also the " character is never escaped in the case that the previous 74 character is = (the sequence =" is present on the string) in order to avoid 75 escaping the " character indicating an attribute assignment, for example in 76 a href attribute for an a tag in HTML (anchor tag). 77 78Escaping in Android DTD 79 It has the sames escapes as in regular DTD, plus this ones: 80 81 - The ' character is escaped using \' or \' or \u0027 82 - The " character is escaped using \" 83""" 84 85import re 86import warnings 87from io import BytesIO 88 89from lxml import etree 90 91from translate.misc import quote 92from translate.storage import base 93 94 95labelsuffixes = (".label", ".title") 96"""Label suffixes: entries with this suffix are able to be comibed with accesskeys 97found in in entries ending with :attr:`.accesskeysuffixes`""" 98accesskeysuffixes = (".accesskey", ".accessKey", ".akey") 99"""Accesskey Suffixes: entries with this suffix may be combined with labels 100ending in :attr:`.labelsuffixes` into accelerator notation""" 101 102 103def quoteforandroid(source): 104 """Escapes a line for Android DTD files.""" 105 # Replace "'" character with the \u0027 escape. Other possible replaces are 106 # "\\'" or "\\'". 107 source = source.replace("'", "\\u0027") 108 source = source.replace('"', "\\"") 109 value = quotefordtd(source) # value is an UTF-8 encoded string. 110 return value 111 112 113def unquotefromandroid(source): 114 """Unquotes a quoted Android DTD definition.""" 115 value = unquotefromdtd(source) # value is an UTF-8 encoded string. 116 value = value.replace("\\'", "'") 117 value = value.replace("\\'", "'") 118 value = value.replace("\\u0027", "'") 119 value = value.replace('\\"', '"') # This converts \" to ". 120 return value 121 122 123_DTD_CODEPOINT2NAME = { 124 ord("%"): "#037", # Always escape % sign as %. 125 ord("&"): "amp", 126 # ord("<"): "lt", # Not really so useful. 127 # ord(">"): "gt", # Not really so useful. 128} 129 130 131def quotefordtd(source): 132 """Quotes and escapes a line for regular DTD files.""" 133 source = quote.entityencode(source, _DTD_CODEPOINT2NAME) 134 if '"' in source: 135 source = source.replace("'", "'") # This seems not to runned. 136 if '="' not in source: # Avoid escaping " chars in href attributes. 137 source = source.replace('"', """) 138 value = '"' + source + '"' # Quote using double quotes. 139 else: 140 value = "'" + source + "'" # Quote using single quotes. 141 else: 142 value = '"' + source + '"' # Quote using double quotes. 143 return value 144 145 146_DTD_NAME2CODEPOINT = { 147 "quot": ord('"'), 148 "amp": ord("&"), 149 # "lt": ord("<"), # Not really so useful. 150 # "gt": ord(">"), # Not really so useful. 151 # FIXME these should probably be handled in a more general way 152 "#x0022": ord('"'), 153 "#187": ord("»"), 154 "#037": ord("%"), 155 "#37": ord("%"), 156 "#x25": ord("%"), 157} 158 159 160def unquotefromdtd(source): 161 """unquotes a quoted dtd definition""" 162 # extract the string, get rid of quoting 163 if len(source) == 0: 164 source = '""' 165 # The quote characters should be the first and last characters in the 166 # string. Of course there could also be quote characters within the string. 167 quotechar = source[0] 168 extracted, quotefinished = quote.extractwithoutquotes( 169 source, quotechar, quotechar, allowreentry=False 170 ) 171 if quotechar == "'": 172 extracted = extracted.replace("'", "'") 173 return quote.entitydecode(extracted, _DTD_NAME2CODEPOINT) 174 175 176def removeinvalidamps(name, value): 177 """Find and remove ampersands that are not part of an entity definition. 178 179 A stray & in a DTD file can break an application's ability to parse the 180 file. In Mozilla localisation this is very important and these can break the 181 parsing of files used in XUL and thus break interface rendering. Tracking 182 down the problem is very difficult, thus by removing potential broken 183 ampersand and warning the users we can ensure that the output DTD will 184 always be parsable. 185 186 :type name: String 187 :param name: Entity name 188 :type value: String 189 :param value: Entity text value 190 :rtype: String 191 :return: Entity value without bad ampersands 192 """ 193 194 def is_valid_entity_name(name): 195 """Check that supplied *name* is a valid entity name.""" 196 if name.replace(".", "").replace("_", "").isalnum(): 197 return True 198 elif name[0] == "#" and name[1:].isalnum(): 199 return True 200 return False 201 202 amppos = 0 203 invalid_amps = [] 204 while amppos >= 0: 205 amppos = value.find("&", amppos) 206 if amppos != -1: 207 amppos += 1 208 semipos = value.find(";", amppos) 209 if semipos != -1: 210 if is_valid_entity_name(value[amppos:semipos]): 211 continue 212 invalid_amps.append(amppos - 1) 213 if len(invalid_amps) > 0: 214 warnings.warn("invalid ampersands in dtd entity %s" % (name)) 215 adjustment = 0 216 for amppos in invalid_amps: 217 value = value[: amppos - adjustment] + value[amppos - adjustment + 1 :] 218 adjustment += 1 219 return value 220 221 222class dtdunit(base.TranslationUnit): 223 """An entity definition from a DTD file (and any associated comments).""" 224 225 def __init__(self, source="", android=False): 226 """construct the dtdunit, prepare it for parsing""" 227 self.android = android 228 229 super().__init__(source) 230 self.comments = [] 231 self.unparsedlines = [] 232 self.incomment = False 233 self.inentity = False 234 self.entity = "FakeEntityOnlyForInitialisationAndTesting" 235 self.source = source 236 self.space_pre_entity = " " 237 self.space_pre_definition = " " 238 self.closing = ">" 239 240 # Note that source and target are equivalent for monolingual units 241 @property 242 def source(self): 243 """gets the unquoted source string""" 244 if self.android: 245 return unquotefromandroid(self.definition) 246 else: 247 return unquotefromdtd(self.definition) 248 249 @source.setter 250 def source(self, source): 251 """Sets the definition to the quoted value of source""" 252 if self.android: 253 self.definition = quoteforandroid(source) 254 else: 255 self.definition = quotefordtd(source) 256 self._rich_source = None 257 258 @property 259 def target(self): 260 """gets the unquoted target string""" 261 if self.android: 262 return unquotefromandroid(self.definition) 263 else: 264 return unquotefromdtd(self.definition) 265 266 @target.setter 267 def target(self, target): 268 """Sets the definition to the quoted value of target""" 269 if target is None: 270 target = "" 271 if self.android: 272 self.definition = quoteforandroid(target) 273 else: 274 self.definition = quotefordtd(target) 275 self._rich_target = None 276 277 def getid(self): 278 return self.entity 279 280 def setid(self, new_id): 281 self.entity = new_id 282 283 def getlocations(self): 284 """Return the entity as location (identifier).""" 285 assert quote.rstripeol(self.entity) == self.entity 286 return [self.entity] 287 288 def addlocation(self, location): 289 """Set the entity to the given "location".""" 290 self.entity = location 291 292 def isblank(self): 293 """returns whether this dtdunit doesn't actually have an entity definition""" 294 # for dtds, we currently return a blank string if there is no .entity (==location in other files) 295 # TODO: this needs to work better with base class expectations 296 return self.entity is None 297 298 def istranslatable(self): 299 if getattr(self, "entityparameter", None) == "SYSTEM" or self.isblank(): 300 return False 301 return True 302 303 def parse(self, dtdsrc): 304 """read the first dtd element from the source code into this object, return linesprocessed""" 305 self.comments = [] 306 # make all the lists the same 307 self._locfilenotes = self.comments 308 self._locgroupstarts = self.comments 309 self._locgroupends = self.comments 310 self._locnotes = self.comments 311 # self._locfilenotes = [] 312 # self._locgroupstarts = [] 313 # self._locgroupends = [] 314 # self._locnotes = [] 315 # self.comments = [] 316 self.entity = None 317 self.definition = "" 318 if not dtdsrc: 319 return 0 320 lines = dtdsrc.split("\n") 321 linesprocessed = 0 322 comment = "" 323 for line in lines: 324 line += "\n" 325 linesprocessed += 1 326 if not self.incomment: 327 if line.find("<!--") != -1: 328 self.incomment = True 329 self.continuecomment = False 330 # now work out the type of comment, and save it (remember we're not in the comment yet) 331 (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0) 332 if comment.find("LOCALIZATION NOTE") != -1: 333 l = quote.findend(comment, "LOCALIZATION NOTE") 334 while comment[l] == " ": 335 l += 1 336 if comment.find("FILE", l) == l: 337 self.commenttype = "locfile" 338 elif comment.find("BEGIN", l) == l: 339 self.commenttype = "locgroupstart" 340 elif comment.find("END", l) == l: 341 self.commenttype = "locgroupend" 342 else: 343 self.commenttype = "locnote" 344 else: 345 # plain comment 346 self.commenttype = "comment" 347 # FIXME: bloody entity might share a line with something important 348 elif not self.inentity and re.search("%.*;", line): 349 # now work out the type of comment, and save it (remember we're not in the comment yet) 350 self.comments.append(("comment", line)) 351 line = "" 352 continue 353 354 if self.incomment: 355 # some kind of comment 356 (comment, self.incomment) = quote.extract( 357 line, "<!--", "-->", None, self.continuecomment 358 ) 359 self.continuecomment = self.incomment 360 # strip the comment out of what will be parsed 361 line = line.replace(comment, "", 1) 362 # add a end of line of this is the end of the comment 363 if not self.incomment: 364 if line.isspace(): 365 comment += line 366 line = "" 367 else: 368 comment += "\n" 369 # check if there's actually an entity definition that's commented out 370 # TODO: parse these, store as obsolete messages 371 # if comment.find('<!ENTITY') != -1: 372 # # remove the entity from the comment 373 # comment, dummy = quote.extractwithoutquotes(comment, ">", "<!ENTITY", None, 1) 374 # depending on the type of comment (worked out at the start), put it in the right place 375 # make it record the comment and type as a tuple 376 commentpair = (self.commenttype, comment) 377 if self.commenttype == "locfile": 378 self._locfilenotes.append(commentpair) 379 elif self.commenttype == "locgroupstart": 380 self._locgroupstarts.append(commentpair) 381 elif self.commenttype == "locgroupend": 382 self._locgroupends.append(commentpair) 383 elif self.commenttype == "locnote": 384 self._locnotes.append(commentpair) 385 elif self.commenttype == "comment": 386 self.comments.append(commentpair) 387 388 if not self.inentity and not self.incomment: 389 entitypos = line.find("<!ENTITY") 390 if entitypos != -1: 391 self.inentity = True 392 beforeentity = line[:entitypos].strip() 393 if beforeentity.startswith("#"): 394 self.hashprefix = beforeentity 395 self.entitypart = "start" 396 else: 397 self.unparsedlines.append(line) 398 399 if self.inentity: 400 if self.entitypart == "start": 401 # the entity definition 402 e = quote.findend(line, "<!ENTITY") 403 line = line[e:] 404 self.entitypart = "name" 405 self.entitytype = "internal" 406 if self.entitypart == "name": 407 s = 0 408 e = 0 409 while e < len(line) and line[e].isspace(): 410 e += 1 411 self.space_pre_entity = " " * (e - s) 412 s = e 413 self.entity = "" 414 if e < len(line) and line[e] == "%": 415 self.entitytype = "external" 416 self.entityparameter = "" 417 e += 1 418 while e < len(line) and line[e].isspace(): 419 e += 1 420 while e < len(line) and not line[e].isspace(): 421 self.entity += line[e] 422 e += 1 423 s = e 424 425 assert quote.rstripeol(self.entity) == self.entity 426 while e < len(line) and line[e].isspace(): 427 e += 1 428 self.space_pre_definition = " " * (e - s) 429 if self.entity: 430 if self.entitytype == "external": 431 self.entitypart = "parameter" 432 else: 433 self.entitypart = "definition" 434 # remember the start position and the quote character 435 if e == len(line): 436 self.entityhelp = None 437 e = 0 438 continue 439 elif self.entitypart == "definition": 440 self.entityhelp = (e, line[e]) 441 self.instring = False 442 if self.entitypart == "parameter": 443 while e < len(line) and line[e].isspace(): 444 e += 1 445 paramstart = e 446 while e < len(line) and line[e].isalnum(): 447 e += 1 448 self.entityparameter += line[paramstart:e] 449 while e < len(line) and line[e].isspace(): 450 e += 1 451 line = line[e:] 452 e = 0 453 if not line: 454 continue 455 if line[0] in ('"', "'"): 456 self.entitypart = "definition" 457 self.entityhelp = (e, line[e]) 458 self.instring = False 459 if self.entitypart == "definition": 460 if self.entityhelp is None: 461 e = 0 462 while e < len(line) and line[e].isspace(): 463 e += 1 464 if e == len(line): 465 continue 466 self.entityhelp = (e, line[e]) 467 self.instring = False 468 # actually the lines below should remember instring, rather than using it as dummy 469 e = self.entityhelp[0] 470 if self.entityhelp[1] == "'": 471 (defpart, self.instring) = quote.extract( 472 line[e:], 473 "'", 474 "'", 475 startinstring=self.instring, 476 allowreentry=False, 477 ) 478 elif self.entityhelp[1] == '"': 479 (defpart, self.instring) = quote.extract( 480 line[e:], 481 '"', 482 '"', 483 startinstring=self.instring, 484 allowreentry=False, 485 ) 486 else: 487 raise ValueError( 488 "Unexpected quote character... %r" % (self.entityhelp[1]) 489 ) 490 # for any following lines, start at the beginning of the line. remember the quote character 491 self.entityhelp = (0, self.entityhelp[1]) 492 self.definition += defpart 493 if not self.instring: 494 self.closing = line[e + len(defpart) :].rstrip("\n\r") 495 self.inentity = False 496 break 497 498 return linesprocessed 499 500 def __str__(self): 501 """convert to a string.""" 502 return self.getoutput() 503 504 def getoutput(self): 505 """convert the dtd entity back to string form""" 506 lines = [] 507 lines.extend([comment for commenttype, comment in self.comments]) 508 lines.extend(self.unparsedlines) 509 if self.isblank(): 510 result = "".join(lines) 511 return result.rstrip() + "\n" 512 # for f in self._locfilenotes: yield f 513 # for ge in self._locgroupends: yield ge 514 # for gs in self._locgroupstarts: yield gs 515 # for n in self._locnotes: yield n 516 if len(self.entity) > 0: 517 if getattr(self, "entitytype", None) == "external": 518 entityline = ( 519 "<!ENTITY % " 520 + self.entity 521 + " " 522 + self.entityparameter 523 + " " 524 + self.definition 525 + self.closing 526 ) 527 else: 528 entityline = ( 529 "<!ENTITY" 530 + self.space_pre_entity 531 + self.entity 532 + self.space_pre_definition 533 + self.definition 534 + self.closing 535 ) 536 if getattr(self, "hashprefix", None): 537 entityline = self.hashprefix + " " + entityline 538 lines.append(entityline + "\n") 539 return "".join(lines) 540 541 542class dtdfile(base.TranslationStore): 543 """A .dtd file made up of dtdunits.""" 544 545 UnitClass = dtdunit 546 547 def __init__(self, inputfile=None, android=False): 548 """construct a dtdfile, optionally reading in from inputfile""" 549 super().__init__() 550 self.filename = getattr(inputfile, "name", "") 551 self.android = android 552 if inputfile is not None: 553 dtdsrc = inputfile.read() 554 self.parse(dtdsrc) 555 556 def parse(self, dtdsrc): 557 """read the source code of a dtd file in and include them as dtdunits in self.units""" 558 start = 0 559 end = 0 560 lines = dtdsrc.split(b"\n") 561 while end < len(lines): 562 if start == end: 563 end += 1 564 foundentity = False 565 while end < len(lines): 566 if end >= len(lines): 567 break 568 if lines[end].find(b"<!ENTITY") > -1: 569 foundentity = True 570 if foundentity and re.match(br"[\"']\s*>", lines[end]): 571 end += 1 572 break 573 end += 1 574 575 linesprocessed = 1 # to initialise loop 576 while linesprocessed >= 1: 577 newdtd = dtdunit(android=self.android) 578 try: 579 linesprocessed = newdtd.parse( 580 (b"\n".join(lines[start:end])).decode(self.encoding) 581 ) 582 if linesprocessed >= 1 and ( 583 not newdtd.isblank() or newdtd.unparsedlines 584 ): 585 self.units.append(newdtd) 586 except Exception as e: 587 warnings.warn( 588 "%s\nError occured between lines %d and %d:\n%s" 589 % (e, start + 1, end, b"\n".join(lines[start:end])) 590 ) 591 start += linesprocessed 592 593 def serialize(self, out): 594 """Write content to file""" 595 content = b"" 596 for dtd in self.units: 597 unit_str = str(dtd).encode(self.encoding) 598 out.write(unit_str) 599 content += unit_str 600 if not self._valid_store(content): 601 warnings.warn("DTD file '%s' does not validate" % self.filename) 602 out.truncate(0) 603 604 def _valid_store(self, content): 605 """Validate the store to determine if it is valid 606 607 This uses ElementTree to parse the DTD 608 609 :return: If the store passes validation 610 :rtype: Boolean 611 """ 612 # Android files are invalid DTDs 613 if not self.android: 614 # #expand is a Mozilla hack and are removed as they are not valid in DTDs 615 _input = re.sub(b"#expand", b"", content) 616 try: 617 etree.DTD(BytesIO(_input)) 618 except etree.DTDParseError as e: 619 warnings.warn("DTD parse error: %s" % e.error_log) 620 return False 621 return True 622