1# -*- encoding: utf-8 -*- 2# Copyright (c) 2004, 2005, 2006 Danilo Šegan <danilo@gnome.org>. 3# Copyright (c) 2009 Claude Paroz <claude@2xlibre.net>. 4# 5# This file is part of xml2po. 6# 7# xml2po is free software; you can redistribute it and/or modify 8# it under the terms of the GNU General Public License as published by 9# the Free Software Foundation; either version 2 of the License, or 10# (at your option) any later version. 11# 12# xml2po is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15# GNU General Public License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with xml2po; if not, write to the Free Software Foundation, Inc., 19# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20# 21import os 22import sys 23import re 24import subprocess 25import tempfile 26import gettext 27import libxml2 28 29NULL_STRING = '/dev/null' 30if not os.path.exists('/dev/null'): NULL_STRING = 'NUL' 31 32# Utility functions 33def escapePoString(text): 34 return text.replace('\\','\\\\').replace('"', "\\\"").replace("\n","\\n").replace("\t","\\t") 35 36def unEscapePoString(text): 37 return text.replace('\\"', '"').replace('\\\\','\\') 38 39class NoneTranslations: 40 def gettext(self, message): 41 return None 42 43 def lgettext(self, message): 44 return None 45 46 def ngettext(self, msgid1, msgid2, n): 47 return None 48 49 def lngettext(self, msgid1, msgid2, n): 50 return None 51 52 def ugettext(self, message): 53 return None 54 55 def ungettext(self, msgid1, msgid2, n): 56 return None 57 58class MessageOutput: 59 """ Class to abstract po/pot file """ 60 def __init__(self, app): 61 self.app = app 62 self.messages = [] 63 self.comments = {} 64 self.linenos = {} 65 self.nowrap = {} 66 self.translations = [] 67 self.do_translations = False 68 self.output_msgstr = False # this is msgid mode for outputMessage; True is for msgstr mode 69 70 def translationsFollow(self): 71 """Indicate that what follows are translations.""" 72 self.output_msgstr = True 73 74 def setFilename(self, filename): 75 self.filename = filename 76 77 def outputMessage(self, text, lineno = 0, comment = None, spacepreserve = False, tag = None): 78 """Adds a string to the list of messages.""" 79 if (text.strip() != ''): 80 t = escapePoString(text) 81 if self.output_msgstr: 82 self.translations.append(t) 83 return 84 85 if self.do_translations or (not t in self.messages): 86 self.messages.append(t) 87 if spacepreserve: 88 self.nowrap[t] = True 89 if t in self.linenos.keys(): 90 self.linenos[t].append((self.filename, tag, lineno)) 91 else: 92 self.linenos[t] = [ (self.filename, tag, lineno) ] 93 if (not self.do_translations) and comment and not t in self.comments: 94 self.comments[t] = comment 95 else: 96 if t in self.linenos.keys(): 97 self.linenos[t].append((self.filename, tag, lineno)) 98 else: 99 self.linenos[t] = [ (self.filename, tag, lineno) ] 100 if comment and not t in self.comments: 101 self.comments[t] = comment 102 103 def outputHeader(self, out): 104 import time 105 out.write("""msgid "" 106msgstr "" 107"Project-Id-Version: PACKAGE VERSION\\n" 108"POT-Creation-Date: %s\\n" 109"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" 110"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" 111"Language-Team: LANGUAGE <LL@li.org>\\n" 112"MIME-Version: 1.0\\n" 113"Content-Type: text/plain; charset=UTF-8\\n" 114"Content-Transfer-Encoding: 8bit\\n" 115 116""" % (time.strftime("%Y-%m-%d %H:%M%z"))) 117 118 def outputAll(self, out): 119 self.outputHeader(out) 120 121 for k in self.messages: 122 if k in self.comments: 123 out.write("#. %s\n" % (self.comments[k].replace("\n","\n#. "))) 124 references = "" 125 for reference in self.linenos[k]: 126 references += "%s:%d(%s) " % (reference[0], reference[2], reference[1]) 127 out.write("#: %s\n" % (references.strip())) 128 if k in self.nowrap and self.nowrap[k]: 129 out.write("#, no-wrap\n") 130 out.write("msgid \"%s\"\n" % (k)) 131 translation = "" 132 if self.do_translations: 133 if len(self.translations)>0: 134 translation = self.translations.pop(0) 135 if translation == k: 136 translation = "" 137 out.write("msgstr \"%s\"\n\n" % (translation)) 138 139class XMLDocument(object): 140 def __init__(self, filename, app): 141 self.app = app 142 self.expand_entities = self.app.options.get('expand_entities') 143 self.ignored_tags = self.app.current_mode.getIgnoredTags() 144 ctxt = libxml2.createFileParserCtxt(filename) 145 ctxt.lineNumbers(1) 146 if self.app.options.get('expand_all_entities'): 147 ctxt.replaceEntities(1) 148 ctxt.parseDocument() 149 self.doc = ctxt.doc() 150 if self.doc.name != filename: 151 raise Exception("Error: I tried to open '%s' but got '%s' -- how did that happen?" % (filename, self.doc.name)) 152 if self.app.msg: 153 self.app.msg.setFilename(filename) 154 self.isFinalNode = self.app.current_mode.isFinalNode 155 156 def generate_messages(self): 157 self.app.msg.setFilename(self.doc.name) 158 self.doSerialize(self.doc) 159 160 def normalizeNode(self, node): 161 #print >>sys.stderr, "<%s> (%s) [%s]" % (node.name, node.type, node.serialize('utf-8')) 162 if not node: 163 return 164 elif self.app.isSpacePreserveNode(node): 165 return 166 elif node.isText(): 167 if node.isBlankNode(): 168 if self.app.options.get('expand_entities') or \ 169 (not (node.prev and not node.prev.isBlankNode() and node.next and not node.next.isBlankNode()) ): 170 #print >>sys.stderr, "BLANK" 171 node.setContent('') 172 else: 173 node.setContent(re.sub('\s+',' ', node.content)) 174 175 elif node.children and node.type == 'element': 176 child = node.children 177 while child: 178 self.normalizeNode(child) 179 child = child.next 180 181 def normalizeString(self, text, spacepreserve = False): 182 """Normalizes string to be used as key for gettext lookup. 183 184 Removes all unnecessary whitespace.""" 185 if spacepreserve: 186 return text 187 try: 188 # Lets add document DTD so entities are resolved 189 dtd = self.doc.intSubset() 190 tmp = dtd.serialize('utf-8') 191 tmp = tmp + '<norm>%s</norm>' % text 192 except: 193 tmp = '<norm>%s</norm>' % text 194 195 try: 196 ctxt = libxml2.createDocParserCtxt(tmp) 197 if self.app.options.get('expand_entities'): 198 ctxt.replaceEntities(1) 199 ctxt.parseDocument() 200 tree = ctxt.doc() 201 newnode = tree.getRootElement() 202 except: 203 print >> sys.stderr, """Error while normalizing string as XML:\n"%s"\n""" % (text) 204 return text 205 206 self.normalizeNode(newnode) 207 208 result = '' 209 child = newnode.children 210 while child: 211 result += child.serialize('utf-8') 212 child = child.next 213 214 result = re.sub('^ ','', result) 215 result = re.sub(' $','', result) 216 tree.freeDoc() 217 218 return result 219 220 def stringForEntity(self, node): 221 """Replaces entities in the node.""" 222 text = node.serialize('utf-8') 223 try: 224 # Lets add document DTD so entities are resolved 225 dtd = self.doc.intSubset() 226 tmp = dtd.serialize('utf-8') + '<norm>%s</norm>' % text 227 next = True 228 except: 229 tmp = '<norm>%s</norm>' % text 230 next = False 231 232 ctxt = libxml2.createDocParserCtxt(tmp) 233 if self.expand_entities: 234 ctxt.replaceEntities(1) 235 ctxt.parseDocument() 236 tree = ctxt.doc() 237 if next: 238 newnode = tree.children.next 239 else: 240 newnode = tree.children 241 242 result = '' 243 child = newnode.children 244 while child: 245 result += child.serialize('utf-8') 246 child = child.next 247 tree.freeDoc() 248 return result 249 250 251 def myAttributeSerialize(self, node): 252 result = '' 253 if node.children: 254 child = node.children 255 while child: 256 if child.type=='text': 257 result += self.doc.encodeEntitiesReentrant(child.content) 258 elif child.type=='entity_ref': 259 if not self.expand_entities: 260 result += '&' + child.name + ';' 261 else: 262 result += child.content.decode('utf-8') 263 else: 264 result += self.myAttributeSerialize(child) 265 child = child.next 266 else: 267 result = node.serialize('utf-8') 268 return result 269 270 def startTagForNode(self, node): 271 if not node: 272 return 0 273 274 result = node.name 275 params = '' 276 if node.properties: 277 for p in node.properties: 278 if p.type == 'attribute': 279 try: 280 nsprop = p.ns().name + ":" + p.name 281 except: 282 nsprop = p.name 283 params += " %s=\"%s\"" % (nsprop, self.myAttributeSerialize(p)) 284 return result+params 285 286 def endTagForNode(self, node): 287 if not node: 288 return False 289 return node.name 290 291 def ignoreNode(self, node): 292 if self.isFinalNode(node): 293 return False 294 if node.name in self.ignored_tags or node.type in ('dtd', 'comment'): 295 return True 296 return False 297 298 def getCommentForNode(self, node): 299 """Walk through previous siblings until a comment is found, or other element. 300 301 Only whitespace is allowed between comment and current node.""" 302 prev = node.prev 303 while prev and prev.type == 'text' and prev.content.strip() == '': 304 prev = prev.prev 305 if prev and prev.type == 'comment': 306 return prev.content.strip() 307 else: 308 return None 309 310 def replaceAttributeContentsWithText(self, node, text): 311 node.setContent(text) 312 313 def replaceNodeContentsWithText(self, node, text): 314 """Replaces all subnodes of a node with contents of text treated as XML.""" 315 316 if node.children: 317 starttag = self.startTagForNode(node) 318 endtag = self.endTagForNode(node) 319 320 # Lets add document DTD so entities are resolved 321 tmp = '<?xml version="1.0" encoding="utf-8" ?>' 322 try: 323 dtd = self.doc.intSubset() 324 tmp = tmp + dtd.serialize('utf-8') 325 except libxml2.treeError: 326 pass 327 328 content = '<%s>%s</%s>' % (starttag, text, endtag) 329 tmp = tmp + content.encode('utf-8') 330 331 newnode = None 332 try: 333 ctxt = libxml2.createDocParserCtxt(tmp) 334 ctxt.replaceEntities(0) 335 ctxt.parseDocument() 336 newnode = ctxt.doc() 337 except: 338 pass 339 340 if not newnode: 341 print >> sys.stderr, """Error while parsing translation as XML:\n"%s"\n""" % (text.encode('utf-8')) 342 return 343 344 newelem = newnode.getRootElement() 345 346 if newelem and newelem.children: 347 free = node.children 348 while free: 349 next = free.next 350 free.unlinkNode() 351 free = next 352 353 if node: 354 copy = newelem.copyNodeList() 355 next = node.next 356 node.replaceNode(newelem.copyNodeList()) 357 node.next = next 358 359 else: 360 # In practice, this happens with tags such as "<para> </para>" (only whitespace in between) 361 pass 362 else: 363 node.setContent(text) 364 365 def hasText(self, node): 366 """Whether or not a node contains text 367 368 A node "contains text" if the node itself or one of its children 369 is a text node containing non-empty text. 370 """ 371 if node.name in self.ignored_tags: 372 return False 373 if node.isText() and node.content.strip() != '': 374 return True 375 child = node.children 376 while child: 377 if child.isText() and child.content.strip() != '': 378 return True 379 else: 380 child = child.next 381 return False 382 383 384 def worthOutputting(self,node): 385 """Whether or not a node is worth outputting 386 387 A node is "worth outputting", if the node itself or one of its 388 children is a text node -- unless the node is not final and there 389 is a parent node which is already worth outputting. 390 """ 391 worth = self.hasText(node) # is or has non-empty text node 392 if not (self.isFinalNode(node) or node.get_name() in self.ignored_tags): 393 parent = node.get_parent() 394 while worth and parent: 395 if self.worthOutputting(parent): 396 worth = False 397 else: 398 parent = parent.get_parent() 399 return worth 400 401 def processAttribute(self, node, attr): 402 assert node and attr 403 404 outtxt = self.normalizeString(attr.content) 405 if self.app.operation == 'merge': 406 translation = self.app.getTranslation(outtxt) # unicode or None 407 if translation is not None: 408 self.replaceAttributeContentsWithText(attr, 409 translation.encode('utf-8')) 410 else: 411 self.app.msg.outputMessage(outtxt, node.lineNo(), "", spacepreserve=False, 412 tag = node.name + ":" + attr.name) 413 414 def processElementTag(self, node, replacements, restart = False): 415 """Process node with node.type == 'element'.""" 416 if node.type != 'element': 417 raise Exception("You must pass node with node.type=='element'.") 418 419 # Translate attributes if needed 420 if node.properties and self.app.current_mode.getTreatedAttributes(): 421 for p in node.properties: 422 if p.name in self.app.current_mode.getTreatedAttributes(): 423 self.processAttribute(node, p) 424 425 outtxt = '' 426 if restart: 427 myrepl = [] 428 else: 429 myrepl = replacements 430 431 submsgs = [] 432 433 child = node.children 434 while child: 435 if (self.isFinalNode(child)) or (child.type == 'element' and self.worthOutputting(child)): 436 myrepl.append(self.processElementTag(child, myrepl, True)) 437 outtxt += '<placeholder-%d/>' % (len(myrepl)) 438 else: 439 if child.type == 'element': 440 (starttag, content, endtag, translation) = self.processElementTag(child, myrepl, False) 441 outtxt += '<%s>%s</%s>' % (starttag, content, endtag) 442 else: 443 outtxt += self.doSerialize(child) 444 child = child.next 445 446 if self.app.operation == 'merge': 447 norm_outtxt = self.normalizeString(outtxt, self.app.isSpacePreserveNode(node)) 448 translation = self.app.getTranslation(norm_outtxt) 449 else: 450 translation = outtxt.decode('utf-8') 451 452 starttag = self.startTagForNode(node) 453 endtag = self.endTagForNode(node) 454 455 worth = self.worthOutputting(node) 456 if not translation: 457 translation = outtxt.decode('utf-8') 458 if worth and self.app.options.get('mark_untranslated'): 459 node.setLang('C') 460 461 if restart or worth: 462 for i, repl in enumerate(myrepl): 463 # repl[0] may contain translated attributes with 464 # non-ASCII chars, so implicit conversion to <str> may fail 465 replacement = '<%s>%s</%s>' % \ 466 (repl[0].decode('utf-8'), repl[3], repl[2]) 467 translation = translation.replace('<placeholder-%d/>' % (i+1), replacement) 468 469 if worth: 470 if self.app.operation == 'merge': 471 self.replaceNodeContentsWithText(node, translation) 472 else: 473 norm_outtxt = self.normalizeString(outtxt, self.app.isSpacePreserveNode(node)) 474 self.app.msg.outputMessage(norm_outtxt, node.lineNo(), self.getCommentForNode(node), self.app.isSpacePreserveNode(node), tag = node.name) 475 476 return (starttag, outtxt, endtag, translation) 477 478 479 def isExternalGeneralParsedEntity(self, node): 480 try: 481 # it would be nice if debugDumpNode could use StringIO, but it apparently cannot 482 tmp = tempfile.TemporaryFile() 483 node.debugDumpNode(tmp,0) 484 tmp.seek(0) 485 tmpstr = tmp.read() 486 tmp.close() 487 except: 488 # We fail silently, and replace all entities if we cannot 489 # write .xml2po-entitychecking 490 # !!! This is not very nice thing to do, but I don't know if 491 # raising an exception is any better 492 return False 493 return tmpstr.find('EXTERNAL_GENERAL_PARSED_ENTITY') != -1 494 495 def doSerialize(self, node): 496 """Serializes a node and its children, emitting PO messages along the way. 497 498 node is the node to serialize, first indicates whether surrounding 499 tags should be emitted as well. 500 """ 501 502 if self.ignoreNode(node): 503 return '' 504 elif not node.children: 505 return node.serialize("utf-8") 506 elif node.type == 'entity_ref': 507 if self.isExternalGeneralParsedEntity(node): 508 return node.serialize('utf-8') 509 else: 510 return self.stringForEntity(node) #content #content #serialize("utf-8") 511 elif node.type == 'entity_decl': 512 return node.serialize('utf-8') #'<%s>%s</%s>' % (startTagForNode(node), node.content, node.name) 513 elif node.type == 'text': 514 return node.serialize('utf-8') 515 elif node.type == 'element': 516 repl = [] 517 (starttag, content, endtag, translation) = self.processElementTag(node, repl, True) 518 return '<%s>%s</%s>' % (starttag, content, endtag) 519 else: 520 child = node.children 521 outtxt = '' 522 while child: 523 outtxt += self.doSerialize(child) 524 child = child.next 525 return outtxt 526 527def xml_error_handler(arg, ctxt): 528 #deactivate error messages from the validation 529 pass 530 531class Main(object): 532 def __init__(self, mode, operation, output, options): 533 libxml2.registerErrorHandler(xml_error_handler, None) 534 self.operation = operation 535 self.options = options 536 self.msg = None 537 self.gt = None 538 self.current_mode = self.load_mode(mode)() 539 # Prepare output 540 if operation == 'update': 541 self.out = tempfile.TemporaryFile() 542 elif output == '-': 543 self.out = sys.stdout 544 else: 545 self.out = file(output, 'w') 546 547 def load_mode(self, modename): 548 try: 549 module = __import__('xml2po.modes.%s' % modename, globals(), locals(), ['%sXmlMode' % modename]) 550 return getattr(module, '%sXmlMode' % modename) 551 except (ImportError, AttributeError): 552 if modename == 'basic': 553 sys.stderr.write("Unable to find xml2po modes. Please check your xml2po installation.\n") 554 sys.exit(1) 555 else: 556 sys.stderr.write("Unable to load mode '%s'. Falling back to 'basic' mode with automatic detection (-a).\n" % modename) 557 return self.load_mode('basic') 558 559 def to_pot(self, xmlfiles): 560 """ Produce a pot file from the list of 'xmlfiles' """ 561 self.msg = MessageOutput(self) 562 for xmlfile in xmlfiles: 563 if not os.access(xmlfile, os.R_OK): 564 raise IOError("Unable to read file '%s'" % xmlfile) 565 try: 566 doc = XMLDocument(xmlfile, self) 567 except Exception as e: 568 print >> sys.stderr, "Unable to parse XML file '%s': %s" % (xmlfile, str(e)) 569 sys.exit(1) 570 self.current_mode.preProcessXml(doc.doc, self.msg) 571 doc.generate_messages() 572 self.output_po() 573 574 def merge(self, mofile, xmlfile): 575 """ Merge translations from mofile into xmlfile to generate a translated XML file """ 576 if not os.access(xmlfile, os.R_OK): 577 raise IOError("Unable to read file '%s'" % xmlfile) 578 try: 579 doc = XMLDocument(xmlfile, self) 580 except Exception as e: 581 print >> sys.stderr, str(e) 582 sys.exit(1) 583 584 try: 585 mfile = open(mofile, "rb") 586 except: 587 print >> sys.stderr, "Can't open MO file '%s'." % (mofile) 588 self.gt = gettext.GNUTranslations(mfile) 589 self.gt.add_fallback(NoneTranslations()) 590 # Has preProcessXml use cases for merge? 591 #self.current_mode.preProcessXml(doc.doc, self.msg) 592 593 doc.doSerialize(doc.doc) 594 tcmsg = self.current_mode.getStringForTranslators() 595 outtxt = self.getTranslation(tcmsg) 596 self.current_mode.postProcessXmlTranslation(doc.doc, self.options.get('translationlanguage'), outtxt) 597 self.out.write(doc.doc.serialize('utf-8', 1)) 598 599 def reuse(self, origxml, xmlfile): 600 """ Produce a po file from xmlfile pot and using translations from origxml """ 601 self.msg = MessageOutput(self) 602 self.msg.do_translations = True 603 if not os.access(xmlfile, os.R_OK): 604 raise IOError("Unable to read file '%s'" % xmlfile) 605 if not os.access(origxml, os.R_OK): 606 raise IOError("Unable to read file '%s'" % xmlfile) 607 try: 608 doc = XMLDocument(xmlfile, self) 609 except Exception as e: 610 print >> sys.stderr, str(e) 611 sys.exit(1) 612 doc.generate_messages() 613 614 self.msg.translationsFollow() 615 try: 616 doc = XMLDocument(origxml, self) 617 except Exception as e: 618 print >> sys.stderr, str(e) 619 sys.exit(1) 620 doc.generate_messages() 621 self.output_po() 622 623 def update(self, xmlfiles, lang_file): 624 """ Merge the produced pot with an existing po file (lang_file) """ 625 if not os.access(lang_file, os.W_OK): 626 raise IOError("'%s' does not exist or is not writable." % lang_file) 627 self.to_pot(xmlfiles) 628 lang = os.path.basename(lang_file).split(".")[0] 629 630 sys.stderr.write("Merging translations for %s: \n" % (lang)) 631 self.out.seek(0) 632 merge_cmd = subprocess.Popen(["msgmerge", "-o", ".tmp.%s.po" % lang, lang_file, "-"], 633 stdin=self.out, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 634 cmdout, cmderr = merge_cmd.communicate() 635 if merge_cmd.returncode: 636 raise Exception("Error during msgmerge command.") 637 else: 638 result = subprocess.call(["mv", ".tmp.%s.po" % lang, lang_file]) 639 if result: 640 raise Exception("Error: cannot rename file.") 641 else: 642 subprocess.call(["msgfmt", "-cv", "-o", NULL_STRING, lang_file]) 643 644 def getTranslation(self, text): 645 """Returns a translation via gettext for specified snippet. 646 647 text should be a string to look for. 648 """ 649 #print >>sys.stderr,"getTranslation('%s')" % (text.encode('utf-8')) 650 if not text or text.strip() == '': 651 return text 652 if self.gt: 653 res = self.gt.ugettext(text.decode('utf-8')) 654 return res 655 656 return text 657 658 def output_po(self): 659 """ Write the resulting po/pot file to specified output """ 660 tcmsg = self.current_mode.getStringForTranslators() 661 tccom = self.current_mode.getCommentForTranslators() 662 if tcmsg: 663 self.msg.outputMessage(tcmsg, lineno=0, comment=tccom) 664 665 self.msg.outputAll(self.out) 666 667 # **** XML utility functions **** 668 def isSpacePreserveNode(self, node): 669 if node.getSpacePreserve() == 1: 670 return True 671 else: 672 return node.name in self.current_mode.getSpacePreserveTags() 673 674