1# -*- encoding: utf-8 -*-
2# Copyright (c) 2004, 2005, 2006 Danilo Šegan <danilo@gnome.org>.
3# Copyright (c) 2009 Claude Paroz <claude@2xlibre.net>.
4#
5# This file is part of xml2po.
6#
7# xml2po is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# xml2po is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with xml2po; if not, write to the Free Software Foundation, Inc.,
19# 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20#
21import os
22import sys
23import re
24import subprocess
25import tempfile
26import gettext
27import libxml2
28
29NULL_STRING = '/dev/null'
30if not os.path.exists('/dev/null'): NULL_STRING = 'NUL'
31
32# Utility functions
33def escapePoString(text):
34    return text.replace('\\','\\\\').replace('"', "\\\"").replace("\n","\\n").replace("\t","\\t")
35
36def unEscapePoString(text):
37    return text.replace('\\"', '"').replace('\\\\','\\')
38
39class NoneTranslations:
40    def gettext(self, message):
41        return None
42
43    def lgettext(self, message):
44        return None
45
46    def ngettext(self, msgid1, msgid2, n):
47        return None
48
49    def lngettext(self, msgid1, msgid2, n):
50        return None
51
52    def ugettext(self, message):
53        return None
54
55    def ungettext(self, msgid1, msgid2, n):
56        return None
57
58class MessageOutput:
59    """ Class to abstract po/pot file """
60    def __init__(self, app):
61        self.app = app
62        self.messages = []
63        self.comments = {}
64        self.linenos = {}
65        self.nowrap = {}
66        self.translations = []
67        self.do_translations = False
68        self.output_msgstr = False # this is msgid mode for outputMessage; True is for msgstr mode
69
70    def translationsFollow(self):
71        """Indicate that what follows are translations."""
72        self.output_msgstr = True
73
74    def setFilename(self, filename):
75        self.filename = filename
76
77    def outputMessage(self, text, lineno = 0, comment = None, spacepreserve = False, tag = None):
78        """Adds a string to the list of messages."""
79        if (text.strip() != ''):
80            t = escapePoString(text)
81            if self.output_msgstr:
82                self.translations.append(t)
83                return
84
85            if self.do_translations or (not t in self.messages):
86                self.messages.append(t)
87                if spacepreserve:
88                    self.nowrap[t] = True
89                if t in self.linenos.keys():
90                    self.linenos[t].append((self.filename, tag, lineno))
91                else:
92                    self.linenos[t] = [ (self.filename, tag, lineno) ]
93                if (not self.do_translations) and comment and not t in self.comments:
94                    self.comments[t] = comment
95            else:
96                if t in self.linenos.keys():
97                    self.linenos[t].append((self.filename, tag, lineno))
98                else:
99                    self.linenos[t] = [ (self.filename, tag, lineno) ]
100                if comment and not t in self.comments:
101                    self.comments[t] = comment
102
103    def outputHeader(self, out):
104        import time
105        out.write("""msgid ""
106msgstr ""
107"Project-Id-Version: PACKAGE VERSION\\n"
108"POT-Creation-Date: %s\\n"
109"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
110"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
111"Language-Team: LANGUAGE <LL@li.org>\\n"
112"MIME-Version: 1.0\\n"
113"Content-Type: text/plain; charset=UTF-8\\n"
114"Content-Transfer-Encoding: 8bit\\n"
115
116""" % (time.strftime("%Y-%m-%d %H:%M%z")))
117
118    def outputAll(self, out):
119        self.outputHeader(out)
120
121        for k in self.messages:
122            if k in self.comments:
123                out.write("#. %s\n" % (self.comments[k].replace("\n","\n#. ")))
124            references = ""
125            for reference in self.linenos[k]:
126                references += "%s:%d(%s) " % (reference[0], reference[2], reference[1])
127            out.write("#: %s\n" % (references.strip()))
128            if k in self.nowrap and self.nowrap[k]:
129                out.write("#, no-wrap\n")
130            out.write("msgid \"%s\"\n" % (k))
131            translation = ""
132            if self.do_translations:
133                if len(self.translations)>0:
134                    translation = self.translations.pop(0)
135            if translation == k:
136                translation = ""
137            out.write("msgstr \"%s\"\n\n" % (translation))
138
139class XMLDocument(object):
140    def __init__(self, filename, app):
141        self.app = app
142        self.expand_entities = self.app.options.get('expand_entities')
143        self.ignored_tags = self.app.current_mode.getIgnoredTags()
144        ctxt = libxml2.createFileParserCtxt(filename)
145        ctxt.lineNumbers(1)
146        if self.app.options.get('expand_all_entities'):
147            ctxt.replaceEntities(1)
148        ctxt.parseDocument()
149        self.doc = ctxt.doc()
150        if self.doc.name != filename:
151            raise Exception("Error: I tried to open '%s' but got '%s' -- how did that happen?" % (filename, self.doc.name))
152        if self.app.msg:
153            self.app.msg.setFilename(filename)
154        self.isFinalNode = self.app.current_mode.isFinalNode
155
156    def generate_messages(self):
157        self.app.msg.setFilename(self.doc.name)
158        self.doSerialize(self.doc)
159
160    def normalizeNode(self, node):
161        #print >>sys.stderr, "<%s> (%s) [%s]" % (node.name, node.type, node.serialize('utf-8'))
162        if not node:
163            return
164        elif self.app.isSpacePreserveNode(node):
165            return
166        elif node.isText():
167            if node.isBlankNode():
168                if self.app.options.get('expand_entities') or \
169                  (not (node.prev and not node.prev.isBlankNode() and node.next and not node.next.isBlankNode()) ):
170                    #print >>sys.stderr, "BLANK"
171                    node.setContent('')
172            else:
173                node.setContent(re.sub('\s+',' ', node.content))
174
175        elif node.children and node.type == 'element':
176            child = node.children
177            while child:
178                self.normalizeNode(child)
179                child = child.next
180
181    def normalizeString(self, text, spacepreserve = False):
182        """Normalizes string to be used as key for gettext lookup.
183
184        Removes all unnecessary whitespace."""
185        if spacepreserve:
186            return text
187        try:
188            # Lets add document DTD so entities are resolved
189            dtd = self.doc.intSubset()
190            tmp = dtd.serialize('utf-8')
191            tmp = tmp + '<norm>%s</norm>' % text
192        except:
193            tmp = '<norm>%s</norm>' % text
194
195        try:
196            ctxt = libxml2.createDocParserCtxt(tmp)
197            if self.app.options.get('expand_entities'):
198                ctxt.replaceEntities(1)
199            ctxt.parseDocument()
200            tree = ctxt.doc()
201            newnode = tree.getRootElement()
202        except:
203            print >> sys.stderr, """Error while normalizing string as XML:\n"%s"\n""" % (text)
204            return text
205
206        self.normalizeNode(newnode)
207
208        result = ''
209        child = newnode.children
210        while child:
211            result += child.serialize('utf-8')
212            child = child.next
213
214        result = re.sub('^ ','', result)
215        result = re.sub(' $','', result)
216        tree.freeDoc()
217
218        return result
219
220    def stringForEntity(self, node):
221        """Replaces entities in the node."""
222        text = node.serialize('utf-8')
223        try:
224            # Lets add document DTD so entities are resolved
225            dtd = self.doc.intSubset()
226            tmp = dtd.serialize('utf-8') + '<norm>%s</norm>' % text
227            next = True
228        except:
229            tmp = '<norm>%s</norm>' % text
230            next = False
231
232        ctxt = libxml2.createDocParserCtxt(tmp)
233        if self.expand_entities:
234            ctxt.replaceEntities(1)
235        ctxt.parseDocument()
236        tree = ctxt.doc()
237        if next:
238            newnode = tree.children.next
239        else:
240            newnode = tree.children
241
242        result = ''
243        child = newnode.children
244        while child:
245            result += child.serialize('utf-8')
246            child = child.next
247        tree.freeDoc()
248        return result
249
250
251    def myAttributeSerialize(self, node):
252        result = ''
253        if node.children:
254            child = node.children
255            while child:
256                if child.type=='text':
257                    result += self.doc.encodeEntitiesReentrant(child.content)
258                elif child.type=='entity_ref':
259                    if not self.expand_entities:
260                        result += '&' + child.name + ';'
261                    else:
262                        result += child.content.decode('utf-8')
263                else:
264                    result += self.myAttributeSerialize(child)
265                child = child.next
266        else:
267            result = node.serialize('utf-8')
268        return result
269
270    def startTagForNode(self, node):
271        if not node:
272            return 0
273
274        result = node.name
275        params = ''
276        if node.properties:
277            for p in node.properties:
278                if p.type == 'attribute':
279                    try:
280                        nsprop = p.ns().name + ":" + p.name
281                    except:
282                        nsprop = p.name
283                    params += " %s=\"%s\"" % (nsprop, self.myAttributeSerialize(p))
284        return result+params
285
286    def endTagForNode(self, node):
287        if not node:
288            return False
289        return node.name
290
291    def ignoreNode(self, node):
292        if self.isFinalNode(node):
293            return False
294        if node.name in self.ignored_tags or node.type in ('dtd', 'comment'):
295            return True
296        return False
297
298    def getCommentForNode(self, node):
299        """Walk through previous siblings until a comment is found, or other element.
300
301        Only whitespace is allowed between comment and current node."""
302        prev = node.prev
303        while prev and prev.type == 'text' and prev.content.strip() == '':
304            prev = prev.prev
305        if prev and prev.type == 'comment':
306            return prev.content.strip()
307        else:
308            return None
309
310    def replaceAttributeContentsWithText(self, node, text):
311        node.setContent(text)
312
313    def replaceNodeContentsWithText(self, node, text):
314        """Replaces all subnodes of a node with contents of text treated as XML."""
315
316        if node.children:
317            starttag = self.startTagForNode(node)
318            endtag = self.endTagForNode(node)
319
320            # Lets add document DTD so entities are resolved
321            tmp = '<?xml version="1.0" encoding="utf-8" ?>'
322            try:
323                dtd = self.doc.intSubset()
324                tmp = tmp + dtd.serialize('utf-8')
325            except libxml2.treeError:
326                pass
327
328            content = '<%s>%s</%s>' % (starttag, text, endtag)
329            tmp = tmp + content.encode('utf-8')
330
331            newnode = None
332            try:
333                ctxt = libxml2.createDocParserCtxt(tmp)
334                ctxt.replaceEntities(0)
335                ctxt.parseDocument()
336                newnode = ctxt.doc()
337            except:
338                pass
339
340            if not newnode:
341                print >> sys.stderr, """Error while parsing translation as XML:\n"%s"\n""" % (text.encode('utf-8'))
342                return
343
344            newelem = newnode.getRootElement()
345
346            if newelem and newelem.children:
347                free = node.children
348                while free:
349                    next = free.next
350                    free.unlinkNode()
351                    free = next
352
353                if node:
354                    copy = newelem.copyNodeList()
355                    next = node.next
356                    node.replaceNode(newelem.copyNodeList())
357                    node.next = next
358
359            else:
360                # In practice, this happens with tags such as "<para>    </para>" (only whitespace in between)
361                pass
362        else:
363            node.setContent(text)
364
365    def hasText(self, node):
366        """Whether or not a node contains text
367
368        A node "contains text" if the node itself or one of its children
369        is a text node containing non-empty text.
370        """
371        if node.name in self.ignored_tags:
372            return False
373        if node.isText() and node.content.strip() != '':
374            return True
375        child = node.children
376        while child:
377            if child.isText() and child.content.strip() != '':
378                return True
379            else:
380                child = child.next
381        return False
382
383
384    def worthOutputting(self,node):
385        """Whether or not a node is worth outputting
386
387        A node is "worth outputting", if the node itself or one of its
388        children is a text node -- unless the node is not final and there
389        is a parent node which is already worth outputting.
390        """
391        worth = self.hasText(node)	# is or has non-empty text node
392        if not (self.isFinalNode(node) or node.get_name() in self.ignored_tags):
393            parent = node.get_parent()
394            while worth and parent:
395                if self.worthOutputting(parent):
396                    worth = False
397                else:
398                    parent = parent.get_parent()
399        return worth
400
401    def processAttribute(self, node, attr):
402        assert node and attr
403
404        outtxt = self.normalizeString(attr.content)
405        if self.app.operation == 'merge':
406            translation = self.app.getTranslation(outtxt)  # unicode or None
407            if translation is not None:
408                self.replaceAttributeContentsWithText(attr,
409                                                      translation.encode('utf-8'))
410        else:
411            self.app.msg.outputMessage(outtxt, node.lineNo(),  "", spacepreserve=False,
412                              tag = node.name + ":" + attr.name)
413
414    def processElementTag(self, node, replacements, restart = False):
415        """Process node with node.type == 'element'."""
416        if node.type != 'element':
417            raise Exception("You must pass node with node.type=='element'.")
418
419        # Translate attributes if needed
420        if node.properties and self.app.current_mode.getTreatedAttributes():
421            for p in node.properties:
422                if p.name in self.app.current_mode.getTreatedAttributes():
423                    self.processAttribute(node, p)
424
425        outtxt = ''
426        if restart:
427            myrepl = []
428        else:
429            myrepl = replacements
430
431        submsgs = []
432
433        child = node.children
434        while child:
435            if (self.isFinalNode(child)) or (child.type == 'element' and self.worthOutputting(child)):
436                myrepl.append(self.processElementTag(child, myrepl, True))
437                outtxt += '<placeholder-%d/>' % (len(myrepl))
438            else:
439                if child.type == 'element':
440                    (starttag, content, endtag, translation) = self.processElementTag(child, myrepl, False)
441                    outtxt += '<%s>%s</%s>' % (starttag, content, endtag)
442                else:
443                    outtxt += self.doSerialize(child)
444            child = child.next
445
446        if self.app.operation == 'merge':
447            norm_outtxt = self.normalizeString(outtxt, self.app.isSpacePreserveNode(node))
448            translation = self.app.getTranslation(norm_outtxt)
449        else:
450            translation = outtxt.decode('utf-8')
451
452        starttag = self.startTagForNode(node)
453        endtag = self.endTagForNode(node)
454
455        worth = self.worthOutputting(node)
456        if not translation:
457            translation = outtxt.decode('utf-8')
458            if worth and self.app.options.get('mark_untranslated'):
459                node.setLang('C')
460
461        if restart or worth:
462            for i, repl in enumerate(myrepl):
463                # repl[0] may contain translated attributes with
464                # non-ASCII chars, so implicit conversion to <str> may fail
465                replacement = '<%s>%s</%s>' % \
466                              (repl[0].decode('utf-8'), repl[3], repl[2])
467                translation = translation.replace('<placeholder-%d/>' % (i+1), replacement)
468
469            if worth:
470                if self.app.operation == 'merge':
471                    self.replaceNodeContentsWithText(node, translation)
472                else:
473                    norm_outtxt = self.normalizeString(outtxt, self.app.isSpacePreserveNode(node))
474                    self.app.msg.outputMessage(norm_outtxt, node.lineNo(), self.getCommentForNode(node), self.app.isSpacePreserveNode(node), tag = node.name)
475
476        return (starttag, outtxt, endtag, translation)
477
478
479    def isExternalGeneralParsedEntity(self, node):
480        try:
481            # it would be nice if debugDumpNode could use StringIO, but it apparently cannot
482            tmp = tempfile.TemporaryFile()
483            node.debugDumpNode(tmp,0)
484            tmp.seek(0)
485            tmpstr = tmp.read()
486            tmp.close()
487        except:
488            # We fail silently, and replace all entities if we cannot
489            # write .xml2po-entitychecking
490            # !!! This is not very nice thing to do, but I don't know if
491            #     raising an exception is any better
492            return False
493        return tmpstr.find('EXTERNAL_GENERAL_PARSED_ENTITY') != -1
494
495    def doSerialize(self, node):
496        """Serializes a node and its children, emitting PO messages along the way.
497
498        node is the node to serialize, first indicates whether surrounding
499        tags should be emitted as well.
500        """
501
502        if self.ignoreNode(node):
503            return ''
504        elif not node.children:
505            return node.serialize("utf-8")
506        elif node.type == 'entity_ref':
507            if self.isExternalGeneralParsedEntity(node):
508                return node.serialize('utf-8')
509            else:
510                return self.stringForEntity(node) #content #content #serialize("utf-8")
511        elif node.type == 'entity_decl':
512            return node.serialize('utf-8') #'<%s>%s</%s>' % (startTagForNode(node), node.content, node.name)
513        elif node.type == 'text':
514            return node.serialize('utf-8')
515        elif node.type == 'element':
516            repl = []
517            (starttag, content, endtag, translation) = self.processElementTag(node, repl, True)
518            return '<%s>%s</%s>' % (starttag, content, endtag)
519        else:
520            child = node.children
521            outtxt = ''
522            while child:
523                outtxt += self.doSerialize(child)
524                child = child.next
525            return outtxt
526
527def xml_error_handler(arg, ctxt):
528    #deactivate error messages from the validation
529    pass
530
531class Main(object):
532    def __init__(self, mode, operation, output, options):
533        libxml2.registerErrorHandler(xml_error_handler, None)
534        self.operation = operation
535        self.options = options
536        self.msg = None
537        self.gt = None
538        self.current_mode = self.load_mode(mode)()
539        # Prepare output
540        if operation == 'update':
541            self.out = tempfile.TemporaryFile()
542        elif output == '-':
543            self.out = sys.stdout
544        else:
545            self.out = file(output, 'w')
546
547    def load_mode(self, modename):
548        try:
549            module = __import__('xml2po.modes.%s' % modename, globals(), locals(), ['%sXmlMode' % modename])
550            return getattr(module, '%sXmlMode' % modename)
551        except (ImportError, AttributeError):
552            if modename == 'basic':
553                sys.stderr.write("Unable to find xml2po modes. Please check your xml2po installation.\n")
554                sys.exit(1)
555            else:
556                sys.stderr.write("Unable to load mode '%s'. Falling back to 'basic' mode with automatic detection (-a).\n" % modename)
557                return self.load_mode('basic')
558
559    def to_pot(self, xmlfiles):
560        """ Produce a pot file from the list of 'xmlfiles' """
561        self.msg = MessageOutput(self)
562        for xmlfile in xmlfiles:
563            if not os.access(xmlfile, os.R_OK):
564                raise IOError("Unable to read file '%s'" % xmlfile)
565            try:
566                doc = XMLDocument(xmlfile, self)
567            except Exception as e:
568                print >> sys.stderr, "Unable to parse XML file '%s': %s" % (xmlfile, str(e))
569                sys.exit(1)
570            self.current_mode.preProcessXml(doc.doc, self.msg)
571            doc.generate_messages()
572        self.output_po()
573
574    def merge(self, mofile, xmlfile):
575        """ Merge translations from mofile into xmlfile to generate a translated XML file """
576        if not os.access(xmlfile, os.R_OK):
577            raise IOError("Unable to read file '%s'" % xmlfile)
578        try:
579            doc = XMLDocument(xmlfile, self)
580        except Exception as e:
581            print >> sys.stderr, str(e)
582            sys.exit(1)
583
584        try:
585            mfile = open(mofile, "rb")
586        except:
587            print >> sys.stderr, "Can't open MO file '%s'." % (mofile)
588        self.gt = gettext.GNUTranslations(mfile)
589        self.gt.add_fallback(NoneTranslations())
590        # Has preProcessXml use cases for merge?
591        #self.current_mode.preProcessXml(doc.doc, self.msg)
592
593        doc.doSerialize(doc.doc)
594        tcmsg = self.current_mode.getStringForTranslators()
595        outtxt = self.getTranslation(tcmsg)
596        self.current_mode.postProcessXmlTranslation(doc.doc, self.options.get('translationlanguage'), outtxt)
597        self.out.write(doc.doc.serialize('utf-8', 1))
598
599    def reuse(self, origxml, xmlfile):
600        """ Produce a po file from xmlfile pot and using translations from origxml """
601        self.msg = MessageOutput(self)
602        self.msg.do_translations = True
603        if not os.access(xmlfile, os.R_OK):
604            raise IOError("Unable to read file '%s'" % xmlfile)
605        if not os.access(origxml, os.R_OK):
606            raise IOError("Unable to read file '%s'" % xmlfile)
607        try:
608            doc = XMLDocument(xmlfile, self)
609        except Exception as e:
610            print >> sys.stderr, str(e)
611            sys.exit(1)
612        doc.generate_messages()
613
614        self.msg.translationsFollow()
615        try:
616            doc = XMLDocument(origxml, self)
617        except Exception as e:
618            print >> sys.stderr, str(e)
619            sys.exit(1)
620        doc.generate_messages()
621        self.output_po()
622
623    def update(self, xmlfiles, lang_file):
624        """ Merge the produced pot with an existing po file (lang_file) """
625        if not os.access(lang_file, os.W_OK):
626            raise IOError("'%s' does not exist or is not writable." % lang_file)
627        self.to_pot(xmlfiles)
628        lang = os.path.basename(lang_file).split(".")[0]
629
630        sys.stderr.write("Merging translations for %s: \n" % (lang))
631        self.out.seek(0)
632        merge_cmd = subprocess.Popen(["msgmerge", "-o", ".tmp.%s.po" % lang, lang_file, "-"],
633                                     stdin=self.out, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
634        cmdout, cmderr = merge_cmd.communicate()
635        if merge_cmd.returncode:
636             raise Exception("Error during msgmerge command.")
637        else:
638            result = subprocess.call(["mv", ".tmp.%s.po" % lang, lang_file])
639            if result:
640                raise Exception("Error: cannot rename file.")
641            else:
642                subprocess.call(["msgfmt", "-cv", "-o", NULL_STRING, lang_file])
643
644    def getTranslation(self, text):
645        """Returns a translation via gettext for specified snippet.
646
647        text should be a string to look for.
648        """
649        #print >>sys.stderr,"getTranslation('%s')" % (text.encode('utf-8'))
650        if not text or text.strip() == '':
651            return text
652        if self.gt:
653            res = self.gt.ugettext(text.decode('utf-8'))
654            return res
655
656        return text
657
658    def output_po(self):
659        """ Write the resulting po/pot file to specified output """
660        tcmsg = self.current_mode.getStringForTranslators()
661        tccom = self.current_mode.getCommentForTranslators()
662        if tcmsg:
663            self.msg.outputMessage(tcmsg, lineno=0, comment=tccom)
664
665        self.msg.outputAll(self.out)
666
667    # **** XML utility functions ****
668    def isSpacePreserveNode(self, node):
669        if node.getSpacePreserve() == 1:
670            return True
671        else:
672            return node.name in self.current_mode.getSpacePreserveTags()
673
674