1#
2# Copyright 2002-2013 Zuza Software Foundation
3#
4# This file is part of translate.
5#
6# translate is free software; you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation; either version 2 of the License, or
9# (at your option) any later version.
10#
11# translate is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program; if not, see <http://www.gnu.org/licenses/>.
18
19r"""Classes that hold units of .dtd files (:class:`dtdunit`) or entire files
20(:class:`dtdfile`).
21
22These are specific .dtd files for localisation used by mozilla.
23
24Specifications
25    The following information is provided by Mozilla:
26
27    `Specification <http://www.w3.org/TR/REC-xml/#sec-entexpand>`_
28
29    There is a grammar for entity definitions, which isn't really precise,
30    as the spec says.  There's no formal specification for DTD files, it's
31    just "whatever makes this work" basically. The whole piece is clearly not
32    the strongest point of the xml spec
33
34    XML elements are allowed in entity values. A number of things that are
35    allowed will just break the resulting document, Mozilla forbids these
36    in their DTD parser.
37
38Dialects
39    There are two dialects:
40
41    - Regular DTD
42    - Android DTD
43
44    Both dialects are similar, but the Android DTD uses some particular escapes
45    that regular DTDs don't have.
46
47Escaping in regular DTD
48    In DTD usually there are characters escaped in the entities. In order to
49    ease the translation some of those escaped characters are unescaped when
50    reading from, or converting, the DTD, and that are escaped again when
51    saving, or converting to a DTD.
52
53    In regular DTD the following characters are usually or sometimes escaped:
54
55    - The % character is escaped using &#037; or &#37; or &#x25;
56    - The " character is escaped using &quot;
57    - The ' character is escaped using &apos; (partial roundtrip)
58    - The & character is escaped using &amp;
59    - The < character is escaped using &lt; (not yet implemented)
60    - The > character is escaped using &gt; (not yet implemented)
61
62    Besides the previous ones there are a lot of escapes for a huge number of
63    characters. This escapes usually have the form of &#NUMBER; where NUMBER
64    represents the numerical code for the character.
65
66    There are a few particularities in DTD escaping. Some of the escapes are
67    not yet implemented since they are not really necessary, or because its
68    implementation is too hard.
69
70    A special case is the ' escaping using &apos; which doesn't provide a full
71    roundtrip conversion in order to support some special Mozilla DTD files.
72
73    Also the " character is never escaped in the case that the previous
74    character is = (the sequence =" is present on the string) in order to avoid
75    escaping the " character indicating an attribute assignment, for example in
76    a href attribute for an a tag in HTML (anchor tag).
77
78Escaping in Android DTD
79    It has the sames escapes as in regular DTD, plus this ones:
80
81    - The ' character is escaped using \&apos; or \' or \u0027
82    - The " character is escaped using \&quot;
83"""
84
85import re
86import warnings
87from io import BytesIO
88
89from lxml import etree
90
91from translate.misc import quote
92from translate.storage import base
93
94
95labelsuffixes = (".label", ".title")
96"""Label suffixes: entries with this suffix are able to be comibed with accesskeys
97found in in entries ending with :attr:`.accesskeysuffixes`"""
98accesskeysuffixes = (".accesskey", ".accessKey", ".akey")
99"""Accesskey Suffixes: entries with this suffix may be combined with labels
100ending in :attr:`.labelsuffixes` into accelerator notation"""
101
102
103def quoteforandroid(source):
104    """Escapes a line for Android DTD files."""
105    # Replace "'" character with the \u0027 escape. Other possible replaces are
106    # "\\&apos;" or "\\'".
107    source = source.replace("'", "\\u0027")
108    source = source.replace('"', "\\&quot;")
109    value = quotefordtd(source)  # value is an UTF-8 encoded string.
110    return value
111
112
113def unquotefromandroid(source):
114    """Unquotes a quoted Android DTD definition."""
115    value = unquotefromdtd(source)  # value is an UTF-8 encoded string.
116    value = value.replace("\\&apos;", "'")
117    value = value.replace("\\'", "'")
118    value = value.replace("\\u0027", "'")
119    value = value.replace('\\"', '"')  # This converts \&quot; to ".
120    return value
121
122
123_DTD_CODEPOINT2NAME = {
124    ord("%"): "#037",  # Always escape % sign as &#037;.
125    ord("&"): "amp",
126    # ord("<"): "lt",  # Not really so useful.
127    # ord(">"): "gt",  # Not really so useful.
128}
129
130
131def quotefordtd(source):
132    """Quotes and escapes a line for regular DTD files."""
133    source = quote.entityencode(source, _DTD_CODEPOINT2NAME)
134    if '"' in source:
135        source = source.replace("'", "&apos;")  # This seems not to runned.
136        if '="' not in source:  # Avoid escaping " chars in href attributes.
137            source = source.replace('"', "&quot;")
138            value = '"' + source + '"'  # Quote using double quotes.
139        else:
140            value = "'" + source + "'"  # Quote using single quotes.
141    else:
142        value = '"' + source + '"'  # Quote using double quotes.
143    return value
144
145
146_DTD_NAME2CODEPOINT = {
147    "quot": ord('"'),
148    "amp": ord("&"),
149    # "lt": ord("<"),  # Not really so useful.
150    # "gt": ord(">"),  # Not really so useful.
151    # FIXME these should probably be handled in a more general way
152    "#x0022": ord('"'),
153    "#187": ord("»"),
154    "#037": ord("%"),
155    "#37": ord("%"),
156    "#x25": ord("%"),
157}
158
159
160def unquotefromdtd(source):
161    """unquotes a quoted dtd definition"""
162    # extract the string, get rid of quoting
163    if len(source) == 0:
164        source = '""'
165    # The quote characters should be the first and last characters in the
166    # string. Of course there could also be quote characters within the string.
167    quotechar = source[0]
168    extracted, quotefinished = quote.extractwithoutquotes(
169        source, quotechar, quotechar, allowreentry=False
170    )
171    if quotechar == "'":
172        extracted = extracted.replace("&apos;", "'")
173    return quote.entitydecode(extracted, _DTD_NAME2CODEPOINT)
174
175
176def removeinvalidamps(name, value):
177    """Find and remove ampersands that are not part of an entity definition.
178
179    A stray & in a DTD file can break an application's ability to parse the
180    file. In Mozilla localisation this is very important and these can break the
181    parsing of files used in XUL and thus break interface rendering. Tracking
182    down the problem is very difficult, thus by removing potential broken
183    ampersand and warning the users we can ensure that the output DTD will
184    always be parsable.
185
186    :type name: String
187    :param name: Entity name
188    :type value: String
189    :param value: Entity text value
190    :rtype: String
191    :return: Entity value without bad ampersands
192    """
193
194    def is_valid_entity_name(name):
195        """Check that supplied *name* is a valid entity name."""
196        if name.replace(".", "").replace("_", "").isalnum():
197            return True
198        elif name[0] == "#" and name[1:].isalnum():
199            return True
200        return False
201
202    amppos = 0
203    invalid_amps = []
204    while amppos >= 0:
205        amppos = value.find("&", amppos)
206        if amppos != -1:
207            amppos += 1
208            semipos = value.find(";", amppos)
209            if semipos != -1:
210                if is_valid_entity_name(value[amppos:semipos]):
211                    continue
212            invalid_amps.append(amppos - 1)
213    if len(invalid_amps) > 0:
214        warnings.warn("invalid ampersands in dtd entity %s" % (name))
215        adjustment = 0
216        for amppos in invalid_amps:
217            value = value[: amppos - adjustment] + value[amppos - adjustment + 1 :]
218            adjustment += 1
219    return value
220
221
222class dtdunit(base.TranslationUnit):
223    """An entity definition from a DTD file (and any associated comments)."""
224
225    def __init__(self, source="", android=False):
226        """construct the dtdunit, prepare it for parsing"""
227        self.android = android
228
229        super().__init__(source)
230        self.comments = []
231        self.unparsedlines = []
232        self.incomment = False
233        self.inentity = False
234        self.entity = "FakeEntityOnlyForInitialisationAndTesting"
235        self.source = source
236        self.space_pre_entity = " "
237        self.space_pre_definition = " "
238        self.closing = ">"
239
240    # Note that source and target are equivalent for monolingual units
241    @property
242    def source(self):
243        """gets the unquoted source string"""
244        if self.android:
245            return unquotefromandroid(self.definition)
246        else:
247            return unquotefromdtd(self.definition)
248
249    @source.setter
250    def source(self, source):
251        """Sets the definition to the quoted value of source"""
252        if self.android:
253            self.definition = quoteforandroid(source)
254        else:
255            self.definition = quotefordtd(source)
256        self._rich_source = None
257
258    @property
259    def target(self):
260        """gets the unquoted target string"""
261        if self.android:
262            return unquotefromandroid(self.definition)
263        else:
264            return unquotefromdtd(self.definition)
265
266    @target.setter
267    def target(self, target):
268        """Sets the definition to the quoted value of target"""
269        if target is None:
270            target = ""
271        if self.android:
272            self.definition = quoteforandroid(target)
273        else:
274            self.definition = quotefordtd(target)
275        self._rich_target = None
276
277    def getid(self):
278        return self.entity
279
280    def setid(self, new_id):
281        self.entity = new_id
282
283    def getlocations(self):
284        """Return the entity as location (identifier)."""
285        assert quote.rstripeol(self.entity) == self.entity
286        return [self.entity]
287
288    def addlocation(self, location):
289        """Set the entity to the given "location"."""
290        self.entity = location
291
292    def isblank(self):
293        """returns whether this dtdunit doesn't actually have an entity definition"""
294        # for dtds, we currently return a blank string if there is no .entity (==location in other files)
295        # TODO: this needs to work better with base class expectations
296        return self.entity is None
297
298    def istranslatable(self):
299        if getattr(self, "entityparameter", None) == "SYSTEM" or self.isblank():
300            return False
301        return True
302
303    def parse(self, dtdsrc):
304        """read the first dtd element from the source code into this object, return linesprocessed"""
305        self.comments = []
306        # make all the lists the same
307        self._locfilenotes = self.comments
308        self._locgroupstarts = self.comments
309        self._locgroupends = self.comments
310        self._locnotes = self.comments
311        # self._locfilenotes = []
312        # self._locgroupstarts = []
313        # self._locgroupends = []
314        # self._locnotes = []
315        # self.comments = []
316        self.entity = None
317        self.definition = ""
318        if not dtdsrc:
319            return 0
320        lines = dtdsrc.split("\n")
321        linesprocessed = 0
322        comment = ""
323        for line in lines:
324            line += "\n"
325            linesprocessed += 1
326            if not self.incomment:
327                if line.find("<!--") != -1:
328                    self.incomment = True
329                    self.continuecomment = False
330                    # now work out the type of comment, and save it (remember we're not in the comment yet)
331                    (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0)
332                    if comment.find("LOCALIZATION NOTE") != -1:
333                        l = quote.findend(comment, "LOCALIZATION NOTE")
334                        while comment[l] == " ":
335                            l += 1
336                        if comment.find("FILE", l) == l:
337                            self.commenttype = "locfile"
338                        elif comment.find("BEGIN", l) == l:
339                            self.commenttype = "locgroupstart"
340                        elif comment.find("END", l) == l:
341                            self.commenttype = "locgroupend"
342                        else:
343                            self.commenttype = "locnote"
344                    else:
345                        # plain comment
346                        self.commenttype = "comment"
347                # FIXME: bloody entity might share a line with something important
348                elif not self.inentity and re.search("%.*;", line):
349                    # now work out the type of comment, and save it (remember we're not in the comment yet)
350                    self.comments.append(("comment", line))
351                    line = ""
352                    continue
353
354            if self.incomment:
355                # some kind of comment
356                (comment, self.incomment) = quote.extract(
357                    line, "<!--", "-->", None, self.continuecomment
358                )
359                self.continuecomment = self.incomment
360                # strip the comment out of what will be parsed
361                line = line.replace(comment, "", 1)
362                # add a end of line of this is the end of the comment
363                if not self.incomment:
364                    if line.isspace():
365                        comment += line
366                        line = ""
367                    else:
368                        comment += "\n"
369                # check if there's actually an entity definition that's commented out
370                # TODO: parse these, store as obsolete messages
371                # if comment.find('<!ENTITY') != -1:
372                #     # remove the entity from the comment
373                #     comment, dummy = quote.extractwithoutquotes(comment, ">", "<!ENTITY", None, 1)
374                # depending on the type of comment (worked out at the start), put it in the right place
375                # make it record the comment and type as a tuple
376                commentpair = (self.commenttype, comment)
377                if self.commenttype == "locfile":
378                    self._locfilenotes.append(commentpair)
379                elif self.commenttype == "locgroupstart":
380                    self._locgroupstarts.append(commentpair)
381                elif self.commenttype == "locgroupend":
382                    self._locgroupends.append(commentpair)
383                elif self.commenttype == "locnote":
384                    self._locnotes.append(commentpair)
385                elif self.commenttype == "comment":
386                    self.comments.append(commentpair)
387
388            if not self.inentity and not self.incomment:
389                entitypos = line.find("<!ENTITY")
390                if entitypos != -1:
391                    self.inentity = True
392                    beforeentity = line[:entitypos].strip()
393                    if beforeentity.startswith("#"):
394                        self.hashprefix = beforeentity
395                    self.entitypart = "start"
396                else:
397                    self.unparsedlines.append(line)
398
399            if self.inentity:
400                if self.entitypart == "start":
401                    # the entity definition
402                    e = quote.findend(line, "<!ENTITY")
403                    line = line[e:]
404                    self.entitypart = "name"
405                    self.entitytype = "internal"
406                if self.entitypart == "name":
407                    s = 0
408                    e = 0
409                    while e < len(line) and line[e].isspace():
410                        e += 1
411                    self.space_pre_entity = " " * (e - s)
412                    s = e
413                    self.entity = ""
414                    if e < len(line) and line[e] == "%":
415                        self.entitytype = "external"
416                        self.entityparameter = ""
417                        e += 1
418                        while e < len(line) and line[e].isspace():
419                            e += 1
420                    while e < len(line) and not line[e].isspace():
421                        self.entity += line[e]
422                        e += 1
423                    s = e
424
425                    assert quote.rstripeol(self.entity) == self.entity
426                    while e < len(line) and line[e].isspace():
427                        e += 1
428                    self.space_pre_definition = " " * (e - s)
429                    if self.entity:
430                        if self.entitytype == "external":
431                            self.entitypart = "parameter"
432                        else:
433                            self.entitypart = "definition"
434                        # remember the start position and the quote character
435                        if e == len(line):
436                            self.entityhelp = None
437                            e = 0
438                            continue
439                        elif self.entitypart == "definition":
440                            self.entityhelp = (e, line[e])
441                            self.instring = False
442                if self.entitypart == "parameter":
443                    while e < len(line) and line[e].isspace():
444                        e += 1
445                    paramstart = e
446                    while e < len(line) and line[e].isalnum():
447                        e += 1
448                    self.entityparameter += line[paramstart:e]
449                    while e < len(line) and line[e].isspace():
450                        e += 1
451                    line = line[e:]
452                    e = 0
453                    if not line:
454                        continue
455                    if line[0] in ('"', "'"):
456                        self.entitypart = "definition"
457                        self.entityhelp = (e, line[e])
458                        self.instring = False
459                if self.entitypart == "definition":
460                    if self.entityhelp is None:
461                        e = 0
462                        while e < len(line) and line[e].isspace():
463                            e += 1
464                        if e == len(line):
465                            continue
466                        self.entityhelp = (e, line[e])
467                        self.instring = False
468                    # actually the lines below should remember instring, rather than using it as dummy
469                    e = self.entityhelp[0]
470                    if self.entityhelp[1] == "'":
471                        (defpart, self.instring) = quote.extract(
472                            line[e:],
473                            "'",
474                            "'",
475                            startinstring=self.instring,
476                            allowreentry=False,
477                        )
478                    elif self.entityhelp[1] == '"':
479                        (defpart, self.instring) = quote.extract(
480                            line[e:],
481                            '"',
482                            '"',
483                            startinstring=self.instring,
484                            allowreentry=False,
485                        )
486                    else:
487                        raise ValueError(
488                            "Unexpected quote character... %r" % (self.entityhelp[1])
489                        )
490                    # for any following lines, start at the beginning of the line. remember the quote character
491                    self.entityhelp = (0, self.entityhelp[1])
492                    self.definition += defpart
493                    if not self.instring:
494                        self.closing = line[e + len(defpart) :].rstrip("\n\r")
495                        self.inentity = False
496                        break
497
498        return linesprocessed
499
500    def __str__(self):
501        """convert to a string."""
502        return self.getoutput()
503
504    def getoutput(self):
505        """convert the dtd entity back to string form"""
506        lines = []
507        lines.extend([comment for commenttype, comment in self.comments])
508        lines.extend(self.unparsedlines)
509        if self.isblank():
510            result = "".join(lines)
511            return result.rstrip() + "\n"
512        # for f in self._locfilenotes: yield f
513        # for ge in self._locgroupends: yield ge
514        # for gs in self._locgroupstarts: yield gs
515        # for n in self._locnotes: yield n
516        if len(self.entity) > 0:
517            if getattr(self, "entitytype", None) == "external":
518                entityline = (
519                    "<!ENTITY % "
520                    + self.entity
521                    + " "
522                    + self.entityparameter
523                    + " "
524                    + self.definition
525                    + self.closing
526                )
527            else:
528                entityline = (
529                    "<!ENTITY"
530                    + self.space_pre_entity
531                    + self.entity
532                    + self.space_pre_definition
533                    + self.definition
534                    + self.closing
535                )
536            if getattr(self, "hashprefix", None):
537                entityline = self.hashprefix + " " + entityline
538            lines.append(entityline + "\n")
539        return "".join(lines)
540
541
542class dtdfile(base.TranslationStore):
543    """A .dtd file made up of dtdunits."""
544
545    UnitClass = dtdunit
546
547    def __init__(self, inputfile=None, android=False):
548        """construct a dtdfile, optionally reading in from inputfile"""
549        super().__init__()
550        self.filename = getattr(inputfile, "name", "")
551        self.android = android
552        if inputfile is not None:
553            dtdsrc = inputfile.read()
554            self.parse(dtdsrc)
555
556    def parse(self, dtdsrc):
557        """read the source code of a dtd file in and include them as dtdunits in self.units"""
558        start = 0
559        end = 0
560        lines = dtdsrc.split(b"\n")
561        while end < len(lines):
562            if start == end:
563                end += 1
564            foundentity = False
565            while end < len(lines):
566                if end >= len(lines):
567                    break
568                if lines[end].find(b"<!ENTITY") > -1:
569                    foundentity = True
570                if foundentity and re.match(br"[\"']\s*>", lines[end]):
571                    end += 1
572                    break
573                end += 1
574
575            linesprocessed = 1  # to initialise loop
576            while linesprocessed >= 1:
577                newdtd = dtdunit(android=self.android)
578                try:
579                    linesprocessed = newdtd.parse(
580                        (b"\n".join(lines[start:end])).decode(self.encoding)
581                    )
582                    if linesprocessed >= 1 and (
583                        not newdtd.isblank() or newdtd.unparsedlines
584                    ):
585                        self.units.append(newdtd)
586                except Exception as e:
587                    warnings.warn(
588                        "%s\nError occured between lines %d and %d:\n%s"
589                        % (e, start + 1, end, b"\n".join(lines[start:end]))
590                    )
591                start += linesprocessed
592
593    def serialize(self, out):
594        """Write content to file"""
595        content = b""
596        for dtd in self.units:
597            unit_str = str(dtd).encode(self.encoding)
598            out.write(unit_str)
599            content += unit_str
600        if not self._valid_store(content):
601            warnings.warn("DTD file '%s' does not validate" % self.filename)
602            out.truncate(0)
603
604    def _valid_store(self, content):
605        """Validate the store to determine if it is valid
606
607        This uses ElementTree to parse the DTD
608
609        :return: If the store passes validation
610        :rtype: Boolean
611        """
612        # Android files are invalid DTDs
613        if not self.android:
614            # #expand is a Mozilla hack and are removed as they are not valid in DTDs
615            _input = re.sub(b"#expand", b"", content)
616            try:
617                etree.DTD(BytesIO(_input))
618            except etree.DTDParseError as e:
619                warnings.warn("DTD parse error: %s" % e.error_log)
620                return False
621        return True
622