1#
2# Copyright 2004-2006,2008 Zuza Software Foundation
3#
4# This file is part of translate.
5#
6# translate is free software; you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation; either version 2 of the License, or
9# (at your option) any later version.
10#
11# translate is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program; if not, see <http://www.gnu.org/licenses/>.
18#
19
20"""module for parsing html files for translation"""
21
22import html.parser
23import re
24from html.entities import html5
25
26from translate.storage import base
27from translate.storage.base import ParseError
28
29
30# Override the piclose tag from simple > to ?> otherwise we consume HTML
31# within the processing instructions
32html.parser.piclose = re.compile(r"\?>")
33
34
35class htmlunit(base.TranslationUnit):
36    """A unit of translatable/localisable HTML content"""
37
38    def __init__(self, source=None):
39        super().__init__(source)
40        self.locations = []
41
42    def addlocation(self, location):
43        self.locations.append(location)
44
45    def getlocations(self):
46        return self.locations
47
48
49class htmlfile(html.parser.HTMLParser, base.TranslationStore):
50    UnitClass = htmlunit
51
52    TRANSLATABLE_ELEMENTS = [
53        "address",
54        "article",
55        "aside",
56        "blockquote",
57        "caption",
58        "dd",
59        "dt",
60        "div",
61        "figcaption",
62        "footer",
63        "header",
64        "h1",
65        "h2",
66        "h3",
67        "h4",
68        "h5",
69        "h6",
70        "li",
71        "main",
72        "nav",
73        "option",
74        "p",
75        "pre",
76        "section",
77        "td",
78        "th",
79        "title",
80    ]
81    """These HTML elements (tags) will be extracted as translation units, unless
82    they lack translatable text content.
83    In case one translatable element is embedded in another, the outer translation
84    unit will be split into the parts before and after the inner translation unit."""
85
86    TRANSLATABLE_ATTRIBUTES = [
87        "abbr",  # abbreviation for a table header cell
88        "alt",
89        "lang",  # only for the html element -- see extract_translatable_attributes()
90        "summary",
91        "title",  # tooltip text for an element
92        "value",
93    ]
94    """Text from these HTML attributes will be extracted as translation units.
95    Note: the content attribute of meta tags is a special case."""
96
97    TRANSLATABLE_METADATA = ["description", "keywords"]
98    """Document metadata from meta elements with these names will be extracted as translation units.
99    Reference `<https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name>`_"""
100
101    EMPTY_HTML_ELEMENTS = [
102        "area",
103        "base",
104        "br",
105        "col",
106        "embed",
107        "hr",
108        "img",
109        "input",
110        "link",
111        "meta",
112        "param",
113        "source",
114        "track",
115        "wbr",
116    ]
117    """An empty element is an element that cannot have any child nodes (i.e., nested
118    elements or text nodes). In HTML, using a closing tag on an empty element is
119    usually invalid.
120    Reference `<https://developer.mozilla.org/en-US/docs/Glossary/Empty_element>`_"""
121
122    WHITESPACE_RE = re.compile(r"\s+")
123
124    LEADING_WHITESPACE_RE = re.compile(r"^(\s+)")
125
126    TRAILING_WHITESPACE_RE = re.compile(r"(\s+)$")
127
128    ENCODING_RE = re.compile(
129        br"""<meta.*
130                                content.*=.*?charset.*?=\s*?
131                                ([^\s]*)
132                                \s*?["']\s*?>
133                             """,
134        re.VERBOSE | re.IGNORECASE,
135    )
136
137    def __init__(self, inputfile=None, callback=None):
138        super().__init__(convert_charrefs=False)
139        base.TranslationStore.__init__(self)
140
141        # store parameters
142        self.filename = getattr(inputfile, "name", None)
143        if callback is None:
144            self.callback = self._simple_callback
145        else:
146            self.callback = callback
147
148        # initialize state
149        self.filesrc = ""
150        self.tag_path = []
151        self.tu_content = []
152        self.tu_location = None
153
154        # parse
155        if inputfile is not None:
156            htmlsrc = inputfile.read()
157            inputfile.close()
158            self.parse(htmlsrc)
159
160    def _simple_callback(self, string):
161        return string
162
163    def guess_encoding(self, htmlsrc):
164        """Returns the encoding of the html text.
165
166        We look for 'charset=' within a meta tag to do this.
167        """
168        result = self.ENCODING_RE.findall(htmlsrc)
169        if result:
170            self.encoding = result[0].decode("ascii")
171        return self.encoding
172
173    def do_encoding(self, htmlsrc):
174        """Return the html text properly encoded based on a charset."""
175        self.guess_encoding(htmlsrc)
176        return htmlsrc.decode(self.encoding)
177
178    def parse(self, htmlsrc):
179        htmlsrc = self.do_encoding(htmlsrc)
180        self.feed(htmlsrc)
181
182    def begin_translation_unit(self):
183        # at the start of a translation unit:
184        # this interrupts any translation unit in progress, so process the queue
185        # and prepare for the new.
186        self.emit_translation_unit()
187        self.tu_content = []
188        self.tu_location = "%s+%s:%d-%d" % (
189            self.filename,
190            ".".join(self.tag_path),
191            self.getpos()[0],
192            self.getpos()[1] + 1,
193        )
194
195    def end_translation_unit(self):
196        # at the end of a translation unit:
197        # process the queue and reset state.
198        self.emit_translation_unit()
199        self.tu_content = []
200        self.tu_location = None
201
202    def append_markup(self, markup):
203        # if within a translation unit: add to the queue to be processed later.
204        # otherwise handle immediately.
205        if self.tu_location:
206            self.tu_content.append(markup)
207        else:
208            self.emit_attribute_translation_units(markup)
209            self.filesrc += markup["html_content"]
210
211    def emit_translation_unit(self):
212        # scan through the queue:
213        # - find the first and last translatable markup elements: the captured
214        #   interval [start, end)
215        # - match start and end tags
216        start = 0
217        end = 0
218        tagstack = []
219        tagmap = {}
220        tag = None
221        do_normalize = True
222        for pos, content in enumerate(self.tu_content):
223            if content["type"] != "endtag" and tag in self.EMPTY_HTML_ELEMENTS:
224                match = tagstack.pop()
225                tag = None
226
227            if self.has_translatable_content(content):
228                if end == 0:
229                    start = pos
230                end = pos + 1
231            elif content["type"] == "starttag":
232                tagstack.append(pos)
233                tag = content["tag"]
234                if tag == "pre":
235                    do_normalize = False
236            elif content["type"] == "endtag":
237                if tagstack:
238                    match = tagstack.pop()
239                    tagmap[match] = pos
240                    tagmap[pos] = match
241                tag = None
242
243        # if no translatable content found: process all the content in the queue
244        # as if the translation unit didn't exist.
245        if end == 0:
246            for markup in self.tu_content:
247                self.emit_attribute_translation_units(markup)
248                self.filesrc += markup["html_content"]
249            return
250
251        # scan the start and end tags captured between translatable content;
252        # extend the captured interval to include the matching tags
253        for pos in range(start + 1, end - 1):
254            if (
255                self.tu_content[pos]["type"] == "starttag"
256                or self.tu_content[pos]["type"] == "endtag"
257            ) and pos in tagmap:
258                match = tagmap[pos]
259                start = min(start, match)
260                end = max(end, match + 1)
261
262        # emit leading uncaptured markup elements
263        for markup in self.tu_content[0:start]:
264            if markup["type"] != "comment":
265                self.emit_attribute_translation_units(markup)
266                self.filesrc += markup["html_content"]
267
268        # emit captured markup elements
269        if start < end:
270            html_content = []
271            for markup in self.tu_content[start:end]:
272                if markup["type"] != "comment":
273                    if "untranslated_html" in markup:
274                        html_content.append(markup["untranslated_html"])
275                    else:
276                        html_content.append(markup["html_content"])
277            html_content = "".join(html_content)
278            if do_normalize:
279                normalized_content = self.WHITESPACE_RE.sub(" ", html_content.strip())
280            else:
281                normalized_content = html_content.strip()
282            assert normalized_content  # shouldn't be here otherwise
283
284            unit = self.addsourceunit(normalized_content)
285            unit.addlocation(self.tu_location)
286            comments = [
287                markup["note"]
288                for markup in self.tu_content
289                if markup["type"] == "comment"
290            ]
291            if comments:
292                unit.addnote("\n".join(comments))
293
294            html_content = (
295                self.get_leading_whitespace(html_content)
296                + self.callback(normalized_content)
297                + self.get_trailing_whitespace(html_content)
298            )
299            self.filesrc += html_content
300
301        # emit trailing uncaptured markup elements
302        for markup in self.tu_content[end : len(self.tu_content)]:
303            if markup["type"] != "comment":
304                self.emit_attribute_translation_units(markup)
305                self.filesrc += markup["html_content"]
306
307    @staticmethod
308    def has_translatable_content(markup):
309        # processing instructions count as translatable content, because PHP
310        return markup["type"] in {"data", "pi"} and markup["html_content"].strip()
311
312    def extract_translatable_attributes(self, tag, attrs):
313        result = []
314        if tag == "meta":
315            tu = self.create_metadata_attribute_tu(attrs)
316            if tu:
317                result.append(tu)
318        else:
319            for attrname, attrvalue in attrs:
320                if (
321                    attrname in self.TRANSLATABLE_ATTRIBUTES
322                    and self.translatable_attribute_matches_tag(attrname, tag)
323                ):
324                    tu = self.create_attribute_tu(attrname, attrvalue)
325                    if tu:
326                        result.append(tu)
327        return result
328
329    def create_metadata_attribute_tu(self, attrs):
330        attrs_dict = dict(attrs)
331        name = attrs_dict["name"].lower() if "name" in attrs_dict else None
332        if name in self.TRANSLATABLE_METADATA and "content" in attrs_dict:
333            return self.create_attribute_tu("content", attrs_dict["content"])
334
335    def translatable_attribute_matches_tag(self, attrname, tag):
336        if attrname == "lang":
337            return tag == "html"
338        return True
339
340    def create_attribute_tu(self, attrname, attrvalue):
341        normalized_value = self.WHITESPACE_RE.sub(" ", attrvalue).strip()
342        if normalized_value:
343            return {
344                "html_content": normalized_value,
345                "location": "%s+%s:%d-%d"
346                % (
347                    self.filename,
348                    ".".join(self.tag_path) + "[" + attrname + "]",
349                    self.getpos()[0],
350                    self.getpos()[1] + 1,
351                ),
352            }
353
354    def emit_attribute_translation_units(self, markup):
355        if "attribute_tus" in markup:
356            for tu in markup["attribute_tus"]:
357                unit = self.addsourceunit(tu["html_content"])
358                unit.addlocation(tu["location"])
359
360    def translate_attributes(self, attrs):
361        result = []
362        for attrname, attrvalue in attrs:
363            if attrvalue:
364                normalized_value = self.WHITESPACE_RE.sub(" ", attrvalue).strip()
365                translated_value = self.callback(normalized_value)
366                if translated_value != normalized_value:
367                    result.append((attrname, translated_value))
368                    continue
369            result.append((attrname, attrvalue))
370        return result
371
372    def create_start_tag(self, tag, attrs=None, startend=False):
373        attr_strings = []
374        for attrname, attrvalue in attrs:
375            if attrvalue is None:
376                attr_strings.append(" " + attrname)
377            else:
378                attr_strings.append(f' {attrname}="{attrvalue}"')
379        return "<{}{}{}>".format(tag, "".join(attr_strings), " /" if startend else "")
380
381    def auto_close_empty_element(self):
382        if self.tag_path and self.tag_path[-1] in self.EMPTY_HTML_ELEMENTS:
383            self.tag_path.pop()
384
385    def get_leading_whitespace(self, str):
386        match = self.LEADING_WHITESPACE_RE.search(str)
387        return match.group(1) if match else ""
388
389    def get_trailing_whitespace(self, str):
390        match = self.TRAILING_WHITESPACE_RE.search(str)
391        return match.group(1) if match else ""
392
393    # From here on below, follows the methods of the HTMLParser
394
395    def handle_starttag(self, tag, attrs):
396        self.auto_close_empty_element()
397        self.tag_path.append(tag)
398
399        if tag in self.TRANSLATABLE_ELEMENTS:
400            self.begin_translation_unit()
401
402        translated_attrs = self.translate_attributes(attrs)
403        markup = {
404            "type": "starttag",
405            "tag": tag,
406            "html_content": self.create_start_tag(tag, translated_attrs),
407            "untranslated_html": self.create_start_tag(tag, attrs),
408            "attribute_tus": self.extract_translatable_attributes(tag, attrs),
409        }
410        self.append_markup(markup)
411
412    def handle_endtag(self, tag):
413        try:
414            popped = self.tag_path.pop()
415        except IndexError:
416            raise ParseError(
417                "Mismatched tags: no more tags: line %s" % self.getpos()[0]
418            )
419        if popped != tag and popped in self.EMPTY_HTML_ELEMENTS:
420            popped = self.tag_path.pop()
421        if popped != tag:
422            raise ParseError(
423                "Mismatched closing tag: "
424                "expected '%s' got '%s' at line %s" % (popped, tag, self.getpos()[0])
425            )
426
427        self.append_markup({"type": "endtag", "html_content": "</%s>" % tag})
428
429        if tag in self.TRANSLATABLE_ELEMENTS:
430            self.end_translation_unit()
431            if any(t in self.TRANSLATABLE_ELEMENTS for t in self.tag_path):
432                self.begin_translation_unit()
433
434    def handle_startendtag(self, tag, attrs):
435        self.auto_close_empty_element()
436        self.tag_path.append(tag)
437
438        if tag in self.TRANSLATABLE_ELEMENTS:
439            self.begin_translation_unit()
440
441        translated_attrs = self.translate_attributes(attrs)
442        markup = {
443            "type": "startendtag",
444            "html_content": self.create_start_tag(tag, translated_attrs, startend=True),
445            "untranslated_html": self.create_start_tag(tag, attrs, startend=True),
446            "attribute_tus": self.extract_translatable_attributes(tag, attrs),
447        }
448        self.append_markup(markup)
449
450        if tag in self.TRANSLATABLE_ELEMENTS:
451            self.end_translation_unit()
452            if any(t in self.TRANSLATABLE_ELEMENTS for t in self.tag_path):
453                self.begin_translation_unit()
454
455        self.tag_path.pop()
456
457    def handle_data(self, data):
458        self.auto_close_empty_element()
459        self.append_markup({"type": "data", "html_content": data})
460
461    def handle_charref(self, name):
462        """Handle entries in the form &#NNNN; e.g. &#8417;"""
463        if name.lower().startswith("x"):
464            self.handle_data(chr(int(name[1:], 16)))
465        else:
466            self.handle_data(chr(int(name)))
467
468    def handle_entityref(self, name):
469        """Handle named entities of the form &aaaa; e.g. &rsquo;"""
470        converted = html5.get(name + ";", None)
471        if name in ["gt", "lt", "amp"] or not converted:
472            self.handle_data("&%s;" % name)
473        else:
474            self.handle_data(converted)
475
476    def handle_comment(self, data):
477        self.auto_close_empty_element()
478        self.append_markup(
479            {"type": "comment", "html_content": "<!--%s-->" % data, "note": data}
480        )
481
482    def handle_decl(self, decl):
483        self.auto_close_empty_element()
484        self.append_markup({"type": "decl", "html_content": "<!%s>" % decl})
485
486    def handle_pi(self, data):
487        self.auto_close_empty_element()
488        self.append_markup({"type": "pi", "html_content": "<?%s?>" % data})
489
490    def unknown_decl(self, data):
491        self.auto_close_empty_element()
492        self.append_markup({"type": "cdecl", "html_content": "<![%s]>" % data})
493
494
495class POHTMLParser(htmlfile):
496    pass
497