1#
2# Copyright (c), 2016-2020, SISSA (International School for Advanced Studies).
3# All rights reserved.
4# This file is distributed under the terms of the MIT License.
5# See the file 'LICENSE' in the root directory of the present
6# distribution, or http://opensource.org/licenses/MIT.
7#
8# @author Davide Brunato <brunato@sissa.it>
9#
10"""
11A unified setup module for ElementTree with a safe parser and helper functions.
12"""
13import sys
14import re
15from collections import namedtuple
16from typing import Any, MutableMapping, Optional, Union
17
18from .exceptions import XMLSchemaTypeError
19
20_REGEX_NS_PREFIX = re.compile(r'ns\d+$')
21
22###
23# Programmatic import of xml.etree.ElementTree
24#
25# In Python 3 the pure python implementation is overwritten by the C module API,
26# so use a programmatic re-import to obtain the pure Python module, necessary for
27# defining a safer XMLParser.
28#
29if '_elementtree' in sys.modules:
30    if 'xml.etree.ElementTree' not in sys.modules:
31        raise RuntimeError("Inconsistent status for ElementTree module: module "
32                           "is missing but the C optimized version is imported.")
33
34    import xml.etree.ElementTree as ElementTree
35
36    # Temporary remove the loaded modules
37    sys.modules.pop('xml.etree.ElementTree')
38    _cmod = sys.modules.pop('_elementtree')
39
40    # Load the pure Python module
41    sys.modules['_elementtree'] = None  # type: ignore[assignment]
42    import xml.etree.ElementTree as PyElementTree
43    import xml.etree
44
45    # Restore original modules
46    sys.modules['_elementtree'] = _cmod
47    xml.etree.ElementTree = ElementTree
48    sys.modules['xml.etree.ElementTree'] = ElementTree
49
50else:
51    # Load the pure Python module
52    sys.modules['_elementtree'] = None  # type: ignore[assignment]
53    import xml.etree.ElementTree as PyElementTree
54
55    # Remove the pure Python module from imported modules
56    del sys.modules['xml.etree']
57    del sys.modules['xml.etree.ElementTree']
58    del sys.modules['_elementtree']
59
60    # Load the C optimized ElementTree module
61    import xml.etree.ElementTree as ElementTree
62
63
64etree_element = ElementTree.Element
65ParseError = ElementTree.ParseError
66py_etree_element = PyElementTree.Element
67
68
69class SafeXMLParser(PyElementTree.XMLParser):
70    """
71    An XMLParser that forbids entities processing. Drops the *html* argument
72    that is deprecated since version 3.4.
73
74    :param target: the target object called by the `feed()` method of the \
75    parser, that defaults to `TreeBuilder`.
76    :param encoding: if provided, its value overrides the encoding specified \
77    in the XML file.
78    """
79    def __init__(self, target: Optional[Any] = None, encoding: Optional[str] = None) -> None:
80        super(SafeXMLParser, self).__init__(target=target, encoding=encoding)
81        self.parser.EntityDeclHandler = self.entity_declaration
82        self.parser.UnparsedEntityDeclHandler = self.unparsed_entity_declaration
83        self.parser.ExternalEntityRefHandler = self.external_entity_reference
84
85    def entity_declaration(self, entity_name, is_parameter_entity, value, base,  # type: ignore
86                           system_id, public_id, notation_name):
87        raise PyElementTree.ParseError(
88            "Entities are forbidden (entity_name={!r})".format(entity_name)
89        )
90
91    def unparsed_entity_declaration(self, entity_name, base, system_id,  # type: ignore
92                                    public_id, notation_name):
93        raise PyElementTree.ParseError(
94            "Unparsed entities are forbidden (entity_name={!r})".format(entity_name)
95        )
96
97    def external_entity_reference(self, context, base, system_id, public_id):  # type: ignore
98        raise PyElementTree.ParseError(
99            "External references are forbidden (system_id={!r}, "
100            "public_id={!r})".format(system_id, public_id)
101        )  # pragma: no cover (EntityDeclHandler is called before)
102
103
104ElementData = namedtuple('ElementData', ['tag', 'text', 'content', 'attributes'])
105"""
106Namedtuple for Element data interchange between decoders and converters.
107The field *tag* is a string containing the Element's tag, *text* can be `None`
108or a string representing the Element's text, *content* can be `None`, a list
109containing the Element's children or a dictionary containing element name to
110list of element contents for the Element's children (used for unordered input
111data), *attributes* can be `None` or a dictionary containing the Element's
112attributes.
113"""
114
115
116def is_etree_element(obj: Any) -> bool:
117    """A checker for valid ElementTree elements that excludes XsdElement objects."""
118    return hasattr(obj, 'append') and hasattr(obj, 'tag') and hasattr(obj, 'attrib')
119
120
121def etree_tostring(elem: etree_element,
122                   namespaces: Optional[MutableMapping[str, str]] = None,
123                   indent: str = '',
124                   max_lines: Optional[int] = None,
125                   spaces_for_tab: Optional[int] = None,
126                   xml_declaration: Optional[bool] = None,
127                   encoding: str = 'unicode',
128                   method: str = 'xml') -> Union[str, bytes]:
129    """
130    Serialize an Element tree to a string. Tab characters are replaced by whitespaces.
131
132    :param elem: the Element instance.
133    :param namespaces: is an optional mapping from namespace prefix to URI. \
134    Provided namespaces are registered before serialization.
135    :param indent: the base line indentation.
136    :param max_lines: if truncate serialization after a number of lines \
137    (default: do not truncate).
138    :param spaces_for_tab: number of spaces for replacing tab characters. \
139    For default tabs are replaced with 4 spaces, but only if not empty \
140    indentation or a max lines limit are provided.
141    :param xml_declaration: if set to `True` inserts the XML declaration at the head.
142    :param encoding: if "unicode" (the default) the output is a string, otherwise it’s binary.
143    :param method: is either "xml" (the default), "html" or "text".
144    :return: a Unicode string.
145    """
146    def reindent(line: str) -> str:
147        if not line:
148            return line
149        elif line.startswith(min_indent):
150            return line[start:] if start >= 0 else indent[start:] + line
151        else:
152            return indent + line
153
154    etree_module: Any
155    if not is_etree_element(elem):
156        raise XMLSchemaTypeError("{!r} is not an Element".format(elem))
157
158    elif isinstance(elem, py_etree_element):
159        etree_module = PyElementTree
160    elif not hasattr(elem, 'nsmap'):
161        etree_module = ElementTree
162    else:
163        import lxml.etree as etree_module  # type: ignore[no-redef]
164
165    if namespaces:
166        default_namespace = namespaces.get('')
167        for prefix, uri in namespaces.items():
168            if prefix and not _REGEX_NS_PREFIX.match(prefix):
169                etree_module.register_namespace(prefix, uri)
170                if uri == default_namespace:
171                    default_namespace = None
172
173        if default_namespace and not hasattr(elem, 'nsmap'):
174            etree_module.register_namespace('', default_namespace)
175
176    xml_text = etree_module.tostring(elem, encoding=encoding, method=method)
177    if isinstance(xml_text, bytes):
178        xml_text = xml_text.decode('utf-8')
179
180    if spaces_for_tab:
181        xml_text = xml_text.replace('\t', ' ' * spaces_for_tab)
182    elif method != 'text' and (indent or max_lines):
183        xml_text = xml_text.replace('\t', ' ' * 4)
184
185    if xml_text.startswith('<?xml '):
186        if xml_declaration is False:
187            lines = xml_text.splitlines()[1:]
188        else:
189            lines = xml_text.splitlines()
190    elif xml_declaration and encoding.lower() != 'unicode':
191        lines = ['<?xml version="1.0" encoding="{}"?>'.format(encoding)]
192        lines.extend(xml_text.splitlines())
193    else:
194        lines = xml_text.splitlines()
195
196    # Clear ending empty lines
197    while lines and not lines[-1].strip():
198        lines.pop(-1)
199
200    if not lines or method == 'text' or (not indent and not max_lines):
201        if encoding == 'unicode':
202            return '\n'.join(lines)
203        return '\n'.join(lines).encode(encoding)
204
205    last_indent = ' ' * min(k for k in range(len(lines[-1])) if lines[-1][k] != ' ')
206    if len(lines) > 2:
207        child_indent = ' ' * min(
208            k for line in lines[1:-1] for k in range(len(line)) if line[k] != ' '
209        )
210        min_indent = min(child_indent, last_indent)
211    else:
212        min_indent = child_indent = last_indent
213
214    start = len(min_indent) - len(indent)
215
216    if max_lines is not None and len(lines) > max_lines + 2:
217        lines = lines[:max_lines] + [child_indent + '...'] * 2 + lines[-1:]
218
219    if encoding == 'unicode':
220        return '\n'.join(reindent(line) for line in lines)
221    return '\n'.join(reindent(line) for line in lines).encode(encoding)
222
223
224__all__ = ['ElementTree', 'PyElementTree', 'ParseError', 'SafeXMLParser', 'etree_element',
225           'py_etree_element', 'ElementData', 'is_etree_element', 'etree_tostring']
226