1# 2# Copyright (c), 2016-2020, SISSA (International School for Advanced Studies). 3# All rights reserved. 4# This file is distributed under the terms of the MIT License. 5# See the file 'LICENSE' in the root directory of the present 6# distribution, or http://opensource.org/licenses/MIT. 7# 8# @author Davide Brunato <brunato@sissa.it> 9# 10""" 11A unified setup module for ElementTree with a safe parser and helper functions. 12""" 13import sys 14import re 15from collections import namedtuple 16from typing import Any, MutableMapping, Optional, Union 17 18from .exceptions import XMLSchemaTypeError 19 20_REGEX_NS_PREFIX = re.compile(r'ns\d+$') 21 22### 23# Programmatic import of xml.etree.ElementTree 24# 25# In Python 3 the pure python implementation is overwritten by the C module API, 26# so use a programmatic re-import to obtain the pure Python module, necessary for 27# defining a safer XMLParser. 28# 29if '_elementtree' in sys.modules: 30 if 'xml.etree.ElementTree' not in sys.modules: 31 raise RuntimeError("Inconsistent status for ElementTree module: module " 32 "is missing but the C optimized version is imported.") 33 34 import xml.etree.ElementTree as ElementTree 35 36 # Temporary remove the loaded modules 37 sys.modules.pop('xml.etree.ElementTree') 38 _cmod = sys.modules.pop('_elementtree') 39 40 # Load the pure Python module 41 sys.modules['_elementtree'] = None # type: ignore[assignment] 42 import xml.etree.ElementTree as PyElementTree 43 import xml.etree 44 45 # Restore original modules 46 sys.modules['_elementtree'] = _cmod 47 xml.etree.ElementTree = ElementTree 48 sys.modules['xml.etree.ElementTree'] = ElementTree 49 50else: 51 # Load the pure Python module 52 sys.modules['_elementtree'] = None # type: ignore[assignment] 53 import xml.etree.ElementTree as PyElementTree 54 55 # Remove the pure Python module from imported modules 56 del sys.modules['xml.etree'] 57 del sys.modules['xml.etree.ElementTree'] 58 del sys.modules['_elementtree'] 59 60 # Load the C optimized ElementTree module 61 import xml.etree.ElementTree as ElementTree 62 63 64etree_element = ElementTree.Element 65ParseError = ElementTree.ParseError 66py_etree_element = PyElementTree.Element 67 68 69class SafeXMLParser(PyElementTree.XMLParser): 70 """ 71 An XMLParser that forbids entities processing. Drops the *html* argument 72 that is deprecated since version 3.4. 73 74 :param target: the target object called by the `feed()` method of the \ 75 parser, that defaults to `TreeBuilder`. 76 :param encoding: if provided, its value overrides the encoding specified \ 77 in the XML file. 78 """ 79 def __init__(self, target: Optional[Any] = None, encoding: Optional[str] = None) -> None: 80 super(SafeXMLParser, self).__init__(target=target, encoding=encoding) 81 self.parser.EntityDeclHandler = self.entity_declaration 82 self.parser.UnparsedEntityDeclHandler = self.unparsed_entity_declaration 83 self.parser.ExternalEntityRefHandler = self.external_entity_reference 84 85 def entity_declaration(self, entity_name, is_parameter_entity, value, base, # type: ignore 86 system_id, public_id, notation_name): 87 raise PyElementTree.ParseError( 88 "Entities are forbidden (entity_name={!r})".format(entity_name) 89 ) 90 91 def unparsed_entity_declaration(self, entity_name, base, system_id, # type: ignore 92 public_id, notation_name): 93 raise PyElementTree.ParseError( 94 "Unparsed entities are forbidden (entity_name={!r})".format(entity_name) 95 ) 96 97 def external_entity_reference(self, context, base, system_id, public_id): # type: ignore 98 raise PyElementTree.ParseError( 99 "External references are forbidden (system_id={!r}, " 100 "public_id={!r})".format(system_id, public_id) 101 ) # pragma: no cover (EntityDeclHandler is called before) 102 103 104ElementData = namedtuple('ElementData', ['tag', 'text', 'content', 'attributes']) 105""" 106Namedtuple for Element data interchange between decoders and converters. 107The field *tag* is a string containing the Element's tag, *text* can be `None` 108or a string representing the Element's text, *content* can be `None`, a list 109containing the Element's children or a dictionary containing element name to 110list of element contents for the Element's children (used for unordered input 111data), *attributes* can be `None` or a dictionary containing the Element's 112attributes. 113""" 114 115 116def is_etree_element(obj: Any) -> bool: 117 """A checker for valid ElementTree elements that excludes XsdElement objects.""" 118 return hasattr(obj, 'append') and hasattr(obj, 'tag') and hasattr(obj, 'attrib') 119 120 121def etree_tostring(elem: etree_element, 122 namespaces: Optional[MutableMapping[str, str]] = None, 123 indent: str = '', 124 max_lines: Optional[int] = None, 125 spaces_for_tab: Optional[int] = None, 126 xml_declaration: Optional[bool] = None, 127 encoding: str = 'unicode', 128 method: str = 'xml') -> Union[str, bytes]: 129 """ 130 Serialize an Element tree to a string. Tab characters are replaced by whitespaces. 131 132 :param elem: the Element instance. 133 :param namespaces: is an optional mapping from namespace prefix to URI. \ 134 Provided namespaces are registered before serialization. 135 :param indent: the base line indentation. 136 :param max_lines: if truncate serialization after a number of lines \ 137 (default: do not truncate). 138 :param spaces_for_tab: number of spaces for replacing tab characters. \ 139 For default tabs are replaced with 4 spaces, but only if not empty \ 140 indentation or a max lines limit are provided. 141 :param xml_declaration: if set to `True` inserts the XML declaration at the head. 142 :param encoding: if "unicode" (the default) the output is a string, otherwise it’s binary. 143 :param method: is either "xml" (the default), "html" or "text". 144 :return: a Unicode string. 145 """ 146 def reindent(line: str) -> str: 147 if not line: 148 return line 149 elif line.startswith(min_indent): 150 return line[start:] if start >= 0 else indent[start:] + line 151 else: 152 return indent + line 153 154 etree_module: Any 155 if not is_etree_element(elem): 156 raise XMLSchemaTypeError("{!r} is not an Element".format(elem)) 157 158 elif isinstance(elem, py_etree_element): 159 etree_module = PyElementTree 160 elif not hasattr(elem, 'nsmap'): 161 etree_module = ElementTree 162 else: 163 import lxml.etree as etree_module # type: ignore[no-redef] 164 165 if namespaces: 166 default_namespace = namespaces.get('') 167 for prefix, uri in namespaces.items(): 168 if prefix and not _REGEX_NS_PREFIX.match(prefix): 169 etree_module.register_namespace(prefix, uri) 170 if uri == default_namespace: 171 default_namespace = None 172 173 if default_namespace and not hasattr(elem, 'nsmap'): 174 etree_module.register_namespace('', default_namespace) 175 176 xml_text = etree_module.tostring(elem, encoding=encoding, method=method) 177 if isinstance(xml_text, bytes): 178 xml_text = xml_text.decode('utf-8') 179 180 if spaces_for_tab: 181 xml_text = xml_text.replace('\t', ' ' * spaces_for_tab) 182 elif method != 'text' and (indent or max_lines): 183 xml_text = xml_text.replace('\t', ' ' * 4) 184 185 if xml_text.startswith('<?xml '): 186 if xml_declaration is False: 187 lines = xml_text.splitlines()[1:] 188 else: 189 lines = xml_text.splitlines() 190 elif xml_declaration and encoding.lower() != 'unicode': 191 lines = ['<?xml version="1.0" encoding="{}"?>'.format(encoding)] 192 lines.extend(xml_text.splitlines()) 193 else: 194 lines = xml_text.splitlines() 195 196 # Clear ending empty lines 197 while lines and not lines[-1].strip(): 198 lines.pop(-1) 199 200 if not lines or method == 'text' or (not indent and not max_lines): 201 if encoding == 'unicode': 202 return '\n'.join(lines) 203 return '\n'.join(lines).encode(encoding) 204 205 last_indent = ' ' * min(k for k in range(len(lines[-1])) if lines[-1][k] != ' ') 206 if len(lines) > 2: 207 child_indent = ' ' * min( 208 k for line in lines[1:-1] for k in range(len(line)) if line[k] != ' ' 209 ) 210 min_indent = min(child_indent, last_indent) 211 else: 212 min_indent = child_indent = last_indent 213 214 start = len(min_indent) - len(indent) 215 216 if max_lines is not None and len(lines) > max_lines + 2: 217 lines = lines[:max_lines] + [child_indent + '...'] * 2 + lines[-1:] 218 219 if encoding == 'unicode': 220 return '\n'.join(reindent(line) for line in lines) 221 return '\n'.join(reindent(line) for line in lines).encode(encoding) 222 223 224__all__ = ['ElementTree', 'PyElementTree', 'ParseError', 'SafeXMLParser', 'etree_element', 225 'py_etree_element', 'ElementData', 'is_etree_element', 'etree_tostring'] 226