1# Licensed under a 3-clause BSD style license - see LICENSE.rst 2""" 3Contains a class that makes it simple to stream out well-formed and 4nicely-indented XML. 5""" 6 7# STDLIB 8import contextlib 9import textwrap 10 11try: 12 from . import _iterparser 13except ImportError: 14 def xml_escape_cdata(s): 15 """ 16 Escapes &, < and > in an XML CDATA string. 17 """ 18 s = s.replace("&", "&") 19 s = s.replace("<", "<") 20 s = s.replace(">", ">") 21 return s 22 23 def xml_escape(s): 24 """ 25 Escapes &, ', ", < and > in an XML attribute value. 26 """ 27 s = s.replace("&", "&") 28 s = s.replace("'", "'") 29 s = s.replace("\"", """) 30 s = s.replace("<", "<") 31 s = s.replace(">", ">") 32 return s 33else: 34 xml_escape_cdata = _iterparser.escape_xml_cdata 35 xml_escape = _iterparser.escape_xml 36 37 38class XMLWriter: 39 """ 40 A class to write well-formed and nicely indented XML. 41 42 Use like this:: 43 44 w = XMLWriter(fh) 45 with w.tag('html'): 46 with w.tag('body'): 47 w.data('This is the content') 48 49 Which produces:: 50 51 <html> 52 <body> 53 This is the content 54 </body> 55 </html> 56 """ 57 58 def __init__(self, file): 59 """ 60 Parameters 61 ---------- 62 file : writable file-like 63 """ 64 self.write = file.write 65 if hasattr(file, "flush"): 66 self.flush = file.flush 67 self._open = 0 # true if start tag is open 68 self._tags = [] 69 self._data = [] 70 self._indentation = " " * 64 71 72 self.xml_escape_cdata = xml_escape_cdata 73 self.xml_escape = xml_escape 74 75 def _flush(self, indent=True, wrap=False): 76 """ 77 Flush internal buffers. 78 """ 79 if self._open: 80 if indent: 81 self.write(">\n") 82 else: 83 self.write(">") 84 self._open = 0 85 if self._data: 86 data = ''.join(self._data) 87 if wrap: 88 indent = self.get_indentation_spaces(1) 89 data = textwrap.fill( 90 data, 91 initial_indent=indent, 92 subsequent_indent=indent) 93 self.write('\n') 94 self.write(self.xml_escape_cdata(data)) 95 self.write('\n') 96 self.write(self.get_indentation_spaces()) 97 else: 98 self.write(self.xml_escape_cdata(data)) 99 self._data = [] 100 101 def start(self, tag, attrib={}, **extra): 102 """ 103 Opens a new element. Attributes can be given as keyword 104 arguments, or as a string/string dictionary. The method 105 returns an opaque identifier that can be passed to the 106 :meth:`close` method, to close all open elements up to and 107 including this one. 108 109 Parameters 110 ---------- 111 tag : str 112 The element name 113 114 attrib : dict of str -> str 115 Attribute dictionary. Alternatively, attributes can 116 be given as keyword arguments. 117 118 Returns 119 ------- 120 id : int 121 Returns an element identifier. 122 """ 123 self._flush() 124 # This is just busy work -- we know our tag names are clean 125 # tag = xml_escape_cdata(tag) 126 self._data = [] 127 self._tags.append(tag) 128 self.write(self.get_indentation_spaces(-1)) 129 self.write(f"<{tag}") 130 if attrib or extra: 131 attrib = attrib.copy() 132 attrib.update(extra) 133 attrib = list(attrib.items()) 134 attrib.sort() 135 for k, v in attrib: 136 if v is not None: 137 # This is just busy work -- we know our keys are clean 138 # k = xml_escape_cdata(k) 139 v = self.xml_escape(v) 140 self.write(f" {k}=\"{v}\"") 141 self._open = 1 142 143 return len(self._tags) 144 145 @contextlib.contextmanager 146 def xml_cleaning_method(self, method='escape_xml', **clean_kwargs): 147 """Context manager to control how XML data tags are cleaned (escaped) to 148 remove potentially unsafe characters or constructs. 149 150 The default (``method='escape_xml'``) applies brute-force escaping of 151 certain key XML characters like ``<``, ``>``, and ``&`` to ensure that 152 the output is not valid XML. 153 154 In order to explicitly allow certain XML tags (e.g. link reference or 155 emphasis tags), use ``method='bleach_clean'``. This sanitizes the data 156 string using the ``clean`` function of the 157 `bleach <https://bleach.readthedocs.io/en/latest/clean.html>`_ package. 158 Any additional keyword arguments will be passed directly to the 159 ``clean`` function. 160 161 Finally, use ``method='none'`` to disable any sanitization. This should 162 be used sparingly. 163 164 Example:: 165 166 w = writer.XMLWriter(ListWriter(lines)) 167 with w.xml_cleaning_method('bleach_clean'): 168 w.start('td') 169 w.data('<a href="https://google.com">google.com</a>') 170 w.end() 171 172 Parameters 173 ---------- 174 method : str 175 Cleaning method. Allowed values are "escape_xml", 176 "bleach_clean", and "none". 177 178 **clean_kwargs : keyword args 179 Additional keyword args that are passed to the 180 bleach.clean() function. 181 """ 182 current_xml_escape_cdata = self.xml_escape_cdata 183 184 if method == 'bleach_clean': 185 # NOTE: bleach is imported locally to avoid importing it when 186 # it is not nocessary 187 try: 188 import bleach 189 except ImportError: 190 raise ValueError('bleach package is required when HTML escaping is disabled.\n' 191 'Use "pip install bleach".') 192 193 if clean_kwargs is None: 194 clean_kwargs = {} 195 self.xml_escape_cdata = lambda x: bleach.clean(x, **clean_kwargs) 196 elif method == "none": 197 self.xml_escape_cdata = lambda x: x 198 elif method != 'escape_xml': 199 raise ValueError('allowed values of method are "escape_xml", "bleach_clean", and "none"') 200 201 yield 202 203 self.xml_escape_cdata = current_xml_escape_cdata 204 205 @contextlib.contextmanager 206 def tag(self, tag, attrib={}, **extra): 207 """ 208 A convenience method for creating wrapper elements using the 209 ``with`` statement. 210 211 Examples 212 -------- 213 214 >>> with writer.tag('foo'): # doctest: +SKIP 215 ... writer.element('bar') 216 ... # </foo> is implicitly closed here 217 ... 218 219 Parameters are the same as to `start`. 220 """ 221 self.start(tag, attrib, **extra) 222 yield 223 self.end(tag) 224 225 def comment(self, comment): 226 """ 227 Adds a comment to the output stream. 228 229 Parameters 230 ---------- 231 comment : str 232 Comment text, as a Unicode string. 233 """ 234 self._flush() 235 self.write(self.get_indentation_spaces()) 236 self.write(f"<!-- {self.xml_escape_cdata(comment)} -->\n") 237 238 def data(self, text): 239 """ 240 Adds character data to the output stream. 241 242 Parameters 243 ---------- 244 text : str 245 Character data, as a Unicode string. 246 """ 247 self._data.append(text) 248 249 def end(self, tag=None, indent=True, wrap=False): 250 """ 251 Closes the current element (opened by the most recent call to 252 `start`). 253 254 Parameters 255 ---------- 256 tag : str 257 Element name. If given, the tag must match the start tag. 258 If omitted, the current element is closed. 259 """ 260 if tag: 261 if not self._tags: 262 raise ValueError(f"unbalanced end({tag})") 263 if tag != self._tags[-1]: 264 raise ValueError(f"expected end({self._tags[-1]}), got {tag}") 265 else: 266 if not self._tags: 267 raise ValueError("unbalanced end()") 268 tag = self._tags.pop() 269 if self._data: 270 self._flush(indent, wrap) 271 elif self._open: 272 self._open = 0 273 self.write("/>\n") 274 return 275 if indent: 276 self.write(self.get_indentation_spaces()) 277 self.write(f"</{tag}>\n") 278 279 def close(self, id): 280 """ 281 Closes open elements, up to (and including) the element identified 282 by the given identifier. 283 284 Parameters 285 ---------- 286 id : int 287 Element identifier, as returned by the `start` method. 288 """ 289 while len(self._tags) > id: 290 self.end() 291 292 def element(self, tag, text=None, wrap=False, attrib={}, **extra): 293 """ 294 Adds an entire element. This is the same as calling `start`, 295 `data`, and `end` in sequence. The ``text`` argument 296 can be omitted. 297 """ 298 self.start(tag, attrib, **extra) 299 if text: 300 self.data(text) 301 self.end(indent=False, wrap=wrap) 302 303 def flush(self): 304 pass # replaced by the constructor 305 306 def get_indentation(self): 307 """ 308 Returns the number of indentation levels the file is currently 309 in. 310 """ 311 return len(self._tags) 312 313 def get_indentation_spaces(self, offset=0): 314 """ 315 Returns a string of spaces that matches the current 316 indentation level. 317 """ 318 return self._indentation[:len(self._tags) + offset] 319 320 @staticmethod 321 def object_attrs(obj, attrs): 322 """ 323 Converts an object with a bunch of attributes on an object 324 into a dictionary for use by the `XMLWriter`. 325 326 Parameters 327 ---------- 328 obj : object 329 Any Python object 330 331 attrs : sequence of str 332 Attribute names to pull from the object 333 334 Returns 335 ------- 336 attrs : dict 337 Maps attribute names to the values retrieved from 338 ``obj.attr``. If any of the attributes is `None`, it will 339 not appear in the output dictionary. 340 """ 341 d = {} 342 for attr in attrs: 343 if getattr(obj, attr) is not None: 344 d[attr.replace('_', '-')] = str(getattr(obj, attr)) 345 return d 346