1# Licensed under a 3-clause BSD style license - see LICENSE.rst
2"""
3Contains a class that makes it simple to stream out well-formed and
4nicely-indented XML.
5"""
6
7# STDLIB
8import contextlib
9import textwrap
10
11try:
12    from . import _iterparser
13except ImportError:
14    def xml_escape_cdata(s):
15        """
16        Escapes &, < and > in an XML CDATA string.
17        """
18        s = s.replace("&", "&amp;")
19        s = s.replace("<", "&lt;")
20        s = s.replace(">", "&gt;")
21        return s
22
23    def xml_escape(s):
24        """
25        Escapes &, ', ", < and > in an XML attribute value.
26        """
27        s = s.replace("&", "&amp;")
28        s = s.replace("'", "&apos;")
29        s = s.replace("\"", "&quot;")
30        s = s.replace("<", "&lt;")
31        s = s.replace(">", "&gt;")
32        return s
33else:
34    xml_escape_cdata = _iterparser.escape_xml_cdata
35    xml_escape = _iterparser.escape_xml
36
37
38class XMLWriter:
39    """
40    A class to write well-formed and nicely indented XML.
41
42    Use like this::
43
44        w = XMLWriter(fh)
45        with w.tag('html'):
46            with w.tag('body'):
47                w.data('This is the content')
48
49    Which produces::
50
51        <html>
52         <body>
53          This is the content
54         </body>
55        </html>
56    """
57
58    def __init__(self, file):
59        """
60        Parameters
61        ----------
62        file : writable file-like
63        """
64        self.write = file.write
65        if hasattr(file, "flush"):
66            self.flush = file.flush
67        self._open = 0  # true if start tag is open
68        self._tags = []
69        self._data = []
70        self._indentation = " " * 64
71
72        self.xml_escape_cdata = xml_escape_cdata
73        self.xml_escape = xml_escape
74
75    def _flush(self, indent=True, wrap=False):
76        """
77        Flush internal buffers.
78        """
79        if self._open:
80            if indent:
81                self.write(">\n")
82            else:
83                self.write(">")
84            self._open = 0
85        if self._data:
86            data = ''.join(self._data)
87            if wrap:
88                indent = self.get_indentation_spaces(1)
89                data = textwrap.fill(
90                    data,
91                    initial_indent=indent,
92                    subsequent_indent=indent)
93                self.write('\n')
94                self.write(self.xml_escape_cdata(data))
95                self.write('\n')
96                self.write(self.get_indentation_spaces())
97            else:
98                self.write(self.xml_escape_cdata(data))
99            self._data = []
100
101    def start(self, tag, attrib={}, **extra):
102        """
103        Opens a new element.  Attributes can be given as keyword
104        arguments, or as a string/string dictionary.  The method
105        returns an opaque identifier that can be passed to the
106        :meth:`close` method, to close all open elements up to and
107        including this one.
108
109        Parameters
110        ----------
111        tag : str
112            The element name
113
114        attrib : dict of str -> str
115            Attribute dictionary.  Alternatively, attributes can
116            be given as keyword arguments.
117
118        Returns
119        -------
120        id : int
121            Returns an element identifier.
122        """
123        self._flush()
124        # This is just busy work -- we know our tag names are clean
125        # tag = xml_escape_cdata(tag)
126        self._data = []
127        self._tags.append(tag)
128        self.write(self.get_indentation_spaces(-1))
129        self.write(f"<{tag}")
130        if attrib or extra:
131            attrib = attrib.copy()
132            attrib.update(extra)
133            attrib = list(attrib.items())
134            attrib.sort()
135            for k, v in attrib:
136                if v is not None:
137                    # This is just busy work -- we know our keys are clean
138                    # k = xml_escape_cdata(k)
139                    v = self.xml_escape(v)
140                    self.write(f" {k}=\"{v}\"")
141        self._open = 1
142
143        return len(self._tags)
144
145    @contextlib.contextmanager
146    def xml_cleaning_method(self, method='escape_xml', **clean_kwargs):
147        """Context manager to control how XML data tags are cleaned (escaped) to
148        remove potentially unsafe characters or constructs.
149
150        The default (``method='escape_xml'``) applies brute-force escaping of
151        certain key XML characters like ``<``, ``>``, and ``&`` to ensure that
152        the output is not valid XML.
153
154        In order to explicitly allow certain XML tags (e.g. link reference or
155        emphasis tags), use ``method='bleach_clean'``.  This sanitizes the data
156        string using the ``clean`` function of the
157        `bleach <https://bleach.readthedocs.io/en/latest/clean.html>`_ package.
158        Any additional keyword arguments will be passed directly to the
159        ``clean`` function.
160
161        Finally, use ``method='none'`` to disable any sanitization. This should
162        be used sparingly.
163
164        Example::
165
166          w = writer.XMLWriter(ListWriter(lines))
167          with w.xml_cleaning_method('bleach_clean'):
168              w.start('td')
169              w.data('<a href="https://google.com">google.com</a>')
170              w.end()
171
172        Parameters
173        ----------
174        method : str
175            Cleaning method.  Allowed values are "escape_xml",
176            "bleach_clean", and "none".
177
178        **clean_kwargs : keyword args
179            Additional keyword args that are passed to the
180            bleach.clean() function.
181        """
182        current_xml_escape_cdata = self.xml_escape_cdata
183
184        if method == 'bleach_clean':
185            # NOTE: bleach is imported locally to avoid importing it when
186            # it is not nocessary
187            try:
188                import bleach
189            except ImportError:
190                raise ValueError('bleach package is required when HTML escaping is disabled.\n'
191                                 'Use "pip install bleach".')
192
193            if clean_kwargs is None:
194                clean_kwargs = {}
195            self.xml_escape_cdata = lambda x: bleach.clean(x, **clean_kwargs)
196        elif method == "none":
197            self.xml_escape_cdata = lambda x: x
198        elif method != 'escape_xml':
199            raise ValueError('allowed values of method are "escape_xml", "bleach_clean", and "none"')
200
201        yield
202
203        self.xml_escape_cdata = current_xml_escape_cdata
204
205    @contextlib.contextmanager
206    def tag(self, tag, attrib={}, **extra):
207        """
208        A convenience method for creating wrapper elements using the
209        ``with`` statement.
210
211        Examples
212        --------
213
214        >>> with writer.tag('foo'):  # doctest: +SKIP
215        ...     writer.element('bar')
216        ... # </foo> is implicitly closed here
217        ...
218
219        Parameters are the same as to `start`.
220        """
221        self.start(tag, attrib, **extra)
222        yield
223        self.end(tag)
224
225    def comment(self, comment):
226        """
227        Adds a comment to the output stream.
228
229        Parameters
230        ----------
231        comment : str
232            Comment text, as a Unicode string.
233        """
234        self._flush()
235        self.write(self.get_indentation_spaces())
236        self.write(f"<!-- {self.xml_escape_cdata(comment)} -->\n")
237
238    def data(self, text):
239        """
240        Adds character data to the output stream.
241
242        Parameters
243        ----------
244        text : str
245            Character data, as a Unicode string.
246        """
247        self._data.append(text)
248
249    def end(self, tag=None, indent=True, wrap=False):
250        """
251        Closes the current element (opened by the most recent call to
252        `start`).
253
254        Parameters
255        ----------
256        tag : str
257            Element name.  If given, the tag must match the start tag.
258            If omitted, the current element is closed.
259        """
260        if tag:
261            if not self._tags:
262                raise ValueError(f"unbalanced end({tag})")
263            if tag != self._tags[-1]:
264                raise ValueError(f"expected end({self._tags[-1]}), got {tag}")
265        else:
266            if not self._tags:
267                raise ValueError("unbalanced end()")
268        tag = self._tags.pop()
269        if self._data:
270            self._flush(indent, wrap)
271        elif self._open:
272            self._open = 0
273            self.write("/>\n")
274            return
275        if indent:
276            self.write(self.get_indentation_spaces())
277        self.write(f"</{tag}>\n")
278
279    def close(self, id):
280        """
281        Closes open elements, up to (and including) the element identified
282        by the given identifier.
283
284        Parameters
285        ----------
286        id : int
287            Element identifier, as returned by the `start` method.
288        """
289        while len(self._tags) > id:
290            self.end()
291
292    def element(self, tag, text=None, wrap=False, attrib={}, **extra):
293        """
294        Adds an entire element.  This is the same as calling `start`,
295        `data`, and `end` in sequence. The ``text`` argument
296        can be omitted.
297        """
298        self.start(tag, attrib, **extra)
299        if text:
300            self.data(text)
301        self.end(indent=False, wrap=wrap)
302
303    def flush(self):
304        pass  # replaced by the constructor
305
306    def get_indentation(self):
307        """
308        Returns the number of indentation levels the file is currently
309        in.
310        """
311        return len(self._tags)
312
313    def get_indentation_spaces(self, offset=0):
314        """
315        Returns a string of spaces that matches the current
316        indentation level.
317        """
318        return self._indentation[:len(self._tags) + offset]
319
320    @staticmethod
321    def object_attrs(obj, attrs):
322        """
323        Converts an object with a bunch of attributes on an object
324        into a dictionary for use by the `XMLWriter`.
325
326        Parameters
327        ----------
328        obj : object
329            Any Python object
330
331        attrs : sequence of str
332            Attribute names to pull from the object
333
334        Returns
335        -------
336        attrs : dict
337            Maps attribute names to the values retrieved from
338            ``obj.attr``.  If any of the attributes is `None`, it will
339            not appear in the output dictionary.
340        """
341        d = {}
342        for attr in attrs:
343            if getattr(obj, attr) is not None:
344                d[attr.replace('_', '-')] = str(getattr(obj, attr))
345        return d
346