1"""Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree.  This module has two classes for this purpose:
5
6    1. ElementTree represents the whole XML document as a tree and
7
8    2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level.  Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary.  Each Element has a number of properties associated with it:
17
18    'tag' - a string containing the element's name.
19
20    'attributes' - a Python dictionary storing the element's attributes.
21
22    'text' - a string containing the element's text content.
23
24    'tail' - an optional string containing text after the element's end tag.
25
26    And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34"""
35
36#---------------------------------------------------------------------
37# Licensed to PSF under a Contributor Agreement.
38# See https://www.python.org/psf/license for licensing details.
39#
40# ElementTree
41# Copyright (c) 1999-2008 by Fredrik Lundh.  All rights reserved.
42#
43# fredrik@pythonware.com
44# http://www.pythonware.com
45# --------------------------------------------------------------------
46# The ElementTree toolkit is
47#
48# Copyright (c) 1999-2008 by Fredrik Lundh
49#
50# By obtaining, using, and/or copying this software and/or its
51# associated documentation, you agree that you have read, understood,
52# and will comply with the following terms and conditions:
53#
54# Permission to use, copy, modify, and distribute this software and
55# its associated documentation for any purpose and without fee is
56# hereby granted, provided that the above copyright notice appears in
57# all copies, and that both that copyright notice and this permission
58# notice appear in supporting documentation, and that the name of
59# Secret Labs AB or the author not be used in advertising or publicity
60# pertaining to distribution of the software without specific, written
61# prior permission.
62#
63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70# OF THIS SOFTWARE.
71# --------------------------------------------------------------------
72
73__all__ = [
74    # public symbols
75    "Comment",
76    "dump",
77    "Element", "ElementTree",
78    "fromstring", "fromstringlist",
79    "indent", "iselement", "iterparse",
80    "parse", "ParseError",
81    "PI", "ProcessingInstruction",
82    "QName",
83    "SubElement",
84    "tostring", "tostringlist",
85    "TreeBuilder",
86    "VERSION",
87    "XML", "XMLID",
88    "XMLParser", "XMLPullParser",
89    "register_namespace",
90    "canonicalize", "C14NWriterTarget",
91    ]
92
93VERSION = "1.3.0"
94
95import sys
96import re
97import warnings
98import io
99import collections
100import collections.abc
101import contextlib
102
103from . import ElementPath
104
105
106class ParseError(SyntaxError):
107    """An error when parsing an XML document.
108
109    In addition to its exception value, a ParseError contains
110    two extra attributes:
111        'code'     - the specific exception code
112        'position' - the line and column of the error
113
114    """
115    pass
116
117# --------------------------------------------------------------------
118
119
120def iselement(element):
121    """Return True if *element* appears to be an Element."""
122    return hasattr(element, 'tag')
123
124
125class Element:
126    """An XML element.
127
128    This class is the reference implementation of the Element interface.
129
130    An element's length is its number of subelements.  That means if you
131    want to check if an element is truly empty, you should check BOTH
132    its length AND its text attribute.
133
134    The element tag, attribute names, and attribute values can be either
135    bytes or strings.
136
137    *tag* is the element name.  *attrib* is an optional dictionary containing
138    element attributes. *extra* are additional element attributes given as
139    keyword arguments.
140
141    Example form:
142        <tag attrib>text<child/>...</tag>tail
143
144    """
145
146    tag = None
147    """The element's name."""
148
149    attrib = None
150    """Dictionary of the element's attributes."""
151
152    text = None
153    """
154    Text before first subelement. This is either a string or the value None.
155    Note that if there is no text, this attribute may be either
156    None or the empty string, depending on the parser.
157
158    """
159
160    tail = None
161    """
162    Text after this element's end tag, but before the next sibling element's
163    start tag.  This is either a string or the value None.  Note that if there
164    was no text, this attribute may be either None or an empty string,
165    depending on the parser.
166
167    """
168
169    def __init__(self, tag, attrib={}, **extra):
170        if not isinstance(attrib, dict):
171            raise TypeError("attrib must be dict, not %s" % (
172                attrib.__class__.__name__,))
173        self.tag = tag
174        self.attrib = {**attrib, **extra}
175        self._children = []
176
177    def __repr__(self):
178        return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
179
180    def makeelement(self, tag, attrib):
181        """Create a new element with the same type.
182
183        *tag* is a string containing the element name.
184        *attrib* is a dictionary containing the element attributes.
185
186        Do not call this method, use the SubElement factory function instead.
187
188        """
189        return self.__class__(tag, attrib)
190
191    def copy(self):
192        """Return copy of current element.
193
194        This creates a shallow copy. Subelements will be shared with the
195        original tree.
196
197        """
198        warnings.warn(
199            "elem.copy() is deprecated. Use copy.copy(elem) instead.",
200            DeprecationWarning
201            )
202        return self.__copy__()
203
204    def __copy__(self):
205        elem = self.makeelement(self.tag, self.attrib)
206        elem.text = self.text
207        elem.tail = self.tail
208        elem[:] = self
209        return elem
210
211    def __len__(self):
212        return len(self._children)
213
214    def __bool__(self):
215        warnings.warn(
216            "The behavior of this method will change in future versions.  "
217            "Use specific 'len(elem)' or 'elem is not None' test instead.",
218            FutureWarning, stacklevel=2
219            )
220        return len(self._children) != 0 # emulate old behaviour, for now
221
222    def __getitem__(self, index):
223        return self._children[index]
224
225    def __setitem__(self, index, element):
226        if isinstance(index, slice):
227            for elt in element:
228                self._assert_is_element(elt)
229        else:
230            self._assert_is_element(element)
231        self._children[index] = element
232
233    def __delitem__(self, index):
234        del self._children[index]
235
236    def append(self, subelement):
237        """Add *subelement* to the end of this element.
238
239        The new element will appear in document order after the last existing
240        subelement (or directly after the text, if it's the first subelement),
241        but before the end tag for this element.
242
243        """
244        self._assert_is_element(subelement)
245        self._children.append(subelement)
246
247    def extend(self, elements):
248        """Append subelements from a sequence.
249
250        *elements* is a sequence with zero or more elements.
251
252        """
253        for element in elements:
254            self._assert_is_element(element)
255            self._children.append(element)
256
257    def insert(self, index, subelement):
258        """Insert *subelement* at position *index*."""
259        self._assert_is_element(subelement)
260        self._children.insert(index, subelement)
261
262    def _assert_is_element(self, e):
263        # Need to refer to the actual Python implementation, not the
264        # shadowing C implementation.
265        if not isinstance(e, _Element_Py):
266            raise TypeError('expected an Element, not %s' % type(e).__name__)
267
268    def remove(self, subelement):
269        """Remove matching subelement.
270
271        Unlike the find methods, this method compares elements based on
272        identity, NOT ON tag value or contents.  To remove subelements by
273        other means, the easiest way is to use a list comprehension to
274        select what elements to keep, and then use slice assignment to update
275        the parent element.
276
277        ValueError is raised if a matching element could not be found.
278
279        """
280        # assert iselement(element)
281        self._children.remove(subelement)
282
283    def find(self, path, namespaces=None):
284        """Find first matching element by tag name or path.
285
286        *path* is a string having either an element tag or an XPath,
287        *namespaces* is an optional mapping from namespace prefix to full name.
288
289        Return the first matching element, or None if no element was found.
290
291        """
292        return ElementPath.find(self, path, namespaces)
293
294    def findtext(self, path, default=None, namespaces=None):
295        """Find text for first matching element by tag name or path.
296
297        *path* is a string having either an element tag or an XPath,
298        *default* is the value to return if the element was not found,
299        *namespaces* is an optional mapping from namespace prefix to full name.
300
301        Return text content of first matching element, or default value if
302        none was found.  Note that if an element is found having no text
303        content, the empty string is returned.
304
305        """
306        return ElementPath.findtext(self, path, default, namespaces)
307
308    def findall(self, path, namespaces=None):
309        """Find all matching subelements by tag name or path.
310
311        *path* is a string having either an element tag or an XPath,
312        *namespaces* is an optional mapping from namespace prefix to full name.
313
314        Returns list containing all matching elements in document order.
315
316        """
317        return ElementPath.findall(self, path, namespaces)
318
319    def iterfind(self, path, namespaces=None):
320        """Find all matching subelements by tag name or path.
321
322        *path* is a string having either an element tag or an XPath,
323        *namespaces* is an optional mapping from namespace prefix to full name.
324
325        Return an iterable yielding all matching elements in document order.
326
327        """
328        return ElementPath.iterfind(self, path, namespaces)
329
330    def clear(self):
331        """Reset element.
332
333        This function removes all subelements, clears all attributes, and sets
334        the text and tail attributes to None.
335
336        """
337        self.attrib.clear()
338        self._children = []
339        self.text = self.tail = None
340
341    def get(self, key, default=None):
342        """Get element attribute.
343
344        Equivalent to attrib.get, but some implementations may handle this a
345        bit more efficiently.  *key* is what attribute to look for, and
346        *default* is what to return if the attribute was not found.
347
348        Returns a string containing the attribute value, or the default if
349        attribute was not found.
350
351        """
352        return self.attrib.get(key, default)
353
354    def set(self, key, value):
355        """Set element attribute.
356
357        Equivalent to attrib[key] = value, but some implementations may handle
358        this a bit more efficiently.  *key* is what attribute to set, and
359        *value* is the attribute value to set it to.
360
361        """
362        self.attrib[key] = value
363
364    def keys(self):
365        """Get list of attribute names.
366
367        Names are returned in an arbitrary order, just like an ordinary
368        Python dict.  Equivalent to attrib.keys()
369
370        """
371        return self.attrib.keys()
372
373    def items(self):
374        """Get element attributes as a sequence.
375
376        The attributes are returned in arbitrary order.  Equivalent to
377        attrib.items().
378
379        Return a list of (name, value) tuples.
380
381        """
382        return self.attrib.items()
383
384    def iter(self, tag=None):
385        """Create tree iterator.
386
387        The iterator loops over the element and all subelements in document
388        order, returning all elements with a matching tag.
389
390        If the tree structure is modified during iteration, new or removed
391        elements may or may not be included.  To get a stable set, use the
392        list() function on the iterator, and loop over the resulting list.
393
394        *tag* is what tags to look for (default is to return all elements)
395
396        Return an iterator containing all the matching elements.
397
398        """
399        if tag == "*":
400            tag = None
401        if tag is None or self.tag == tag:
402            yield self
403        for e in self._children:
404            yield from e.iter(tag)
405
406    def itertext(self):
407        """Create text iterator.
408
409        The iterator loops over the element and all subelements in document
410        order, returning all inner text.
411
412        """
413        tag = self.tag
414        if not isinstance(tag, str) and tag is not None:
415            return
416        t = self.text
417        if t:
418            yield t
419        for e in self:
420            yield from e.itertext()
421            t = e.tail
422            if t:
423                yield t
424
425
426def SubElement(parent, tag, attrib={}, **extra):
427    """Subelement factory which creates an element instance, and appends it
428    to an existing parent.
429
430    The element tag, attribute names, and attribute values can be either
431    bytes or Unicode strings.
432
433    *parent* is the parent element, *tag* is the subelements name, *attrib* is
434    an optional directory containing element attributes, *extra* are
435    additional attributes given as keyword arguments.
436
437    """
438    attrib = {**attrib, **extra}
439    element = parent.makeelement(tag, attrib)
440    parent.append(element)
441    return element
442
443
444def Comment(text=None):
445    """Comment element factory.
446
447    This function creates a special element which the standard serializer
448    serializes as an XML comment.
449
450    *text* is a string containing the comment string.
451
452    """
453    element = Element(Comment)
454    element.text = text
455    return element
456
457
458def ProcessingInstruction(target, text=None):
459    """Processing Instruction element factory.
460
461    This function creates a special element which the standard serializer
462    serializes as an XML comment.
463
464    *target* is a string containing the processing instruction, *text* is a
465    string containing the processing instruction contents, if any.
466
467    """
468    element = Element(ProcessingInstruction)
469    element.text = target
470    if text:
471        element.text = element.text + " " + text
472    return element
473
474PI = ProcessingInstruction
475
476
477class QName:
478    """Qualified name wrapper.
479
480    This class can be used to wrap a QName attribute value in order to get
481    proper namespace handing on output.
482
483    *text_or_uri* is a string containing the QName value either in the form
484    {uri}local, or if the tag argument is given, the URI part of a QName.
485
486    *tag* is an optional argument which if given, will make the first
487    argument (text_or_uri) be interpreted as a URI, and this argument (tag)
488    be interpreted as a local name.
489
490    """
491    def __init__(self, text_or_uri, tag=None):
492        if tag:
493            text_or_uri = "{%s}%s" % (text_or_uri, tag)
494        self.text = text_or_uri
495    def __str__(self):
496        return self.text
497    def __repr__(self):
498        return '<%s %r>' % (self.__class__.__name__, self.text)
499    def __hash__(self):
500        return hash(self.text)
501    def __le__(self, other):
502        if isinstance(other, QName):
503            return self.text <= other.text
504        return self.text <= other
505    def __lt__(self, other):
506        if isinstance(other, QName):
507            return self.text < other.text
508        return self.text < other
509    def __ge__(self, other):
510        if isinstance(other, QName):
511            return self.text >= other.text
512        return self.text >= other
513    def __gt__(self, other):
514        if isinstance(other, QName):
515            return self.text > other.text
516        return self.text > other
517    def __eq__(self, other):
518        if isinstance(other, QName):
519            return self.text == other.text
520        return self.text == other
521
522# --------------------------------------------------------------------
523
524
525class ElementTree:
526    """An XML element hierarchy.
527
528    This class also provides support for serialization to and from
529    standard XML.
530
531    *element* is an optional root element node,
532    *file* is an optional file handle or file name of an XML file whose
533    contents will be used to initialize the tree with.
534
535    """
536    def __init__(self, element=None, file=None):
537        # assert element is None or iselement(element)
538        self._root = element # first node
539        if file:
540            self.parse(file)
541
542    def getroot(self):
543        """Return root element of this tree."""
544        return self._root
545
546    def _setroot(self, element):
547        """Replace root element of this tree.
548
549        This will discard the current contents of the tree and replace it
550        with the given element.  Use with care!
551
552        """
553        # assert iselement(element)
554        self._root = element
555
556    def parse(self, source, parser=None):
557        """Load external XML document into element tree.
558
559        *source* is a file name or file object, *parser* is an optional parser
560        instance that defaults to XMLParser.
561
562        ParseError is raised if the parser fails to parse the document.
563
564        Returns the root element of the given source document.
565
566        """
567        close_source = False
568        if not hasattr(source, "read"):
569            source = open(source, "rb")
570            close_source = True
571        try:
572            if parser is None:
573                # If no parser was specified, create a default XMLParser
574                parser = XMLParser()
575                if hasattr(parser, '_parse_whole'):
576                    # The default XMLParser, when it comes from an accelerator,
577                    # can define an internal _parse_whole API for efficiency.
578                    # It can be used to parse the whole source without feeding
579                    # it with chunks.
580                    self._root = parser._parse_whole(source)
581                    return self._root
582            while True:
583                data = source.read(65536)
584                if not data:
585                    break
586                parser.feed(data)
587            self._root = parser.close()
588            return self._root
589        finally:
590            if close_source:
591                source.close()
592
593    def iter(self, tag=None):
594        """Create and return tree iterator for the root element.
595
596        The iterator loops over all elements in this tree, in document order.
597
598        *tag* is a string with the tag name to iterate over
599        (default is to return all elements).
600
601        """
602        # assert self._root is not None
603        return self._root.iter(tag)
604
605    def find(self, path, namespaces=None):
606        """Find first matching element by tag name or path.
607
608        Same as getroot().find(path), which is Element.find()
609
610        *path* is a string having either an element tag or an XPath,
611        *namespaces* is an optional mapping from namespace prefix to full name.
612
613        Return the first matching element, or None if no element was found.
614
615        """
616        # assert self._root is not None
617        if path[:1] == "/":
618            path = "." + path
619            warnings.warn(
620                "This search is broken in 1.3 and earlier, and will be "
621                "fixed in a future version.  If you rely on the current "
622                "behaviour, change it to %r" % path,
623                FutureWarning, stacklevel=2
624                )
625        return self._root.find(path, namespaces)
626
627    def findtext(self, path, default=None, namespaces=None):
628        """Find first matching element by tag name or path.
629
630        Same as getroot().findtext(path),  which is Element.findtext()
631
632        *path* is a string having either an element tag or an XPath,
633        *namespaces* is an optional mapping from namespace prefix to full name.
634
635        Return the first matching element, or None if no element was found.
636
637        """
638        # assert self._root is not None
639        if path[:1] == "/":
640            path = "." + path
641            warnings.warn(
642                "This search is broken in 1.3 and earlier, and will be "
643                "fixed in a future version.  If you rely on the current "
644                "behaviour, change it to %r" % path,
645                FutureWarning, stacklevel=2
646                )
647        return self._root.findtext(path, default, namespaces)
648
649    def findall(self, path, namespaces=None):
650        """Find all matching subelements by tag name or path.
651
652        Same as getroot().findall(path), which is Element.findall().
653
654        *path* is a string having either an element tag or an XPath,
655        *namespaces* is an optional mapping from namespace prefix to full name.
656
657        Return list containing all matching elements in document order.
658
659        """
660        # assert self._root is not None
661        if path[:1] == "/":
662            path = "." + path
663            warnings.warn(
664                "This search is broken in 1.3 and earlier, and will be "
665                "fixed in a future version.  If you rely on the current "
666                "behaviour, change it to %r" % path,
667                FutureWarning, stacklevel=2
668                )
669        return self._root.findall(path, namespaces)
670
671    def iterfind(self, path, namespaces=None):
672        """Find all matching subelements by tag name or path.
673
674        Same as getroot().iterfind(path), which is element.iterfind()
675
676        *path* is a string having either an element tag or an XPath,
677        *namespaces* is an optional mapping from namespace prefix to full name.
678
679        Return an iterable yielding all matching elements in document order.
680
681        """
682        # assert self._root is not None
683        if path[:1] == "/":
684            path = "." + path
685            warnings.warn(
686                "This search is broken in 1.3 and earlier, and will be "
687                "fixed in a future version.  If you rely on the current "
688                "behaviour, change it to %r" % path,
689                FutureWarning, stacklevel=2
690                )
691        return self._root.iterfind(path, namespaces)
692
693    def write(self, file_or_filename,
694              encoding=None,
695              xml_declaration=None,
696              default_namespace=None,
697              method=None, *,
698              short_empty_elements=True):
699        """Write element tree to a file as XML.
700
701        Arguments:
702          *file_or_filename* -- file name or a file object opened for writing
703
704          *encoding* -- the output encoding (default: US-ASCII)
705
706          *xml_declaration* -- bool indicating if an XML declaration should be
707                               added to the output. If None, an XML declaration
708                               is added if encoding IS NOT either of:
709                               US-ASCII, UTF-8, or Unicode
710
711          *default_namespace* -- sets the default XML namespace (for "xmlns")
712
713          *method* -- either "xml" (default), "html, "text", or "c14n"
714
715          *short_empty_elements* -- controls the formatting of elements
716                                    that contain no content. If True (default)
717                                    they are emitted as a single self-closed
718                                    tag, otherwise they are emitted as a pair
719                                    of start/end tags
720
721        """
722        if not method:
723            method = "xml"
724        elif method not in _serialize:
725            raise ValueError("unknown method %r" % method)
726        if not encoding:
727            if method == "c14n":
728                encoding = "utf-8"
729            else:
730                encoding = "us-ascii"
731        enc_lower = encoding.lower()
732        with _get_writer(file_or_filename, enc_lower) as write:
733            if method == "xml" and (xml_declaration or
734                    (xml_declaration is None and
735                     enc_lower not in ("utf-8", "us-ascii", "unicode"))):
736                declared_encoding = encoding
737                if enc_lower == "unicode":
738                    # Retrieve the default encoding for the xml declaration
739                    import locale
740                    declared_encoding = locale.getpreferredencoding()
741                write("<?xml version='1.0' encoding='%s'?>\n" % (
742                    declared_encoding,))
743            if method == "text":
744                _serialize_text(write, self._root)
745            else:
746                qnames, namespaces = _namespaces(self._root, default_namespace)
747                serialize = _serialize[method]
748                serialize(write, self._root, qnames, namespaces,
749                          short_empty_elements=short_empty_elements)
750
751    def write_c14n(self, file):
752        # lxml.etree compatibility.  use output method instead
753        return self.write(file, method="c14n")
754
755# --------------------------------------------------------------------
756# serialization support
757
758@contextlib.contextmanager
759def _get_writer(file_or_filename, encoding):
760    # returns text write method and release all resources after using
761    try:
762        write = file_or_filename.write
763    except AttributeError:
764        # file_or_filename is a file name
765        if encoding == "unicode":
766            file = open(file_or_filename, "w")
767        else:
768            file = open(file_or_filename, "w", encoding=encoding,
769                        errors="xmlcharrefreplace")
770        with file:
771            yield file.write
772    else:
773        # file_or_filename is a file-like object
774        # encoding determines if it is a text or binary writer
775        if encoding == "unicode":
776            # use a text writer as is
777            yield write
778        else:
779            # wrap a binary writer with TextIOWrapper
780            with contextlib.ExitStack() as stack:
781                if isinstance(file_or_filename, io.BufferedIOBase):
782                    file = file_or_filename
783                elif isinstance(file_or_filename, io.RawIOBase):
784                    file = io.BufferedWriter(file_or_filename)
785                    # Keep the original file open when the BufferedWriter is
786                    # destroyed
787                    stack.callback(file.detach)
788                else:
789                    # This is to handle passed objects that aren't in the
790                    # IOBase hierarchy, but just have a write method
791                    file = io.BufferedIOBase()
792                    file.writable = lambda: True
793                    file.write = write
794                    try:
795                        # TextIOWrapper uses this methods to determine
796                        # if BOM (for UTF-16, etc) should be added
797                        file.seekable = file_or_filename.seekable
798                        file.tell = file_or_filename.tell
799                    except AttributeError:
800                        pass
801                file = io.TextIOWrapper(file,
802                                        encoding=encoding,
803                                        errors="xmlcharrefreplace",
804                                        newline="\n")
805                # Keep the original file open when the TextIOWrapper is
806                # destroyed
807                stack.callback(file.detach)
808                yield file.write
809
810def _namespaces(elem, default_namespace=None):
811    # identify namespaces used in this tree
812
813    # maps qnames to *encoded* prefix:local names
814    qnames = {None: None}
815
816    # maps uri:s to prefixes
817    namespaces = {}
818    if default_namespace:
819        namespaces[default_namespace] = ""
820
821    def add_qname(qname):
822        # calculate serialized qname representation
823        try:
824            if qname[:1] == "{":
825                uri, tag = qname[1:].rsplit("}", 1)
826                prefix = namespaces.get(uri)
827                if prefix is None:
828                    prefix = _namespace_map.get(uri)
829                    if prefix is None:
830                        prefix = "ns%d" % len(namespaces)
831                    if prefix != "xml":
832                        namespaces[uri] = prefix
833                if prefix:
834                    qnames[qname] = "%s:%s" % (prefix, tag)
835                else:
836                    qnames[qname] = tag # default element
837            else:
838                if default_namespace:
839                    # FIXME: can this be handled in XML 1.0?
840                    raise ValueError(
841                        "cannot use non-qualified names with "
842                        "default_namespace option"
843                        )
844                qnames[qname] = qname
845        except TypeError:
846            _raise_serialization_error(qname)
847
848    # populate qname and namespaces table
849    for elem in elem.iter():
850        tag = elem.tag
851        if isinstance(tag, QName):
852            if tag.text not in qnames:
853                add_qname(tag.text)
854        elif isinstance(tag, str):
855            if tag not in qnames:
856                add_qname(tag)
857        elif tag is not None and tag is not Comment and tag is not PI:
858            _raise_serialization_error(tag)
859        for key, value in elem.items():
860            if isinstance(key, QName):
861                key = key.text
862            if key not in qnames:
863                add_qname(key)
864            if isinstance(value, QName) and value.text not in qnames:
865                add_qname(value.text)
866        text = elem.text
867        if isinstance(text, QName) and text.text not in qnames:
868            add_qname(text.text)
869    return qnames, namespaces
870
871def _serialize_xml(write, elem, qnames, namespaces,
872                   short_empty_elements, **kwargs):
873    tag = elem.tag
874    text = elem.text
875    if tag is Comment:
876        write("<!--%s-->" % text)
877    elif tag is ProcessingInstruction:
878        write("<?%s?>" % text)
879    else:
880        tag = qnames[tag]
881        if tag is None:
882            if text:
883                write(_escape_cdata(text))
884            for e in elem:
885                _serialize_xml(write, e, qnames, None,
886                               short_empty_elements=short_empty_elements)
887        else:
888            write("<" + tag)
889            items = list(elem.items())
890            if items or namespaces:
891                if namespaces:
892                    for v, k in sorted(namespaces.items(),
893                                       key=lambda x: x[1]):  # sort on prefix
894                        if k:
895                            k = ":" + k
896                        write(" xmlns%s=\"%s\"" % (
897                            k,
898                            _escape_attrib(v)
899                            ))
900                for k, v in items:
901                    if isinstance(k, QName):
902                        k = k.text
903                    if isinstance(v, QName):
904                        v = qnames[v.text]
905                    else:
906                        v = _escape_attrib(v)
907                    write(" %s=\"%s\"" % (qnames[k], v))
908            if text or len(elem) or not short_empty_elements:
909                write(">")
910                if text:
911                    write(_escape_cdata(text))
912                for e in elem:
913                    _serialize_xml(write, e, qnames, None,
914                                   short_empty_elements=short_empty_elements)
915                write("</" + tag + ">")
916            else:
917                write(" />")
918    if elem.tail:
919        write(_escape_cdata(elem.tail))
920
921HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
922              "img", "input", "isindex", "link", "meta", "param")
923
924try:
925    HTML_EMPTY = set(HTML_EMPTY)
926except NameError:
927    pass
928
929def _serialize_html(write, elem, qnames, namespaces, **kwargs):
930    tag = elem.tag
931    text = elem.text
932    if tag is Comment:
933        write("<!--%s-->" % _escape_cdata(text))
934    elif tag is ProcessingInstruction:
935        write("<?%s?>" % _escape_cdata(text))
936    else:
937        tag = qnames[tag]
938        if tag is None:
939            if text:
940                write(_escape_cdata(text))
941            for e in elem:
942                _serialize_html(write, e, qnames, None)
943        else:
944            write("<" + tag)
945            items = list(elem.items())
946            if items or namespaces:
947                if namespaces:
948                    for v, k in sorted(namespaces.items(),
949                                       key=lambda x: x[1]):  # sort on prefix
950                        if k:
951                            k = ":" + k
952                        write(" xmlns%s=\"%s\"" % (
953                            k,
954                            _escape_attrib(v)
955                            ))
956                for k, v in items:
957                    if isinstance(k, QName):
958                        k = k.text
959                    if isinstance(v, QName):
960                        v = qnames[v.text]
961                    else:
962                        v = _escape_attrib_html(v)
963                    # FIXME: handle boolean attributes
964                    write(" %s=\"%s\"" % (qnames[k], v))
965            write(">")
966            ltag = tag.lower()
967            if text:
968                if ltag == "script" or ltag == "style":
969                    write(text)
970                else:
971                    write(_escape_cdata(text))
972            for e in elem:
973                _serialize_html(write, e, qnames, None)
974            if ltag not in HTML_EMPTY:
975                write("</" + tag + ">")
976    if elem.tail:
977        write(_escape_cdata(elem.tail))
978
979def _serialize_text(write, elem):
980    for part in elem.itertext():
981        write(part)
982    if elem.tail:
983        write(elem.tail)
984
985_serialize = {
986    "xml": _serialize_xml,
987    "html": _serialize_html,
988    "text": _serialize_text,
989# this optional method is imported at the end of the module
990#   "c14n": _serialize_c14n,
991}
992
993
994def register_namespace(prefix, uri):
995    """Register a namespace prefix.
996
997    The registry is global, and any existing mapping for either the
998    given prefix or the namespace URI will be removed.
999
1000    *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1001    attributes in this namespace will be serialized with prefix if possible.
1002
1003    ValueError is raised if prefix is reserved or is invalid.
1004
1005    """
1006    if re.match(r"ns\d+$", prefix):
1007        raise ValueError("Prefix format reserved for internal use")
1008    for k, v in list(_namespace_map.items()):
1009        if k == uri or v == prefix:
1010            del _namespace_map[k]
1011    _namespace_map[uri] = prefix
1012
1013_namespace_map = {
1014    # "well-known" namespace prefixes
1015    "http://www.w3.org/XML/1998/namespace": "xml",
1016    "http://www.w3.org/1999/xhtml": "html",
1017    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1018    "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1019    # xml schema
1020    "http://www.w3.org/2001/XMLSchema": "xs",
1021    "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1022    # dublin core
1023    "http://purl.org/dc/elements/1.1/": "dc",
1024}
1025# For tests and troubleshooting
1026register_namespace._namespace_map = _namespace_map
1027
1028def _raise_serialization_error(text):
1029    raise TypeError(
1030        "cannot serialize %r (type %s)" % (text, type(text).__name__)
1031        )
1032
1033def _escape_cdata(text):
1034    # escape character data
1035    try:
1036        # it's worth avoiding do-nothing calls for strings that are
1037        # shorter than 500 characters, or so.  assume that's, by far,
1038        # the most common case in most applications.
1039        if "&" in text:
1040            text = text.replace("&", "&amp;")
1041        if "<" in text:
1042            text = text.replace("<", "&lt;")
1043        if ">" in text:
1044            text = text.replace(">", "&gt;")
1045        return text
1046    except (TypeError, AttributeError):
1047        _raise_serialization_error(text)
1048
1049def _escape_attrib(text):
1050    # escape attribute value
1051    try:
1052        if "&" in text:
1053            text = text.replace("&", "&amp;")
1054        if "<" in text:
1055            text = text.replace("<", "&lt;")
1056        if ">" in text:
1057            text = text.replace(">", "&gt;")
1058        if "\"" in text:
1059            text = text.replace("\"", "&quot;")
1060        # Although section 2.11 of the XML specification states that CR or
1061        # CR LN should be replaced with just LN, it applies only to EOLNs
1062        # which take part of organizing file into lines. Within attributes,
1063        # we are replacing these with entity numbers, so they do not count.
1064        # http://www.w3.org/TR/REC-xml/#sec-line-ends
1065        # The current solution, contained in following six lines, was
1066        # discussed in issue 17582 and 39011.
1067        if "\r" in text:
1068            text = text.replace("\r", "&#13;")
1069        if "\n" in text:
1070            text = text.replace("\n", "&#10;")
1071        if "\t" in text:
1072            text = text.replace("\t", "&#09;")
1073        return text
1074    except (TypeError, AttributeError):
1075        _raise_serialization_error(text)
1076
1077def _escape_attrib_html(text):
1078    # escape attribute value
1079    try:
1080        if "&" in text:
1081            text = text.replace("&", "&amp;")
1082        if ">" in text:
1083            text = text.replace(">", "&gt;")
1084        if "\"" in text:
1085            text = text.replace("\"", "&quot;")
1086        return text
1087    except (TypeError, AttributeError):
1088        _raise_serialization_error(text)
1089
1090# --------------------------------------------------------------------
1091
1092def tostring(element, encoding=None, method=None, *,
1093             xml_declaration=None, default_namespace=None,
1094             short_empty_elements=True):
1095    """Generate string representation of XML element.
1096
1097    All subelements are included.  If encoding is "unicode", a string
1098    is returned. Otherwise a bytestring is returned.
1099
1100    *element* is an Element instance, *encoding* is an optional output
1101    encoding defaulting to US-ASCII, *method* is an optional output which can
1102    be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
1103    sets the default XML namespace (for "xmlns").
1104
1105    Returns an (optionally) encoded string containing the XML data.
1106
1107    """
1108    stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
1109    ElementTree(element).write(stream, encoding,
1110                               xml_declaration=xml_declaration,
1111                               default_namespace=default_namespace,
1112                               method=method,
1113                               short_empty_elements=short_empty_elements)
1114    return stream.getvalue()
1115
1116class _ListDataStream(io.BufferedIOBase):
1117    """An auxiliary stream accumulating into a list reference."""
1118    def __init__(self, lst):
1119        self.lst = lst
1120
1121    def writable(self):
1122        return True
1123
1124    def seekable(self):
1125        return True
1126
1127    def write(self, b):
1128        self.lst.append(b)
1129
1130    def tell(self):
1131        return len(self.lst)
1132
1133def tostringlist(element, encoding=None, method=None, *,
1134                 xml_declaration=None, default_namespace=None,
1135                 short_empty_elements=True):
1136    lst = []
1137    stream = _ListDataStream(lst)
1138    ElementTree(element).write(stream, encoding,
1139                               xml_declaration=xml_declaration,
1140                               default_namespace=default_namespace,
1141                               method=method,
1142                               short_empty_elements=short_empty_elements)
1143    return lst
1144
1145
1146def dump(elem):
1147    """Write element tree or element structure to sys.stdout.
1148
1149    This function should be used for debugging only.
1150
1151    *elem* is either an ElementTree, or a single Element.  The exact output
1152    format is implementation dependent.  In this version, it's written as an
1153    ordinary XML file.
1154
1155    """
1156    # debugging
1157    if not isinstance(elem, ElementTree):
1158        elem = ElementTree(elem)
1159    elem.write(sys.stdout, encoding="unicode")
1160    tail = elem.getroot().tail
1161    if not tail or tail[-1] != "\n":
1162        sys.stdout.write("\n")
1163
1164
1165def indent(tree, space="  ", level=0):
1166    """Indent an XML document by inserting newlines and indentation space
1167    after elements.
1168
1169    *tree* is the ElementTree or Element to modify.  The (root) element
1170    itself will not be changed, but the tail text of all elements in its
1171    subtree will be adapted.
1172
1173    *space* is the whitespace to insert for each indentation level, two
1174    space characters by default.
1175
1176    *level* is the initial indentation level. Setting this to a higher
1177    value than 0 can be used for indenting subtrees that are more deeply
1178    nested inside of a document.
1179    """
1180    if isinstance(tree, ElementTree):
1181        tree = tree.getroot()
1182    if level < 0:
1183        raise ValueError(f"Initial indentation level must be >= 0, got {level}")
1184    if not len(tree):
1185        return
1186
1187    # Reduce the memory consumption by reusing indentation strings.
1188    indentations = ["\n" + level * space]
1189
1190    def _indent_children(elem, level):
1191        # Start a new indentation level for the first child.
1192        child_level = level + 1
1193        try:
1194            child_indentation = indentations[child_level]
1195        except IndexError:
1196            child_indentation = indentations[level] + space
1197            indentations.append(child_indentation)
1198
1199        if not elem.text or not elem.text.strip():
1200            elem.text = child_indentation
1201
1202        for child in elem:
1203            if len(child):
1204                _indent_children(child, child_level)
1205            if not child.tail or not child.tail.strip():
1206                child.tail = child_indentation
1207
1208        # Dedent after the last child by overwriting the previous indentation.
1209        if not child.tail.strip():
1210            child.tail = indentations[level]
1211
1212    _indent_children(tree, 0)
1213
1214
1215# --------------------------------------------------------------------
1216# parsing
1217
1218
1219def parse(source, parser=None):
1220    """Parse XML document into element tree.
1221
1222    *source* is a filename or file object containing XML data,
1223    *parser* is an optional parser instance defaulting to XMLParser.
1224
1225    Return an ElementTree instance.
1226
1227    """
1228    tree = ElementTree()
1229    tree.parse(source, parser)
1230    return tree
1231
1232
1233def iterparse(source, events=None, parser=None):
1234    """Incrementally parse XML document into ElementTree.
1235
1236    This class also reports what's going on to the user based on the
1237    *events* it is initialized with.  The supported events are the strings
1238    "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1239    detailed namespace information).  If *events* is omitted, only
1240    "end" events are reported.
1241
1242    *source* is a filename or file object containing XML data, *events* is
1243    a list of events to report back, *parser* is an optional parser instance.
1244
1245    Returns an iterator providing (event, elem) pairs.
1246
1247    """
1248    # Use the internal, undocumented _parser argument for now; When the
1249    # parser argument of iterparse is removed, this can be killed.
1250    pullparser = XMLPullParser(events=events, _parser=parser)
1251    def iterator():
1252        try:
1253            while True:
1254                yield from pullparser.read_events()
1255                # load event buffer
1256                data = source.read(16 * 1024)
1257                if not data:
1258                    break
1259                pullparser.feed(data)
1260            root = pullparser._close_and_return_root()
1261            yield from pullparser.read_events()
1262            it.root = root
1263        finally:
1264            if close_source:
1265                source.close()
1266
1267    class IterParseIterator(collections.abc.Iterator):
1268        __next__ = iterator().__next__
1269    it = IterParseIterator()
1270    it.root = None
1271    del iterator, IterParseIterator
1272
1273    close_source = False
1274    if not hasattr(source, "read"):
1275        source = open(source, "rb")
1276        close_source = True
1277
1278    return it
1279
1280
1281class XMLPullParser:
1282
1283    def __init__(self, events=None, *, _parser=None):
1284        # The _parser argument is for internal use only and must not be relied
1285        # upon in user code. It will be removed in a future release.
1286        # See https://bugs.python.org/issue17741 for more details.
1287
1288        self._events_queue = collections.deque()
1289        self._parser = _parser or XMLParser(target=TreeBuilder())
1290        # wire up the parser for event reporting
1291        if events is None:
1292            events = ("end",)
1293        self._parser._setevents(self._events_queue, events)
1294
1295    def feed(self, data):
1296        """Feed encoded data to parser."""
1297        if self._parser is None:
1298            raise ValueError("feed() called after end of stream")
1299        if data:
1300            try:
1301                self._parser.feed(data)
1302            except SyntaxError as exc:
1303                self._events_queue.append(exc)
1304
1305    def _close_and_return_root(self):
1306        # iterparse needs this to set its root attribute properly :(
1307        root = self._parser.close()
1308        self._parser = None
1309        return root
1310
1311    def close(self):
1312        """Finish feeding data to parser.
1313
1314        Unlike XMLParser, does not return the root element. Use
1315        read_events() to consume elements from XMLPullParser.
1316        """
1317        self._close_and_return_root()
1318
1319    def read_events(self):
1320        """Return an iterator over currently available (event, elem) pairs.
1321
1322        Events are consumed from the internal event queue as they are
1323        retrieved from the iterator.
1324        """
1325        events = self._events_queue
1326        while events:
1327            event = events.popleft()
1328            if isinstance(event, Exception):
1329                raise event
1330            else:
1331                yield event
1332
1333
1334def XML(text, parser=None):
1335    """Parse XML document from string constant.
1336
1337    This function can be used to embed "XML Literals" in Python code.
1338
1339    *text* is a string containing XML data, *parser* is an
1340    optional parser instance, defaulting to the standard XMLParser.
1341
1342    Returns an Element instance.
1343
1344    """
1345    if not parser:
1346        parser = XMLParser(target=TreeBuilder())
1347    parser.feed(text)
1348    return parser.close()
1349
1350
1351def XMLID(text, parser=None):
1352    """Parse XML document from string constant for its IDs.
1353
1354    *text* is a string containing XML data, *parser* is an
1355    optional parser instance, defaulting to the standard XMLParser.
1356
1357    Returns an (Element, dict) tuple, in which the
1358    dict maps element id:s to elements.
1359
1360    """
1361    if not parser:
1362        parser = XMLParser(target=TreeBuilder())
1363    parser.feed(text)
1364    tree = parser.close()
1365    ids = {}
1366    for elem in tree.iter():
1367        id = elem.get("id")
1368        if id:
1369            ids[id] = elem
1370    return tree, ids
1371
1372# Parse XML document from string constant.  Alias for XML().
1373fromstring = XML
1374
1375def fromstringlist(sequence, parser=None):
1376    """Parse XML document from sequence of string fragments.
1377
1378    *sequence* is a list of other sequence, *parser* is an optional parser
1379    instance, defaulting to the standard XMLParser.
1380
1381    Returns an Element instance.
1382
1383    """
1384    if not parser:
1385        parser = XMLParser(target=TreeBuilder())
1386    for text in sequence:
1387        parser.feed(text)
1388    return parser.close()
1389
1390# --------------------------------------------------------------------
1391
1392
1393class TreeBuilder:
1394    """Generic element structure builder.
1395
1396    This builder converts a sequence of start, data, and end method
1397    calls to a well-formed element structure.
1398
1399    You can use this class to build an element structure using a custom XML
1400    parser, or a parser for some other XML-like format.
1401
1402    *element_factory* is an optional element factory which is called
1403    to create new Element instances, as necessary.
1404
1405    *comment_factory* is a factory to create comments to be used instead of
1406    the standard factory.  If *insert_comments* is false (the default),
1407    comments will not be inserted into the tree.
1408
1409    *pi_factory* is a factory to create processing instructions to be used
1410    instead of the standard factory.  If *insert_pis* is false (the default),
1411    processing instructions will not be inserted into the tree.
1412    """
1413    def __init__(self, element_factory=None, *,
1414                 comment_factory=None, pi_factory=None,
1415                 insert_comments=False, insert_pis=False):
1416        self._data = [] # data collector
1417        self._elem = [] # element stack
1418        self._last = None # last element
1419        self._root = None # root element
1420        self._tail = None # true if we're after an end tag
1421        if comment_factory is None:
1422            comment_factory = Comment
1423        self._comment_factory = comment_factory
1424        self.insert_comments = insert_comments
1425        if pi_factory is None:
1426            pi_factory = ProcessingInstruction
1427        self._pi_factory = pi_factory
1428        self.insert_pis = insert_pis
1429        if element_factory is None:
1430            element_factory = Element
1431        self._factory = element_factory
1432
1433    def close(self):
1434        """Flush builder buffers and return toplevel document Element."""
1435        assert len(self._elem) == 0, "missing end tags"
1436        assert self._root is not None, "missing toplevel element"
1437        return self._root
1438
1439    def _flush(self):
1440        if self._data:
1441            if self._last is not None:
1442                text = "".join(self._data)
1443                if self._tail:
1444                    assert self._last.tail is None, "internal error (tail)"
1445                    self._last.tail = text
1446                else:
1447                    assert self._last.text is None, "internal error (text)"
1448                    self._last.text = text
1449            self._data = []
1450
1451    def data(self, data):
1452        """Add text to current element."""
1453        self._data.append(data)
1454
1455    def start(self, tag, attrs):
1456        """Open new element and return it.
1457
1458        *tag* is the element name, *attrs* is a dict containing element
1459        attributes.
1460
1461        """
1462        self._flush()
1463        self._last = elem = self._factory(tag, attrs)
1464        if self._elem:
1465            self._elem[-1].append(elem)
1466        elif self._root is None:
1467            self._root = elem
1468        self._elem.append(elem)
1469        self._tail = 0
1470        return elem
1471
1472    def end(self, tag):
1473        """Close and return current Element.
1474
1475        *tag* is the element name.
1476
1477        """
1478        self._flush()
1479        self._last = self._elem.pop()
1480        assert self._last.tag == tag,\
1481               "end tag mismatch (expected %s, got %s)" % (
1482                   self._last.tag, tag)
1483        self._tail = 1
1484        return self._last
1485
1486    def comment(self, text):
1487        """Create a comment using the comment_factory.
1488
1489        *text* is the text of the comment.
1490        """
1491        return self._handle_single(
1492            self._comment_factory, self.insert_comments, text)
1493
1494    def pi(self, target, text=None):
1495        """Create a processing instruction using the pi_factory.
1496
1497        *target* is the target name of the processing instruction.
1498        *text* is the data of the processing instruction, or ''.
1499        """
1500        return self._handle_single(
1501            self._pi_factory, self.insert_pis, target, text)
1502
1503    def _handle_single(self, factory, insert, *args):
1504        elem = factory(*args)
1505        if insert:
1506            self._flush()
1507            self._last = elem
1508            if self._elem:
1509                self._elem[-1].append(elem)
1510            self._tail = 1
1511        return elem
1512
1513
1514# also see ElementTree and TreeBuilder
1515class XMLParser:
1516    """Element structure builder for XML source data based on the expat parser.
1517
1518    *target* is an optional target object which defaults to an instance of the
1519    standard TreeBuilder class, *encoding* is an optional encoding string
1520    which if given, overrides the encoding specified in the XML file:
1521    http://www.iana.org/assignments/character-sets
1522
1523    """
1524
1525    def __init__(self, *, target=None, encoding=None):
1526        try:
1527            from xml.parsers import expat
1528        except ImportError:
1529            try:
1530                import pyexpat as expat
1531            except ImportError:
1532                raise ImportError(
1533                    "No module named expat; use SimpleXMLTreeBuilder instead"
1534                    )
1535        parser = expat.ParserCreate(encoding, "}")
1536        if target is None:
1537            target = TreeBuilder()
1538        # underscored names are provided for compatibility only
1539        self.parser = self._parser = parser
1540        self.target = self._target = target
1541        self._error = expat.error
1542        self._names = {} # name memo cache
1543        # main callbacks
1544        parser.DefaultHandlerExpand = self._default
1545        if hasattr(target, 'start'):
1546            parser.StartElementHandler = self._start
1547        if hasattr(target, 'end'):
1548            parser.EndElementHandler = self._end
1549        if hasattr(target, 'start_ns'):
1550            parser.StartNamespaceDeclHandler = self._start_ns
1551        if hasattr(target, 'end_ns'):
1552            parser.EndNamespaceDeclHandler = self._end_ns
1553        if hasattr(target, 'data'):
1554            parser.CharacterDataHandler = target.data
1555        # miscellaneous callbacks
1556        if hasattr(target, 'comment'):
1557            parser.CommentHandler = target.comment
1558        if hasattr(target, 'pi'):
1559            parser.ProcessingInstructionHandler = target.pi
1560        # Configure pyexpat: buffering, new-style attribute handling.
1561        parser.buffer_text = 1
1562        parser.ordered_attributes = 1
1563        self._doctype = None
1564        self.entity = {}
1565        try:
1566            self.version = "Expat %d.%d.%d" % expat.version_info
1567        except AttributeError:
1568            pass # unknown
1569
1570    def _setevents(self, events_queue, events_to_report):
1571        # Internal API for XMLPullParser
1572        # events_to_report: a list of events to report during parsing (same as
1573        # the *events* of XMLPullParser's constructor.
1574        # events_queue: a list of actual parsing events that will be populated
1575        # by the underlying parser.
1576        #
1577        parser = self._parser
1578        append = events_queue.append
1579        for event_name in events_to_report:
1580            if event_name == "start":
1581                parser.ordered_attributes = 1
1582                def handler(tag, attrib_in, event=event_name, append=append,
1583                            start=self._start):
1584                    append((event, start(tag, attrib_in)))
1585                parser.StartElementHandler = handler
1586            elif event_name == "end":
1587                def handler(tag, event=event_name, append=append,
1588                            end=self._end):
1589                    append((event, end(tag)))
1590                parser.EndElementHandler = handler
1591            elif event_name == "start-ns":
1592                # TreeBuilder does not implement .start_ns()
1593                if hasattr(self.target, "start_ns"):
1594                    def handler(prefix, uri, event=event_name, append=append,
1595                                start_ns=self._start_ns):
1596                        append((event, start_ns(prefix, uri)))
1597                else:
1598                    def handler(prefix, uri, event=event_name, append=append):
1599                        append((event, (prefix or '', uri or '')))
1600                parser.StartNamespaceDeclHandler = handler
1601            elif event_name == "end-ns":
1602                # TreeBuilder does not implement .end_ns()
1603                if hasattr(self.target, "end_ns"):
1604                    def handler(prefix, event=event_name, append=append,
1605                                end_ns=self._end_ns):
1606                        append((event, end_ns(prefix)))
1607                else:
1608                    def handler(prefix, event=event_name, append=append):
1609                        append((event, None))
1610                parser.EndNamespaceDeclHandler = handler
1611            elif event_name == 'comment':
1612                def handler(text, event=event_name, append=append, self=self):
1613                    append((event, self.target.comment(text)))
1614                parser.CommentHandler = handler
1615            elif event_name == 'pi':
1616                def handler(pi_target, data, event=event_name, append=append,
1617                            self=self):
1618                    append((event, self.target.pi(pi_target, data)))
1619                parser.ProcessingInstructionHandler = handler
1620            else:
1621                raise ValueError("unknown event %r" % event_name)
1622
1623    def _raiseerror(self, value):
1624        err = ParseError(value)
1625        err.code = value.code
1626        err.position = value.lineno, value.offset
1627        raise err
1628
1629    def _fixname(self, key):
1630        # expand qname, and convert name string to ascii, if possible
1631        try:
1632            name = self._names[key]
1633        except KeyError:
1634            name = key
1635            if "}" in name:
1636                name = "{" + name
1637            self._names[key] = name
1638        return name
1639
1640    def _start_ns(self, prefix, uri):
1641        return self.target.start_ns(prefix or '', uri or '')
1642
1643    def _end_ns(self, prefix):
1644        return self.target.end_ns(prefix or '')
1645
1646    def _start(self, tag, attr_list):
1647        # Handler for expat's StartElementHandler. Since ordered_attributes
1648        # is set, the attributes are reported as a list of alternating
1649        # attribute name,value.
1650        fixname = self._fixname
1651        tag = fixname(tag)
1652        attrib = {}
1653        if attr_list:
1654            for i in range(0, len(attr_list), 2):
1655                attrib[fixname(attr_list[i])] = attr_list[i+1]
1656        return self.target.start(tag, attrib)
1657
1658    def _end(self, tag):
1659        return self.target.end(self._fixname(tag))
1660
1661    def _default(self, text):
1662        prefix = text[:1]
1663        if prefix == "&":
1664            # deal with undefined entities
1665            try:
1666                data_handler = self.target.data
1667            except AttributeError:
1668                return
1669            try:
1670                data_handler(self.entity[text[1:-1]])
1671            except KeyError:
1672                from xml.parsers import expat
1673                err = expat.error(
1674                    "undefined entity %s: line %d, column %d" %
1675                    (text, self.parser.ErrorLineNumber,
1676                    self.parser.ErrorColumnNumber)
1677                    )
1678                err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
1679                err.lineno = self.parser.ErrorLineNumber
1680                err.offset = self.parser.ErrorColumnNumber
1681                raise err
1682        elif prefix == "<" and text[:9] == "<!DOCTYPE":
1683            self._doctype = [] # inside a doctype declaration
1684        elif self._doctype is not None:
1685            # parse doctype contents
1686            if prefix == ">":
1687                self._doctype = None
1688                return
1689            text = text.strip()
1690            if not text:
1691                return
1692            self._doctype.append(text)
1693            n = len(self._doctype)
1694            if n > 2:
1695                type = self._doctype[1]
1696                if type == "PUBLIC" and n == 4:
1697                    name, type, pubid, system = self._doctype
1698                    if pubid:
1699                        pubid = pubid[1:-1]
1700                elif type == "SYSTEM" and n == 3:
1701                    name, type, system = self._doctype
1702                    pubid = None
1703                else:
1704                    return
1705                if hasattr(self.target, "doctype"):
1706                    self.target.doctype(name, pubid, system[1:-1])
1707                elif hasattr(self, "doctype"):
1708                    warnings.warn(
1709                        "The doctype() method of XMLParser is ignored.  "
1710                        "Define doctype() method on the TreeBuilder target.",
1711                        RuntimeWarning)
1712
1713                self._doctype = None
1714
1715    def feed(self, data):
1716        """Feed encoded data to parser."""
1717        try:
1718            self.parser.Parse(data, False)
1719        except self._error as v:
1720            self._raiseerror(v)
1721
1722    def close(self):
1723        """Finish feeding data to parser and return element structure."""
1724        try:
1725            self.parser.Parse(b"", True) # end of data
1726        except self._error as v:
1727            self._raiseerror(v)
1728        try:
1729            close_handler = self.target.close
1730        except AttributeError:
1731            pass
1732        else:
1733            return close_handler()
1734        finally:
1735            # get rid of circular references
1736            del self.parser, self._parser
1737            del self.target, self._target
1738
1739
1740# --------------------------------------------------------------------
1741# C14N 2.0
1742
1743def canonicalize(xml_data=None, *, out=None, from_file=None, **options):
1744    """Convert XML to its C14N 2.0 serialised form.
1745
1746    If *out* is provided, it must be a file or file-like object that receives
1747    the serialised canonical XML output (text, not bytes) through its ``.write()``
1748    method.  To write to a file, open it in text mode with encoding "utf-8".
1749    If *out* is not provided, this function returns the output as text string.
1750
1751    Either *xml_data* (an XML string) or *from_file* (a file path or
1752    file-like object) must be provided as input.
1753
1754    The configuration options are the same as for the ``C14NWriterTarget``.
1755    """
1756    if xml_data is None and from_file is None:
1757        raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")
1758    sio = None
1759    if out is None:
1760        sio = out = io.StringIO()
1761
1762    parser = XMLParser(target=C14NWriterTarget(out.write, **options))
1763
1764    if xml_data is not None:
1765        parser.feed(xml_data)
1766        parser.close()
1767    elif from_file is not None:
1768        parse(from_file, parser=parser)
1769
1770    return sio.getvalue() if sio is not None else None
1771
1772
1773_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match
1774
1775
1776class C14NWriterTarget:
1777    """
1778    Canonicalization writer target for the XMLParser.
1779
1780    Serialises parse events to XML C14N 2.0.
1781
1782    The *write* function is used for writing out the resulting data stream
1783    as text (not bytes).  To write to a file, open it in text mode with encoding
1784    "utf-8" and pass its ``.write`` method.
1785
1786    Configuration options:
1787
1788    - *with_comments*: set to true to include comments
1789    - *strip_text*: set to true to strip whitespace before and after text content
1790    - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
1791    - *qname_aware_tags*: a set of qname aware tag names in which prefixes
1792                          should be replaced in text content
1793    - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
1794                           should be replaced in text content
1795    - *exclude_attrs*: a set of attribute names that should not be serialised
1796    - *exclude_tags*: a set of tag names that should not be serialised
1797    """
1798    def __init__(self, write, *,
1799                 with_comments=False, strip_text=False, rewrite_prefixes=False,
1800                 qname_aware_tags=None, qname_aware_attrs=None,
1801                 exclude_attrs=None, exclude_tags=None):
1802        self._write = write
1803        self._data = []
1804        self._with_comments = with_comments
1805        self._strip_text = strip_text
1806        self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None
1807        self._exclude_tags = set(exclude_tags) if exclude_tags else None
1808
1809        self._rewrite_prefixes = rewrite_prefixes
1810        if qname_aware_tags:
1811            self._qname_aware_tags = set(qname_aware_tags)
1812        else:
1813            self._qname_aware_tags = None
1814        if qname_aware_attrs:
1815            self._find_qname_aware_attrs = set(qname_aware_attrs).intersection
1816        else:
1817            self._find_qname_aware_attrs = None
1818
1819        # Stack with globally and newly declared namespaces as (uri, prefix) pairs.
1820        self._declared_ns_stack = [[
1821            ("http://www.w3.org/XML/1998/namespace", "xml"),
1822        ]]
1823        # Stack with user declared namespace prefixes as (uri, prefix) pairs.
1824        self._ns_stack = []
1825        if not rewrite_prefixes:
1826            self._ns_stack.append(list(_namespace_map.items()))
1827        self._ns_stack.append([])
1828        self._prefix_map = {}
1829        self._preserve_space = [False]
1830        self._pending_start = None
1831        self._root_seen = False
1832        self._root_done = False
1833        self._ignored_depth = 0
1834
1835    def _iter_namespaces(self, ns_stack, _reversed=reversed):
1836        for namespaces in _reversed(ns_stack):
1837            if namespaces:  # almost no element declares new namespaces
1838                yield from namespaces
1839
1840    def _resolve_prefix_name(self, prefixed_name):
1841        prefix, name = prefixed_name.split(':', 1)
1842        for uri, p in self._iter_namespaces(self._ns_stack):
1843            if p == prefix:
1844                return f'{{{uri}}}{name}'
1845        raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')
1846
1847    def _qname(self, qname, uri=None):
1848        if uri is None:
1849            uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)
1850        else:
1851            tag = qname
1852
1853        prefixes_seen = set()
1854        for u, prefix in self._iter_namespaces(self._declared_ns_stack):
1855            if u == uri and prefix not in prefixes_seen:
1856                return f'{prefix}:{tag}' if prefix else tag, tag, uri
1857            prefixes_seen.add(prefix)
1858
1859        # Not declared yet => add new declaration.
1860        if self._rewrite_prefixes:
1861            if uri in self._prefix_map:
1862                prefix = self._prefix_map[uri]
1863            else:
1864                prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'
1865            self._declared_ns_stack[-1].append((uri, prefix))
1866            return f'{prefix}:{tag}', tag, uri
1867
1868        if not uri and '' not in prefixes_seen:
1869            # No default namespace declared => no prefix needed.
1870            return tag, tag, uri
1871
1872        for u, prefix in self._iter_namespaces(self._ns_stack):
1873            if u == uri:
1874                self._declared_ns_stack[-1].append((uri, prefix))
1875                return f'{prefix}:{tag}' if prefix else tag, tag, uri
1876
1877        if not uri:
1878            # As soon as a default namespace is defined,
1879            # anything that has no namespace (and thus, no prefix) goes there.
1880            return tag, tag, uri
1881
1882        raise ValueError(f'Namespace "{uri}" is not declared in scope')
1883
1884    def data(self, data):
1885        if not self._ignored_depth:
1886            self._data.append(data)
1887
1888    def _flush(self, _join_text=''.join):
1889        data = _join_text(self._data)
1890        del self._data[:]
1891        if self._strip_text and not self._preserve_space[-1]:
1892            data = data.strip()
1893        if self._pending_start is not None:
1894            args, self._pending_start = self._pending_start, None
1895            qname_text = data if data and _looks_like_prefix_name(data) else None
1896            self._start(*args, qname_text)
1897            if qname_text is not None:
1898                return
1899        if data and self._root_seen:
1900            self._write(_escape_cdata_c14n(data))
1901
1902    def start_ns(self, prefix, uri):
1903        if self._ignored_depth:
1904            return
1905        # we may have to resolve qnames in text content
1906        if self._data:
1907            self._flush()
1908        self._ns_stack[-1].append((uri, prefix))
1909
1910    def start(self, tag, attrs):
1911        if self._exclude_tags is not None and (
1912                self._ignored_depth or tag in self._exclude_tags):
1913            self._ignored_depth += 1
1914            return
1915        if self._data:
1916            self._flush()
1917
1918        new_namespaces = []
1919        self._declared_ns_stack.append(new_namespaces)
1920
1921        if self._qname_aware_tags is not None and tag in self._qname_aware_tags:
1922            # Need to parse text first to see if it requires a prefix declaration.
1923            self._pending_start = (tag, attrs, new_namespaces)
1924            return
1925        self._start(tag, attrs, new_namespaces)
1926
1927    def _start(self, tag, attrs, new_namespaces, qname_text=None):
1928        if self._exclude_attrs is not None and attrs:
1929            attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}
1930
1931        qnames = {tag, *attrs}
1932        resolved_names = {}
1933
1934        # Resolve prefixes in attribute and tag text.
1935        if qname_text is not None:
1936            qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)
1937            qnames.add(qname)
1938        if self._find_qname_aware_attrs is not None and attrs:
1939            qattrs = self._find_qname_aware_attrs(attrs)
1940            if qattrs:
1941                for attr_name in qattrs:
1942                    value = attrs[attr_name]
1943                    if _looks_like_prefix_name(value):
1944                        qname = resolved_names[value] = self._resolve_prefix_name(value)
1945                        qnames.add(qname)
1946            else:
1947                qattrs = None
1948        else:
1949            qattrs = None
1950
1951        # Assign prefixes in lexicographical order of used URIs.
1952        parse_qname = self._qname
1953        parsed_qnames = {n: parse_qname(n) for n in sorted(
1954            qnames, key=lambda n: n.split('}', 1))}
1955
1956        # Write namespace declarations in prefix order ...
1957        if new_namespaces:
1958            attr_list = [
1959                ('xmlns:' + prefix if prefix else 'xmlns', uri)
1960                for uri, prefix in new_namespaces
1961            ]
1962            attr_list.sort()
1963        else:
1964            # almost always empty
1965            attr_list = []
1966
1967        # ... followed by attributes in URI+name order
1968        if attrs:
1969            for k, v in sorted(attrs.items()):
1970                if qattrs is not None and k in qattrs and v in resolved_names:
1971                    v = parsed_qnames[resolved_names[v]][0]
1972                attr_qname, attr_name, uri = parsed_qnames[k]
1973                # No prefix for attributes in default ('') namespace.
1974                attr_list.append((attr_qname if uri else attr_name, v))
1975
1976        # Honour xml:space attributes.
1977        space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')
1978        self._preserve_space.append(
1979            space_behaviour == 'preserve' if space_behaviour
1980            else self._preserve_space[-1])
1981
1982        # Write the tag.
1983        write = self._write
1984        write('<' + parsed_qnames[tag][0])
1985        if attr_list:
1986            write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))
1987        write('>')
1988
1989        # Write the resolved qname text content.
1990        if qname_text is not None:
1991            write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))
1992
1993        self._root_seen = True
1994        self._ns_stack.append([])
1995
1996    def end(self, tag):
1997        if self._ignored_depth:
1998            self._ignored_depth -= 1
1999            return
2000        if self._data:
2001            self._flush()
2002        self._write(f'</{self._qname(tag)[0]}>')
2003        self._preserve_space.pop()
2004        self._root_done = len(self._preserve_space) == 1
2005        self._declared_ns_stack.pop()
2006        self._ns_stack.pop()
2007
2008    def comment(self, text):
2009        if not self._with_comments:
2010            return
2011        if self._ignored_depth:
2012            return
2013        if self._root_done:
2014            self._write('\n')
2015        elif self._root_seen and self._data:
2016            self._flush()
2017        self._write(f'<!--{_escape_cdata_c14n(text)}-->')
2018        if not self._root_seen:
2019            self._write('\n')
2020
2021    def pi(self, target, data):
2022        if self._ignored_depth:
2023            return
2024        if self._root_done:
2025            self._write('\n')
2026        elif self._root_seen and self._data:
2027            self._flush()
2028        self._write(
2029            f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')
2030        if not self._root_seen:
2031            self._write('\n')
2032
2033
2034def _escape_cdata_c14n(text):
2035    # escape character data
2036    try:
2037        # it's worth avoiding do-nothing calls for strings that are
2038        # shorter than 500 character, or so.  assume that's, by far,
2039        # the most common case in most applications.
2040        if '&' in text:
2041            text = text.replace('&', '&amp;')
2042        if '<' in text:
2043            text = text.replace('<', '&lt;')
2044        if '>' in text:
2045            text = text.replace('>', '&gt;')
2046        if '\r' in text:
2047            text = text.replace('\r', '&#xD;')
2048        return text
2049    except (TypeError, AttributeError):
2050        _raise_serialization_error(text)
2051
2052
2053def _escape_attrib_c14n(text):
2054    # escape attribute value
2055    try:
2056        if '&' in text:
2057            text = text.replace('&', '&amp;')
2058        if '<' in text:
2059            text = text.replace('<', '&lt;')
2060        if '"' in text:
2061            text = text.replace('"', '&quot;')
2062        if '\t' in text:
2063            text = text.replace('\t', '&#x9;')
2064        if '\n' in text:
2065            text = text.replace('\n', '&#xA;')
2066        if '\r' in text:
2067            text = text.replace('\r', '&#xD;')
2068        return text
2069    except (TypeError, AttributeError):
2070        _raise_serialization_error(text)
2071
2072
2073# --------------------------------------------------------------------
2074
2075# Import the C accelerators
2076try:
2077    # Element is going to be shadowed by the C implementation. We need to keep
2078    # the Python version of it accessible for some "creative" by external code
2079    # (see tests)
2080    _Element_Py = Element
2081
2082    # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories
2083    from _elementtree import *
2084    from _elementtree import _set_factories
2085except ImportError:
2086    pass
2087else:
2088    _set_factories(Comment, ProcessingInstruction)
2089