1"""Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree.  This module has two classes for this purpose:
5
6    1. ElementTree represents the whole XML document as a tree and
7
8    2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level.  Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary.  Each Element has a number of properties associated with it:
17
18    'tag' - a string containing the element's name.
19
20    'attributes' - a Python dictionary storing the element's attributes.
21
22    'text' - a string containing the element's text content.
23
24    'tail' - an optional string containing text after the element's end tag.
25
26    And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34"""
35
36#---------------------------------------------------------------------
37# Licensed to PSF under a Contributor Agreement.
38# See http://www.python.org/psf/license for licensing details.
39#
40# ElementTree
41# Copyright (c) 1999-2008 by Fredrik Lundh.  All rights reserved.
42#
43# fredrik@pythonware.com
44# http://www.pythonware.com
45# --------------------------------------------------------------------
46# The ElementTree toolkit is
47#
48# Copyright (c) 1999-2008 by Fredrik Lundh
49#
50# By obtaining, using, and/or copying this software and/or its
51# associated documentation, you agree that you have read, understood,
52# and will comply with the following terms and conditions:
53#
54# Permission to use, copy, modify, and distribute this software and
55# its associated documentation for any purpose and without fee is
56# hereby granted, provided that the above copyright notice appears in
57# all copies, and that both that copyright notice and this permission
58# notice appear in supporting documentation, and that the name of
59# Secret Labs AB or the author not be used in advertising or publicity
60# pertaining to distribution of the software without specific, written
61# prior permission.
62#
63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70# OF THIS SOFTWARE.
71# --------------------------------------------------------------------
72
73__all__ = [
74    # public symbols
75    "Comment",
76    "dump",
77    "Element", "ElementTree",
78    "fromstring", "fromstringlist",
79    "iselement", "iterparse",
80    "parse", "ParseError",
81    "PI", "ProcessingInstruction",
82    "QName",
83    "SubElement",
84    "tostring", "tostringlist",
85    "TreeBuilder",
86    "VERSION",
87    "XML", "XMLID",
88    "XMLParser", "XMLPullParser",
89    "register_namespace",
90    "canonicalize", "C14NWriterTarget",
91    ]
92
93VERSION = "1.3.0"
94
95import sys
96import re
97import warnings
98import io
99import collections
100import collections.abc
101import contextlib
102
103from . import ElementPath
104
105
106class ParseError(SyntaxError):
107    """An error when parsing an XML document.
108
109    In addition to its exception value, a ParseError contains
110    two extra attributes:
111        'code'     - the specific exception code
112        'position' - the line and column of the error
113
114    """
115    pass
116
117# --------------------------------------------------------------------
118
119
120def iselement(element):
121    """Return True if *element* appears to be an Element."""
122    return hasattr(element, 'tag')
123
124
125class Element:
126    """An XML element.
127
128    This class is the reference implementation of the Element interface.
129
130    An element's length is its number of subelements.  That means if you
131    want to check if an element is truly empty, you should check BOTH
132    its length AND its text attribute.
133
134    The element tag, attribute names, and attribute values can be either
135    bytes or strings.
136
137    *tag* is the element name.  *attrib* is an optional dictionary containing
138    element attributes. *extra* are additional element attributes given as
139    keyword arguments.
140
141    Example form:
142        <tag attrib>text<child/>...</tag>tail
143
144    """
145
146    tag = None
147    """The element's name."""
148
149    attrib = None
150    """Dictionary of the element's attributes."""
151
152    text = None
153    """
154    Text before first subelement. This is either a string or the value None.
155    Note that if there is no text, this attribute may be either
156    None or the empty string, depending on the parser.
157
158    """
159
160    tail = None
161    """
162    Text after this element's end tag, but before the next sibling element's
163    start tag.  This is either a string or the value None.  Note that if there
164    was no text, this attribute may be either None or an empty string,
165    depending on the parser.
166
167    """
168
169    def __init__(self, tag, attrib={}, **extra):
170        if not isinstance(attrib, dict):
171            raise TypeError("attrib must be dict, not %s" % (
172                attrib.__class__.__name__,))
173        self.tag = tag
174        self.attrib = {**attrib, **extra}
175        self._children = []
176
177    def __repr__(self):
178        return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
179
180    def makeelement(self, tag, attrib):
181        """Create a new element with the same type.
182
183        *tag* is a string containing the element name.
184        *attrib* is a dictionary containing the element attributes.
185
186        Do not call this method, use the SubElement factory function instead.
187
188        """
189        return self.__class__(tag, attrib)
190
191    def copy(self):
192        """Return copy of current element.
193
194        This creates a shallow copy. Subelements will be shared with the
195        original tree.
196
197        """
198        elem = self.makeelement(self.tag, self.attrib)
199        elem.text = self.text
200        elem.tail = self.tail
201        elem[:] = self
202        return elem
203
204    def __len__(self):
205        return len(self._children)
206
207    def __bool__(self):
208        warnings.warn(
209            "The behavior of this method will change in future versions.  "
210            "Use specific 'len(elem)' or 'elem is not None' test instead.",
211            FutureWarning, stacklevel=2
212            )
213        return len(self._children) != 0 # emulate old behaviour, for now
214
215    def __getitem__(self, index):
216        return self._children[index]
217
218    def __setitem__(self, index, element):
219        if isinstance(index, slice):
220            for elt in element:
221                self._assert_is_element(elt)
222        else:
223            self._assert_is_element(element)
224        self._children[index] = element
225
226    def __delitem__(self, index):
227        del self._children[index]
228
229    def append(self, subelement):
230        """Add *subelement* to the end of this element.
231
232        The new element will appear in document order after the last existing
233        subelement (or directly after the text, if it's the first subelement),
234        but before the end tag for this element.
235
236        """
237        self._assert_is_element(subelement)
238        self._children.append(subelement)
239
240    def extend(self, elements):
241        """Append subelements from a sequence.
242
243        *elements* is a sequence with zero or more elements.
244
245        """
246        for element in elements:
247            self._assert_is_element(element)
248            self._children.append(element)
249
250    def insert(self, index, subelement):
251        """Insert *subelement* at position *index*."""
252        self._assert_is_element(subelement)
253        self._children.insert(index, subelement)
254
255    def _assert_is_element(self, e):
256        # Need to refer to the actual Python implementation, not the
257        # shadowing C implementation.
258        if not isinstance(e, _Element_Py):
259            raise TypeError('expected an Element, not %s' % type(e).__name__)
260
261    def remove(self, subelement):
262        """Remove matching subelement.
263
264        Unlike the find methods, this method compares elements based on
265        identity, NOT ON tag value or contents.  To remove subelements by
266        other means, the easiest way is to use a list comprehension to
267        select what elements to keep, and then use slice assignment to update
268        the parent element.
269
270        ValueError is raised if a matching element could not be found.
271
272        """
273        # assert iselement(element)
274        self._children.remove(subelement)
275
276    def getchildren(self):
277        """(Deprecated) Return all subelements.
278
279        Elements are returned in document order.
280
281        """
282        warnings.warn(
283            "This method will be removed in future versions.  "
284            "Use 'list(elem)' or iteration over elem instead.",
285            DeprecationWarning, stacklevel=2
286            )
287        return self._children
288
289    def find(self, path, namespaces=None):
290        """Find first matching element by tag name or path.
291
292        *path* is a string having either an element tag or an XPath,
293        *namespaces* is an optional mapping from namespace prefix to full name.
294
295        Return the first matching element, or None if no element was found.
296
297        """
298        return ElementPath.find(self, path, namespaces)
299
300    def findtext(self, path, default=None, namespaces=None):
301        """Find text for first matching element by tag name or path.
302
303        *path* is a string having either an element tag or an XPath,
304        *default* is the value to return if the element was not found,
305        *namespaces* is an optional mapping from namespace prefix to full name.
306
307        Return text content of first matching element, or default value if
308        none was found.  Note that if an element is found having no text
309        content, the empty string is returned.
310
311        """
312        return ElementPath.findtext(self, path, default, namespaces)
313
314    def findall(self, path, namespaces=None):
315        """Find all matching subelements by tag name or path.
316
317        *path* is a string having either an element tag or an XPath,
318        *namespaces* is an optional mapping from namespace prefix to full name.
319
320        Returns list containing all matching elements in document order.
321
322        """
323        return ElementPath.findall(self, path, namespaces)
324
325    def iterfind(self, path, namespaces=None):
326        """Find all matching subelements by tag name or path.
327
328        *path* is a string having either an element tag or an XPath,
329        *namespaces* is an optional mapping from namespace prefix to full name.
330
331        Return an iterable yielding all matching elements in document order.
332
333        """
334        return ElementPath.iterfind(self, path, namespaces)
335
336    def clear(self):
337        """Reset element.
338
339        This function removes all subelements, clears all attributes, and sets
340        the text and tail attributes to None.
341
342        """
343        self.attrib.clear()
344        self._children = []
345        self.text = self.tail = None
346
347    def get(self, key, default=None):
348        """Get element attribute.
349
350        Equivalent to attrib.get, but some implementations may handle this a
351        bit more efficiently.  *key* is what attribute to look for, and
352        *default* is what to return if the attribute was not found.
353
354        Returns a string containing the attribute value, or the default if
355        attribute was not found.
356
357        """
358        return self.attrib.get(key, default)
359
360    def set(self, key, value):
361        """Set element attribute.
362
363        Equivalent to attrib[key] = value, but some implementations may handle
364        this a bit more efficiently.  *key* is what attribute to set, and
365        *value* is the attribute value to set it to.
366
367        """
368        self.attrib[key] = value
369
370    def keys(self):
371        """Get list of attribute names.
372
373        Names are returned in an arbitrary order, just like an ordinary
374        Python dict.  Equivalent to attrib.keys()
375
376        """
377        return self.attrib.keys()
378
379    def items(self):
380        """Get element attributes as a sequence.
381
382        The attributes are returned in arbitrary order.  Equivalent to
383        attrib.items().
384
385        Return a list of (name, value) tuples.
386
387        """
388        return self.attrib.items()
389
390    def iter(self, tag=None):
391        """Create tree iterator.
392
393        The iterator loops over the element and all subelements in document
394        order, returning all elements with a matching tag.
395
396        If the tree structure is modified during iteration, new or removed
397        elements may or may not be included.  To get a stable set, use the
398        list() function on the iterator, and loop over the resulting list.
399
400        *tag* is what tags to look for (default is to return all elements)
401
402        Return an iterator containing all the matching elements.
403
404        """
405        if tag == "*":
406            tag = None
407        if tag is None or self.tag == tag:
408            yield self
409        for e in self._children:
410            yield from e.iter(tag)
411
412    # compatibility
413    def getiterator(self, tag=None):
414        warnings.warn(
415            "This method will be removed in future versions.  "
416            "Use 'elem.iter()' or 'list(elem.iter())' instead.",
417            DeprecationWarning, stacklevel=2
418        )
419        return list(self.iter(tag))
420
421    def itertext(self):
422        """Create text iterator.
423
424        The iterator loops over the element and all subelements in document
425        order, returning all inner text.
426
427        """
428        tag = self.tag
429        if not isinstance(tag, str) and tag is not None:
430            return
431        t = self.text
432        if t:
433            yield t
434        for e in self:
435            yield from e.itertext()
436            t = e.tail
437            if t:
438                yield t
439
440
441def SubElement(parent, tag, attrib={}, **extra):
442    """Subelement factory which creates an element instance, and appends it
443    to an existing parent.
444
445    The element tag, attribute names, and attribute values can be either
446    bytes or Unicode strings.
447
448    *parent* is the parent element, *tag* is the subelements name, *attrib* is
449    an optional directory containing element attributes, *extra* are
450    additional attributes given as keyword arguments.
451
452    """
453    attrib = {**attrib, **extra}
454    element = parent.makeelement(tag, attrib)
455    parent.append(element)
456    return element
457
458
459def Comment(text=None):
460    """Comment element factory.
461
462    This function creates a special element which the standard serializer
463    serializes as an XML comment.
464
465    *text* is a string containing the comment string.
466
467    """
468    element = Element(Comment)
469    element.text = text
470    return element
471
472
473def ProcessingInstruction(target, text=None):
474    """Processing Instruction element factory.
475
476    This function creates a special element which the standard serializer
477    serializes as an XML comment.
478
479    *target* is a string containing the processing instruction, *text* is a
480    string containing the processing instruction contents, if any.
481
482    """
483    element = Element(ProcessingInstruction)
484    element.text = target
485    if text:
486        element.text = element.text + " " + text
487    return element
488
489PI = ProcessingInstruction
490
491
492class QName:
493    """Qualified name wrapper.
494
495    This class can be used to wrap a QName attribute value in order to get
496    proper namespace handing on output.
497
498    *text_or_uri* is a string containing the QName value either in the form
499    {uri}local, or if the tag argument is given, the URI part of a QName.
500
501    *tag* is an optional argument which if given, will make the first
502    argument (text_or_uri) be interpreted as a URI, and this argument (tag)
503    be interpreted as a local name.
504
505    """
506    def __init__(self, text_or_uri, tag=None):
507        if tag:
508            text_or_uri = "{%s}%s" % (text_or_uri, tag)
509        self.text = text_or_uri
510    def __str__(self):
511        return self.text
512    def __repr__(self):
513        return '<%s %r>' % (self.__class__.__name__, self.text)
514    def __hash__(self):
515        return hash(self.text)
516    def __le__(self, other):
517        if isinstance(other, QName):
518            return self.text <= other.text
519        return self.text <= other
520    def __lt__(self, other):
521        if isinstance(other, QName):
522            return self.text < other.text
523        return self.text < other
524    def __ge__(self, other):
525        if isinstance(other, QName):
526            return self.text >= other.text
527        return self.text >= other
528    def __gt__(self, other):
529        if isinstance(other, QName):
530            return self.text > other.text
531        return self.text > other
532    def __eq__(self, other):
533        if isinstance(other, QName):
534            return self.text == other.text
535        return self.text == other
536
537# --------------------------------------------------------------------
538
539
540class ElementTree:
541    """An XML element hierarchy.
542
543    This class also provides support for serialization to and from
544    standard XML.
545
546    *element* is an optional root element node,
547    *file* is an optional file handle or file name of an XML file whose
548    contents will be used to initialize the tree with.
549
550    """
551    def __init__(self, element=None, file=None):
552        # assert element is None or iselement(element)
553        self._root = element # first node
554        if file:
555            self.parse(file)
556
557    def getroot(self):
558        """Return root element of this tree."""
559        return self._root
560
561    def _setroot(self, element):
562        """Replace root element of this tree.
563
564        This will discard the current contents of the tree and replace it
565        with the given element.  Use with care!
566
567        """
568        # assert iselement(element)
569        self._root = element
570
571    def parse(self, source, parser=None):
572        """Load external XML document into element tree.
573
574        *source* is a file name or file object, *parser* is an optional parser
575        instance that defaults to XMLParser.
576
577        ParseError is raised if the parser fails to parse the document.
578
579        Returns the root element of the given source document.
580
581        """
582        close_source = False
583        if not hasattr(source, "read"):
584            source = open(source, "rb")
585            close_source = True
586        try:
587            if parser is None:
588                # If no parser was specified, create a default XMLParser
589                parser = XMLParser()
590                if hasattr(parser, '_parse_whole'):
591                    # The default XMLParser, when it comes from an accelerator,
592                    # can define an internal _parse_whole API for efficiency.
593                    # It can be used to parse the whole source without feeding
594                    # it with chunks.
595                    self._root = parser._parse_whole(source)
596                    return self._root
597            while True:
598                data = source.read(65536)
599                if not data:
600                    break
601                parser.feed(data)
602            self._root = parser.close()
603            return self._root
604        finally:
605            if close_source:
606                source.close()
607
608    def iter(self, tag=None):
609        """Create and return tree iterator for the root element.
610
611        The iterator loops over all elements in this tree, in document order.
612
613        *tag* is a string with the tag name to iterate over
614        (default is to return all elements).
615
616        """
617        # assert self._root is not None
618        return self._root.iter(tag)
619
620    # compatibility
621    def getiterator(self, tag=None):
622        warnings.warn(
623            "This method will be removed in future versions.  "
624            "Use 'tree.iter()' or 'list(tree.iter())' instead.",
625            DeprecationWarning, stacklevel=2
626        )
627        return list(self.iter(tag))
628
629    def find(self, path, namespaces=None):
630        """Find first matching element by tag name or path.
631
632        Same as getroot().find(path), which is Element.find()
633
634        *path* is a string having either an element tag or an XPath,
635        *namespaces* is an optional mapping from namespace prefix to full name.
636
637        Return the first matching element, or None if no element was found.
638
639        """
640        # assert self._root is not None
641        if path[:1] == "/":
642            path = "." + path
643            warnings.warn(
644                "This search is broken in 1.3 and earlier, and will be "
645                "fixed in a future version.  If you rely on the current "
646                "behaviour, change it to %r" % path,
647                FutureWarning, stacklevel=2
648                )
649        return self._root.find(path, namespaces)
650
651    def findtext(self, path, default=None, namespaces=None):
652        """Find first matching element by tag name or path.
653
654        Same as getroot().findtext(path),  which is Element.findtext()
655
656        *path* is a string having either an element tag or an XPath,
657        *namespaces* is an optional mapping from namespace prefix to full name.
658
659        Return the first matching element, or None if no element was found.
660
661        """
662        # assert self._root is not None
663        if path[:1] == "/":
664            path = "." + path
665            warnings.warn(
666                "This search is broken in 1.3 and earlier, and will be "
667                "fixed in a future version.  If you rely on the current "
668                "behaviour, change it to %r" % path,
669                FutureWarning, stacklevel=2
670                )
671        return self._root.findtext(path, default, namespaces)
672
673    def findall(self, path, namespaces=None):
674        """Find all matching subelements by tag name or path.
675
676        Same as getroot().findall(path), which is Element.findall().
677
678        *path* is a string having either an element tag or an XPath,
679        *namespaces* is an optional mapping from namespace prefix to full name.
680
681        Return list containing all matching elements in document order.
682
683        """
684        # assert self._root is not None
685        if path[:1] == "/":
686            path = "." + path
687            warnings.warn(
688                "This search is broken in 1.3 and earlier, and will be "
689                "fixed in a future version.  If you rely on the current "
690                "behaviour, change it to %r" % path,
691                FutureWarning, stacklevel=2
692                )
693        return self._root.findall(path, namespaces)
694
695    def iterfind(self, path, namespaces=None):
696        """Find all matching subelements by tag name or path.
697
698        Same as getroot().iterfind(path), which is element.iterfind()
699
700        *path* is a string having either an element tag or an XPath,
701        *namespaces* is an optional mapping from namespace prefix to full name.
702
703        Return an iterable yielding all matching elements in document order.
704
705        """
706        # assert self._root is not None
707        if path[:1] == "/":
708            path = "." + path
709            warnings.warn(
710                "This search is broken in 1.3 and earlier, and will be "
711                "fixed in a future version.  If you rely on the current "
712                "behaviour, change it to %r" % path,
713                FutureWarning, stacklevel=2
714                )
715        return self._root.iterfind(path, namespaces)
716
717    def write(self, file_or_filename,
718              encoding=None,
719              xml_declaration=None,
720              default_namespace=None,
721              method=None, *,
722              short_empty_elements=True):
723        """Write element tree to a file as XML.
724
725        Arguments:
726          *file_or_filename* -- file name or a file object opened for writing
727
728          *encoding* -- the output encoding (default: US-ASCII)
729
730          *xml_declaration* -- bool indicating if an XML declaration should be
731                               added to the output. If None, an XML declaration
732                               is added if encoding IS NOT either of:
733                               US-ASCII, UTF-8, or Unicode
734
735          *default_namespace* -- sets the default XML namespace (for "xmlns")
736
737          *method* -- either "xml" (default), "html, "text", or "c14n"
738
739          *short_empty_elements* -- controls the formatting of elements
740                                    that contain no content. If True (default)
741                                    they are emitted as a single self-closed
742                                    tag, otherwise they are emitted as a pair
743                                    of start/end tags
744
745        """
746        if not method:
747            method = "xml"
748        elif method not in _serialize:
749            raise ValueError("unknown method %r" % method)
750        if not encoding:
751            if method == "c14n":
752                encoding = "utf-8"
753            else:
754                encoding = "us-ascii"
755        enc_lower = encoding.lower()
756        with _get_writer(file_or_filename, enc_lower) as write:
757            if method == "xml" and (xml_declaration or
758                    (xml_declaration is None and
759                     enc_lower not in ("utf-8", "us-ascii", "unicode"))):
760                declared_encoding = encoding
761                if enc_lower == "unicode":
762                    # Retrieve the default encoding for the xml declaration
763                    import locale
764                    declared_encoding = locale.getpreferredencoding()
765                write("<?xml version='1.0' encoding='%s'?>\n" % (
766                    declared_encoding,))
767            if method == "text":
768                _serialize_text(write, self._root)
769            else:
770                qnames, namespaces = _namespaces(self._root, default_namespace)
771                serialize = _serialize[method]
772                serialize(write, self._root, qnames, namespaces,
773                          short_empty_elements=short_empty_elements)
774
775    def write_c14n(self, file):
776        # lxml.etree compatibility.  use output method instead
777        return self.write(file, method="c14n")
778
779# --------------------------------------------------------------------
780# serialization support
781
782@contextlib.contextmanager
783def _get_writer(file_or_filename, encoding):
784    # returns text write method and release all resources after using
785    try:
786        write = file_or_filename.write
787    except AttributeError:
788        # file_or_filename is a file name
789        if encoding == "unicode":
790            file = open(file_or_filename, "w")
791        else:
792            file = open(file_or_filename, "w", encoding=encoding,
793                        errors="xmlcharrefreplace")
794        with file:
795            yield file.write
796    else:
797        # file_or_filename is a file-like object
798        # encoding determines if it is a text or binary writer
799        if encoding == "unicode":
800            # use a text writer as is
801            yield write
802        else:
803            # wrap a binary writer with TextIOWrapper
804            with contextlib.ExitStack() as stack:
805                if isinstance(file_or_filename, io.BufferedIOBase):
806                    file = file_or_filename
807                elif isinstance(file_or_filename, io.RawIOBase):
808                    file = io.BufferedWriter(file_or_filename)
809                    # Keep the original file open when the BufferedWriter is
810                    # destroyed
811                    stack.callback(file.detach)
812                else:
813                    # This is to handle passed objects that aren't in the
814                    # IOBase hierarchy, but just have a write method
815                    file = io.BufferedIOBase()
816                    file.writable = lambda: True
817                    file.write = write
818                    try:
819                        # TextIOWrapper uses this methods to determine
820                        # if BOM (for UTF-16, etc) should be added
821                        file.seekable = file_or_filename.seekable
822                        file.tell = file_or_filename.tell
823                    except AttributeError:
824                        pass
825                file = io.TextIOWrapper(file,
826                                        encoding=encoding,
827                                        errors="xmlcharrefreplace",
828                                        newline="\n")
829                # Keep the original file open when the TextIOWrapper is
830                # destroyed
831                stack.callback(file.detach)
832                yield file.write
833
834def _namespaces(elem, default_namespace=None):
835    # identify namespaces used in this tree
836
837    # maps qnames to *encoded* prefix:local names
838    qnames = {None: None}
839
840    # maps uri:s to prefixes
841    namespaces = {}
842    if default_namespace:
843        namespaces[default_namespace] = ""
844
845    def add_qname(qname):
846        # calculate serialized qname representation
847        try:
848            if qname[:1] == "{":
849                uri, tag = qname[1:].rsplit("}", 1)
850                prefix = namespaces.get(uri)
851                if prefix is None:
852                    prefix = _namespace_map.get(uri)
853                    if prefix is None:
854                        prefix = "ns%d" % len(namespaces)
855                    if prefix != "xml":
856                        namespaces[uri] = prefix
857                if prefix:
858                    qnames[qname] = "%s:%s" % (prefix, tag)
859                else:
860                    qnames[qname] = tag # default element
861            else:
862                if default_namespace:
863                    # FIXME: can this be handled in XML 1.0?
864                    raise ValueError(
865                        "cannot use non-qualified names with "
866                        "default_namespace option"
867                        )
868                qnames[qname] = qname
869        except TypeError:
870            _raise_serialization_error(qname)
871
872    # populate qname and namespaces table
873    for elem in elem.iter():
874        tag = elem.tag
875        if isinstance(tag, QName):
876            if tag.text not in qnames:
877                add_qname(tag.text)
878        elif isinstance(tag, str):
879            if tag not in qnames:
880                add_qname(tag)
881        elif tag is not None and tag is not Comment and tag is not PI:
882            _raise_serialization_error(tag)
883        for key, value in elem.items():
884            if isinstance(key, QName):
885                key = key.text
886            if key not in qnames:
887                add_qname(key)
888            if isinstance(value, QName) and value.text not in qnames:
889                add_qname(value.text)
890        text = elem.text
891        if isinstance(text, QName) and text.text not in qnames:
892            add_qname(text.text)
893    return qnames, namespaces
894
895def _serialize_xml(write, elem, qnames, namespaces,
896                   short_empty_elements, **kwargs):
897    tag = elem.tag
898    text = elem.text
899    if tag is Comment:
900        write("<!--%s-->" % text)
901    elif tag is ProcessingInstruction:
902        write("<?%s?>" % text)
903    else:
904        tag = qnames[tag]
905        if tag is None:
906            if text:
907                write(_escape_cdata(text))
908            for e in elem:
909                _serialize_xml(write, e, qnames, None,
910                               short_empty_elements=short_empty_elements)
911        else:
912            write("<" + tag)
913            items = list(elem.items())
914            if items or namespaces:
915                if namespaces:
916                    for v, k in sorted(namespaces.items(),
917                                       key=lambda x: x[1]):  # sort on prefix
918                        if k:
919                            k = ":" + k
920                        write(" xmlns%s=\"%s\"" % (
921                            k,
922                            _escape_attrib(v)
923                            ))
924                for k, v in items:
925                    if isinstance(k, QName):
926                        k = k.text
927                    if isinstance(v, QName):
928                        v = qnames[v.text]
929                    else:
930                        v = _escape_attrib(v)
931                    write(" %s=\"%s\"" % (qnames[k], v))
932            if text or len(elem) or not short_empty_elements:
933                write(">")
934                if text:
935                    write(_escape_cdata(text))
936                for e in elem:
937                    _serialize_xml(write, e, qnames, None,
938                                   short_empty_elements=short_empty_elements)
939                write("</" + tag + ">")
940            else:
941                write(" />")
942    if elem.tail:
943        write(_escape_cdata(elem.tail))
944
945HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
946              "img", "input", "isindex", "link", "meta", "param")
947
948try:
949    HTML_EMPTY = set(HTML_EMPTY)
950except NameError:
951    pass
952
953def _serialize_html(write, elem, qnames, namespaces, **kwargs):
954    tag = elem.tag
955    text = elem.text
956    if tag is Comment:
957        write("<!--%s-->" % _escape_cdata(text))
958    elif tag is ProcessingInstruction:
959        write("<?%s?>" % _escape_cdata(text))
960    else:
961        tag = qnames[tag]
962        if tag is None:
963            if text:
964                write(_escape_cdata(text))
965            for e in elem:
966                _serialize_html(write, e, qnames, None)
967        else:
968            write("<" + tag)
969            items = list(elem.items())
970            if items or namespaces:
971                if namespaces:
972                    for v, k in sorted(namespaces.items(),
973                                       key=lambda x: x[1]):  # sort on prefix
974                        if k:
975                            k = ":" + k
976                        write(" xmlns%s=\"%s\"" % (
977                            k,
978                            _escape_attrib(v)
979                            ))
980                for k, v in items:
981                    if isinstance(k, QName):
982                        k = k.text
983                    if isinstance(v, QName):
984                        v = qnames[v.text]
985                    else:
986                        v = _escape_attrib_html(v)
987                    # FIXME: handle boolean attributes
988                    write(" %s=\"%s\"" % (qnames[k], v))
989            write(">")
990            ltag = tag.lower()
991            if text:
992                if ltag == "script" or ltag == "style":
993                    write(text)
994                else:
995                    write(_escape_cdata(text))
996            for e in elem:
997                _serialize_html(write, e, qnames, None)
998            if ltag not in HTML_EMPTY:
999                write("</" + tag + ">")
1000    if elem.tail:
1001        write(_escape_cdata(elem.tail))
1002
1003def _serialize_text(write, elem):
1004    for part in elem.itertext():
1005        write(part)
1006    if elem.tail:
1007        write(elem.tail)
1008
1009_serialize = {
1010    "xml": _serialize_xml,
1011    "html": _serialize_html,
1012    "text": _serialize_text,
1013# this optional method is imported at the end of the module
1014#   "c14n": _serialize_c14n,
1015}
1016
1017
1018def register_namespace(prefix, uri):
1019    """Register a namespace prefix.
1020
1021    The registry is global, and any existing mapping for either the
1022    given prefix or the namespace URI will be removed.
1023
1024    *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1025    attributes in this namespace will be serialized with prefix if possible.
1026
1027    ValueError is raised if prefix is reserved or is invalid.
1028
1029    """
1030    if re.match(r"ns\d+$", prefix):
1031        raise ValueError("Prefix format reserved for internal use")
1032    for k, v in list(_namespace_map.items()):
1033        if k == uri or v == prefix:
1034            del _namespace_map[k]
1035    _namespace_map[uri] = prefix
1036
1037_namespace_map = {
1038    # "well-known" namespace prefixes
1039    "http://www.w3.org/XML/1998/namespace": "xml",
1040    "http://www.w3.org/1999/xhtml": "html",
1041    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1042    "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1043    # xml schema
1044    "http://www.w3.org/2001/XMLSchema": "xs",
1045    "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1046    # dublin core
1047    "http://purl.org/dc/elements/1.1/": "dc",
1048}
1049# For tests and troubleshooting
1050register_namespace._namespace_map = _namespace_map
1051
1052def _raise_serialization_error(text):
1053    raise TypeError(
1054        "cannot serialize %r (type %s)" % (text, type(text).__name__)
1055        )
1056
1057def _escape_cdata(text):
1058    # escape character data
1059    try:
1060        # it's worth avoiding do-nothing calls for strings that are
1061        # shorter than 500 characters, or so.  assume that's, by far,
1062        # the most common case in most applications.
1063        if "&" in text:
1064            text = text.replace("&", "&amp;")
1065        if "<" in text:
1066            text = text.replace("<", "&lt;")
1067        if ">" in text:
1068            text = text.replace(">", "&gt;")
1069        return text
1070    except (TypeError, AttributeError):
1071        _raise_serialization_error(text)
1072
1073def _escape_attrib(text):
1074    # escape attribute value
1075    try:
1076        if "&" in text:
1077            text = text.replace("&", "&amp;")
1078        if "<" in text:
1079            text = text.replace("<", "&lt;")
1080        if ">" in text:
1081            text = text.replace(">", "&gt;")
1082        if "\"" in text:
1083            text = text.replace("\"", "&quot;")
1084        # The following business with carriage returns is to satisfy
1085        # Section 2.11 of the XML specification, stating that
1086        # CR or CR LN should be replaced with just LN
1087        # http://www.w3.org/TR/REC-xml/#sec-line-ends
1088        if "\r\n" in text:
1089            text = text.replace("\r\n", "\n")
1090        if "\r" in text:
1091            text = text.replace("\r", "\n")
1092        #The following four lines are issue 17582
1093        if "\n" in text:
1094            text = text.replace("\n", "&#10;")
1095        if "\t" in text:
1096            text = text.replace("\t", "&#09;")
1097        return text
1098    except (TypeError, AttributeError):
1099        _raise_serialization_error(text)
1100
1101def _escape_attrib_html(text):
1102    # escape attribute value
1103    try:
1104        if "&" in text:
1105            text = text.replace("&", "&amp;")
1106        if ">" in text:
1107            text = text.replace(">", "&gt;")
1108        if "\"" in text:
1109            text = text.replace("\"", "&quot;")
1110        return text
1111    except (TypeError, AttributeError):
1112        _raise_serialization_error(text)
1113
1114# --------------------------------------------------------------------
1115
1116def tostring(element, encoding=None, method=None, *,
1117             xml_declaration=None, default_namespace=None,
1118             short_empty_elements=True):
1119    """Generate string representation of XML element.
1120
1121    All subelements are included.  If encoding is "unicode", a string
1122    is returned. Otherwise a bytestring is returned.
1123
1124    *element* is an Element instance, *encoding* is an optional output
1125    encoding defaulting to US-ASCII, *method* is an optional output which can
1126    be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
1127    sets the default XML namespace (for "xmlns").
1128
1129    Returns an (optionally) encoded string containing the XML data.
1130
1131    """
1132    stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
1133    ElementTree(element).write(stream, encoding,
1134                               xml_declaration=xml_declaration,
1135                               default_namespace=default_namespace,
1136                               method=method,
1137                               short_empty_elements=short_empty_elements)
1138    return stream.getvalue()
1139
1140class _ListDataStream(io.BufferedIOBase):
1141    """An auxiliary stream accumulating into a list reference."""
1142    def __init__(self, lst):
1143        self.lst = lst
1144
1145    def writable(self):
1146        return True
1147
1148    def seekable(self):
1149        return True
1150
1151    def write(self, b):
1152        self.lst.append(b)
1153
1154    def tell(self):
1155        return len(self.lst)
1156
1157def tostringlist(element, encoding=None, method=None, *,
1158                 xml_declaration=None, default_namespace=None,
1159                 short_empty_elements=True):
1160    lst = []
1161    stream = _ListDataStream(lst)
1162    ElementTree(element).write(stream, encoding,
1163                               xml_declaration=xml_declaration,
1164                               default_namespace=default_namespace,
1165                               method=method,
1166                               short_empty_elements=short_empty_elements)
1167    return lst
1168
1169
1170def dump(elem):
1171    """Write element tree or element structure to sys.stdout.
1172
1173    This function should be used for debugging only.
1174
1175    *elem* is either an ElementTree, or a single Element.  The exact output
1176    format is implementation dependent.  In this version, it's written as an
1177    ordinary XML file.
1178
1179    """
1180    # debugging
1181    if not isinstance(elem, ElementTree):
1182        elem = ElementTree(elem)
1183    elem.write(sys.stdout, encoding="unicode")
1184    tail = elem.getroot().tail
1185    if not tail or tail[-1] != "\n":
1186        sys.stdout.write("\n")
1187
1188# --------------------------------------------------------------------
1189# parsing
1190
1191
1192def parse(source, parser=None):
1193    """Parse XML document into element tree.
1194
1195    *source* is a filename or file object containing XML data,
1196    *parser* is an optional parser instance defaulting to XMLParser.
1197
1198    Return an ElementTree instance.
1199
1200    """
1201    tree = ElementTree()
1202    tree.parse(source, parser)
1203    return tree
1204
1205
1206def iterparse(source, events=None, parser=None):
1207    """Incrementally parse XML document into ElementTree.
1208
1209    This class also reports what's going on to the user based on the
1210    *events* it is initialized with.  The supported events are the strings
1211    "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1212    detailed namespace information).  If *events* is omitted, only
1213    "end" events are reported.
1214
1215    *source* is a filename or file object containing XML data, *events* is
1216    a list of events to report back, *parser* is an optional parser instance.
1217
1218    Returns an iterator providing (event, elem) pairs.
1219
1220    """
1221    # Use the internal, undocumented _parser argument for now; When the
1222    # parser argument of iterparse is removed, this can be killed.
1223    pullparser = XMLPullParser(events=events, _parser=parser)
1224    def iterator():
1225        try:
1226            while True:
1227                yield from pullparser.read_events()
1228                # load event buffer
1229                data = source.read(16 * 1024)
1230                if not data:
1231                    break
1232                pullparser.feed(data)
1233            root = pullparser._close_and_return_root()
1234            yield from pullparser.read_events()
1235            it.root = root
1236        finally:
1237            if close_source:
1238                source.close()
1239
1240    class IterParseIterator(collections.abc.Iterator):
1241        __next__ = iterator().__next__
1242    it = IterParseIterator()
1243    it.root = None
1244    del iterator, IterParseIterator
1245
1246    close_source = False
1247    if not hasattr(source, "read"):
1248        source = open(source, "rb")
1249        close_source = True
1250
1251    return it
1252
1253
1254class XMLPullParser:
1255
1256    def __init__(self, events=None, *, _parser=None):
1257        # The _parser argument is for internal use only and must not be relied
1258        # upon in user code. It will be removed in a future release.
1259        # See http://bugs.python.org/issue17741 for more details.
1260
1261        self._events_queue = collections.deque()
1262        self._parser = _parser or XMLParser(target=TreeBuilder())
1263        # wire up the parser for event reporting
1264        if events is None:
1265            events = ("end",)
1266        self._parser._setevents(self._events_queue, events)
1267
1268    def feed(self, data):
1269        """Feed encoded data to parser."""
1270        if self._parser is None:
1271            raise ValueError("feed() called after end of stream")
1272        if data:
1273            try:
1274                self._parser.feed(data)
1275            except SyntaxError as exc:
1276                self._events_queue.append(exc)
1277
1278    def _close_and_return_root(self):
1279        # iterparse needs this to set its root attribute properly :(
1280        root = self._parser.close()
1281        self._parser = None
1282        return root
1283
1284    def close(self):
1285        """Finish feeding data to parser.
1286
1287        Unlike XMLParser, does not return the root element. Use
1288        read_events() to consume elements from XMLPullParser.
1289        """
1290        self._close_and_return_root()
1291
1292    def read_events(self):
1293        """Return an iterator over currently available (event, elem) pairs.
1294
1295        Events are consumed from the internal event queue as they are
1296        retrieved from the iterator.
1297        """
1298        events = self._events_queue
1299        while events:
1300            event = events.popleft()
1301            if isinstance(event, Exception):
1302                raise event
1303            else:
1304                yield event
1305
1306
1307def XML(text, parser=None):
1308    """Parse XML document from string constant.
1309
1310    This function can be used to embed "XML Literals" in Python code.
1311
1312    *text* is a string containing XML data, *parser* is an
1313    optional parser instance, defaulting to the standard XMLParser.
1314
1315    Returns an Element instance.
1316
1317    """
1318    if not parser:
1319        parser = XMLParser(target=TreeBuilder())
1320    parser.feed(text)
1321    return parser.close()
1322
1323
1324def XMLID(text, parser=None):
1325    """Parse XML document from string constant for its IDs.
1326
1327    *text* is a string containing XML data, *parser* is an
1328    optional parser instance, defaulting to the standard XMLParser.
1329
1330    Returns an (Element, dict) tuple, in which the
1331    dict maps element id:s to elements.
1332
1333    """
1334    if not parser:
1335        parser = XMLParser(target=TreeBuilder())
1336    parser.feed(text)
1337    tree = parser.close()
1338    ids = {}
1339    for elem in tree.iter():
1340        id = elem.get("id")
1341        if id:
1342            ids[id] = elem
1343    return tree, ids
1344
1345# Parse XML document from string constant.  Alias for XML().
1346fromstring = XML
1347
1348def fromstringlist(sequence, parser=None):
1349    """Parse XML document from sequence of string fragments.
1350
1351    *sequence* is a list of other sequence, *parser* is an optional parser
1352    instance, defaulting to the standard XMLParser.
1353
1354    Returns an Element instance.
1355
1356    """
1357    if not parser:
1358        parser = XMLParser(target=TreeBuilder())
1359    for text in sequence:
1360        parser.feed(text)
1361    return parser.close()
1362
1363# --------------------------------------------------------------------
1364
1365
1366class TreeBuilder:
1367    """Generic element structure builder.
1368
1369    This builder converts a sequence of start, data, and end method
1370    calls to a well-formed element structure.
1371
1372    You can use this class to build an element structure using a custom XML
1373    parser, or a parser for some other XML-like format.
1374
1375    *element_factory* is an optional element factory which is called
1376    to create new Element instances, as necessary.
1377
1378    *comment_factory* is a factory to create comments to be used instead of
1379    the standard factory.  If *insert_comments* is false (the default),
1380    comments will not be inserted into the tree.
1381
1382    *pi_factory* is a factory to create processing instructions to be used
1383    instead of the standard factory.  If *insert_pis* is false (the default),
1384    processing instructions will not be inserted into the tree.
1385    """
1386    def __init__(self, element_factory=None, *,
1387                 comment_factory=None, pi_factory=None,
1388                 insert_comments=False, insert_pis=False):
1389        self._data = [] # data collector
1390        self._elem = [] # element stack
1391        self._last = None # last element
1392        self._root = None # root element
1393        self._tail = None # true if we're after an end tag
1394        if comment_factory is None:
1395            comment_factory = Comment
1396        self._comment_factory = comment_factory
1397        self.insert_comments = insert_comments
1398        if pi_factory is None:
1399            pi_factory = ProcessingInstruction
1400        self._pi_factory = pi_factory
1401        self.insert_pis = insert_pis
1402        if element_factory is None:
1403            element_factory = Element
1404        self._factory = element_factory
1405
1406    def close(self):
1407        """Flush builder buffers and return toplevel document Element."""
1408        assert len(self._elem) == 0, "missing end tags"
1409        assert self._root is not None, "missing toplevel element"
1410        return self._root
1411
1412    def _flush(self):
1413        if self._data:
1414            if self._last is not None:
1415                text = "".join(self._data)
1416                if self._tail:
1417                    assert self._last.tail is None, "internal error (tail)"
1418                    self._last.tail = text
1419                else:
1420                    assert self._last.text is None, "internal error (text)"
1421                    self._last.text = text
1422            self._data = []
1423
1424    def data(self, data):
1425        """Add text to current element."""
1426        self._data.append(data)
1427
1428    def start(self, tag, attrs):
1429        """Open new element and return it.
1430
1431        *tag* is the element name, *attrs* is a dict containing element
1432        attributes.
1433
1434        """
1435        self._flush()
1436        self._last = elem = self._factory(tag, attrs)
1437        if self._elem:
1438            self._elem[-1].append(elem)
1439        elif self._root is None:
1440            self._root = elem
1441        self._elem.append(elem)
1442        self._tail = 0
1443        return elem
1444
1445    def end(self, tag):
1446        """Close and return current Element.
1447
1448        *tag* is the element name.
1449
1450        """
1451        self._flush()
1452        self._last = self._elem.pop()
1453        assert self._last.tag == tag,\
1454               "end tag mismatch (expected %s, got %s)" % (
1455                   self._last.tag, tag)
1456        self._tail = 1
1457        return self._last
1458
1459    def comment(self, text):
1460        """Create a comment using the comment_factory.
1461
1462        *text* is the text of the comment.
1463        """
1464        return self._handle_single(
1465            self._comment_factory, self.insert_comments, text)
1466
1467    def pi(self, target, text=None):
1468        """Create a processing instruction using the pi_factory.
1469
1470        *target* is the target name of the processing instruction.
1471        *text* is the data of the processing instruction, or ''.
1472        """
1473        return self._handle_single(
1474            self._pi_factory, self.insert_pis, target, text)
1475
1476    def _handle_single(self, factory, insert, *args):
1477        elem = factory(*args)
1478        if insert:
1479            self._flush()
1480            self._last = elem
1481            if self._elem:
1482                self._elem[-1].append(elem)
1483            self._tail = 1
1484        return elem
1485
1486
1487# also see ElementTree and TreeBuilder
1488class XMLParser:
1489    """Element structure builder for XML source data based on the expat parser.
1490
1491    *target* is an optional target object which defaults to an instance of the
1492    standard TreeBuilder class, *encoding* is an optional encoding string
1493    which if given, overrides the encoding specified in the XML file:
1494    http://www.iana.org/assignments/character-sets
1495
1496    """
1497
1498    def __init__(self, *, target=None, encoding=None):
1499        try:
1500            from xml.parsers import expat
1501        except ImportError:
1502            try:
1503                import pyexpat as expat
1504            except ImportError:
1505                raise ImportError(
1506                    "No module named expat; use SimpleXMLTreeBuilder instead"
1507                    )
1508        parser = expat.ParserCreate(encoding, "}")
1509        if target is None:
1510            target = TreeBuilder()
1511        # underscored names are provided for compatibility only
1512        self.parser = self._parser = parser
1513        self.target = self._target = target
1514        self._error = expat.error
1515        self._names = {} # name memo cache
1516        # main callbacks
1517        parser.DefaultHandlerExpand = self._default
1518        if hasattr(target, 'start'):
1519            parser.StartElementHandler = self._start
1520        if hasattr(target, 'end'):
1521            parser.EndElementHandler = self._end
1522        if hasattr(target, 'start_ns'):
1523            parser.StartNamespaceDeclHandler = self._start_ns
1524        if hasattr(target, 'end_ns'):
1525            parser.EndNamespaceDeclHandler = self._end_ns
1526        if hasattr(target, 'data'):
1527            parser.CharacterDataHandler = target.data
1528        # miscellaneous callbacks
1529        if hasattr(target, 'comment'):
1530            parser.CommentHandler = target.comment
1531        if hasattr(target, 'pi'):
1532            parser.ProcessingInstructionHandler = target.pi
1533        # Configure pyexpat: buffering, new-style attribute handling.
1534        parser.buffer_text = 1
1535        parser.ordered_attributes = 1
1536        parser.specified_attributes = 1
1537        self._doctype = None
1538        self.entity = {}
1539        try:
1540            self.version = "Expat %d.%d.%d" % expat.version_info
1541        except AttributeError:
1542            pass # unknown
1543
1544    def _setevents(self, events_queue, events_to_report):
1545        # Internal API for XMLPullParser
1546        # events_to_report: a list of events to report during parsing (same as
1547        # the *events* of XMLPullParser's constructor.
1548        # events_queue: a list of actual parsing events that will be populated
1549        # by the underlying parser.
1550        #
1551        parser = self._parser
1552        append = events_queue.append
1553        for event_name in events_to_report:
1554            if event_name == "start":
1555                parser.ordered_attributes = 1
1556                parser.specified_attributes = 1
1557                def handler(tag, attrib_in, event=event_name, append=append,
1558                            start=self._start):
1559                    append((event, start(tag, attrib_in)))
1560                parser.StartElementHandler = handler
1561            elif event_name == "end":
1562                def handler(tag, event=event_name, append=append,
1563                            end=self._end):
1564                    append((event, end(tag)))
1565                parser.EndElementHandler = handler
1566            elif event_name == "start-ns":
1567                # TreeBuilder does not implement .start_ns()
1568                if hasattr(self.target, "start_ns"):
1569                    def handler(prefix, uri, event=event_name, append=append,
1570                                start_ns=self._start_ns):
1571                        append((event, start_ns(prefix, uri)))
1572                else:
1573                    def handler(prefix, uri, event=event_name, append=append):
1574                        append((event, (prefix or '', uri or '')))
1575                parser.StartNamespaceDeclHandler = handler
1576            elif event_name == "end-ns":
1577                # TreeBuilder does not implement .end_ns()
1578                if hasattr(self.target, "end_ns"):
1579                    def handler(prefix, event=event_name, append=append,
1580                                end_ns=self._end_ns):
1581                        append((event, end_ns(prefix)))
1582                else:
1583                    def handler(prefix, event=event_name, append=append):
1584                        append((event, None))
1585                parser.EndNamespaceDeclHandler = handler
1586            elif event_name == 'comment':
1587                def handler(text, event=event_name, append=append, self=self):
1588                    append((event, self.target.comment(text)))
1589                parser.CommentHandler = handler
1590            elif event_name == 'pi':
1591                def handler(pi_target, data, event=event_name, append=append,
1592                            self=self):
1593                    append((event, self.target.pi(pi_target, data)))
1594                parser.ProcessingInstructionHandler = handler
1595            else:
1596                raise ValueError("unknown event %r" % event_name)
1597
1598    def _raiseerror(self, value):
1599        err = ParseError(value)
1600        err.code = value.code
1601        err.position = value.lineno, value.offset
1602        raise err
1603
1604    def _fixname(self, key):
1605        # expand qname, and convert name string to ascii, if possible
1606        try:
1607            name = self._names[key]
1608        except KeyError:
1609            name = key
1610            if "}" in name:
1611                name = "{" + name
1612            self._names[key] = name
1613        return name
1614
1615    def _start_ns(self, prefix, uri):
1616        return self.target.start_ns(prefix or '', uri or '')
1617
1618    def _end_ns(self, prefix):
1619        return self.target.end_ns(prefix or '')
1620
1621    def _start(self, tag, attr_list):
1622        # Handler for expat's StartElementHandler. Since ordered_attributes
1623        # is set, the attributes are reported as a list of alternating
1624        # attribute name,value.
1625        fixname = self._fixname
1626        tag = fixname(tag)
1627        attrib = {}
1628        if attr_list:
1629            for i in range(0, len(attr_list), 2):
1630                attrib[fixname(attr_list[i])] = attr_list[i+1]
1631        return self.target.start(tag, attrib)
1632
1633    def _end(self, tag):
1634        return self.target.end(self._fixname(tag))
1635
1636    def _default(self, text):
1637        prefix = text[:1]
1638        if prefix == "&":
1639            # deal with undefined entities
1640            try:
1641                data_handler = self.target.data
1642            except AttributeError:
1643                return
1644            try:
1645                data_handler(self.entity[text[1:-1]])
1646            except KeyError:
1647                from xml.parsers import expat
1648                err = expat.error(
1649                    "undefined entity %s: line %d, column %d" %
1650                    (text, self.parser.ErrorLineNumber,
1651                    self.parser.ErrorColumnNumber)
1652                    )
1653                err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
1654                err.lineno = self.parser.ErrorLineNumber
1655                err.offset = self.parser.ErrorColumnNumber
1656                raise err
1657        elif prefix == "<" and text[:9] == "<!DOCTYPE":
1658            self._doctype = [] # inside a doctype declaration
1659        elif self._doctype is not None:
1660            # parse doctype contents
1661            if prefix == ">":
1662                self._doctype = None
1663                return
1664            text = text.strip()
1665            if not text:
1666                return
1667            self._doctype.append(text)
1668            n = len(self._doctype)
1669            if n > 2:
1670                type = self._doctype[1]
1671                if type == "PUBLIC" and n == 4:
1672                    name, type, pubid, system = self._doctype
1673                    if pubid:
1674                        pubid = pubid[1:-1]
1675                elif type == "SYSTEM" and n == 3:
1676                    name, type, system = self._doctype
1677                    pubid = None
1678                else:
1679                    return
1680                if hasattr(self.target, "doctype"):
1681                    self.target.doctype(name, pubid, system[1:-1])
1682                elif hasattr(self, "doctype"):
1683                    warnings.warn(
1684                        "The doctype() method of XMLParser is ignored.  "
1685                        "Define doctype() method on the TreeBuilder target.",
1686                        RuntimeWarning)
1687
1688                self._doctype = None
1689
1690    def feed(self, data):
1691        """Feed encoded data to parser."""
1692        try:
1693            self.parser.Parse(data, 0)
1694        except self._error as v:
1695            self._raiseerror(v)
1696
1697    def close(self):
1698        """Finish feeding data to parser and return element structure."""
1699        try:
1700            self.parser.Parse("", 1) # end of data
1701        except self._error as v:
1702            self._raiseerror(v)
1703        try:
1704            close_handler = self.target.close
1705        except AttributeError:
1706            pass
1707        else:
1708            return close_handler()
1709        finally:
1710            # get rid of circular references
1711            del self.parser, self._parser
1712            del self.target, self._target
1713
1714
1715# --------------------------------------------------------------------
1716# C14N 2.0
1717
1718def canonicalize(xml_data=None, *, out=None, from_file=None, **options):
1719    """Convert XML to its C14N 2.0 serialised form.
1720
1721    If *out* is provided, it must be a file or file-like object that receives
1722    the serialised canonical XML output (text, not bytes) through its ``.write()``
1723    method.  To write to a file, open it in text mode with encoding "utf-8".
1724    If *out* is not provided, this function returns the output as text string.
1725
1726    Either *xml_data* (an XML string) or *from_file* (a file path or
1727    file-like object) must be provided as input.
1728
1729    The configuration options are the same as for the ``C14NWriterTarget``.
1730    """
1731    if xml_data is None and from_file is None:
1732        raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")
1733    sio = None
1734    if out is None:
1735        sio = out = io.StringIO()
1736
1737    parser = XMLParser(target=C14NWriterTarget(out.write, **options))
1738
1739    if xml_data is not None:
1740        parser.feed(xml_data)
1741        parser.close()
1742    elif from_file is not None:
1743        parse(from_file, parser=parser)
1744
1745    return sio.getvalue() if sio is not None else None
1746
1747
1748_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match
1749
1750
1751class C14NWriterTarget:
1752    """
1753    Canonicalization writer target for the XMLParser.
1754
1755    Serialises parse events to XML C14N 2.0.
1756
1757    The *write* function is used for writing out the resulting data stream
1758    as text (not bytes).  To write to a file, open it in text mode with encoding
1759    "utf-8" and pass its ``.write`` method.
1760
1761    Configuration options:
1762
1763    - *with_comments*: set to true to include comments
1764    - *strip_text*: set to true to strip whitespace before and after text content
1765    - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
1766    - *qname_aware_tags*: a set of qname aware tag names in which prefixes
1767                          should be replaced in text content
1768    - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
1769                           should be replaced in text content
1770    - *exclude_attrs*: a set of attribute names that should not be serialised
1771    - *exclude_tags*: a set of tag names that should not be serialised
1772    """
1773    def __init__(self, write, *,
1774                 with_comments=False, strip_text=False, rewrite_prefixes=False,
1775                 qname_aware_tags=None, qname_aware_attrs=None,
1776                 exclude_attrs=None, exclude_tags=None):
1777        self._write = write
1778        self._data = []
1779        self._with_comments = with_comments
1780        self._strip_text = strip_text
1781        self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None
1782        self._exclude_tags = set(exclude_tags) if exclude_tags else None
1783
1784        self._rewrite_prefixes = rewrite_prefixes
1785        if qname_aware_tags:
1786            self._qname_aware_tags = set(qname_aware_tags)
1787        else:
1788            self._qname_aware_tags = None
1789        if qname_aware_attrs:
1790            self._find_qname_aware_attrs = set(qname_aware_attrs).intersection
1791        else:
1792            self._find_qname_aware_attrs = None
1793
1794        # Stack with globally and newly declared namespaces as (uri, prefix) pairs.
1795        self._declared_ns_stack = [[
1796            ("http://www.w3.org/XML/1998/namespace", "xml"),
1797        ]]
1798        # Stack with user declared namespace prefixes as (uri, prefix) pairs.
1799        self._ns_stack = []
1800        if not rewrite_prefixes:
1801            self._ns_stack.append(list(_namespace_map.items()))
1802        self._ns_stack.append([])
1803        self._prefix_map = {}
1804        self._preserve_space = [False]
1805        self._pending_start = None
1806        self._root_seen = False
1807        self._root_done = False
1808        self._ignored_depth = 0
1809
1810    def _iter_namespaces(self, ns_stack, _reversed=reversed):
1811        for namespaces in _reversed(ns_stack):
1812            if namespaces:  # almost no element declares new namespaces
1813                yield from namespaces
1814
1815    def _resolve_prefix_name(self, prefixed_name):
1816        prefix, name = prefixed_name.split(':', 1)
1817        for uri, p in self._iter_namespaces(self._ns_stack):
1818            if p == prefix:
1819                return f'{{{uri}}}{name}'
1820        raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')
1821
1822    def _qname(self, qname, uri=None):
1823        if uri is None:
1824            uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)
1825        else:
1826            tag = qname
1827
1828        prefixes_seen = set()
1829        for u, prefix in self._iter_namespaces(self._declared_ns_stack):
1830            if u == uri and prefix not in prefixes_seen:
1831                return f'{prefix}:{tag}' if prefix else tag, tag, uri
1832            prefixes_seen.add(prefix)
1833
1834        # Not declared yet => add new declaration.
1835        if self._rewrite_prefixes:
1836            if uri in self._prefix_map:
1837                prefix = self._prefix_map[uri]
1838            else:
1839                prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'
1840            self._declared_ns_stack[-1].append((uri, prefix))
1841            return f'{prefix}:{tag}', tag, uri
1842
1843        if not uri and '' not in prefixes_seen:
1844            # No default namespace declared => no prefix needed.
1845            return tag, tag, uri
1846
1847        for u, prefix in self._iter_namespaces(self._ns_stack):
1848            if u == uri:
1849                self._declared_ns_stack[-1].append((uri, prefix))
1850                return f'{prefix}:{tag}' if prefix else tag, tag, uri
1851
1852        if not uri:
1853            # As soon as a default namespace is defined,
1854            # anything that has no namespace (and thus, no prefix) goes there.
1855            return tag, tag, uri
1856
1857        raise ValueError(f'Namespace "{uri}" is not declared in scope')
1858
1859    def data(self, data):
1860        if not self._ignored_depth:
1861            self._data.append(data)
1862
1863    def _flush(self, _join_text=''.join):
1864        data = _join_text(self._data)
1865        del self._data[:]
1866        if self._strip_text and not self._preserve_space[-1]:
1867            data = data.strip()
1868        if self._pending_start is not None:
1869            args, self._pending_start = self._pending_start, None
1870            qname_text = data if data and _looks_like_prefix_name(data) else None
1871            self._start(*args, qname_text)
1872            if qname_text is not None:
1873                return
1874        if data and self._root_seen:
1875            self._write(_escape_cdata_c14n(data))
1876
1877    def start_ns(self, prefix, uri):
1878        if self._ignored_depth:
1879            return
1880        # we may have to resolve qnames in text content
1881        if self._data:
1882            self._flush()
1883        self._ns_stack[-1].append((uri, prefix))
1884
1885    def start(self, tag, attrs):
1886        if self._exclude_tags is not None and (
1887                self._ignored_depth or tag in self._exclude_tags):
1888            self._ignored_depth += 1
1889            return
1890        if self._data:
1891            self._flush()
1892
1893        new_namespaces = []
1894        self._declared_ns_stack.append(new_namespaces)
1895
1896        if self._qname_aware_tags is not None and tag in self._qname_aware_tags:
1897            # Need to parse text first to see if it requires a prefix declaration.
1898            self._pending_start = (tag, attrs, new_namespaces)
1899            return
1900        self._start(tag, attrs, new_namespaces)
1901
1902    def _start(self, tag, attrs, new_namespaces, qname_text=None):
1903        if self._exclude_attrs is not None and attrs:
1904            attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}
1905
1906        qnames = {tag, *attrs}
1907        resolved_names = {}
1908
1909        # Resolve prefixes in attribute and tag text.
1910        if qname_text is not None:
1911            qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)
1912            qnames.add(qname)
1913        if self._find_qname_aware_attrs is not None and attrs:
1914            qattrs = self._find_qname_aware_attrs(attrs)
1915            if qattrs:
1916                for attr_name in qattrs:
1917                    value = attrs[attr_name]
1918                    if _looks_like_prefix_name(value):
1919                        qname = resolved_names[value] = self._resolve_prefix_name(value)
1920                        qnames.add(qname)
1921            else:
1922                qattrs = None
1923        else:
1924            qattrs = None
1925
1926        # Assign prefixes in lexicographical order of used URIs.
1927        parse_qname = self._qname
1928        parsed_qnames = {n: parse_qname(n) for n in sorted(
1929            qnames, key=lambda n: n.split('}', 1))}
1930
1931        # Write namespace declarations in prefix order ...
1932        if new_namespaces:
1933            attr_list = [
1934                ('xmlns:' + prefix if prefix else 'xmlns', uri)
1935                for uri, prefix in new_namespaces
1936            ]
1937            attr_list.sort()
1938        else:
1939            # almost always empty
1940            attr_list = []
1941
1942        # ... followed by attributes in URI+name order
1943        if attrs:
1944            for k, v in sorted(attrs.items()):
1945                if qattrs is not None and k in qattrs and v in resolved_names:
1946                    v = parsed_qnames[resolved_names[v]][0]
1947                attr_qname, attr_name, uri = parsed_qnames[k]
1948                # No prefix for attributes in default ('') namespace.
1949                attr_list.append((attr_qname if uri else attr_name, v))
1950
1951        # Honour xml:space attributes.
1952        space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')
1953        self._preserve_space.append(
1954            space_behaviour == 'preserve' if space_behaviour
1955            else self._preserve_space[-1])
1956
1957        # Write the tag.
1958        write = self._write
1959        write('<' + parsed_qnames[tag][0])
1960        if attr_list:
1961            write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))
1962        write('>')
1963
1964        # Write the resolved qname text content.
1965        if qname_text is not None:
1966            write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))
1967
1968        self._root_seen = True
1969        self._ns_stack.append([])
1970
1971    def end(self, tag):
1972        if self._ignored_depth:
1973            self._ignored_depth -= 1
1974            return
1975        if self._data:
1976            self._flush()
1977        self._write(f'</{self._qname(tag)[0]}>')
1978        self._preserve_space.pop()
1979        self._root_done = len(self._preserve_space) == 1
1980        self._declared_ns_stack.pop()
1981        self._ns_stack.pop()
1982
1983    def comment(self, text):
1984        if not self._with_comments:
1985            return
1986        if self._ignored_depth:
1987            return
1988        if self._root_done:
1989            self._write('\n')
1990        elif self._root_seen and self._data:
1991            self._flush()
1992        self._write(f'<!--{_escape_cdata_c14n(text)}-->')
1993        if not self._root_seen:
1994            self._write('\n')
1995
1996    def pi(self, target, data):
1997        if self._ignored_depth:
1998            return
1999        if self._root_done:
2000            self._write('\n')
2001        elif self._root_seen and self._data:
2002            self._flush()
2003        self._write(
2004            f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')
2005        if not self._root_seen:
2006            self._write('\n')
2007
2008
2009def _escape_cdata_c14n(text):
2010    # escape character data
2011    try:
2012        # it's worth avoiding do-nothing calls for strings that are
2013        # shorter than 500 character, or so.  assume that's, by far,
2014        # the most common case in most applications.
2015        if '&' in text:
2016            text = text.replace('&', '&amp;')
2017        if '<' in text:
2018            text = text.replace('<', '&lt;')
2019        if '>' in text:
2020            text = text.replace('>', '&gt;')
2021        if '\r' in text:
2022            text = text.replace('\r', '&#xD;')
2023        return text
2024    except (TypeError, AttributeError):
2025        _raise_serialization_error(text)
2026
2027
2028def _escape_attrib_c14n(text):
2029    # escape attribute value
2030    try:
2031        if '&' in text:
2032            text = text.replace('&', '&amp;')
2033        if '<' in text:
2034            text = text.replace('<', '&lt;')
2035        if '"' in text:
2036            text = text.replace('"', '&quot;')
2037        if '\t' in text:
2038            text = text.replace('\t', '&#x9;')
2039        if '\n' in text:
2040            text = text.replace('\n', '&#xA;')
2041        if '\r' in text:
2042            text = text.replace('\r', '&#xD;')
2043        return text
2044    except (TypeError, AttributeError):
2045        _raise_serialization_error(text)
2046
2047
2048# --------------------------------------------------------------------
2049
2050# Import the C accelerators
2051try:
2052    # Element is going to be shadowed by the C implementation. We need to keep
2053    # the Python version of it accessible for some "creative" by external code
2054    # (see tests)
2055    _Element_Py = Element
2056
2057    # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories
2058    from _elementtree import *
2059    from _elementtree import _set_factories
2060except ImportError:
2061    pass
2062else:
2063    _set_factories(Comment, ProcessingInstruction)
2064