1"""Lightweight XML support for Python. 2 3 XML is an inherently hierarchical data format, and the most natural way to 4 represent it is with a tree. This module has two classes for this purpose: 5 6 1. ElementTree represents the whole XML document as a tree and 7 8 2. Element represents a single node in this tree. 9 10 Interactions with the whole document (reading and writing to/from files) are 11 usually done on the ElementTree level. Interactions with a single XML element 12 and its sub-elements are done on the Element level. 13 14 Element is a flexible container object designed to store hierarchical data 15 structures in memory. It can be described as a cross between a list and a 16 dictionary. Each Element has a number of properties associated with it: 17 18 'tag' - a string containing the element's name. 19 20 'attributes' - a Python dictionary storing the element's attributes. 21 22 'text' - a string containing the element's text content. 23 24 'tail' - an optional string containing text after the element's end tag. 25 26 And a number of child elements stored in a Python sequence. 27 28 To create an element instance, use the Element constructor, 29 or the SubElement factory function. 30 31 You can also use the ElementTree class to wrap an element structure 32 and convert it to and from XML. 33 34""" 35 36#--------------------------------------------------------------------- 37# Licensed to PSF under a Contributor Agreement. 38# See http://www.python.org/psf/license for licensing details. 39# 40# ElementTree 41# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved. 42# 43# fredrik@pythonware.com 44# http://www.pythonware.com 45# -------------------------------------------------------------------- 46# The ElementTree toolkit is 47# 48# Copyright (c) 1999-2008 by Fredrik Lundh 49# 50# By obtaining, using, and/or copying this software and/or its 51# associated documentation, you agree that you have read, understood, 52# and will comply with the following terms and conditions: 53# 54# Permission to use, copy, modify, and distribute this software and 55# its associated documentation for any purpose and without fee is 56# hereby granted, provided that the above copyright notice appears in 57# all copies, and that both that copyright notice and this permission 58# notice appear in supporting documentation, and that the name of 59# Secret Labs AB or the author not be used in advertising or publicity 60# pertaining to distribution of the software without specific, written 61# prior permission. 62# 63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD 64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- 65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR 66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 70# OF THIS SOFTWARE. 71# -------------------------------------------------------------------- 72 73__all__ = [ 74 # public symbols 75 "Comment", 76 "dump", 77 "Element", "ElementTree", 78 "fromstring", "fromstringlist", 79 "iselement", "iterparse", 80 "parse", "ParseError", 81 "PI", "ProcessingInstruction", 82 "QName", 83 "SubElement", 84 "tostring", "tostringlist", 85 "TreeBuilder", 86 "VERSION", 87 "XML", "XMLID", 88 "XMLParser", "XMLPullParser", 89 "register_namespace", 90 "canonicalize", "C14NWriterTarget", 91 ] 92 93VERSION = "1.3.0" 94 95import sys 96import re 97import warnings 98import io 99import collections 100import collections.abc 101import contextlib 102 103from . import ElementPath 104 105 106class ParseError(SyntaxError): 107 """An error when parsing an XML document. 108 109 In addition to its exception value, a ParseError contains 110 two extra attributes: 111 'code' - the specific exception code 112 'position' - the line and column of the error 113 114 """ 115 pass 116 117# -------------------------------------------------------------------- 118 119 120def iselement(element): 121 """Return True if *element* appears to be an Element.""" 122 return hasattr(element, 'tag') 123 124 125class Element: 126 """An XML element. 127 128 This class is the reference implementation of the Element interface. 129 130 An element's length is its number of subelements. That means if you 131 want to check if an element is truly empty, you should check BOTH 132 its length AND its text attribute. 133 134 The element tag, attribute names, and attribute values can be either 135 bytes or strings. 136 137 *tag* is the element name. *attrib* is an optional dictionary containing 138 element attributes. *extra* are additional element attributes given as 139 keyword arguments. 140 141 Example form: 142 <tag attrib>text<child/>...</tag>tail 143 144 """ 145 146 tag = None 147 """The element's name.""" 148 149 attrib = None 150 """Dictionary of the element's attributes.""" 151 152 text = None 153 """ 154 Text before first subelement. This is either a string or the value None. 155 Note that if there is no text, this attribute may be either 156 None or the empty string, depending on the parser. 157 158 """ 159 160 tail = None 161 """ 162 Text after this element's end tag, but before the next sibling element's 163 start tag. This is either a string or the value None. Note that if there 164 was no text, this attribute may be either None or an empty string, 165 depending on the parser. 166 167 """ 168 169 def __init__(self, tag, attrib={}, **extra): 170 if not isinstance(attrib, dict): 171 raise TypeError("attrib must be dict, not %s" % ( 172 attrib.__class__.__name__,)) 173 self.tag = tag 174 self.attrib = {**attrib, **extra} 175 self._children = [] 176 177 def __repr__(self): 178 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self)) 179 180 def makeelement(self, tag, attrib): 181 """Create a new element with the same type. 182 183 *tag* is a string containing the element name. 184 *attrib* is a dictionary containing the element attributes. 185 186 Do not call this method, use the SubElement factory function instead. 187 188 """ 189 return self.__class__(tag, attrib) 190 191 def copy(self): 192 """Return copy of current element. 193 194 This creates a shallow copy. Subelements will be shared with the 195 original tree. 196 197 """ 198 elem = self.makeelement(self.tag, self.attrib) 199 elem.text = self.text 200 elem.tail = self.tail 201 elem[:] = self 202 return elem 203 204 def __len__(self): 205 return len(self._children) 206 207 def __bool__(self): 208 warnings.warn( 209 "The behavior of this method will change in future versions. " 210 "Use specific 'len(elem)' or 'elem is not None' test instead.", 211 FutureWarning, stacklevel=2 212 ) 213 return len(self._children) != 0 # emulate old behaviour, for now 214 215 def __getitem__(self, index): 216 return self._children[index] 217 218 def __setitem__(self, index, element): 219 if isinstance(index, slice): 220 for elt in element: 221 self._assert_is_element(elt) 222 else: 223 self._assert_is_element(element) 224 self._children[index] = element 225 226 def __delitem__(self, index): 227 del self._children[index] 228 229 def append(self, subelement): 230 """Add *subelement* to the end of this element. 231 232 The new element will appear in document order after the last existing 233 subelement (or directly after the text, if it's the first subelement), 234 but before the end tag for this element. 235 236 """ 237 self._assert_is_element(subelement) 238 self._children.append(subelement) 239 240 def extend(self, elements): 241 """Append subelements from a sequence. 242 243 *elements* is a sequence with zero or more elements. 244 245 """ 246 for element in elements: 247 self._assert_is_element(element) 248 self._children.append(element) 249 250 def insert(self, index, subelement): 251 """Insert *subelement* at position *index*.""" 252 self._assert_is_element(subelement) 253 self._children.insert(index, subelement) 254 255 def _assert_is_element(self, e): 256 # Need to refer to the actual Python implementation, not the 257 # shadowing C implementation. 258 if not isinstance(e, _Element_Py): 259 raise TypeError('expected an Element, not %s' % type(e).__name__) 260 261 def remove(self, subelement): 262 """Remove matching subelement. 263 264 Unlike the find methods, this method compares elements based on 265 identity, NOT ON tag value or contents. To remove subelements by 266 other means, the easiest way is to use a list comprehension to 267 select what elements to keep, and then use slice assignment to update 268 the parent element. 269 270 ValueError is raised if a matching element could not be found. 271 272 """ 273 # assert iselement(element) 274 self._children.remove(subelement) 275 276 def getchildren(self): 277 """(Deprecated) Return all subelements. 278 279 Elements are returned in document order. 280 281 """ 282 warnings.warn( 283 "This method will be removed in future versions. " 284 "Use 'list(elem)' or iteration over elem instead.", 285 DeprecationWarning, stacklevel=2 286 ) 287 return self._children 288 289 def find(self, path, namespaces=None): 290 """Find first matching element by tag name or path. 291 292 *path* is a string having either an element tag or an XPath, 293 *namespaces* is an optional mapping from namespace prefix to full name. 294 295 Return the first matching element, or None if no element was found. 296 297 """ 298 return ElementPath.find(self, path, namespaces) 299 300 def findtext(self, path, default=None, namespaces=None): 301 """Find text for first matching element by tag name or path. 302 303 *path* is a string having either an element tag or an XPath, 304 *default* is the value to return if the element was not found, 305 *namespaces* is an optional mapping from namespace prefix to full name. 306 307 Return text content of first matching element, or default value if 308 none was found. Note that if an element is found having no text 309 content, the empty string is returned. 310 311 """ 312 return ElementPath.findtext(self, path, default, namespaces) 313 314 def findall(self, path, namespaces=None): 315 """Find all matching subelements by tag name or path. 316 317 *path* is a string having either an element tag or an XPath, 318 *namespaces* is an optional mapping from namespace prefix to full name. 319 320 Returns list containing all matching elements in document order. 321 322 """ 323 return ElementPath.findall(self, path, namespaces) 324 325 def iterfind(self, path, namespaces=None): 326 """Find all matching subelements by tag name or path. 327 328 *path* is a string having either an element tag or an XPath, 329 *namespaces* is an optional mapping from namespace prefix to full name. 330 331 Return an iterable yielding all matching elements in document order. 332 333 """ 334 return ElementPath.iterfind(self, path, namespaces) 335 336 def clear(self): 337 """Reset element. 338 339 This function removes all subelements, clears all attributes, and sets 340 the text and tail attributes to None. 341 342 """ 343 self.attrib.clear() 344 self._children = [] 345 self.text = self.tail = None 346 347 def get(self, key, default=None): 348 """Get element attribute. 349 350 Equivalent to attrib.get, but some implementations may handle this a 351 bit more efficiently. *key* is what attribute to look for, and 352 *default* is what to return if the attribute was not found. 353 354 Returns a string containing the attribute value, or the default if 355 attribute was not found. 356 357 """ 358 return self.attrib.get(key, default) 359 360 def set(self, key, value): 361 """Set element attribute. 362 363 Equivalent to attrib[key] = value, but some implementations may handle 364 this a bit more efficiently. *key* is what attribute to set, and 365 *value* is the attribute value to set it to. 366 367 """ 368 self.attrib[key] = value 369 370 def keys(self): 371 """Get list of attribute names. 372 373 Names are returned in an arbitrary order, just like an ordinary 374 Python dict. Equivalent to attrib.keys() 375 376 """ 377 return self.attrib.keys() 378 379 def items(self): 380 """Get element attributes as a sequence. 381 382 The attributes are returned in arbitrary order. Equivalent to 383 attrib.items(). 384 385 Return a list of (name, value) tuples. 386 387 """ 388 return self.attrib.items() 389 390 def iter(self, tag=None): 391 """Create tree iterator. 392 393 The iterator loops over the element and all subelements in document 394 order, returning all elements with a matching tag. 395 396 If the tree structure is modified during iteration, new or removed 397 elements may or may not be included. To get a stable set, use the 398 list() function on the iterator, and loop over the resulting list. 399 400 *tag* is what tags to look for (default is to return all elements) 401 402 Return an iterator containing all the matching elements. 403 404 """ 405 if tag == "*": 406 tag = None 407 if tag is None or self.tag == tag: 408 yield self 409 for e in self._children: 410 yield from e.iter(tag) 411 412 # compatibility 413 def getiterator(self, tag=None): 414 warnings.warn( 415 "This method will be removed in future versions. " 416 "Use 'elem.iter()' or 'list(elem.iter())' instead.", 417 DeprecationWarning, stacklevel=2 418 ) 419 return list(self.iter(tag)) 420 421 def itertext(self): 422 """Create text iterator. 423 424 The iterator loops over the element and all subelements in document 425 order, returning all inner text. 426 427 """ 428 tag = self.tag 429 if not isinstance(tag, str) and tag is not None: 430 return 431 t = self.text 432 if t: 433 yield t 434 for e in self: 435 yield from e.itertext() 436 t = e.tail 437 if t: 438 yield t 439 440 441def SubElement(parent, tag, attrib={}, **extra): 442 """Subelement factory which creates an element instance, and appends it 443 to an existing parent. 444 445 The element tag, attribute names, and attribute values can be either 446 bytes or Unicode strings. 447 448 *parent* is the parent element, *tag* is the subelements name, *attrib* is 449 an optional directory containing element attributes, *extra* are 450 additional attributes given as keyword arguments. 451 452 """ 453 attrib = {**attrib, **extra} 454 element = parent.makeelement(tag, attrib) 455 parent.append(element) 456 return element 457 458 459def Comment(text=None): 460 """Comment element factory. 461 462 This function creates a special element which the standard serializer 463 serializes as an XML comment. 464 465 *text* is a string containing the comment string. 466 467 """ 468 element = Element(Comment) 469 element.text = text 470 return element 471 472 473def ProcessingInstruction(target, text=None): 474 """Processing Instruction element factory. 475 476 This function creates a special element which the standard serializer 477 serializes as an XML comment. 478 479 *target* is a string containing the processing instruction, *text* is a 480 string containing the processing instruction contents, if any. 481 482 """ 483 element = Element(ProcessingInstruction) 484 element.text = target 485 if text: 486 element.text = element.text + " " + text 487 return element 488 489PI = ProcessingInstruction 490 491 492class QName: 493 """Qualified name wrapper. 494 495 This class can be used to wrap a QName attribute value in order to get 496 proper namespace handing on output. 497 498 *text_or_uri* is a string containing the QName value either in the form 499 {uri}local, or if the tag argument is given, the URI part of a QName. 500 501 *tag* is an optional argument which if given, will make the first 502 argument (text_or_uri) be interpreted as a URI, and this argument (tag) 503 be interpreted as a local name. 504 505 """ 506 def __init__(self, text_or_uri, tag=None): 507 if tag: 508 text_or_uri = "{%s}%s" % (text_or_uri, tag) 509 self.text = text_or_uri 510 def __str__(self): 511 return self.text 512 def __repr__(self): 513 return '<%s %r>' % (self.__class__.__name__, self.text) 514 def __hash__(self): 515 return hash(self.text) 516 def __le__(self, other): 517 if isinstance(other, QName): 518 return self.text <= other.text 519 return self.text <= other 520 def __lt__(self, other): 521 if isinstance(other, QName): 522 return self.text < other.text 523 return self.text < other 524 def __ge__(self, other): 525 if isinstance(other, QName): 526 return self.text >= other.text 527 return self.text >= other 528 def __gt__(self, other): 529 if isinstance(other, QName): 530 return self.text > other.text 531 return self.text > other 532 def __eq__(self, other): 533 if isinstance(other, QName): 534 return self.text == other.text 535 return self.text == other 536 537# -------------------------------------------------------------------- 538 539 540class ElementTree: 541 """An XML element hierarchy. 542 543 This class also provides support for serialization to and from 544 standard XML. 545 546 *element* is an optional root element node, 547 *file* is an optional file handle or file name of an XML file whose 548 contents will be used to initialize the tree with. 549 550 """ 551 def __init__(self, element=None, file=None): 552 # assert element is None or iselement(element) 553 self._root = element # first node 554 if file: 555 self.parse(file) 556 557 def getroot(self): 558 """Return root element of this tree.""" 559 return self._root 560 561 def _setroot(self, element): 562 """Replace root element of this tree. 563 564 This will discard the current contents of the tree and replace it 565 with the given element. Use with care! 566 567 """ 568 # assert iselement(element) 569 self._root = element 570 571 def parse(self, source, parser=None): 572 """Load external XML document into element tree. 573 574 *source* is a file name or file object, *parser* is an optional parser 575 instance that defaults to XMLParser. 576 577 ParseError is raised if the parser fails to parse the document. 578 579 Returns the root element of the given source document. 580 581 """ 582 close_source = False 583 if not hasattr(source, "read"): 584 source = open(source, "rb") 585 close_source = True 586 try: 587 if parser is None: 588 # If no parser was specified, create a default XMLParser 589 parser = XMLParser() 590 if hasattr(parser, '_parse_whole'): 591 # The default XMLParser, when it comes from an accelerator, 592 # can define an internal _parse_whole API for efficiency. 593 # It can be used to parse the whole source without feeding 594 # it with chunks. 595 self._root = parser._parse_whole(source) 596 return self._root 597 while True: 598 data = source.read(65536) 599 if not data: 600 break 601 parser.feed(data) 602 self._root = parser.close() 603 return self._root 604 finally: 605 if close_source: 606 source.close() 607 608 def iter(self, tag=None): 609 """Create and return tree iterator for the root element. 610 611 The iterator loops over all elements in this tree, in document order. 612 613 *tag* is a string with the tag name to iterate over 614 (default is to return all elements). 615 616 """ 617 # assert self._root is not None 618 return self._root.iter(tag) 619 620 # compatibility 621 def getiterator(self, tag=None): 622 warnings.warn( 623 "This method will be removed in future versions. " 624 "Use 'tree.iter()' or 'list(tree.iter())' instead.", 625 DeprecationWarning, stacklevel=2 626 ) 627 return list(self.iter(tag)) 628 629 def find(self, path, namespaces=None): 630 """Find first matching element by tag name or path. 631 632 Same as getroot().find(path), which is Element.find() 633 634 *path* is a string having either an element tag or an XPath, 635 *namespaces* is an optional mapping from namespace prefix to full name. 636 637 Return the first matching element, or None if no element was found. 638 639 """ 640 # assert self._root is not None 641 if path[:1] == "/": 642 path = "." + path 643 warnings.warn( 644 "This search is broken in 1.3 and earlier, and will be " 645 "fixed in a future version. If you rely on the current " 646 "behaviour, change it to %r" % path, 647 FutureWarning, stacklevel=2 648 ) 649 return self._root.find(path, namespaces) 650 651 def findtext(self, path, default=None, namespaces=None): 652 """Find first matching element by tag name or path. 653 654 Same as getroot().findtext(path), which is Element.findtext() 655 656 *path* is a string having either an element tag or an XPath, 657 *namespaces* is an optional mapping from namespace prefix to full name. 658 659 Return the first matching element, or None if no element was found. 660 661 """ 662 # assert self._root is not None 663 if path[:1] == "/": 664 path = "." + path 665 warnings.warn( 666 "This search is broken in 1.3 and earlier, and will be " 667 "fixed in a future version. If you rely on the current " 668 "behaviour, change it to %r" % path, 669 FutureWarning, stacklevel=2 670 ) 671 return self._root.findtext(path, default, namespaces) 672 673 def findall(self, path, namespaces=None): 674 """Find all matching subelements by tag name or path. 675 676 Same as getroot().findall(path), which is Element.findall(). 677 678 *path* is a string having either an element tag or an XPath, 679 *namespaces* is an optional mapping from namespace prefix to full name. 680 681 Return list containing all matching elements in document order. 682 683 """ 684 # assert self._root is not None 685 if path[:1] == "/": 686 path = "." + path 687 warnings.warn( 688 "This search is broken in 1.3 and earlier, and will be " 689 "fixed in a future version. If you rely on the current " 690 "behaviour, change it to %r" % path, 691 FutureWarning, stacklevel=2 692 ) 693 return self._root.findall(path, namespaces) 694 695 def iterfind(self, path, namespaces=None): 696 """Find all matching subelements by tag name or path. 697 698 Same as getroot().iterfind(path), which is element.iterfind() 699 700 *path* is a string having either an element tag or an XPath, 701 *namespaces* is an optional mapping from namespace prefix to full name. 702 703 Return an iterable yielding all matching elements in document order. 704 705 """ 706 # assert self._root is not None 707 if path[:1] == "/": 708 path = "." + path 709 warnings.warn( 710 "This search is broken in 1.3 and earlier, and will be " 711 "fixed in a future version. If you rely on the current " 712 "behaviour, change it to %r" % path, 713 FutureWarning, stacklevel=2 714 ) 715 return self._root.iterfind(path, namespaces) 716 717 def write(self, file_or_filename, 718 encoding=None, 719 xml_declaration=None, 720 default_namespace=None, 721 method=None, *, 722 short_empty_elements=True): 723 """Write element tree to a file as XML. 724 725 Arguments: 726 *file_or_filename* -- file name or a file object opened for writing 727 728 *encoding* -- the output encoding (default: US-ASCII) 729 730 *xml_declaration* -- bool indicating if an XML declaration should be 731 added to the output. If None, an XML declaration 732 is added if encoding IS NOT either of: 733 US-ASCII, UTF-8, or Unicode 734 735 *default_namespace* -- sets the default XML namespace (for "xmlns") 736 737 *method* -- either "xml" (default), "html, "text", or "c14n" 738 739 *short_empty_elements* -- controls the formatting of elements 740 that contain no content. If True (default) 741 they are emitted as a single self-closed 742 tag, otherwise they are emitted as a pair 743 of start/end tags 744 745 """ 746 if not method: 747 method = "xml" 748 elif method not in _serialize: 749 raise ValueError("unknown method %r" % method) 750 if not encoding: 751 if method == "c14n": 752 encoding = "utf-8" 753 else: 754 encoding = "us-ascii" 755 enc_lower = encoding.lower() 756 with _get_writer(file_or_filename, enc_lower) as write: 757 if method == "xml" and (xml_declaration or 758 (xml_declaration is None and 759 enc_lower not in ("utf-8", "us-ascii", "unicode"))): 760 declared_encoding = encoding 761 if enc_lower == "unicode": 762 # Retrieve the default encoding for the xml declaration 763 import locale 764 declared_encoding = locale.getpreferredencoding() 765 write("<?xml version='1.0' encoding='%s'?>\n" % ( 766 declared_encoding,)) 767 if method == "text": 768 _serialize_text(write, self._root) 769 else: 770 qnames, namespaces = _namespaces(self._root, default_namespace) 771 serialize = _serialize[method] 772 serialize(write, self._root, qnames, namespaces, 773 short_empty_elements=short_empty_elements) 774 775 def write_c14n(self, file): 776 # lxml.etree compatibility. use output method instead 777 return self.write(file, method="c14n") 778 779# -------------------------------------------------------------------- 780# serialization support 781 782@contextlib.contextmanager 783def _get_writer(file_or_filename, encoding): 784 # returns text write method and release all resources after using 785 try: 786 write = file_or_filename.write 787 except AttributeError: 788 # file_or_filename is a file name 789 if encoding == "unicode": 790 file = open(file_or_filename, "w") 791 else: 792 file = open(file_or_filename, "w", encoding=encoding, 793 errors="xmlcharrefreplace") 794 with file: 795 yield file.write 796 else: 797 # file_or_filename is a file-like object 798 # encoding determines if it is a text or binary writer 799 if encoding == "unicode": 800 # use a text writer as is 801 yield write 802 else: 803 # wrap a binary writer with TextIOWrapper 804 with contextlib.ExitStack() as stack: 805 if isinstance(file_or_filename, io.BufferedIOBase): 806 file = file_or_filename 807 elif isinstance(file_or_filename, io.RawIOBase): 808 file = io.BufferedWriter(file_or_filename) 809 # Keep the original file open when the BufferedWriter is 810 # destroyed 811 stack.callback(file.detach) 812 else: 813 # This is to handle passed objects that aren't in the 814 # IOBase hierarchy, but just have a write method 815 file = io.BufferedIOBase() 816 file.writable = lambda: True 817 file.write = write 818 try: 819 # TextIOWrapper uses this methods to determine 820 # if BOM (for UTF-16, etc) should be added 821 file.seekable = file_or_filename.seekable 822 file.tell = file_or_filename.tell 823 except AttributeError: 824 pass 825 file = io.TextIOWrapper(file, 826 encoding=encoding, 827 errors="xmlcharrefreplace", 828 newline="\n") 829 # Keep the original file open when the TextIOWrapper is 830 # destroyed 831 stack.callback(file.detach) 832 yield file.write 833 834def _namespaces(elem, default_namespace=None): 835 # identify namespaces used in this tree 836 837 # maps qnames to *encoded* prefix:local names 838 qnames = {None: None} 839 840 # maps uri:s to prefixes 841 namespaces = {} 842 if default_namespace: 843 namespaces[default_namespace] = "" 844 845 def add_qname(qname): 846 # calculate serialized qname representation 847 try: 848 if qname[:1] == "{": 849 uri, tag = qname[1:].rsplit("}", 1) 850 prefix = namespaces.get(uri) 851 if prefix is None: 852 prefix = _namespace_map.get(uri) 853 if prefix is None: 854 prefix = "ns%d" % len(namespaces) 855 if prefix != "xml": 856 namespaces[uri] = prefix 857 if prefix: 858 qnames[qname] = "%s:%s" % (prefix, tag) 859 else: 860 qnames[qname] = tag # default element 861 else: 862 if default_namespace: 863 # FIXME: can this be handled in XML 1.0? 864 raise ValueError( 865 "cannot use non-qualified names with " 866 "default_namespace option" 867 ) 868 qnames[qname] = qname 869 except TypeError: 870 _raise_serialization_error(qname) 871 872 # populate qname and namespaces table 873 for elem in elem.iter(): 874 tag = elem.tag 875 if isinstance(tag, QName): 876 if tag.text not in qnames: 877 add_qname(tag.text) 878 elif isinstance(tag, str): 879 if tag not in qnames: 880 add_qname(tag) 881 elif tag is not None and tag is not Comment and tag is not PI: 882 _raise_serialization_error(tag) 883 for key, value in elem.items(): 884 if isinstance(key, QName): 885 key = key.text 886 if key not in qnames: 887 add_qname(key) 888 if isinstance(value, QName) and value.text not in qnames: 889 add_qname(value.text) 890 text = elem.text 891 if isinstance(text, QName) and text.text not in qnames: 892 add_qname(text.text) 893 return qnames, namespaces 894 895def _serialize_xml(write, elem, qnames, namespaces, 896 short_empty_elements, **kwargs): 897 tag = elem.tag 898 text = elem.text 899 if tag is Comment: 900 write("<!--%s-->" % text) 901 elif tag is ProcessingInstruction: 902 write("<?%s?>" % text) 903 else: 904 tag = qnames[tag] 905 if tag is None: 906 if text: 907 write(_escape_cdata(text)) 908 for e in elem: 909 _serialize_xml(write, e, qnames, None, 910 short_empty_elements=short_empty_elements) 911 else: 912 write("<" + tag) 913 items = list(elem.items()) 914 if items or namespaces: 915 if namespaces: 916 for v, k in sorted(namespaces.items(), 917 key=lambda x: x[1]): # sort on prefix 918 if k: 919 k = ":" + k 920 write(" xmlns%s=\"%s\"" % ( 921 k, 922 _escape_attrib(v) 923 )) 924 for k, v in items: 925 if isinstance(k, QName): 926 k = k.text 927 if isinstance(v, QName): 928 v = qnames[v.text] 929 else: 930 v = _escape_attrib(v) 931 write(" %s=\"%s\"" % (qnames[k], v)) 932 if text or len(elem) or not short_empty_elements: 933 write(">") 934 if text: 935 write(_escape_cdata(text)) 936 for e in elem: 937 _serialize_xml(write, e, qnames, None, 938 short_empty_elements=short_empty_elements) 939 write("</" + tag + ">") 940 else: 941 write(" />") 942 if elem.tail: 943 write(_escape_cdata(elem.tail)) 944 945HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", 946 "img", "input", "isindex", "link", "meta", "param") 947 948try: 949 HTML_EMPTY = set(HTML_EMPTY) 950except NameError: 951 pass 952 953def _serialize_html(write, elem, qnames, namespaces, **kwargs): 954 tag = elem.tag 955 text = elem.text 956 if tag is Comment: 957 write("<!--%s-->" % _escape_cdata(text)) 958 elif tag is ProcessingInstruction: 959 write("<?%s?>" % _escape_cdata(text)) 960 else: 961 tag = qnames[tag] 962 if tag is None: 963 if text: 964 write(_escape_cdata(text)) 965 for e in elem: 966 _serialize_html(write, e, qnames, None) 967 else: 968 write("<" + tag) 969 items = list(elem.items()) 970 if items or namespaces: 971 if namespaces: 972 for v, k in sorted(namespaces.items(), 973 key=lambda x: x[1]): # sort on prefix 974 if k: 975 k = ":" + k 976 write(" xmlns%s=\"%s\"" % ( 977 k, 978 _escape_attrib(v) 979 )) 980 for k, v in items: 981 if isinstance(k, QName): 982 k = k.text 983 if isinstance(v, QName): 984 v = qnames[v.text] 985 else: 986 v = _escape_attrib_html(v) 987 # FIXME: handle boolean attributes 988 write(" %s=\"%s\"" % (qnames[k], v)) 989 write(">") 990 ltag = tag.lower() 991 if text: 992 if ltag == "script" or ltag == "style": 993 write(text) 994 else: 995 write(_escape_cdata(text)) 996 for e in elem: 997 _serialize_html(write, e, qnames, None) 998 if ltag not in HTML_EMPTY: 999 write("</" + tag + ">") 1000 if elem.tail: 1001 write(_escape_cdata(elem.tail)) 1002 1003def _serialize_text(write, elem): 1004 for part in elem.itertext(): 1005 write(part) 1006 if elem.tail: 1007 write(elem.tail) 1008 1009_serialize = { 1010 "xml": _serialize_xml, 1011 "html": _serialize_html, 1012 "text": _serialize_text, 1013# this optional method is imported at the end of the module 1014# "c14n": _serialize_c14n, 1015} 1016 1017 1018def register_namespace(prefix, uri): 1019 """Register a namespace prefix. 1020 1021 The registry is global, and any existing mapping for either the 1022 given prefix or the namespace URI will be removed. 1023 1024 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and 1025 attributes in this namespace will be serialized with prefix if possible. 1026 1027 ValueError is raised if prefix is reserved or is invalid. 1028 1029 """ 1030 if re.match(r"ns\d+$", prefix): 1031 raise ValueError("Prefix format reserved for internal use") 1032 for k, v in list(_namespace_map.items()): 1033 if k == uri or v == prefix: 1034 del _namespace_map[k] 1035 _namespace_map[uri] = prefix 1036 1037_namespace_map = { 1038 # "well-known" namespace prefixes 1039 "http://www.w3.org/XML/1998/namespace": "xml", 1040 "http://www.w3.org/1999/xhtml": "html", 1041 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", 1042 "http://schemas.xmlsoap.org/wsdl/": "wsdl", 1043 # xml schema 1044 "http://www.w3.org/2001/XMLSchema": "xs", 1045 "http://www.w3.org/2001/XMLSchema-instance": "xsi", 1046 # dublin core 1047 "http://purl.org/dc/elements/1.1/": "dc", 1048} 1049# For tests and troubleshooting 1050register_namespace._namespace_map = _namespace_map 1051 1052def _raise_serialization_error(text): 1053 raise TypeError( 1054 "cannot serialize %r (type %s)" % (text, type(text).__name__) 1055 ) 1056 1057def _escape_cdata(text): 1058 # escape character data 1059 try: 1060 # it's worth avoiding do-nothing calls for strings that are 1061 # shorter than 500 characters, or so. assume that's, by far, 1062 # the most common case in most applications. 1063 if "&" in text: 1064 text = text.replace("&", "&") 1065 if "<" in text: 1066 text = text.replace("<", "<") 1067 if ">" in text: 1068 text = text.replace(">", ">") 1069 return text 1070 except (TypeError, AttributeError): 1071 _raise_serialization_error(text) 1072 1073def _escape_attrib(text): 1074 # escape attribute value 1075 try: 1076 if "&" in text: 1077 text = text.replace("&", "&") 1078 if "<" in text: 1079 text = text.replace("<", "<") 1080 if ">" in text: 1081 text = text.replace(">", ">") 1082 if "\"" in text: 1083 text = text.replace("\"", """) 1084 # The following business with carriage returns is to satisfy 1085 # Section 2.11 of the XML specification, stating that 1086 # CR or CR LN should be replaced with just LN 1087 # http://www.w3.org/TR/REC-xml/#sec-line-ends 1088 if "\r\n" in text: 1089 text = text.replace("\r\n", "\n") 1090 if "\r" in text: 1091 text = text.replace("\r", "\n") 1092 #The following four lines are issue 17582 1093 if "\n" in text: 1094 text = text.replace("\n", " ") 1095 if "\t" in text: 1096 text = text.replace("\t", "	") 1097 return text 1098 except (TypeError, AttributeError): 1099 _raise_serialization_error(text) 1100 1101def _escape_attrib_html(text): 1102 # escape attribute value 1103 try: 1104 if "&" in text: 1105 text = text.replace("&", "&") 1106 if ">" in text: 1107 text = text.replace(">", ">") 1108 if "\"" in text: 1109 text = text.replace("\"", """) 1110 return text 1111 except (TypeError, AttributeError): 1112 _raise_serialization_error(text) 1113 1114# -------------------------------------------------------------------- 1115 1116def tostring(element, encoding=None, method=None, *, 1117 xml_declaration=None, default_namespace=None, 1118 short_empty_elements=True): 1119 """Generate string representation of XML element. 1120 1121 All subelements are included. If encoding is "unicode", a string 1122 is returned. Otherwise a bytestring is returned. 1123 1124 *element* is an Element instance, *encoding* is an optional output 1125 encoding defaulting to US-ASCII, *method* is an optional output which can 1126 be one of "xml" (default), "html", "text" or "c14n", *default_namespace* 1127 sets the default XML namespace (for "xmlns"). 1128 1129 Returns an (optionally) encoded string containing the XML data. 1130 1131 """ 1132 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO() 1133 ElementTree(element).write(stream, encoding, 1134 xml_declaration=xml_declaration, 1135 default_namespace=default_namespace, 1136 method=method, 1137 short_empty_elements=short_empty_elements) 1138 return stream.getvalue() 1139 1140class _ListDataStream(io.BufferedIOBase): 1141 """An auxiliary stream accumulating into a list reference.""" 1142 def __init__(self, lst): 1143 self.lst = lst 1144 1145 def writable(self): 1146 return True 1147 1148 def seekable(self): 1149 return True 1150 1151 def write(self, b): 1152 self.lst.append(b) 1153 1154 def tell(self): 1155 return len(self.lst) 1156 1157def tostringlist(element, encoding=None, method=None, *, 1158 xml_declaration=None, default_namespace=None, 1159 short_empty_elements=True): 1160 lst = [] 1161 stream = _ListDataStream(lst) 1162 ElementTree(element).write(stream, encoding, 1163 xml_declaration=xml_declaration, 1164 default_namespace=default_namespace, 1165 method=method, 1166 short_empty_elements=short_empty_elements) 1167 return lst 1168 1169 1170def dump(elem): 1171 """Write element tree or element structure to sys.stdout. 1172 1173 This function should be used for debugging only. 1174 1175 *elem* is either an ElementTree, or a single Element. The exact output 1176 format is implementation dependent. In this version, it's written as an 1177 ordinary XML file. 1178 1179 """ 1180 # debugging 1181 if not isinstance(elem, ElementTree): 1182 elem = ElementTree(elem) 1183 elem.write(sys.stdout, encoding="unicode") 1184 tail = elem.getroot().tail 1185 if not tail or tail[-1] != "\n": 1186 sys.stdout.write("\n") 1187 1188# -------------------------------------------------------------------- 1189# parsing 1190 1191 1192def parse(source, parser=None): 1193 """Parse XML document into element tree. 1194 1195 *source* is a filename or file object containing XML data, 1196 *parser* is an optional parser instance defaulting to XMLParser. 1197 1198 Return an ElementTree instance. 1199 1200 """ 1201 tree = ElementTree() 1202 tree.parse(source, parser) 1203 return tree 1204 1205 1206def iterparse(source, events=None, parser=None): 1207 """Incrementally parse XML document into ElementTree. 1208 1209 This class also reports what's going on to the user based on the 1210 *events* it is initialized with. The supported events are the strings 1211 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get 1212 detailed namespace information). If *events* is omitted, only 1213 "end" events are reported. 1214 1215 *source* is a filename or file object containing XML data, *events* is 1216 a list of events to report back, *parser* is an optional parser instance. 1217 1218 Returns an iterator providing (event, elem) pairs. 1219 1220 """ 1221 # Use the internal, undocumented _parser argument for now; When the 1222 # parser argument of iterparse is removed, this can be killed. 1223 pullparser = XMLPullParser(events=events, _parser=parser) 1224 def iterator(): 1225 try: 1226 while True: 1227 yield from pullparser.read_events() 1228 # load event buffer 1229 data = source.read(16 * 1024) 1230 if not data: 1231 break 1232 pullparser.feed(data) 1233 root = pullparser._close_and_return_root() 1234 yield from pullparser.read_events() 1235 it.root = root 1236 finally: 1237 if close_source: 1238 source.close() 1239 1240 class IterParseIterator(collections.abc.Iterator): 1241 __next__ = iterator().__next__ 1242 it = IterParseIterator() 1243 it.root = None 1244 del iterator, IterParseIterator 1245 1246 close_source = False 1247 if not hasattr(source, "read"): 1248 source = open(source, "rb") 1249 close_source = True 1250 1251 return it 1252 1253 1254class XMLPullParser: 1255 1256 def __init__(self, events=None, *, _parser=None): 1257 # The _parser argument is for internal use only and must not be relied 1258 # upon in user code. It will be removed in a future release. 1259 # See http://bugs.python.org/issue17741 for more details. 1260 1261 self._events_queue = collections.deque() 1262 self._parser = _parser or XMLParser(target=TreeBuilder()) 1263 # wire up the parser for event reporting 1264 if events is None: 1265 events = ("end",) 1266 self._parser._setevents(self._events_queue, events) 1267 1268 def feed(self, data): 1269 """Feed encoded data to parser.""" 1270 if self._parser is None: 1271 raise ValueError("feed() called after end of stream") 1272 if data: 1273 try: 1274 self._parser.feed(data) 1275 except SyntaxError as exc: 1276 self._events_queue.append(exc) 1277 1278 def _close_and_return_root(self): 1279 # iterparse needs this to set its root attribute properly :( 1280 root = self._parser.close() 1281 self._parser = None 1282 return root 1283 1284 def close(self): 1285 """Finish feeding data to parser. 1286 1287 Unlike XMLParser, does not return the root element. Use 1288 read_events() to consume elements from XMLPullParser. 1289 """ 1290 self._close_and_return_root() 1291 1292 def read_events(self): 1293 """Return an iterator over currently available (event, elem) pairs. 1294 1295 Events are consumed from the internal event queue as they are 1296 retrieved from the iterator. 1297 """ 1298 events = self._events_queue 1299 while events: 1300 event = events.popleft() 1301 if isinstance(event, Exception): 1302 raise event 1303 else: 1304 yield event 1305 1306 1307def XML(text, parser=None): 1308 """Parse XML document from string constant. 1309 1310 This function can be used to embed "XML Literals" in Python code. 1311 1312 *text* is a string containing XML data, *parser* is an 1313 optional parser instance, defaulting to the standard XMLParser. 1314 1315 Returns an Element instance. 1316 1317 """ 1318 if not parser: 1319 parser = XMLParser(target=TreeBuilder()) 1320 parser.feed(text) 1321 return parser.close() 1322 1323 1324def XMLID(text, parser=None): 1325 """Parse XML document from string constant for its IDs. 1326 1327 *text* is a string containing XML data, *parser* is an 1328 optional parser instance, defaulting to the standard XMLParser. 1329 1330 Returns an (Element, dict) tuple, in which the 1331 dict maps element id:s to elements. 1332 1333 """ 1334 if not parser: 1335 parser = XMLParser(target=TreeBuilder()) 1336 parser.feed(text) 1337 tree = parser.close() 1338 ids = {} 1339 for elem in tree.iter(): 1340 id = elem.get("id") 1341 if id: 1342 ids[id] = elem 1343 return tree, ids 1344 1345# Parse XML document from string constant. Alias for XML(). 1346fromstring = XML 1347 1348def fromstringlist(sequence, parser=None): 1349 """Parse XML document from sequence of string fragments. 1350 1351 *sequence* is a list of other sequence, *parser* is an optional parser 1352 instance, defaulting to the standard XMLParser. 1353 1354 Returns an Element instance. 1355 1356 """ 1357 if not parser: 1358 parser = XMLParser(target=TreeBuilder()) 1359 for text in sequence: 1360 parser.feed(text) 1361 return parser.close() 1362 1363# -------------------------------------------------------------------- 1364 1365 1366class TreeBuilder: 1367 """Generic element structure builder. 1368 1369 This builder converts a sequence of start, data, and end method 1370 calls to a well-formed element structure. 1371 1372 You can use this class to build an element structure using a custom XML 1373 parser, or a parser for some other XML-like format. 1374 1375 *element_factory* is an optional element factory which is called 1376 to create new Element instances, as necessary. 1377 1378 *comment_factory* is a factory to create comments to be used instead of 1379 the standard factory. If *insert_comments* is false (the default), 1380 comments will not be inserted into the tree. 1381 1382 *pi_factory* is a factory to create processing instructions to be used 1383 instead of the standard factory. If *insert_pis* is false (the default), 1384 processing instructions will not be inserted into the tree. 1385 """ 1386 def __init__(self, element_factory=None, *, 1387 comment_factory=None, pi_factory=None, 1388 insert_comments=False, insert_pis=False): 1389 self._data = [] # data collector 1390 self._elem = [] # element stack 1391 self._last = None # last element 1392 self._root = None # root element 1393 self._tail = None # true if we're after an end tag 1394 if comment_factory is None: 1395 comment_factory = Comment 1396 self._comment_factory = comment_factory 1397 self.insert_comments = insert_comments 1398 if pi_factory is None: 1399 pi_factory = ProcessingInstruction 1400 self._pi_factory = pi_factory 1401 self.insert_pis = insert_pis 1402 if element_factory is None: 1403 element_factory = Element 1404 self._factory = element_factory 1405 1406 def close(self): 1407 """Flush builder buffers and return toplevel document Element.""" 1408 assert len(self._elem) == 0, "missing end tags" 1409 assert self._root is not None, "missing toplevel element" 1410 return self._root 1411 1412 def _flush(self): 1413 if self._data: 1414 if self._last is not None: 1415 text = "".join(self._data) 1416 if self._tail: 1417 assert self._last.tail is None, "internal error (tail)" 1418 self._last.tail = text 1419 else: 1420 assert self._last.text is None, "internal error (text)" 1421 self._last.text = text 1422 self._data = [] 1423 1424 def data(self, data): 1425 """Add text to current element.""" 1426 self._data.append(data) 1427 1428 def start(self, tag, attrs): 1429 """Open new element and return it. 1430 1431 *tag* is the element name, *attrs* is a dict containing element 1432 attributes. 1433 1434 """ 1435 self._flush() 1436 self._last = elem = self._factory(tag, attrs) 1437 if self._elem: 1438 self._elem[-1].append(elem) 1439 elif self._root is None: 1440 self._root = elem 1441 self._elem.append(elem) 1442 self._tail = 0 1443 return elem 1444 1445 def end(self, tag): 1446 """Close and return current Element. 1447 1448 *tag* is the element name. 1449 1450 """ 1451 self._flush() 1452 self._last = self._elem.pop() 1453 assert self._last.tag == tag,\ 1454 "end tag mismatch (expected %s, got %s)" % ( 1455 self._last.tag, tag) 1456 self._tail = 1 1457 return self._last 1458 1459 def comment(self, text): 1460 """Create a comment using the comment_factory. 1461 1462 *text* is the text of the comment. 1463 """ 1464 return self._handle_single( 1465 self._comment_factory, self.insert_comments, text) 1466 1467 def pi(self, target, text=None): 1468 """Create a processing instruction using the pi_factory. 1469 1470 *target* is the target name of the processing instruction. 1471 *text* is the data of the processing instruction, or ''. 1472 """ 1473 return self._handle_single( 1474 self._pi_factory, self.insert_pis, target, text) 1475 1476 def _handle_single(self, factory, insert, *args): 1477 elem = factory(*args) 1478 if insert: 1479 self._flush() 1480 self._last = elem 1481 if self._elem: 1482 self._elem[-1].append(elem) 1483 self._tail = 1 1484 return elem 1485 1486 1487# also see ElementTree and TreeBuilder 1488class XMLParser: 1489 """Element structure builder for XML source data based on the expat parser. 1490 1491 *target* is an optional target object which defaults to an instance of the 1492 standard TreeBuilder class, *encoding* is an optional encoding string 1493 which if given, overrides the encoding specified in the XML file: 1494 http://www.iana.org/assignments/character-sets 1495 1496 """ 1497 1498 def __init__(self, *, target=None, encoding=None): 1499 try: 1500 from xml.parsers import expat 1501 except ImportError: 1502 try: 1503 import pyexpat as expat 1504 except ImportError: 1505 raise ImportError( 1506 "No module named expat; use SimpleXMLTreeBuilder instead" 1507 ) 1508 parser = expat.ParserCreate(encoding, "}") 1509 if target is None: 1510 target = TreeBuilder() 1511 # underscored names are provided for compatibility only 1512 self.parser = self._parser = parser 1513 self.target = self._target = target 1514 self._error = expat.error 1515 self._names = {} # name memo cache 1516 # main callbacks 1517 parser.DefaultHandlerExpand = self._default 1518 if hasattr(target, 'start'): 1519 parser.StartElementHandler = self._start 1520 if hasattr(target, 'end'): 1521 parser.EndElementHandler = self._end 1522 if hasattr(target, 'start_ns'): 1523 parser.StartNamespaceDeclHandler = self._start_ns 1524 if hasattr(target, 'end_ns'): 1525 parser.EndNamespaceDeclHandler = self._end_ns 1526 if hasattr(target, 'data'): 1527 parser.CharacterDataHandler = target.data 1528 # miscellaneous callbacks 1529 if hasattr(target, 'comment'): 1530 parser.CommentHandler = target.comment 1531 if hasattr(target, 'pi'): 1532 parser.ProcessingInstructionHandler = target.pi 1533 # Configure pyexpat: buffering, new-style attribute handling. 1534 parser.buffer_text = 1 1535 parser.ordered_attributes = 1 1536 parser.specified_attributes = 1 1537 self._doctype = None 1538 self.entity = {} 1539 try: 1540 self.version = "Expat %d.%d.%d" % expat.version_info 1541 except AttributeError: 1542 pass # unknown 1543 1544 def _setevents(self, events_queue, events_to_report): 1545 # Internal API for XMLPullParser 1546 # events_to_report: a list of events to report during parsing (same as 1547 # the *events* of XMLPullParser's constructor. 1548 # events_queue: a list of actual parsing events that will be populated 1549 # by the underlying parser. 1550 # 1551 parser = self._parser 1552 append = events_queue.append 1553 for event_name in events_to_report: 1554 if event_name == "start": 1555 parser.ordered_attributes = 1 1556 parser.specified_attributes = 1 1557 def handler(tag, attrib_in, event=event_name, append=append, 1558 start=self._start): 1559 append((event, start(tag, attrib_in))) 1560 parser.StartElementHandler = handler 1561 elif event_name == "end": 1562 def handler(tag, event=event_name, append=append, 1563 end=self._end): 1564 append((event, end(tag))) 1565 parser.EndElementHandler = handler 1566 elif event_name == "start-ns": 1567 # TreeBuilder does not implement .start_ns() 1568 if hasattr(self.target, "start_ns"): 1569 def handler(prefix, uri, event=event_name, append=append, 1570 start_ns=self._start_ns): 1571 append((event, start_ns(prefix, uri))) 1572 else: 1573 def handler(prefix, uri, event=event_name, append=append): 1574 append((event, (prefix or '', uri or ''))) 1575 parser.StartNamespaceDeclHandler = handler 1576 elif event_name == "end-ns": 1577 # TreeBuilder does not implement .end_ns() 1578 if hasattr(self.target, "end_ns"): 1579 def handler(prefix, event=event_name, append=append, 1580 end_ns=self._end_ns): 1581 append((event, end_ns(prefix))) 1582 else: 1583 def handler(prefix, event=event_name, append=append): 1584 append((event, None)) 1585 parser.EndNamespaceDeclHandler = handler 1586 elif event_name == 'comment': 1587 def handler(text, event=event_name, append=append, self=self): 1588 append((event, self.target.comment(text))) 1589 parser.CommentHandler = handler 1590 elif event_name == 'pi': 1591 def handler(pi_target, data, event=event_name, append=append, 1592 self=self): 1593 append((event, self.target.pi(pi_target, data))) 1594 parser.ProcessingInstructionHandler = handler 1595 else: 1596 raise ValueError("unknown event %r" % event_name) 1597 1598 def _raiseerror(self, value): 1599 err = ParseError(value) 1600 err.code = value.code 1601 err.position = value.lineno, value.offset 1602 raise err 1603 1604 def _fixname(self, key): 1605 # expand qname, and convert name string to ascii, if possible 1606 try: 1607 name = self._names[key] 1608 except KeyError: 1609 name = key 1610 if "}" in name: 1611 name = "{" + name 1612 self._names[key] = name 1613 return name 1614 1615 def _start_ns(self, prefix, uri): 1616 return self.target.start_ns(prefix or '', uri or '') 1617 1618 def _end_ns(self, prefix): 1619 return self.target.end_ns(prefix or '') 1620 1621 def _start(self, tag, attr_list): 1622 # Handler for expat's StartElementHandler. Since ordered_attributes 1623 # is set, the attributes are reported as a list of alternating 1624 # attribute name,value. 1625 fixname = self._fixname 1626 tag = fixname(tag) 1627 attrib = {} 1628 if attr_list: 1629 for i in range(0, len(attr_list), 2): 1630 attrib[fixname(attr_list[i])] = attr_list[i+1] 1631 return self.target.start(tag, attrib) 1632 1633 def _end(self, tag): 1634 return self.target.end(self._fixname(tag)) 1635 1636 def _default(self, text): 1637 prefix = text[:1] 1638 if prefix == "&": 1639 # deal with undefined entities 1640 try: 1641 data_handler = self.target.data 1642 except AttributeError: 1643 return 1644 try: 1645 data_handler(self.entity[text[1:-1]]) 1646 except KeyError: 1647 from xml.parsers import expat 1648 err = expat.error( 1649 "undefined entity %s: line %d, column %d" % 1650 (text, self.parser.ErrorLineNumber, 1651 self.parser.ErrorColumnNumber) 1652 ) 1653 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY 1654 err.lineno = self.parser.ErrorLineNumber 1655 err.offset = self.parser.ErrorColumnNumber 1656 raise err 1657 elif prefix == "<" and text[:9] == "<!DOCTYPE": 1658 self._doctype = [] # inside a doctype declaration 1659 elif self._doctype is not None: 1660 # parse doctype contents 1661 if prefix == ">": 1662 self._doctype = None 1663 return 1664 text = text.strip() 1665 if not text: 1666 return 1667 self._doctype.append(text) 1668 n = len(self._doctype) 1669 if n > 2: 1670 type = self._doctype[1] 1671 if type == "PUBLIC" and n == 4: 1672 name, type, pubid, system = self._doctype 1673 if pubid: 1674 pubid = pubid[1:-1] 1675 elif type == "SYSTEM" and n == 3: 1676 name, type, system = self._doctype 1677 pubid = None 1678 else: 1679 return 1680 if hasattr(self.target, "doctype"): 1681 self.target.doctype(name, pubid, system[1:-1]) 1682 elif hasattr(self, "doctype"): 1683 warnings.warn( 1684 "The doctype() method of XMLParser is ignored. " 1685 "Define doctype() method on the TreeBuilder target.", 1686 RuntimeWarning) 1687 1688 self._doctype = None 1689 1690 def feed(self, data): 1691 """Feed encoded data to parser.""" 1692 try: 1693 self.parser.Parse(data, 0) 1694 except self._error as v: 1695 self._raiseerror(v) 1696 1697 def close(self): 1698 """Finish feeding data to parser and return element structure.""" 1699 try: 1700 self.parser.Parse("", 1) # end of data 1701 except self._error as v: 1702 self._raiseerror(v) 1703 try: 1704 close_handler = self.target.close 1705 except AttributeError: 1706 pass 1707 else: 1708 return close_handler() 1709 finally: 1710 # get rid of circular references 1711 del self.parser, self._parser 1712 del self.target, self._target 1713 1714 1715# -------------------------------------------------------------------- 1716# C14N 2.0 1717 1718def canonicalize(xml_data=None, *, out=None, from_file=None, **options): 1719 """Convert XML to its C14N 2.0 serialised form. 1720 1721 If *out* is provided, it must be a file or file-like object that receives 1722 the serialised canonical XML output (text, not bytes) through its ``.write()`` 1723 method. To write to a file, open it in text mode with encoding "utf-8". 1724 If *out* is not provided, this function returns the output as text string. 1725 1726 Either *xml_data* (an XML string) or *from_file* (a file path or 1727 file-like object) must be provided as input. 1728 1729 The configuration options are the same as for the ``C14NWriterTarget``. 1730 """ 1731 if xml_data is None and from_file is None: 1732 raise ValueError("Either 'xml_data' or 'from_file' must be provided as input") 1733 sio = None 1734 if out is None: 1735 sio = out = io.StringIO() 1736 1737 parser = XMLParser(target=C14NWriterTarget(out.write, **options)) 1738 1739 if xml_data is not None: 1740 parser.feed(xml_data) 1741 parser.close() 1742 elif from_file is not None: 1743 parse(from_file, parser=parser) 1744 1745 return sio.getvalue() if sio is not None else None 1746 1747 1748_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match 1749 1750 1751class C14NWriterTarget: 1752 """ 1753 Canonicalization writer target for the XMLParser. 1754 1755 Serialises parse events to XML C14N 2.0. 1756 1757 The *write* function is used for writing out the resulting data stream 1758 as text (not bytes). To write to a file, open it in text mode with encoding 1759 "utf-8" and pass its ``.write`` method. 1760 1761 Configuration options: 1762 1763 - *with_comments*: set to true to include comments 1764 - *strip_text*: set to true to strip whitespace before and after text content 1765 - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}" 1766 - *qname_aware_tags*: a set of qname aware tag names in which prefixes 1767 should be replaced in text content 1768 - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes 1769 should be replaced in text content 1770 - *exclude_attrs*: a set of attribute names that should not be serialised 1771 - *exclude_tags*: a set of tag names that should not be serialised 1772 """ 1773 def __init__(self, write, *, 1774 with_comments=False, strip_text=False, rewrite_prefixes=False, 1775 qname_aware_tags=None, qname_aware_attrs=None, 1776 exclude_attrs=None, exclude_tags=None): 1777 self._write = write 1778 self._data = [] 1779 self._with_comments = with_comments 1780 self._strip_text = strip_text 1781 self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None 1782 self._exclude_tags = set(exclude_tags) if exclude_tags else None 1783 1784 self._rewrite_prefixes = rewrite_prefixes 1785 if qname_aware_tags: 1786 self._qname_aware_tags = set(qname_aware_tags) 1787 else: 1788 self._qname_aware_tags = None 1789 if qname_aware_attrs: 1790 self._find_qname_aware_attrs = set(qname_aware_attrs).intersection 1791 else: 1792 self._find_qname_aware_attrs = None 1793 1794 # Stack with globally and newly declared namespaces as (uri, prefix) pairs. 1795 self._declared_ns_stack = [[ 1796 ("http://www.w3.org/XML/1998/namespace", "xml"), 1797 ]] 1798 # Stack with user declared namespace prefixes as (uri, prefix) pairs. 1799 self._ns_stack = [] 1800 if not rewrite_prefixes: 1801 self._ns_stack.append(list(_namespace_map.items())) 1802 self._ns_stack.append([]) 1803 self._prefix_map = {} 1804 self._preserve_space = [False] 1805 self._pending_start = None 1806 self._root_seen = False 1807 self._root_done = False 1808 self._ignored_depth = 0 1809 1810 def _iter_namespaces(self, ns_stack, _reversed=reversed): 1811 for namespaces in _reversed(ns_stack): 1812 if namespaces: # almost no element declares new namespaces 1813 yield from namespaces 1814 1815 def _resolve_prefix_name(self, prefixed_name): 1816 prefix, name = prefixed_name.split(':', 1) 1817 for uri, p in self._iter_namespaces(self._ns_stack): 1818 if p == prefix: 1819 return f'{{{uri}}}{name}' 1820 raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope') 1821 1822 def _qname(self, qname, uri=None): 1823 if uri is None: 1824 uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname) 1825 else: 1826 tag = qname 1827 1828 prefixes_seen = set() 1829 for u, prefix in self._iter_namespaces(self._declared_ns_stack): 1830 if u == uri and prefix not in prefixes_seen: 1831 return f'{prefix}:{tag}' if prefix else tag, tag, uri 1832 prefixes_seen.add(prefix) 1833 1834 # Not declared yet => add new declaration. 1835 if self._rewrite_prefixes: 1836 if uri in self._prefix_map: 1837 prefix = self._prefix_map[uri] 1838 else: 1839 prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}' 1840 self._declared_ns_stack[-1].append((uri, prefix)) 1841 return f'{prefix}:{tag}', tag, uri 1842 1843 if not uri and '' not in prefixes_seen: 1844 # No default namespace declared => no prefix needed. 1845 return tag, tag, uri 1846 1847 for u, prefix in self._iter_namespaces(self._ns_stack): 1848 if u == uri: 1849 self._declared_ns_stack[-1].append((uri, prefix)) 1850 return f'{prefix}:{tag}' if prefix else tag, tag, uri 1851 1852 if not uri: 1853 # As soon as a default namespace is defined, 1854 # anything that has no namespace (and thus, no prefix) goes there. 1855 return tag, tag, uri 1856 1857 raise ValueError(f'Namespace "{uri}" is not declared in scope') 1858 1859 def data(self, data): 1860 if not self._ignored_depth: 1861 self._data.append(data) 1862 1863 def _flush(self, _join_text=''.join): 1864 data = _join_text(self._data) 1865 del self._data[:] 1866 if self._strip_text and not self._preserve_space[-1]: 1867 data = data.strip() 1868 if self._pending_start is not None: 1869 args, self._pending_start = self._pending_start, None 1870 qname_text = data if data and _looks_like_prefix_name(data) else None 1871 self._start(*args, qname_text) 1872 if qname_text is not None: 1873 return 1874 if data and self._root_seen: 1875 self._write(_escape_cdata_c14n(data)) 1876 1877 def start_ns(self, prefix, uri): 1878 if self._ignored_depth: 1879 return 1880 # we may have to resolve qnames in text content 1881 if self._data: 1882 self._flush() 1883 self._ns_stack[-1].append((uri, prefix)) 1884 1885 def start(self, tag, attrs): 1886 if self._exclude_tags is not None and ( 1887 self._ignored_depth or tag in self._exclude_tags): 1888 self._ignored_depth += 1 1889 return 1890 if self._data: 1891 self._flush() 1892 1893 new_namespaces = [] 1894 self._declared_ns_stack.append(new_namespaces) 1895 1896 if self._qname_aware_tags is not None and tag in self._qname_aware_tags: 1897 # Need to parse text first to see if it requires a prefix declaration. 1898 self._pending_start = (tag, attrs, new_namespaces) 1899 return 1900 self._start(tag, attrs, new_namespaces) 1901 1902 def _start(self, tag, attrs, new_namespaces, qname_text=None): 1903 if self._exclude_attrs is not None and attrs: 1904 attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs} 1905 1906 qnames = {tag, *attrs} 1907 resolved_names = {} 1908 1909 # Resolve prefixes in attribute and tag text. 1910 if qname_text is not None: 1911 qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text) 1912 qnames.add(qname) 1913 if self._find_qname_aware_attrs is not None and attrs: 1914 qattrs = self._find_qname_aware_attrs(attrs) 1915 if qattrs: 1916 for attr_name in qattrs: 1917 value = attrs[attr_name] 1918 if _looks_like_prefix_name(value): 1919 qname = resolved_names[value] = self._resolve_prefix_name(value) 1920 qnames.add(qname) 1921 else: 1922 qattrs = None 1923 else: 1924 qattrs = None 1925 1926 # Assign prefixes in lexicographical order of used URIs. 1927 parse_qname = self._qname 1928 parsed_qnames = {n: parse_qname(n) for n in sorted( 1929 qnames, key=lambda n: n.split('}', 1))} 1930 1931 # Write namespace declarations in prefix order ... 1932 if new_namespaces: 1933 attr_list = [ 1934 ('xmlns:' + prefix if prefix else 'xmlns', uri) 1935 for uri, prefix in new_namespaces 1936 ] 1937 attr_list.sort() 1938 else: 1939 # almost always empty 1940 attr_list = [] 1941 1942 # ... followed by attributes in URI+name order 1943 if attrs: 1944 for k, v in sorted(attrs.items()): 1945 if qattrs is not None and k in qattrs and v in resolved_names: 1946 v = parsed_qnames[resolved_names[v]][0] 1947 attr_qname, attr_name, uri = parsed_qnames[k] 1948 # No prefix for attributes in default ('') namespace. 1949 attr_list.append((attr_qname if uri else attr_name, v)) 1950 1951 # Honour xml:space attributes. 1952 space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space') 1953 self._preserve_space.append( 1954 space_behaviour == 'preserve' if space_behaviour 1955 else self._preserve_space[-1]) 1956 1957 # Write the tag. 1958 write = self._write 1959 write('<' + parsed_qnames[tag][0]) 1960 if attr_list: 1961 write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list])) 1962 write('>') 1963 1964 # Write the resolved qname text content. 1965 if qname_text is not None: 1966 write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0])) 1967 1968 self._root_seen = True 1969 self._ns_stack.append([]) 1970 1971 def end(self, tag): 1972 if self._ignored_depth: 1973 self._ignored_depth -= 1 1974 return 1975 if self._data: 1976 self._flush() 1977 self._write(f'</{self._qname(tag)[0]}>') 1978 self._preserve_space.pop() 1979 self._root_done = len(self._preserve_space) == 1 1980 self._declared_ns_stack.pop() 1981 self._ns_stack.pop() 1982 1983 def comment(self, text): 1984 if not self._with_comments: 1985 return 1986 if self._ignored_depth: 1987 return 1988 if self._root_done: 1989 self._write('\n') 1990 elif self._root_seen and self._data: 1991 self._flush() 1992 self._write(f'<!--{_escape_cdata_c14n(text)}-->') 1993 if not self._root_seen: 1994 self._write('\n') 1995 1996 def pi(self, target, data): 1997 if self._ignored_depth: 1998 return 1999 if self._root_done: 2000 self._write('\n') 2001 elif self._root_seen and self._data: 2002 self._flush() 2003 self._write( 2004 f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>') 2005 if not self._root_seen: 2006 self._write('\n') 2007 2008 2009def _escape_cdata_c14n(text): 2010 # escape character data 2011 try: 2012 # it's worth avoiding do-nothing calls for strings that are 2013 # shorter than 500 character, or so. assume that's, by far, 2014 # the most common case in most applications. 2015 if '&' in text: 2016 text = text.replace('&', '&') 2017 if '<' in text: 2018 text = text.replace('<', '<') 2019 if '>' in text: 2020 text = text.replace('>', '>') 2021 if '\r' in text: 2022 text = text.replace('\r', '
') 2023 return text 2024 except (TypeError, AttributeError): 2025 _raise_serialization_error(text) 2026 2027 2028def _escape_attrib_c14n(text): 2029 # escape attribute value 2030 try: 2031 if '&' in text: 2032 text = text.replace('&', '&') 2033 if '<' in text: 2034 text = text.replace('<', '<') 2035 if '"' in text: 2036 text = text.replace('"', '"') 2037 if '\t' in text: 2038 text = text.replace('\t', '	') 2039 if '\n' in text: 2040 text = text.replace('\n', '
') 2041 if '\r' in text: 2042 text = text.replace('\r', '
') 2043 return text 2044 except (TypeError, AttributeError): 2045 _raise_serialization_error(text) 2046 2047 2048# -------------------------------------------------------------------- 2049 2050# Import the C accelerators 2051try: 2052 # Element is going to be shadowed by the C implementation. We need to keep 2053 # the Python version of it accessible for some "creative" by external code 2054 # (see tests) 2055 _Element_Py = Element 2056 2057 # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories 2058 from _elementtree import * 2059 from _elementtree import _set_factories 2060except ImportError: 2061 pass 2062else: 2063 _set_factories(Comment, ProcessingInstruction) 2064