1# Copyright 2008-2014 by Michiel de Hoon. All rights reserved. 2# Revisions copyright 2008-2015 by Peter Cock. All rights reserved. 3# 4# This file is part of the Biopython distribution and governed by your 5# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 6# Please see the LICENSE file that should have been included as part of this 7# package. 8 9"""Parser for XML results returned by NCBI's Entrez Utilities. 10 11This parser is used by the read() function in Bio.Entrez, and is not 12intended be used directly. 13 14The question is how to represent an XML file as Python objects. Some 15XML files returned by NCBI look like lists, others look like dictionaries, 16and others look like a mix of lists and dictionaries. 17 18My approach is to classify each possible element in the XML as a plain 19string, an integer, a list, a dictionary, or a structure. The latter is a 20dictionary where the same key can occur multiple times; in Python, it is 21represented as a dictionary where that key occurs once, pointing to a list 22of values found in the XML file. 23 24The parser then goes through the XML and creates the appropriate Python 25object for each element. The different levels encountered in the XML are 26preserved on the Python side. So a subelement of a subelement of an element 27is a value in a dictionary that is stored in a list which is a value in 28some other dictionary (or a value in a list which itself belongs to a list 29which is a value in a dictionary, and so on). Attributes encountered in 30the XML are stored as a dictionary in a member .attributes of each element, 31and the tag name is saved in a member .tag. 32 33To decide which kind of Python object corresponds to each element in the 34XML, the parser analyzes the DTD referred at the top of (almost) every 35XML file returned by the Entrez Utilities. This is preferred over a hand- 36written solution, since the number of DTDs is rather large and their 37contents may change over time. About half the code in this parser deals 38with parsing the DTD, and the other half with the XML itself. 39""" 40import os 41import warnings 42from collections import Counter 43from xml.parsers import expat 44from io import BytesIO 45import xml.etree.ElementTree as ET 46from xml.sax.saxutils import escape 47 48from urllib.request import urlopen, urlparse 49 50 51# The following four classes are used to add a member .attributes to integers, 52# strings, lists, and dictionaries, respectively. 53 54 55class NoneElement: 56 """NCBI Entrez XML element mapped to None.""" 57 58 def __init__(self, tag, attributes, key=None): 59 """Create a NoneElement.""" 60 self.tag = tag 61 if key is None: 62 self.key = tag 63 else: 64 self.key = key 65 self.attributes = attributes 66 67 def __eq__(self, other): 68 """Define equality with other None objects.""" 69 if other is None: 70 return True 71 elif other.__eq__(None): 72 return True 73 else: 74 return False 75 76 def __ne__(self, other): 77 """Define non-equality.""" 78 if other is None: 79 return False 80 elif other.__eq__(None): 81 return False 82 else: 83 return True 84 85 def __repr__(self): 86 """Return a string representation of the object.""" 87 try: 88 attributes = self.attributes 89 except AttributeError: 90 return "NoneElement" 91 return "NoneElement(attributes=%r)" % attributes 92 93 94class IntegerElement(int): 95 """NCBI Entrez XML element mapped to an integer.""" 96 97 def __new__(cls, value, tag, attributes, key=None): 98 """Create an IntegerElement.""" 99 self = int.__new__(cls, value) 100 self.tag = tag 101 if key is None: 102 self.key = tag 103 else: 104 self.key = key 105 self.attributes = attributes 106 return self 107 108 def __repr__(self): 109 """Return a string representation of the object.""" 110 text = int.__repr__(self) 111 try: 112 attributes = self.attributes 113 except AttributeError: 114 return text 115 return "IntegerElement(%s, attributes=%r)" % (text, attributes) 116 117 118class StringElement(str): 119 """NCBI Entrez XML element mapped to a string.""" 120 121 def __new__(cls, value, tag, attributes, key=None): 122 """Create a StringElement.""" 123 self = str.__new__(cls, value) 124 self.tag = tag 125 if key is None: 126 self.key = tag 127 else: 128 self.key = key 129 self.attributes = attributes 130 return self 131 132 def __repr__(self): 133 """Return a string representation of the object.""" 134 text = str.__repr__(self) 135 attributes = self.attributes 136 if not attributes: 137 return text 138 return "StringElement(%s, attributes=%r)" % (text, attributes) 139 140 141class ListElement(list): 142 """NCBI Entrez XML element mapped to a list.""" 143 144 def __init__(self, tag, attributes, allowed_tags, key=None): 145 """Create a ListElement.""" 146 self.tag = tag 147 if key is None: 148 self.key = tag 149 else: 150 self.key = key 151 self.attributes = attributes 152 self.allowed_tags = allowed_tags 153 154 def __repr__(self): 155 """Return a string representation of the object.""" 156 text = list.__repr__(self) 157 attributes = self.attributes 158 if not attributes: 159 return text 160 return "ListElement(%s, attributes=%r)" % (text, attributes) 161 162 def store(self, value): 163 """Append an element to the list, checking tags.""" 164 key = value.key 165 if self.allowed_tags is not None and key not in self.allowed_tags: 166 raise ValueError("Unexpected item '%s' in list" % key) 167 self.append(value) 168 169 170class DictionaryElement(dict): 171 """NCBI Entrez XML element mapped to a dictionaray.""" 172 173 def __init__(self, tag, attrs, allowed_tags, repeated_tags=None, key=None): 174 """Create a DictionaryElement.""" 175 self.tag = tag 176 if key is None: 177 self.key = tag 178 else: 179 self.key = key 180 self.attributes = attrs 181 self.allowed_tags = allowed_tags 182 self.repeated_tags = repeated_tags 183 if repeated_tags: 184 for key in repeated_tags: 185 self[key] = [] 186 187 def __repr__(self): 188 """Return a string representation of the object.""" 189 text = dict.__repr__(self) 190 attributes = self.attributes 191 if not attributes: 192 return text 193 return "DictElement(%s, attributes=%r)" % (text, attributes) 194 195 def store(self, value): 196 """Add an entry to the dictionary, checking tags.""" 197 key = value.key 198 tag = value.tag 199 if self.allowed_tags is not None and tag not in self.allowed_tags: 200 raise ValueError("Unexpected item '%s' in dictionary" % key) 201 if self.repeated_tags and key in self.repeated_tags: 202 self[key].append(value) 203 else: 204 self[key] = value 205 206 207class NotXMLError(ValueError): 208 """Failed to parse file as XML.""" 209 210 def __init__(self, message): 211 """Initialize the class.""" 212 self.msg = message 213 214 def __str__(self): 215 """Return a string summary of the exception.""" 216 return ( 217 "Failed to parse the XML data (%s). Please make sure that the input data " 218 "are in XML format." % self.msg 219 ) 220 221 222class CorruptedXMLError(ValueError): 223 """Corrupted XML.""" 224 225 def __init__(self, message): 226 """Initialize the class.""" 227 self.msg = message 228 229 def __str__(self): 230 """Return a string summary of the exception.""" 231 return ( 232 "Failed to parse the XML data (%s). Please make sure that the input data " 233 "are not corrupted." % self.msg 234 ) 235 236 237class ValidationError(ValueError): 238 """XML tag found which was not defined in the DTD. 239 240 Validating parsers raise this error if the parser finds a tag in the XML 241 that is not defined in the DTD. Non-validating parsers do not raise this 242 error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating 243 parsers by default (see those functions for more information). 244 """ 245 246 def __init__(self, name): 247 """Initialize the class.""" 248 self.name = name 249 250 def __str__(self): 251 """Return a string summary of the exception.""" 252 return ( 253 "Failed to find tag '%s' in the DTD. To skip all tags that " 254 "are not represented in the DTD, please call Bio.Entrez.read " 255 "or Bio.Entrez.parse with validate=False." % self.name 256 ) 257 258 259class DataHandlerMeta(type): 260 """A metaclass is needed until Python supports @classproperty.""" 261 262 def __init__(cls, *args, **kwargs): 263 """Initialize the class.""" 264 cls._directory = None 265 266 @property 267 def directory(cls): 268 """Directory for caching XSD and DTD files.""" 269 return cls._directory 270 271 @directory.setter 272 def directory(cls, value): 273 """Set a custom directory for the local DTD/XSD directories.""" 274 if value is None: 275 import platform 276 277 if platform.system() == "Windows": 278 value = os.path.join(os.getenv("APPDATA"), "biopython") 279 else: # Unix/Linux/Mac 280 home = os.path.expanduser("~") 281 value = os.path.join(home, ".config", "biopython") 282 cls._directory = value 283 # Create DTD local directory 284 cls.local_dtd_dir = os.path.join(cls._directory, "Bio", "Entrez", "DTDs") 285 os.makedirs(cls.local_dtd_dir, exist_ok=True) 286 # Create XSD local directory 287 cls.local_xsd_dir = os.path.join(cls._directory, "Bio", "Entrez", "XSDs") 288 os.makedirs(cls.local_xsd_dir, exist_ok=True) 289 290 291class DataHandler(metaclass=DataHandlerMeta): 292 """Data handler for parsing NCBI XML from Entrez.""" 293 294 from Bio import Entrez 295 296 global_dtd_dir = os.path.join(Entrez.__path__[0], "DTDs") 297 global_xsd_dir = os.path.join(Entrez.__path__[0], "XSDs") 298 local_dtd_dir = "" 299 local_xsd_dir = "" 300 301 del Entrez 302 303 def __init__(self, validate, escape): 304 """Create a DataHandler object.""" 305 self.dtd_urls = [] 306 self.element = None 307 self.level = 0 308 self.data = [] 309 self.attributes = None 310 self.allowed_tags = None 311 self.strings = {} 312 self.lists = {} 313 self.dictionaries = {} 314 self.items = set() 315 self.errors = set() 316 self.validating = validate 317 self.parser = expat.ParserCreate(namespace_separator=" ") 318 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 319 self.parser.XmlDeclHandler = self.xmlDeclHandler 320 self.schema_namespace = None 321 self.namespace_level = Counter() 322 self.namespace_prefix = {} 323 if escape: 324 self.characterDataHandler = self.characterDataHandlerEscape 325 else: 326 self.characterDataHandler = self.characterDataHandlerRaw 327 328 def read(self, handle): 329 """Set up the parser and let it parse the XML results.""" 330 # Expat's parser.ParseFile function only accepts binary data; 331 # see also the comment below for Entrez.parse. 332 if handle.read(0) != b"": 333 raise TypeError("file should be opened in binary mode") 334 try: 335 self.parser.ParseFile(handle) 336 except expat.ExpatError as e: 337 if self.parser.StartElementHandler: 338 # We saw the initial <!xml declaration, so we can be sure that 339 # we are parsing XML data. Most likely, the XML file is 340 # corrupted. 341 raise CorruptedXMLError(e) from None 342 else: 343 # We have not seen the initial <!xml declaration, so probably 344 # the input data is not in XML format. 345 raise NotXMLError(e) from None 346 try: 347 return self.record 348 except AttributeError: 349 if self.parser.StartElementHandler: 350 # We saw the initial <!xml declaration, and expat didn't notice 351 # any errors, so self.record should be defined. If not, this is 352 # a bug. 353 raise RuntimeError( 354 "Failed to parse the XML file correctly, possibly due to a bug " 355 "in Bio.Entrez. Please contact the Biopython developers via " 356 "the mailing list or GitHub for assistance." 357 ) from None 358 else: 359 # We did not see the initial <!xml declaration, so probably 360 # the input data is not in XML format. 361 raise NotXMLError("XML declaration not found") from None 362 363 def parse(self, handle): 364 """Parse the XML in the given file handle.""" 365 # The handle should have been opened in binary mode; data read from 366 # the handle are then bytes. Expat will pick up the encoding from the 367 # XML declaration (or assume UTF-8 if it is missing), and use this 368 # encoding to convert the binary data to a string before giving it to 369 # characterDataHandler. 370 # While parser.ParseFile only accepts binary data, parser.Parse accepts 371 # both binary data and strings. However, a file in text mode may have 372 # been opened with an encoding different from the encoding specified in 373 # the XML declaration at the top of the file. If so, the data in the 374 # file will have been decoded with an incorrect encoding. To avoid 375 # this, and to be consistent with parser.ParseFile (which is used in 376 # the Entrez.read function above), we require the handle to be in 377 # binary mode here as well. 378 if handle.read(0) != b"": 379 raise TypeError("file should be opened in binary mode") 380 BLOCK = 1024 381 while True: 382 # Read in another block of data from the file. 383 data = handle.read(BLOCK) 384 try: 385 self.parser.Parse(data, False) 386 except expat.ExpatError as e: 387 if self.parser.StartElementHandler: 388 # We saw the initial <!xml declaration, so we can be sure 389 # that we are parsing XML data. Most likely, the XML file 390 # is corrupted. 391 raise CorruptedXMLError(e) from None 392 else: 393 # We have not seen the initial <!xml declaration, so 394 # probably the input data is not in XML format. 395 raise NotXMLError(e) from None 396 try: 397 records = self.record 398 except AttributeError: 399 if self.parser.StartElementHandler: 400 # We saw the initial <!xml declaration, and expat 401 # didn't notice any errors, so self.record should be 402 # defined. If not, this is a bug. 403 404 raise RuntimeError( 405 "Failed to parse the XML file correctly, possibly due to a " 406 "bug in Bio.Entrez. Please contact the Biopython " 407 "developers via the mailing list or GitHub for assistance." 408 ) from None 409 else: 410 # We did not see the initial <!xml declaration, so 411 # probably the input data is not in XML format. 412 raise NotXMLError("XML declaration not found") from None 413 414 if not isinstance(records, list): 415 raise ValueError( 416 "The XML file does not represent a list. Please use Entrez.read " 417 "instead of Entrez.parse" 418 ) 419 420 if not data: 421 break 422 423 while len(records) >= 2: 424 # Then the first record is finished, while the second record 425 # is still a work in progress. 426 record = records.pop(0) 427 yield record 428 429 # We have reached the end of the XML file 430 self.parser = None 431 if self.element is not None: 432 # No more XML data, but there is still some unfinished business 433 raise CorruptedXMLError("Premature end of data") 434 435 # Send out the remaining records 436 yield from records 437 438 def xmlDeclHandler(self, version, encoding, standalone): 439 """Set XML handlers when an XML declaration is found.""" 440 self.parser.CharacterDataHandler = self.characterDataHandler 441 self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler 442 self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler 443 self.parser.EndNamespaceDeclHandler = self.endNamespaceDeclHandler 444 self.parser.StartElementHandler = self.handleMissingDocumentDefinition 445 446 def handleMissingDocumentDefinition(self, tag, attrs): 447 """Raise an Exception if neither a DTD nor an XML Schema is found.""" 448 raise ValueError( 449 "As the XML data contained neither a Document Type Definition (DTD) nor an XML Schema, Bio.Entrez is unable to parse these data. We recommend using a generic XML parser from the Python standard library instead, for example ElementTree." 450 ) 451 452 def startNamespaceDeclHandler(self, prefix, uri): 453 """Handle start of an XML namespace declaration.""" 454 if prefix == "xsi": 455 # This is an xml schema 456 self.schema_namespace = uri 457 self.parser.StartElementHandler = self.schemaHandler 458 else: 459 # Note that the DTD for MathML specifies a default attribute 460 # that declares the namespace for each MathML element. This means 461 # that MathML element in the XML has an invisible MathML namespace 462 # declaration that triggers a call to startNamespaceDeclHandler 463 # and endNamespaceDeclHandler. Therefore we need to count how often 464 # startNamespaceDeclHandler and endNamespaceDeclHandler were called 465 # to find out their first and last invocation for each namespace. 466 if prefix == "mml": 467 assert uri == "http://www.w3.org/1998/Math/MathML" 468 elif prefix == "xlink": 469 assert uri == "http://www.w3.org/1999/xlink" 470 else: 471 raise ValueError("Unknown prefix '%s' with uri '%s'" % (prefix, uri)) 472 self.namespace_level[prefix] += 1 473 self.namespace_prefix[uri] = prefix 474 475 def endNamespaceDeclHandler(self, prefix): 476 """Handle end of an XML namespace declaration.""" 477 if prefix != "xsi": 478 self.namespace_level[prefix] -= 1 479 if self.namespace_level[prefix] == 0: 480 for key, value in self.namespace_prefix.items(): 481 if value == prefix: 482 break 483 else: 484 raise RuntimeError("Failed to find namespace prefix") 485 del self.namespace_prefix[key] 486 487 def schemaHandler(self, name, attrs): 488 """Process the XML schema (before processing the element).""" 489 key = "%s noNamespaceSchemaLocation" % self.schema_namespace 490 schema = attrs[key] 491 handle = self.open_xsd_file(os.path.basename(schema)) 492 # if there is no local xsd file grab the url and parse the file 493 if not handle: 494 handle = urlopen(schema) 495 text = handle.read() 496 self.save_xsd_file(os.path.basename(schema), text) 497 handle.close() 498 self.parse_xsd(ET.fromstring(text)) 499 else: 500 self.parse_xsd(ET.fromstring(handle.read())) 501 handle.close() 502 # continue handling the element 503 self.startElementHandler(name, attrs) 504 # reset the element handler 505 self.parser.StartElementHandler = self.startElementHandler 506 507 def startElementHandler(self, tag, attrs): 508 """Handle start of an XML element.""" 509 if tag in self.items: 510 assert tag == "Item" 511 name = attrs["Name"] 512 itemtype = attrs["Type"] 513 del attrs["Type"] 514 if itemtype == "Structure": 515 del attrs["Name"] 516 element = DictionaryElement( 517 name, attrs, allowed_tags=None, repeated_tags=None 518 ) 519 parent = self.element 520 element.parent = parent 521 # For consistency with lists below, store the element here 522 if parent is None: 523 self.record = element 524 else: 525 parent.store(element) 526 self.element = element 527 self.parser.EndElementHandler = self.endElementHandler 528 self.parser.CharacterDataHandler = self.skipCharacterDataHandler 529 elif name in ("ArticleIds", "History"): 530 del attrs["Name"] 531 allowed_tags = None # allowed tags are unknown 532 repeated_tags = frozenset(["pubmed", "medline"]) 533 element = DictionaryElement( 534 tag, 535 attrs, 536 allowed_tags=allowed_tags, 537 repeated_tags=repeated_tags, 538 key=name, 539 ) 540 parent = self.element 541 element.parent = parent 542 # For consistency with lists below, store the element here 543 if parent is None: 544 self.record = element 545 else: 546 parent.store(element) 547 self.element = element 548 self.parser.EndElementHandler = self.endElementHandler 549 self.parser.CharacterDataHandler = self.skipCharacterDataHandler 550 elif itemtype == "List": 551 del attrs["Name"] 552 allowed_tags = None # allowed tags are unknown 553 element = ListElement(tag, attrs, allowed_tags, name) 554 parent = self.element 555 element.parent = parent 556 if self.element is None: 557 # Set self.record here to let Entrez.parse iterate over it 558 self.record = element 559 else: 560 parent.store(element) 561 self.element = element 562 self.parser.EndElementHandler = self.endElementHandler 563 self.parser.CharacterDataHandler = self.skipCharacterDataHandler 564 elif itemtype == "Integer": 565 self.parser.EndElementHandler = self.endIntegerElementHandler 566 self.parser.CharacterDataHandler = self.characterDataHandler 567 self.attributes = attrs 568 elif itemtype in ("String", "Unknown", "Date", "Enumerator"): 569 assert self.attributes is None 570 self.attributes = attrs 571 self.parser.StartElementHandler = self.startRawElementHandler 572 self.parser.EndElementHandler = self.endStringElementHandler 573 self.parser.CharacterDataHandler = self.characterDataHandler 574 else: 575 raise ValueError("Unknown item type %s" % name) 576 elif tag in self.errors: 577 self.parser.EndElementHandler = self.endErrorElementHandler 578 self.parser.CharacterDataHandler = self.characterDataHandler 579 elif tag in self.strings: 580 self.parser.StartElementHandler = self.startRawElementHandler 581 self.parser.EndElementHandler = self.endStringElementHandler 582 self.parser.CharacterDataHandler = self.characterDataHandler 583 assert self.allowed_tags is None 584 self.allowed_tags = self.strings[tag] 585 assert self.attributes is None 586 self.attributes = attrs 587 elif tag in self.dictionaries: 588 allowed_tags, repeated_tags = self.dictionaries[tag] 589 element = DictionaryElement(tag, attrs, allowed_tags, repeated_tags) 590 parent = self.element 591 element.parent = parent 592 # For consistency with lists below, store the element here 593 if parent is None: 594 self.record = element 595 else: 596 parent.store(element) 597 self.element = element 598 self.parser.EndElementHandler = self.endElementHandler 599 self.parser.CharacterDataHandler = self.skipCharacterDataHandler 600 elif tag in self.lists: 601 allowed_tags = self.lists[tag] 602 element = ListElement(tag, attrs, allowed_tags) 603 parent = self.element 604 element.parent = parent 605 if parent is None: 606 # Set self.record here to let Entrez.parse iterate over it 607 self.record = element 608 else: 609 parent.store(element) 610 self.element = element 611 self.parser.EndElementHandler = self.endElementHandler 612 self.parser.CharacterDataHandler = self.skipCharacterDataHandler 613 else: 614 # Element not found in DTD 615 if self.validating: 616 raise ValidationError(tag) 617 else: 618 # this will not be stored in the record 619 self.parser.StartElementHandler = self.startSkipElementHandler 620 self.parser.EndElementHandler = self.endSkipElementHandler 621 self.parser.CharacterDataHandler = self.skipCharacterDataHandler 622 self.level = 1 623 624 def startRawElementHandler(self, name, attrs): 625 """Handle start of an XML raw element.""" 626 # check if the name is in a namespace 627 prefix = None 628 if self.namespace_prefix: 629 try: 630 uri, name = name.split() 631 except ValueError: 632 pass 633 else: 634 prefix = self.namespace_prefix[uri] 635 if self.namespace_level[prefix] == 1: 636 attrs = {"xmlns": uri} 637 if prefix: 638 key = "%s:%s" % (prefix, name) 639 else: 640 key = name 641 # self.allowed_tags is ignored for now. Anyway we know what to do 642 # with this tag. 643 tag = "<%s" % name 644 for key, value in attrs.items(): 645 tag += ' %s="%s"' % (key, value) 646 tag += ">" 647 self.data.append(tag) 648 self.parser.EndElementHandler = self.endRawElementHandler 649 self.level += 1 650 651 def startSkipElementHandler(self, name, attrs): 652 """Handle start of an XML skip element.""" 653 self.level += 1 654 655 def endStringElementHandler(self, tag): 656 """Handle end of an XML string element.""" 657 element = self.element 658 if element is not None: 659 self.parser.StartElementHandler = self.startElementHandler 660 self.parser.EndElementHandler = self.endElementHandler 661 self.parser.CharacterDataHandler = self.skipCharacterDataHandler 662 value = "".join(self.data) 663 self.data = [] 664 attributes = self.attributes 665 self.attributes = None 666 if tag in self.items: 667 assert tag == "Item" 668 key = attributes["Name"] 669 del attributes["Name"] 670 else: 671 key = tag 672 value = StringElement(value, tag, attributes, key) 673 if element is None: 674 self.record = element 675 else: 676 element.store(value) 677 self.allowed_tags = None 678 679 def endRawElementHandler(self, name): 680 """Handle start of an XML raw element.""" 681 self.level -= 1 682 if self.level == 0: 683 self.parser.EndElementHandler = self.endStringElementHandler 684 if self.namespace_prefix: 685 try: 686 uri, name = name.split() 687 except ValueError: 688 pass 689 tag = "</%s>" % name 690 self.data.append(tag) 691 692 def endSkipElementHandler(self, name): 693 """Handle start of an XML skip element.""" 694 self.level -= 1 695 if self.level == 0: 696 self.parser.StartElementHandler = self.startElementHandler 697 self.parser.EndElementHandler = self.endElementHandler 698 699 def endErrorElementHandler(self, name): 700 """Handle start of an XML error element.""" 701 if self.data: 702 # error found: 703 value = "".join(self.data) 704 raise RuntimeError(value) 705 # no error found: 706 if self.element is not None: 707 self.parser.EndElementHandler = self.endElementHandler 708 self.parser.CharacterDataHandler = self.skipCharacterDataHandler 709 710 def endElementHandler(self, name): 711 """Handle end of an XML element.""" 712 element = self.element 713 self.element = element.parent 714 del element.parent 715 716 def endIntegerElementHandler(self, tag): 717 """Handle end of an XML integer element.""" 718 attributes = self.attributes 719 self.attributes = None 720 assert tag == "Item" 721 key = attributes["Name"] 722 del attributes["Name"] 723 if self.data: 724 value = int("".join(self.data)) 725 self.data = [] 726 value = IntegerElement(value, tag, attributes, key) 727 else: 728 value = NoneElement(tag, attributes, key) 729 element = self.element 730 if element is None: 731 self.record = value 732 else: 733 self.parser.EndElementHandler = self.endElementHandler 734 self.parser.CharacterDataHandler = self.skipCharacterDataHandler 735 if value is None: 736 return 737 element.store(value) 738 739 def characterDataHandlerRaw(self, content): 740 """Handle character data as-is (raw).""" 741 self.data.append(content) 742 743 def characterDataHandlerEscape(self, content): 744 """Handle character data by encoding it.""" 745 content = escape(content) 746 self.data.append(content) 747 748 def skipCharacterDataHandler(self, content): 749 """Handle character data by skipping it.""" 750 751 def parse_xsd(self, root): 752 """Parse an XSD file.""" 753 prefix = "{http://www.w3.org/2001/XMLSchema}" 754 for element in root: 755 isSimpleContent = False 756 attribute_keys = [] 757 keys = [] 758 multiple = [] 759 assert element.tag == prefix + "element" 760 name = element.attrib["name"] 761 assert len(element) == 1 762 complexType = element[0] 763 assert complexType.tag == prefix + "complexType" 764 for component in complexType: 765 tag = component.tag 766 if tag == prefix + "attribute": 767 # we could distinguish by type; keeping string for now 768 attribute_keys.append(component.attrib["name"]) 769 elif tag == prefix + "sequence": 770 maxOccurs = component.attrib.get("maxOccurs", "1") 771 for key in component: 772 assert key.tag == prefix + "element" 773 ref = key.attrib["ref"] 774 keys.append(ref) 775 if maxOccurs != "1" or key.attrib.get("maxOccurs", "1") != "1": 776 multiple.append(ref) 777 elif tag == prefix + "simpleContent": 778 assert len(component) == 1 779 extension = component[0] 780 assert extension.tag == prefix + "extension" 781 assert extension.attrib["base"] == "xs:string" 782 for attribute in extension: 783 assert attribute.tag == prefix + "attribute" 784 # we could distinguish by type; keeping string for now 785 attribute_keys.append(attribute.attrib["name"]) 786 isSimpleContent = True 787 allowed_tags = frozenset(keys) 788 if len(keys) == 1 and keys == multiple: 789 assert not isSimpleContent 790 self.lists[name] = allowed_tags 791 elif len(keys) >= 1: 792 assert not isSimpleContent 793 repeated_tags = frozenset(multiple) 794 self.dictionaries[name] = (allowed_tags, repeated_tags) 795 else: 796 self.strings[name] = allowed_tags 797 798 def elementDecl(self, name, model): 799 """Call a call-back function for each element declaration in a DTD. 800 801 This is used for each element declaration in a DTD like:: 802 803 <!ELEMENT name (...)> 804 805 The purpose of this function is to determine whether this element 806 should be regarded as a string, integer, list, dictionary, structure, 807 or error. 808 """ 809 if name.upper() == "ERROR": 810 self.errors.add(name) 811 return 812 if name == "Item" and model == ( 813 expat.model.XML_CTYPE_MIXED, 814 expat.model.XML_CQUANT_REP, 815 None, 816 ((expat.model.XML_CTYPE_NAME, expat.model.XML_CQUANT_NONE, "Item", ()),), 817 ): 818 # Special case. As far as I can tell, this only occurs in the 819 # eSummary DTD. 820 self.items.add(name) 821 return 822 # First, remove ignorable parentheses around declarations 823 while ( 824 model[0] in (expat.model.XML_CTYPE_SEQ, expat.model.XML_CTYPE_CHOICE) 825 and model[1] in (expat.model.XML_CQUANT_NONE, expat.model.XML_CQUANT_OPT) 826 and len(model[3]) == 1 827 ): 828 model = model[3][0] 829 # PCDATA declarations correspond to strings 830 if model[0] in (expat.model.XML_CTYPE_MIXED, expat.model.XML_CTYPE_EMPTY): 831 if model[1] == expat.model.XML_CQUANT_REP: 832 children = model[3] 833 allowed_tags = frozenset(child[2] for child in children) 834 else: 835 allowed_tags = frozenset() 836 self.strings[name] = allowed_tags 837 return 838 # List-type elements 839 if model[0] in ( 840 expat.model.XML_CTYPE_CHOICE, 841 expat.model.XML_CTYPE_SEQ, 842 ) and model[1] in (expat.model.XML_CQUANT_PLUS, expat.model.XML_CQUANT_REP): 843 children = model[3] 844 if model[0] == expat.model.XML_CTYPE_SEQ: 845 assert len(children) == 1 846 allowed_tags = frozenset(child[2] for child in children) 847 self.lists[name] = allowed_tags 848 return 849 # This is the tricky case. Check which keys can occur multiple 850 # times. If only one key is possible, and it can occur multiple 851 # times, then this is a list. If more than one key is possible, 852 # but none of them can occur multiple times, then this is a 853 # dictionary. Otherwise, this is a structure. 854 # In 'single' and 'multiple', we keep track which keys can occur 855 # only once, and which can occur multiple times. 856 single = [] 857 multiple = [] 858 # The 'count' function is called recursively to make sure all the 859 # children in this model are counted. Error keys are ignored; 860 # they raise an exception in Python. 861 862 def count(model): 863 quantifier, key, children = model[1:] 864 if key is None: 865 if quantifier in ( 866 expat.model.XML_CQUANT_PLUS, 867 expat.model.XML_CQUANT_REP, 868 ): 869 for child in children: 870 multiple.append(child[2]) 871 else: 872 for child in children: 873 count(child) 874 elif key.upper() != "ERROR": 875 if quantifier in ( 876 expat.model.XML_CQUANT_NONE, 877 expat.model.XML_CQUANT_OPT, 878 ): 879 single.append(key) 880 elif quantifier in ( 881 expat.model.XML_CQUANT_PLUS, 882 expat.model.XML_CQUANT_REP, 883 ): 884 multiple.append(key) 885 886 count(model) 887 if len(single) == 0 and len(multiple) == 1: 888 allowed_tags = frozenset(multiple) 889 self.lists[name] = allowed_tags 890 else: 891 allowed_tags = frozenset(single + multiple) 892 repeated_tags = frozenset(multiple) 893 self.dictionaries[name] = (allowed_tags, repeated_tags) 894 895 def open_dtd_file(self, filename): 896 """Open specified DTD file.""" 897 path = os.path.join(DataHandler.local_dtd_dir, filename) 898 try: 899 handle = open(path, "rb") 900 except FileNotFoundError: 901 pass 902 else: 903 return handle 904 path = os.path.join(DataHandler.global_dtd_dir, filename) 905 try: 906 handle = open(path, "rb") 907 except FileNotFoundError: 908 pass 909 else: 910 return handle 911 return None 912 913 def open_xsd_file(self, filename): 914 """Open specified XSD file.""" 915 path = os.path.join(DataHandler.local_xsd_dir, filename) 916 try: 917 handle = open(path, "rb") 918 except FileNotFoundError: 919 pass 920 else: 921 return handle 922 path = os.path.join(DataHandler.global_xsd_dir, filename) 923 try: 924 handle = open(path, "rb") 925 except FileNotFoundError: 926 pass 927 else: 928 return handle 929 return None 930 931 def save_dtd_file(self, filename, text): 932 """Save DTD file to cache.""" 933 path = os.path.join(DataHandler.local_dtd_dir, filename) 934 try: 935 handle = open(path, "wb") 936 except OSError: 937 warnings.warn("Failed to save %s at %s" % (filename, path)) 938 else: 939 handle.write(text) 940 handle.close() 941 942 def save_xsd_file(self, filename, text): 943 """Save XSD file to cache.""" 944 path = os.path.join(DataHandler.local_xsd_dir, filename) 945 try: 946 handle = open(path, "wb") 947 except OSError: 948 warnings.warn("Failed to save %s at %s" % (filename, path)) 949 else: 950 handle.write(text) 951 handle.close() 952 953 def externalEntityRefHandler(self, context, base, systemId, publicId): 954 """Handle external entity reference in order to cache DTD locally. 955 956 The purpose of this function is to load the DTD locally, instead 957 of downloading it from the URL specified in the XML. Using the local 958 DTD results in much faster parsing. If the DTD is not found locally, 959 we try to download it. If new DTDs become available from NCBI, 960 putting them in Bio/Entrez/DTDs will allow the parser to see them. 961 """ 962 urlinfo = urlparse(systemId) 963 if urlinfo.scheme in ["http", "https", "ftp"]: 964 # Then this is an absolute path to the DTD. 965 url = systemId 966 elif urlinfo.scheme == "": 967 # Then this is a relative path to the DTD. 968 # Look at the parent URL to find the full path. 969 try: 970 source = self.dtd_urls[-1] 971 except IndexError: 972 # Assume the default URL for DTDs if the top parent 973 # does not contain an absolute path 974 source = "http://www.ncbi.nlm.nih.gov/dtd/" 975 else: 976 source = os.path.dirname(source) 977 # urls always have a forward slash, don't use os.path.join 978 url = source.rstrip("/") + "/" + systemId 979 else: 980 raise ValueError("Unexpected URL scheme %r" % urlinfo.scheme) 981 self.dtd_urls.append(url) 982 # First, try to load the local version of the DTD file 983 location, filename = os.path.split(systemId) 984 handle = self.open_dtd_file(filename) 985 if not handle: 986 # DTD is not available as a local file. Try accessing it through 987 # the internet instead. 988 try: 989 handle = urlopen(url) 990 except OSError: 991 raise RuntimeError( 992 "Failed to access %s at %s" % (filename, url) 993 ) from None 994 text = handle.read() 995 handle.close() 996 self.save_dtd_file(filename, text) 997 handle = BytesIO(text) 998 999 parser = self.parser.ExternalEntityParserCreate(context) 1000 parser.ElementDeclHandler = self.elementDecl 1001 parser.ParseFile(handle) 1002 handle.close() 1003 self.dtd_urls.pop() 1004 self.parser.StartElementHandler = self.startElementHandler 1005 return 1 1006