1# Copyright 2008-2014 by Michiel de Hoon.  All rights reserved.
2# Revisions copyright 2008-2015 by Peter Cock. All rights reserved.
3#
4# This file is part of the Biopython distribution and governed by your
5# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
6# Please see the LICENSE file that should have been included as part of this
7# package.
8
9"""Parser for XML results returned by NCBI's Entrez Utilities.
10
11This parser is used by the read() function in Bio.Entrez, and is not
12intended be used directly.
13
14The question is how to represent an XML file as Python objects. Some
15XML files returned by NCBI look like lists, others look like dictionaries,
16and others look like a mix of lists and dictionaries.
17
18My approach is to classify each possible element in the XML as a plain
19string, an integer, a list, a dictionary, or a structure. The latter is a
20dictionary where the same key can occur multiple times; in Python, it is
21represented as a dictionary where that key occurs once, pointing to a list
22of values found in the XML file.
23
24The parser then goes through the XML and creates the appropriate Python
25object for each element. The different levels encountered in the XML are
26preserved on the Python side. So a subelement of a subelement of an element
27is a value in a dictionary that is stored in a list which is a value in
28some other dictionary (or a value in a list which itself belongs to a list
29which is a value in a dictionary, and so on). Attributes encountered in
30the XML are stored as a dictionary in a member .attributes of each element,
31and the tag name is saved in a member .tag.
32
33To decide which kind of Python object corresponds to each element in the
34XML, the parser analyzes the DTD referred at the top of (almost) every
35XML file returned by the Entrez Utilities. This is preferred over a hand-
36written solution, since the number of DTDs is rather large and their
37contents may change over time. About half the code in this parser deals
38with parsing the DTD, and the other half with the XML itself.
39"""
40import os
41import warnings
42from collections import Counter
43from xml.parsers import expat
44from io import BytesIO
45import xml.etree.ElementTree as ET
46from xml.sax.saxutils import escape
47
48from urllib.request import urlopen, urlparse
49
50
51# The following four classes are used to add a member .attributes to integers,
52# strings, lists, and dictionaries, respectively.
53
54
55class NoneElement:
56    """NCBI Entrez XML element mapped to None."""
57
58    def __init__(self, tag, attributes, key=None):
59        """Create a NoneElement."""
60        self.tag = tag
61        if key is None:
62            self.key = tag
63        else:
64            self.key = key
65        self.attributes = attributes
66
67    def __eq__(self, other):
68        """Define equality with other None objects."""
69        if other is None:
70            return True
71        elif other.__eq__(None):
72            return True
73        else:
74            return False
75
76    def __ne__(self, other):
77        """Define non-equality."""
78        if other is None:
79            return False
80        elif other.__eq__(None):
81            return False
82        else:
83            return True
84
85    def __repr__(self):
86        """Return a string representation of the object."""
87        try:
88            attributes = self.attributes
89        except AttributeError:
90            return "NoneElement"
91        return "NoneElement(attributes=%r)" % attributes
92
93
94class IntegerElement(int):
95    """NCBI Entrez XML element mapped to an integer."""
96
97    def __new__(cls, value, tag, attributes, key=None):
98        """Create an IntegerElement."""
99        self = int.__new__(cls, value)
100        self.tag = tag
101        if key is None:
102            self.key = tag
103        else:
104            self.key = key
105        self.attributes = attributes
106        return self
107
108    def __repr__(self):
109        """Return a string representation of the object."""
110        text = int.__repr__(self)
111        try:
112            attributes = self.attributes
113        except AttributeError:
114            return text
115        return "IntegerElement(%s, attributes=%r)" % (text, attributes)
116
117
118class StringElement(str):
119    """NCBI Entrez XML element mapped to a string."""
120
121    def __new__(cls, value, tag, attributes, key=None):
122        """Create a StringElement."""
123        self = str.__new__(cls, value)
124        self.tag = tag
125        if key is None:
126            self.key = tag
127        else:
128            self.key = key
129        self.attributes = attributes
130        return self
131
132    def __repr__(self):
133        """Return a string representation of the object."""
134        text = str.__repr__(self)
135        attributes = self.attributes
136        if not attributes:
137            return text
138        return "StringElement(%s, attributes=%r)" % (text, attributes)
139
140
141class ListElement(list):
142    """NCBI Entrez XML element mapped to a list."""
143
144    def __init__(self, tag, attributes, allowed_tags, key=None):
145        """Create a ListElement."""
146        self.tag = tag
147        if key is None:
148            self.key = tag
149        else:
150            self.key = key
151        self.attributes = attributes
152        self.allowed_tags = allowed_tags
153
154    def __repr__(self):
155        """Return a string representation of the object."""
156        text = list.__repr__(self)
157        attributes = self.attributes
158        if not attributes:
159            return text
160        return "ListElement(%s, attributes=%r)" % (text, attributes)
161
162    def store(self, value):
163        """Append an element to the list, checking tags."""
164        key = value.key
165        if self.allowed_tags is not None and key not in self.allowed_tags:
166            raise ValueError("Unexpected item '%s' in list" % key)
167        self.append(value)
168
169
170class DictionaryElement(dict):
171    """NCBI Entrez XML element mapped to a dictionaray."""
172
173    def __init__(self, tag, attrs, allowed_tags, repeated_tags=None, key=None):
174        """Create a DictionaryElement."""
175        self.tag = tag
176        if key is None:
177            self.key = tag
178        else:
179            self.key = key
180        self.attributes = attrs
181        self.allowed_tags = allowed_tags
182        self.repeated_tags = repeated_tags
183        if repeated_tags:
184            for key in repeated_tags:
185                self[key] = []
186
187    def __repr__(self):
188        """Return a string representation of the object."""
189        text = dict.__repr__(self)
190        attributes = self.attributes
191        if not attributes:
192            return text
193        return "DictElement(%s, attributes=%r)" % (text, attributes)
194
195    def store(self, value):
196        """Add an entry to the dictionary, checking tags."""
197        key = value.key
198        tag = value.tag
199        if self.allowed_tags is not None and tag not in self.allowed_tags:
200            raise ValueError("Unexpected item '%s' in dictionary" % key)
201        if self.repeated_tags and key in self.repeated_tags:
202            self[key].append(value)
203        else:
204            self[key] = value
205
206
207class NotXMLError(ValueError):
208    """Failed to parse file as XML."""
209
210    def __init__(self, message):
211        """Initialize the class."""
212        self.msg = message
213
214    def __str__(self):
215        """Return a string summary of the exception."""
216        return (
217            "Failed to parse the XML data (%s). Please make sure that the input data "
218            "are in XML format." % self.msg
219        )
220
221
222class CorruptedXMLError(ValueError):
223    """Corrupted XML."""
224
225    def __init__(self, message):
226        """Initialize the class."""
227        self.msg = message
228
229    def __str__(self):
230        """Return a string summary of the exception."""
231        return (
232            "Failed to parse the XML data (%s). Please make sure that the input data "
233            "are not corrupted." % self.msg
234        )
235
236
237class ValidationError(ValueError):
238    """XML tag found which was not defined in the DTD.
239
240    Validating parsers raise this error if the parser finds a tag in the XML
241    that is not defined in the DTD. Non-validating parsers do not raise this
242    error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating
243    parsers by default (see those functions for more information).
244    """
245
246    def __init__(self, name):
247        """Initialize the class."""
248        self.name = name
249
250    def __str__(self):
251        """Return a string summary of the exception."""
252        return (
253            "Failed to find tag '%s' in the DTD. To skip all tags that "
254            "are not represented in the DTD, please call Bio.Entrez.read "
255            "or Bio.Entrez.parse with validate=False." % self.name
256        )
257
258
259class DataHandlerMeta(type):
260    """A metaclass is needed until Python supports @classproperty."""
261
262    def __init__(cls, *args, **kwargs):
263        """Initialize the class."""
264        cls._directory = None
265
266    @property
267    def directory(cls):
268        """Directory for caching XSD and DTD files."""
269        return cls._directory
270
271    @directory.setter
272    def directory(cls, value):
273        """Set a custom directory for the local DTD/XSD directories."""
274        if value is None:
275            import platform
276
277            if platform.system() == "Windows":
278                value = os.path.join(os.getenv("APPDATA"), "biopython")
279            else:  # Unix/Linux/Mac
280                home = os.path.expanduser("~")
281                value = os.path.join(home, ".config", "biopython")
282        cls._directory = value
283        # Create DTD local directory
284        cls.local_dtd_dir = os.path.join(cls._directory, "Bio", "Entrez", "DTDs")
285        os.makedirs(cls.local_dtd_dir, exist_ok=True)
286        # Create XSD local directory
287        cls.local_xsd_dir = os.path.join(cls._directory, "Bio", "Entrez", "XSDs")
288        os.makedirs(cls.local_xsd_dir, exist_ok=True)
289
290
291class DataHandler(metaclass=DataHandlerMeta):
292    """Data handler for parsing NCBI XML from Entrez."""
293
294    from Bio import Entrez
295
296    global_dtd_dir = os.path.join(Entrez.__path__[0], "DTDs")
297    global_xsd_dir = os.path.join(Entrez.__path__[0], "XSDs")
298    local_dtd_dir = ""
299    local_xsd_dir = ""
300
301    del Entrez
302
303    def __init__(self, validate, escape):
304        """Create a DataHandler object."""
305        self.dtd_urls = []
306        self.element = None
307        self.level = 0
308        self.data = []
309        self.attributes = None
310        self.allowed_tags = None
311        self.strings = {}
312        self.lists = {}
313        self.dictionaries = {}
314        self.items = set()
315        self.errors = set()
316        self.validating = validate
317        self.parser = expat.ParserCreate(namespace_separator=" ")
318        self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
319        self.parser.XmlDeclHandler = self.xmlDeclHandler
320        self.schema_namespace = None
321        self.namespace_level = Counter()
322        self.namespace_prefix = {}
323        if escape:
324            self.characterDataHandler = self.characterDataHandlerEscape
325        else:
326            self.characterDataHandler = self.characterDataHandlerRaw
327
328    def read(self, handle):
329        """Set up the parser and let it parse the XML results."""
330        # Expat's parser.ParseFile function only accepts binary data;
331        # see also the comment below for Entrez.parse.
332        if handle.read(0) != b"":
333            raise TypeError("file should be opened in binary mode")
334        try:
335            self.parser.ParseFile(handle)
336        except expat.ExpatError as e:
337            if self.parser.StartElementHandler:
338                # We saw the initial <!xml declaration, so we can be sure that
339                # we are parsing XML data. Most likely, the XML file is
340                # corrupted.
341                raise CorruptedXMLError(e) from None
342            else:
343                # We have not seen the initial <!xml declaration, so probably
344                # the input data is not in XML format.
345                raise NotXMLError(e) from None
346        try:
347            return self.record
348        except AttributeError:
349            if self.parser.StartElementHandler:
350                # We saw the initial <!xml declaration, and expat didn't notice
351                # any errors, so self.record should be defined. If not, this is
352                # a bug.
353                raise RuntimeError(
354                    "Failed to parse the XML file correctly, possibly due to a bug "
355                    "in Bio.Entrez. Please contact the Biopython developers via "
356                    "the mailing list or GitHub for assistance."
357                ) from None
358            else:
359                # We did not see the initial <!xml declaration, so probably
360                # the input data is not in XML format.
361                raise NotXMLError("XML declaration not found") from None
362
363    def parse(self, handle):
364        """Parse the XML in the given file handle."""
365        # The handle should have been opened in binary mode; data read from
366        # the handle are then bytes. Expat will pick up the encoding from the
367        # XML declaration (or assume UTF-8 if it is missing), and use this
368        # encoding to convert the binary data to a string before giving it to
369        # characterDataHandler.
370        # While parser.ParseFile only accepts binary data, parser.Parse accepts
371        # both binary data and strings. However, a file in text mode may have
372        # been opened with an encoding different from the encoding specified in
373        # the XML declaration at the top of the file. If so, the data in the
374        # file will have been decoded with an incorrect encoding. To avoid
375        # this, and to be consistent with parser.ParseFile (which is used in
376        # the Entrez.read function above), we require the handle to be in
377        # binary mode here as well.
378        if handle.read(0) != b"":
379            raise TypeError("file should be opened in binary mode")
380        BLOCK = 1024
381        while True:
382            # Read in another block of data from the file.
383            data = handle.read(BLOCK)
384            try:
385                self.parser.Parse(data, False)
386            except expat.ExpatError as e:
387                if self.parser.StartElementHandler:
388                    # We saw the initial <!xml declaration, so we can be sure
389                    # that we are parsing XML data. Most likely, the XML file
390                    # is corrupted.
391                    raise CorruptedXMLError(e) from None
392                else:
393                    # We have not seen the initial <!xml declaration, so
394                    # probably the input data is not in XML format.
395                    raise NotXMLError(e) from None
396            try:
397                records = self.record
398            except AttributeError:
399                if self.parser.StartElementHandler:
400                    # We saw the initial <!xml declaration, and expat
401                    # didn't notice any errors, so self.record should be
402                    # defined. If not, this is a bug.
403
404                    raise RuntimeError(
405                        "Failed to parse the XML file correctly, possibly due to a "
406                        "bug in Bio.Entrez. Please contact the Biopython "
407                        "developers via the mailing list or GitHub for assistance."
408                    ) from None
409                else:
410                    # We did not see the initial <!xml declaration, so
411                    # probably the input data is not in XML format.
412                    raise NotXMLError("XML declaration not found") from None
413
414            if not isinstance(records, list):
415                raise ValueError(
416                    "The XML file does not represent a list. Please use Entrez.read "
417                    "instead of Entrez.parse"
418                )
419
420            if not data:
421                break
422
423            while len(records) >= 2:
424                # Then the first record is finished, while the second record
425                # is still a work in progress.
426                record = records.pop(0)
427                yield record
428
429        # We have reached the end of the XML file
430        self.parser = None
431        if self.element is not None:
432            # No more XML data, but there is still some unfinished business
433            raise CorruptedXMLError("Premature end of data")
434
435        # Send out the remaining records
436        yield from records
437
438    def xmlDeclHandler(self, version, encoding, standalone):
439        """Set XML handlers when an XML declaration is found."""
440        self.parser.CharacterDataHandler = self.characterDataHandler
441        self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler
442        self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
443        self.parser.EndNamespaceDeclHandler = self.endNamespaceDeclHandler
444        self.parser.StartElementHandler = self.handleMissingDocumentDefinition
445
446    def handleMissingDocumentDefinition(self, tag, attrs):
447        """Raise an Exception if neither a DTD nor an XML Schema is found."""
448        raise ValueError(
449            "As the XML data contained neither a Document Type Definition (DTD) nor an XML Schema, Bio.Entrez is unable to parse these data. We recommend using a generic XML parser from the Python standard library instead, for example ElementTree."
450        )
451
452    def startNamespaceDeclHandler(self, prefix, uri):
453        """Handle start of an XML namespace declaration."""
454        if prefix == "xsi":
455            # This is an xml schema
456            self.schema_namespace = uri
457            self.parser.StartElementHandler = self.schemaHandler
458        else:
459            # Note that the DTD for MathML specifies a default attribute
460            # that declares the namespace for each MathML element. This means
461            # that MathML element in the XML has an invisible MathML namespace
462            # declaration that triggers a call to startNamespaceDeclHandler
463            # and endNamespaceDeclHandler. Therefore we need to count how often
464            # startNamespaceDeclHandler and endNamespaceDeclHandler were called
465            # to find out their first and last invocation for each namespace.
466            if prefix == "mml":
467                assert uri == "http://www.w3.org/1998/Math/MathML"
468            elif prefix == "xlink":
469                assert uri == "http://www.w3.org/1999/xlink"
470            else:
471                raise ValueError("Unknown prefix '%s' with uri '%s'" % (prefix, uri))
472            self.namespace_level[prefix] += 1
473            self.namespace_prefix[uri] = prefix
474
475    def endNamespaceDeclHandler(self, prefix):
476        """Handle end of an XML namespace declaration."""
477        if prefix != "xsi":
478            self.namespace_level[prefix] -= 1
479            if self.namespace_level[prefix] == 0:
480                for key, value in self.namespace_prefix.items():
481                    if value == prefix:
482                        break
483                else:
484                    raise RuntimeError("Failed to find namespace prefix")
485                del self.namespace_prefix[key]
486
487    def schemaHandler(self, name, attrs):
488        """Process the XML schema (before processing the element)."""
489        key = "%s noNamespaceSchemaLocation" % self.schema_namespace
490        schema = attrs[key]
491        handle = self.open_xsd_file(os.path.basename(schema))
492        # if there is no local xsd file grab the url and parse the file
493        if not handle:
494            handle = urlopen(schema)
495            text = handle.read()
496            self.save_xsd_file(os.path.basename(schema), text)
497            handle.close()
498            self.parse_xsd(ET.fromstring(text))
499        else:
500            self.parse_xsd(ET.fromstring(handle.read()))
501            handle.close()
502        # continue handling the element
503        self.startElementHandler(name, attrs)
504        # reset the element handler
505        self.parser.StartElementHandler = self.startElementHandler
506
507    def startElementHandler(self, tag, attrs):
508        """Handle start of an XML element."""
509        if tag in self.items:
510            assert tag == "Item"
511            name = attrs["Name"]
512            itemtype = attrs["Type"]
513            del attrs["Type"]
514            if itemtype == "Structure":
515                del attrs["Name"]
516                element = DictionaryElement(
517                    name, attrs, allowed_tags=None, repeated_tags=None
518                )
519                parent = self.element
520                element.parent = parent
521                # For consistency with lists below, store the element here
522                if parent is None:
523                    self.record = element
524                else:
525                    parent.store(element)
526                self.element = element
527                self.parser.EndElementHandler = self.endElementHandler
528                self.parser.CharacterDataHandler = self.skipCharacterDataHandler
529            elif name in ("ArticleIds", "History"):
530                del attrs["Name"]
531                allowed_tags = None  # allowed tags are unknown
532                repeated_tags = frozenset(["pubmed", "medline"])
533                element = DictionaryElement(
534                    tag,
535                    attrs,
536                    allowed_tags=allowed_tags,
537                    repeated_tags=repeated_tags,
538                    key=name,
539                )
540                parent = self.element
541                element.parent = parent
542                # For consistency with lists below, store the element here
543                if parent is None:
544                    self.record = element
545                else:
546                    parent.store(element)
547                self.element = element
548                self.parser.EndElementHandler = self.endElementHandler
549                self.parser.CharacterDataHandler = self.skipCharacterDataHandler
550            elif itemtype == "List":
551                del attrs["Name"]
552                allowed_tags = None  # allowed tags are unknown
553                element = ListElement(tag, attrs, allowed_tags, name)
554                parent = self.element
555                element.parent = parent
556                if self.element is None:
557                    # Set self.record here to let Entrez.parse iterate over it
558                    self.record = element
559                else:
560                    parent.store(element)
561                self.element = element
562                self.parser.EndElementHandler = self.endElementHandler
563                self.parser.CharacterDataHandler = self.skipCharacterDataHandler
564            elif itemtype == "Integer":
565                self.parser.EndElementHandler = self.endIntegerElementHandler
566                self.parser.CharacterDataHandler = self.characterDataHandler
567                self.attributes = attrs
568            elif itemtype in ("String", "Unknown", "Date", "Enumerator"):
569                assert self.attributes is None
570                self.attributes = attrs
571                self.parser.StartElementHandler = self.startRawElementHandler
572                self.parser.EndElementHandler = self.endStringElementHandler
573                self.parser.CharacterDataHandler = self.characterDataHandler
574            else:
575                raise ValueError("Unknown item type %s" % name)
576        elif tag in self.errors:
577            self.parser.EndElementHandler = self.endErrorElementHandler
578            self.parser.CharacterDataHandler = self.characterDataHandler
579        elif tag in self.strings:
580            self.parser.StartElementHandler = self.startRawElementHandler
581            self.parser.EndElementHandler = self.endStringElementHandler
582            self.parser.CharacterDataHandler = self.characterDataHandler
583            assert self.allowed_tags is None
584            self.allowed_tags = self.strings[tag]
585            assert self.attributes is None
586            self.attributes = attrs
587        elif tag in self.dictionaries:
588            allowed_tags, repeated_tags = self.dictionaries[tag]
589            element = DictionaryElement(tag, attrs, allowed_tags, repeated_tags)
590            parent = self.element
591            element.parent = parent
592            # For consistency with lists below, store the element here
593            if parent is None:
594                self.record = element
595            else:
596                parent.store(element)
597            self.element = element
598            self.parser.EndElementHandler = self.endElementHandler
599            self.parser.CharacterDataHandler = self.skipCharacterDataHandler
600        elif tag in self.lists:
601            allowed_tags = self.lists[tag]
602            element = ListElement(tag, attrs, allowed_tags)
603            parent = self.element
604            element.parent = parent
605            if parent is None:
606                # Set self.record here to let Entrez.parse iterate over it
607                self.record = element
608            else:
609                parent.store(element)
610            self.element = element
611            self.parser.EndElementHandler = self.endElementHandler
612            self.parser.CharacterDataHandler = self.skipCharacterDataHandler
613        else:
614            # Element not found in DTD
615            if self.validating:
616                raise ValidationError(tag)
617            else:
618                # this will not be stored in the record
619                self.parser.StartElementHandler = self.startSkipElementHandler
620                self.parser.EndElementHandler = self.endSkipElementHandler
621                self.parser.CharacterDataHandler = self.skipCharacterDataHandler
622                self.level = 1
623
624    def startRawElementHandler(self, name, attrs):
625        """Handle start of an XML raw element."""
626        # check if the name is in a namespace
627        prefix = None
628        if self.namespace_prefix:
629            try:
630                uri, name = name.split()
631            except ValueError:
632                pass
633            else:
634                prefix = self.namespace_prefix[uri]
635                if self.namespace_level[prefix] == 1:
636                    attrs = {"xmlns": uri}
637        if prefix:
638            key = "%s:%s" % (prefix, name)
639        else:
640            key = name
641        # self.allowed_tags is ignored for now. Anyway we know what to do
642        # with this tag.
643        tag = "<%s" % name
644        for key, value in attrs.items():
645            tag += ' %s="%s"' % (key, value)
646        tag += ">"
647        self.data.append(tag)
648        self.parser.EndElementHandler = self.endRawElementHandler
649        self.level += 1
650
651    def startSkipElementHandler(self, name, attrs):
652        """Handle start of an XML skip element."""
653        self.level += 1
654
655    def endStringElementHandler(self, tag):
656        """Handle end of an XML string element."""
657        element = self.element
658        if element is not None:
659            self.parser.StartElementHandler = self.startElementHandler
660            self.parser.EndElementHandler = self.endElementHandler
661            self.parser.CharacterDataHandler = self.skipCharacterDataHandler
662        value = "".join(self.data)
663        self.data = []
664        attributes = self.attributes
665        self.attributes = None
666        if tag in self.items:
667            assert tag == "Item"
668            key = attributes["Name"]
669            del attributes["Name"]
670        else:
671            key = tag
672        value = StringElement(value, tag, attributes, key)
673        if element is None:
674            self.record = element
675        else:
676            element.store(value)
677        self.allowed_tags = None
678
679    def endRawElementHandler(self, name):
680        """Handle start of an XML raw element."""
681        self.level -= 1
682        if self.level == 0:
683            self.parser.EndElementHandler = self.endStringElementHandler
684        if self.namespace_prefix:
685            try:
686                uri, name = name.split()
687            except ValueError:
688                pass
689        tag = "</%s>" % name
690        self.data.append(tag)
691
692    def endSkipElementHandler(self, name):
693        """Handle start of an XML skip element."""
694        self.level -= 1
695        if self.level == 0:
696            self.parser.StartElementHandler = self.startElementHandler
697            self.parser.EndElementHandler = self.endElementHandler
698
699    def endErrorElementHandler(self, name):
700        """Handle start of an XML error element."""
701        if self.data:
702            # error found:
703            value = "".join(self.data)
704            raise RuntimeError(value)
705        # no error found:
706        if self.element is not None:
707            self.parser.EndElementHandler = self.endElementHandler
708            self.parser.CharacterDataHandler = self.skipCharacterDataHandler
709
710    def endElementHandler(self, name):
711        """Handle end of an XML element."""
712        element = self.element
713        self.element = element.parent
714        del element.parent
715
716    def endIntegerElementHandler(self, tag):
717        """Handle end of an XML integer element."""
718        attributes = self.attributes
719        self.attributes = None
720        assert tag == "Item"
721        key = attributes["Name"]
722        del attributes["Name"]
723        if self.data:
724            value = int("".join(self.data))
725            self.data = []
726            value = IntegerElement(value, tag, attributes, key)
727        else:
728            value = NoneElement(tag, attributes, key)
729        element = self.element
730        if element is None:
731            self.record = value
732        else:
733            self.parser.EndElementHandler = self.endElementHandler
734            self.parser.CharacterDataHandler = self.skipCharacterDataHandler
735            if value is None:
736                return
737            element.store(value)
738
739    def characterDataHandlerRaw(self, content):
740        """Handle character data as-is (raw)."""
741        self.data.append(content)
742
743    def characterDataHandlerEscape(self, content):
744        """Handle character data by encoding it."""
745        content = escape(content)
746        self.data.append(content)
747
748    def skipCharacterDataHandler(self, content):
749        """Handle character data by skipping it."""
750
751    def parse_xsd(self, root):
752        """Parse an XSD file."""
753        prefix = "{http://www.w3.org/2001/XMLSchema}"
754        for element in root:
755            isSimpleContent = False
756            attribute_keys = []
757            keys = []
758            multiple = []
759            assert element.tag == prefix + "element"
760            name = element.attrib["name"]
761            assert len(element) == 1
762            complexType = element[0]
763            assert complexType.tag == prefix + "complexType"
764            for component in complexType:
765                tag = component.tag
766                if tag == prefix + "attribute":
767                    # we could distinguish by type; keeping string for now
768                    attribute_keys.append(component.attrib["name"])
769                elif tag == prefix + "sequence":
770                    maxOccurs = component.attrib.get("maxOccurs", "1")
771                    for key in component:
772                        assert key.tag == prefix + "element"
773                        ref = key.attrib["ref"]
774                        keys.append(ref)
775                        if maxOccurs != "1" or key.attrib.get("maxOccurs", "1") != "1":
776                            multiple.append(ref)
777                elif tag == prefix + "simpleContent":
778                    assert len(component) == 1
779                    extension = component[0]
780                    assert extension.tag == prefix + "extension"
781                    assert extension.attrib["base"] == "xs:string"
782                    for attribute in extension:
783                        assert attribute.tag == prefix + "attribute"
784                        # we could distinguish by type; keeping string for now
785                        attribute_keys.append(attribute.attrib["name"])
786                    isSimpleContent = True
787            allowed_tags = frozenset(keys)
788            if len(keys) == 1 and keys == multiple:
789                assert not isSimpleContent
790                self.lists[name] = allowed_tags
791            elif len(keys) >= 1:
792                assert not isSimpleContent
793                repeated_tags = frozenset(multiple)
794                self.dictionaries[name] = (allowed_tags, repeated_tags)
795            else:
796                self.strings[name] = allowed_tags
797
798    def elementDecl(self, name, model):
799        """Call a call-back function for each element declaration in a DTD.
800
801        This is used for each element declaration in a DTD like::
802
803            <!ELEMENT       name          (...)>
804
805        The purpose of this function is to determine whether this element
806        should be regarded as a string, integer, list, dictionary, structure,
807        or error.
808        """
809        if name.upper() == "ERROR":
810            self.errors.add(name)
811            return
812        if name == "Item" and model == (
813            expat.model.XML_CTYPE_MIXED,
814            expat.model.XML_CQUANT_REP,
815            None,
816            ((expat.model.XML_CTYPE_NAME, expat.model.XML_CQUANT_NONE, "Item", ()),),
817        ):
818            # Special case. As far as I can tell, this only occurs in the
819            # eSummary DTD.
820            self.items.add(name)
821            return
822        # First, remove ignorable parentheses around declarations
823        while (
824            model[0] in (expat.model.XML_CTYPE_SEQ, expat.model.XML_CTYPE_CHOICE)
825            and model[1] in (expat.model.XML_CQUANT_NONE, expat.model.XML_CQUANT_OPT)
826            and len(model[3]) == 1
827        ):
828            model = model[3][0]
829        # PCDATA declarations correspond to strings
830        if model[0] in (expat.model.XML_CTYPE_MIXED, expat.model.XML_CTYPE_EMPTY):
831            if model[1] == expat.model.XML_CQUANT_REP:
832                children = model[3]
833                allowed_tags = frozenset(child[2] for child in children)
834            else:
835                allowed_tags = frozenset()
836            self.strings[name] = allowed_tags
837            return
838        # List-type elements
839        if model[0] in (
840            expat.model.XML_CTYPE_CHOICE,
841            expat.model.XML_CTYPE_SEQ,
842        ) and model[1] in (expat.model.XML_CQUANT_PLUS, expat.model.XML_CQUANT_REP):
843            children = model[3]
844            if model[0] == expat.model.XML_CTYPE_SEQ:
845                assert len(children) == 1
846            allowed_tags = frozenset(child[2] for child in children)
847            self.lists[name] = allowed_tags
848            return
849        # This is the tricky case. Check which keys can occur multiple
850        # times. If only one key is possible, and it can occur multiple
851        # times, then this is a list. If more than one key is possible,
852        # but none of them can occur multiple times, then this is a
853        # dictionary. Otherwise, this is a structure.
854        # In 'single' and 'multiple', we keep track which keys can occur
855        # only once, and which can occur multiple times.
856        single = []
857        multiple = []
858        # The 'count' function is called recursively to make sure all the
859        # children in this model are counted. Error keys are ignored;
860        # they raise an exception in Python.
861
862        def count(model):
863            quantifier, key, children = model[1:]
864            if key is None:
865                if quantifier in (
866                    expat.model.XML_CQUANT_PLUS,
867                    expat.model.XML_CQUANT_REP,
868                ):
869                    for child in children:
870                        multiple.append(child[2])
871                else:
872                    for child in children:
873                        count(child)
874            elif key.upper() != "ERROR":
875                if quantifier in (
876                    expat.model.XML_CQUANT_NONE,
877                    expat.model.XML_CQUANT_OPT,
878                ):
879                    single.append(key)
880                elif quantifier in (
881                    expat.model.XML_CQUANT_PLUS,
882                    expat.model.XML_CQUANT_REP,
883                ):
884                    multiple.append(key)
885
886        count(model)
887        if len(single) == 0 and len(multiple) == 1:
888            allowed_tags = frozenset(multiple)
889            self.lists[name] = allowed_tags
890        else:
891            allowed_tags = frozenset(single + multiple)
892            repeated_tags = frozenset(multiple)
893            self.dictionaries[name] = (allowed_tags, repeated_tags)
894
895    def open_dtd_file(self, filename):
896        """Open specified DTD file."""
897        path = os.path.join(DataHandler.local_dtd_dir, filename)
898        try:
899            handle = open(path, "rb")
900        except FileNotFoundError:
901            pass
902        else:
903            return handle
904        path = os.path.join(DataHandler.global_dtd_dir, filename)
905        try:
906            handle = open(path, "rb")
907        except FileNotFoundError:
908            pass
909        else:
910            return handle
911        return None
912
913    def open_xsd_file(self, filename):
914        """Open specified XSD file."""
915        path = os.path.join(DataHandler.local_xsd_dir, filename)
916        try:
917            handle = open(path, "rb")
918        except FileNotFoundError:
919            pass
920        else:
921            return handle
922        path = os.path.join(DataHandler.global_xsd_dir, filename)
923        try:
924            handle = open(path, "rb")
925        except FileNotFoundError:
926            pass
927        else:
928            return handle
929        return None
930
931    def save_dtd_file(self, filename, text):
932        """Save DTD file to cache."""
933        path = os.path.join(DataHandler.local_dtd_dir, filename)
934        try:
935            handle = open(path, "wb")
936        except OSError:
937            warnings.warn("Failed to save %s at %s" % (filename, path))
938        else:
939            handle.write(text)
940            handle.close()
941
942    def save_xsd_file(self, filename, text):
943        """Save XSD file to cache."""
944        path = os.path.join(DataHandler.local_xsd_dir, filename)
945        try:
946            handle = open(path, "wb")
947        except OSError:
948            warnings.warn("Failed to save %s at %s" % (filename, path))
949        else:
950            handle.write(text)
951            handle.close()
952
953    def externalEntityRefHandler(self, context, base, systemId, publicId):
954        """Handle external entity reference in order to cache DTD locally.
955
956        The purpose of this function is to load the DTD locally, instead
957        of downloading it from the URL specified in the XML. Using the local
958        DTD results in much faster parsing. If the DTD is not found locally,
959        we try to download it. If new DTDs become available from NCBI,
960        putting them in Bio/Entrez/DTDs will allow the parser to see them.
961        """
962        urlinfo = urlparse(systemId)
963        if urlinfo.scheme in ["http", "https", "ftp"]:
964            # Then this is an absolute path to the DTD.
965            url = systemId
966        elif urlinfo.scheme == "":
967            # Then this is a relative path to the DTD.
968            # Look at the parent URL to find the full path.
969            try:
970                source = self.dtd_urls[-1]
971            except IndexError:
972                # Assume the default URL for DTDs if the top parent
973                # does not contain an absolute path
974                source = "http://www.ncbi.nlm.nih.gov/dtd/"
975            else:
976                source = os.path.dirname(source)
977            # urls always have a forward slash, don't use os.path.join
978            url = source.rstrip("/") + "/" + systemId
979        else:
980            raise ValueError("Unexpected URL scheme %r" % urlinfo.scheme)
981        self.dtd_urls.append(url)
982        # First, try to load the local version of the DTD file
983        location, filename = os.path.split(systemId)
984        handle = self.open_dtd_file(filename)
985        if not handle:
986            # DTD is not available as a local file. Try accessing it through
987            # the internet instead.
988            try:
989                handle = urlopen(url)
990            except OSError:
991                raise RuntimeError(
992                    "Failed to access %s at %s" % (filename, url)
993                ) from None
994            text = handle.read()
995            handle.close()
996            self.save_dtd_file(filename, text)
997            handle = BytesIO(text)
998
999        parser = self.parser.ExternalEntityParserCreate(context)
1000        parser.ElementDeclHandler = self.elementDecl
1001        parser.ParseFile(handle)
1002        handle.close()
1003        self.dtd_urls.pop()
1004        self.parser.StartElementHandler = self.startElementHandler
1005        return 1
1006