1# Copyright 2010 by Andrea Pierleoni
2# Revisions copyright 2010, 2016 by Peter Cock
3# All rights reserved.
4#
5# This file is part of the Biopython distribution and governed by your
6# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
7# Please see the LICENSE file that should have been included as part of this
8# package.
9"""Bio.SeqIO support for the "uniprot-xml" file format.
10
11See Also:
12http://www.uniprot.org
13
14The UniProt XML format essentially replaces the old plain text file format
15originally introduced by SwissProt ("swiss" format in Bio.SeqIO).
16
17"""
18from xml.etree import ElementTree
19from xml.parsers.expat import errors
20
21from Bio import SeqFeature
22from Bio.Seq import Seq
23from Bio.SeqRecord import SeqRecord
24
25
26NS = "{http://uniprot.org/uniprot}"
27REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)"
28
29
30def UniprotIterator(source, alphabet=None, return_raw_comments=False):
31    """Iterate over UniProt XML as SeqRecord objects.
32
33    parses an XML entry at a time from any UniProt XML file
34    returns a SeqRecord for each iteration
35
36    This generator can be used in Bio.SeqIO
37
38    Argument source is a file-like object or a path to a file.
39
40    Optional argument alphabet should not be used anymore.
41
42    return_raw_comments = True --> comment fields are returned as complete XML to allow further processing
43    skip_parsing_errors = True --> if parsing errors are found, skip to next entry
44    """
45    if alphabet is not None:
46        raise ValueError("The alphabet argument is no longer supported")
47    try:
48        for event, elem in ElementTree.iterparse(source, events=("start", "end")):
49            if event == "end" and elem.tag == NS + "entry":
50                yield Parser(elem, return_raw_comments=return_raw_comments).parse()
51                elem.clear()
52    except ElementTree.ParseError as exception:
53        if errors.messages[exception.code] == errors.XML_ERROR_NO_ELEMENTS:
54            assert exception.position == (1, 0)  # line 1, column 0
55            raise ValueError("Empty file.") from None
56        else:
57            raise
58
59
60class Parser:
61    """Parse a UniProt XML entry to a SeqRecord.
62
63    Optional argument alphabet is no longer used.
64
65    return_raw_comments=True to get back the complete comment field in XML format
66    """
67
68    def __init__(self, elem, alphabet=None, return_raw_comments=False):
69        """Initialize the class."""
70        if alphabet is not None:
71            raise ValueError("The alphabet argument is no longer supported")
72        self.entry = elem
73        self.return_raw_comments = return_raw_comments
74
75    def parse(self):
76        """Parse the input."""
77        assert self.entry.tag == NS + "entry"
78
79        def append_to_annotations(key, value):
80            if key not in self.ParsedSeqRecord.annotations:
81                self.ParsedSeqRecord.annotations[key] = []
82            if value not in self.ParsedSeqRecord.annotations[key]:
83                self.ParsedSeqRecord.annotations[key].append(value)
84
85        def _parse_name(element):
86            self.ParsedSeqRecord.name = element.text
87            self.ParsedSeqRecord.dbxrefs.append(self.dbname + ":" + element.text)
88
89        def _parse_accession(element):
90            append_to_annotations(
91                "accessions", element.text
92            )  # to cope with SwissProt plain text parser
93            self.ParsedSeqRecord.dbxrefs.append(self.dbname + ":" + element.text)
94
95        def _parse_protein(element):
96            """Parse protein names (PRIVATE)."""
97            descr_set = False
98            for protein_element in element:
99                if protein_element.tag in [
100                    NS + "recommendedName",
101                    NS + "submittedName",
102                    NS + "alternativeName",
103                ]:  # recommendedName tag are parsed before
104                    # use protein fields for name and description
105                    for rec_name in protein_element:
106                        ann_key = "%s_%s" % (
107                            protein_element.tag.replace(NS, ""),
108                            rec_name.tag.replace(NS, ""),
109                        )
110                        append_to_annotations(ann_key, rec_name.text)
111                        if (rec_name.tag == NS + "fullName") and not descr_set:
112                            self.ParsedSeqRecord.description = rec_name.text
113                            descr_set = True
114                elif protein_element.tag == NS + "component":
115                    pass  # not parsed
116                elif protein_element.tag == NS + "domain":
117                    pass  # not parsed
118
119        def _parse_gene(element):
120            for genename_element in element:
121                if "type" in genename_element.attrib:
122                    ann_key = "gene_%s_%s" % (
123                        genename_element.tag.replace(NS, ""),
124                        genename_element.attrib["type"],
125                    )
126                    if genename_element.attrib["type"] == "primary":
127                        self.ParsedSeqRecord.annotations[
128                            ann_key
129                        ] = genename_element.text
130                    else:
131                        append_to_annotations(ann_key, genename_element.text)
132
133        def _parse_geneLocation(element):
134            append_to_annotations("geneLocation", element.attrib["type"])
135
136        def _parse_organism(element):
137            organism_name = com_name = sci_name = ""
138            for organism_element in element:
139                if organism_element.tag == NS + "name":
140                    if organism_element.text:
141                        if organism_element.attrib["type"] == "scientific":
142                            sci_name = organism_element.text
143                        elif organism_element.attrib["type"] == "common":
144                            com_name = organism_element.text
145                        else:
146                            # e.g. synonym
147                            append_to_annotations(
148                                "organism_name", organism_element.text
149                            )
150                elif organism_element.tag == NS + "dbReference":
151                    self.ParsedSeqRecord.dbxrefs.append(
152                        organism_element.attrib["type"]
153                        + ":"
154                        + organism_element.attrib["id"]
155                    )
156                elif organism_element.tag == NS + "lineage":
157                    for taxon_element in organism_element:
158                        if taxon_element.tag == NS + "taxon":
159                            append_to_annotations("taxonomy", taxon_element.text)
160            if sci_name and com_name:
161                organism_name = "%s (%s)" % (sci_name, com_name)
162            elif sci_name:
163                organism_name = sci_name
164            elif com_name:
165                organism_name = com_name
166            self.ParsedSeqRecord.annotations["organism"] = organism_name
167
168        def _parse_organismHost(element):
169            for organism_element in element:
170                if organism_element.tag == NS + "name":
171                    append_to_annotations("organism_host", organism_element.text)
172
173        def _parse_keyword(element):
174            append_to_annotations("keywords", element.text)
175
176        def _parse_comment(element):
177            """Parse comments (PRIVATE).
178
179            Comment fields are very heterogeneus. each type has his own (frequently mutated) schema.
180            To store all the contained data, more complex data structures are needed, such as
181            annotated dictionaries. This is left to end user, by optionally setting:
182
183            return_raw_comments=True
184
185            The original XML is returned in the annotation fields.
186
187            Available comment types at december 2009:
188             - "allergen"
189             - "alternative products"
190             - "biotechnology"
191             - "biophysicochemical properties"
192             - "catalytic activity"
193             - "caution"
194             - "cofactor"
195             - "developmental stage"
196             - "disease"
197             - "domain"
198             - "disruption phenotype"
199             - "enzyme regulation"
200             - "function"
201             - "induction"
202             - "miscellaneous"
203             - "pathway"
204             - "pharmaceutical"
205             - "polymorphism"
206             - "PTM"
207             - "RNA editing"
208             - "similarity"
209             - "subcellular location"
210             - "sequence caution"
211             - "subunit"
212             - "tissue specificity"
213             - "toxic dose"
214             - "online information"
215             - "mass spectrometry"
216             - "interaction"
217
218            """
219            simple_comments = [
220                "allergen",
221                "biotechnology",
222                "biophysicochemical properties",
223                "catalytic activity",
224                "caution",
225                "cofactor",
226                "developmental stage",
227                "disease",
228                "domain",
229                "disruption phenotype",
230                "enzyme regulation",
231                "function",
232                "induction",
233                "miscellaneous",
234                "pathway",
235                "pharmaceutical",
236                "polymorphism",
237                "PTM",
238                "RNA editing",  # positions not parsed
239                "similarity",
240                "subunit",
241                "tissue specificity",
242                "toxic dose",
243            ]
244
245            if element.attrib["type"] in simple_comments:
246                ann_key = "comment_%s" % element.attrib["type"].replace(" ", "")
247                for text_element in element.iter(NS + "text"):
248                    if text_element.text:
249                        append_to_annotations(ann_key, text_element.text)
250            elif element.attrib["type"] == "subcellular location":
251                for subloc_element in element.iter(NS + "subcellularLocation"):
252                    for el in subloc_element:
253                        if el.text:
254                            ann_key = "comment_%s_%s" % (
255                                element.attrib["type"].replace(" ", ""),
256                                el.tag.replace(NS, ""),
257                            )
258                            append_to_annotations(ann_key, el.text)
259            elif element.attrib["type"] == "interaction":
260                for interact_element in element.iter(NS + "interactant"):
261                    ann_key = "comment_%s_intactId" % element.attrib["type"]
262                    append_to_annotations(ann_key, interact_element.attrib["intactId"])
263            elif element.attrib["type"] == "alternative products":
264                for alt_element in element.iter(NS + "isoform"):
265                    ann_key = "comment_%s_isoform" % element.attrib["type"].replace(
266                        " ", ""
267                    )
268                    for id_element in alt_element.iter(NS + "id"):
269                        append_to_annotations(ann_key, id_element.text)
270            elif element.attrib["type"] == "mass spectrometry":
271                ann_key = "comment_%s" % element.attrib["type"].replace(" ", "")
272                start = end = 0
273                for el in element.iter(NS + "location"):
274                    pos_els = list(el.iter(NS + "position"))
275                    # this try should be avoided, maybe it is safer to skip position parsing for mass spectrometry
276                    try:
277                        if pos_els:
278                            end = int(pos_els[0].attrib["position"])
279                            start = end - 1
280                        else:
281                            start = int(next(el.iter(NS + "begin")).attrib["position"])
282                            start -= 1
283                            end = int(next(el.iter(NS + "end")).attrib["position"])
284                    except (ValueError, KeyError):
285                        # undefined positions or erroneously mapped
286                        pass
287                mass = element.attrib["mass"]
288                method = element.attrib["method"]
289                if start == end == 0:
290                    append_to_annotations(ann_key, "undefined:%s|%s" % (mass, method))
291                else:
292                    append_to_annotations(
293                        ann_key, "%s..%s:%s|%s" % (start, end, mass, method)
294                    )
295            elif element.attrib["type"] == "sequence caution":
296                pass  # not parsed: few information, complex structure
297            elif element.attrib["type"] == "online information":
298                for link_element in element.iter(NS + "link"):
299                    ann_key = "comment_%s" % element.attrib["type"].replace(" ", "")
300                    for id_element in link_element.iter(NS + "link"):
301                        append_to_annotations(
302                            ann_key,
303                            "%s@%s"
304                            % (element.attrib["name"], link_element.attrib["uri"]),
305                        )
306
307            # return raw XML comments if needed
308            if self.return_raw_comments:
309                ann_key = "comment_%s_xml" % element.attrib["type"].replace(" ", "")
310                append_to_annotations(ann_key, ElementTree.tostring(element))
311
312        def _parse_dbReference(element):
313            self.ParsedSeqRecord.dbxrefs.append(
314                element.attrib["type"] + ":" + element.attrib["id"]
315            )
316            # e.g.
317            # <dbReference type="PDB" key="11" id="2GEZ">
318            #   <property value="X-ray" type="method"/>
319            #   <property value="2.60 A" type="resolution"/>
320            #   <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/>
321            # </dbReference>
322            if "type" in element.attrib:
323                if element.attrib["type"] == "PDB":
324                    method = ""
325                    resolution = ""
326                    for ref_element in element:
327                        if ref_element.tag == NS + "property":
328                            dat_type = ref_element.attrib["type"]
329                            if dat_type == "method":
330                                method = ref_element.attrib["value"]
331                            if dat_type == "resolution":
332                                resolution = ref_element.attrib["value"]
333                            if dat_type == "chains":
334                                pairs = ref_element.attrib["value"].split(",")
335                                for elem in pairs:
336                                    pair = elem.strip().split("=")
337                                    if pair[1] != "-":
338                                        # TODO - How best to store these, do SeqFeatures make sense?
339                                        feature = SeqFeature.SeqFeature()
340                                        feature.type = element.attrib["type"]
341                                        feature.qualifiers["name"] = element.attrib[
342                                            "id"
343                                        ]
344                                        feature.qualifiers["method"] = method
345                                        feature.qualifiers["resolution"] = resolution
346                                        feature.qualifiers["chains"] = pair[0].split(
347                                            "/"
348                                        )
349                                        start = int(pair[1].split("-")[0]) - 1
350                                        end = int(pair[1].split("-")[1])
351                                        feature.location = SeqFeature.FeatureLocation(
352                                            start, end
353                                        )
354                                        # self.ParsedSeqRecord.features.append(feature)
355
356            for ref_element in element:
357                if ref_element.tag == NS + "property":
358                    pass  # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs
359
360        def _parse_reference(element):
361            reference = SeqFeature.Reference()
362            authors = []
363            scopes = []
364            tissues = []
365            journal_name = ""
366            pub_type = ""
367            pub_date = ""
368            for ref_element in element:
369                if ref_element.tag == NS + "citation":
370                    pub_type = ref_element.attrib["type"]
371                    if pub_type == "submission":
372                        pub_type += " to the " + ref_element.attrib["db"]
373                    if "name" in ref_element.attrib:
374                        journal_name = ref_element.attrib["name"]
375                    pub_date = ref_element.attrib.get("date", "")
376                    j_volume = ref_element.attrib.get("volume", "")
377                    j_first = ref_element.attrib.get("first", "")
378                    j_last = ref_element.attrib.get("last", "")
379                    for cit_element in ref_element:
380                        if cit_element.tag == NS + "title":
381                            reference.title = cit_element.text
382                        elif cit_element.tag == NS + "authorList":
383                            for person_element in cit_element:
384                                authors.append(person_element.attrib["name"])
385                        elif cit_element.tag == NS + "dbReference":
386                            self.ParsedSeqRecord.dbxrefs.append(
387                                cit_element.attrib["type"]
388                                + ":"
389                                + cit_element.attrib["id"]
390                            )
391                            if cit_element.attrib["type"] == "PubMed":
392                                reference.pubmed_id = cit_element.attrib["id"]
393                            elif ref_element.attrib["type"] == "MEDLINE":
394                                reference.medline_id = cit_element.attrib["id"]
395                elif ref_element.tag == NS + "scope":
396                    scopes.append(ref_element.text)
397                elif ref_element.tag == NS + "source":
398                    for source_element in ref_element:
399                        if source_element.tag == NS + "tissue":
400                            tissues.append(source_element.text)
401            if scopes:
402                scopes_str = "Scope: " + ", ".join(scopes)
403            else:
404                scopes_str = ""
405            if tissues:
406                tissues_str = "Tissue: " + ", ".join(tissues)
407            else:
408                tissues_str = ""
409
410            # locations cannot be parsed since they are actually written in
411            # free text inside scopes so all the references are put in the
412            # annotation.
413            reference.location = []
414            reference.authors = ", ".join(authors)
415            if journal_name:
416                if pub_date and j_volume and j_first and j_last:
417                    reference.journal = REFERENCE_JOURNAL % {
418                        "name": journal_name,
419                        "volume": j_volume,
420                        "first": j_first,
421                        "last": j_last,
422                        "pub_date": pub_date,
423                    }
424                else:
425                    reference.journal = journal_name
426            reference.comment = " | ".join(
427                (pub_type, pub_date, scopes_str, tissues_str)
428            )
429            append_to_annotations("references", reference)
430
431        def _parse_position(element, offset=0):
432            try:
433                position = int(element.attrib["position"]) + offset
434            except KeyError:
435                position = None
436            status = element.attrib.get("status", "")
437            if status == "unknown":
438                assert position is None
439                return SeqFeature.UnknownPosition()
440            elif not status:
441                return SeqFeature.ExactPosition(position)
442            elif status == "greater than":
443                return SeqFeature.AfterPosition(position)
444            elif status == "less than":
445                return SeqFeature.BeforePosition(position)
446            elif status == "uncertain":
447                return SeqFeature.UncertainPosition(position)
448            else:
449                raise NotImplementedError("Position status %r" % status)
450
451        def _parse_feature(element):
452            feature = SeqFeature.SeqFeature()
453            for k, v in element.attrib.items():
454                feature.qualifiers[k] = v
455            feature.type = element.attrib.get("type", "")
456            if "id" in element.attrib:
457                feature.id = element.attrib["id"]
458            for feature_element in element:
459                if feature_element.tag == NS + "location":
460                    position_elements = feature_element.findall(NS + "position")
461                    if position_elements:
462                        element = position_elements[0]
463                        start_position = _parse_position(element, -1)
464                        end_position = _parse_position(element)
465                    else:
466                        element = feature_element.findall(NS + "begin")[0]
467                        start_position = _parse_position(element, -1)
468                        element = feature_element.findall(NS + "end")[0]
469                        end_position = _parse_position(element)
470                    feature.location = SeqFeature.FeatureLocation(
471                        start_position, end_position
472                    )
473                else:
474                    try:
475                        feature.qualifiers[
476                            feature_element.tag.replace(NS, "")
477                        ] = feature_element.text
478                    except Exception:  # TODO - Which exceptions?
479                        pass  # skip unparsable tag
480            self.ParsedSeqRecord.features.append(feature)
481
482        def _parse_proteinExistence(element):
483            append_to_annotations("proteinExistence", element.attrib["type"])
484
485        def _parse_evidence(element):
486            for k, v in element.attrib.items():
487                ann_key = k
488                append_to_annotations(ann_key, v)
489
490        def _parse_sequence(element):
491            for k, v in element.attrib.items():
492                if k in ("length", "mass", "version"):
493                    self.ParsedSeqRecord.annotations["sequence_%s" % k] = int(v)
494                else:
495                    self.ParsedSeqRecord.annotations["sequence_%s" % k] = v
496            self.ParsedSeqRecord.seq = Seq("".join(element.text.split()))
497            self.ParsedSeqRecord.annotations["molecule_type"] = "protein"
498
499        # ============================================#
500        # Initialize SeqRecord
501        self.ParsedSeqRecord = SeqRecord("", id="")
502
503        # Entry attribs parsing
504        # Unknown dataset should not happen!
505        self.dbname = self.entry.attrib.get("dataset", "UnknownDataset")
506        # add attribs to annotations
507        for k, v in self.entry.attrib.items():
508            if k in ("version"):
509                # original
510                # self.ParsedSeqRecord.annotations["entry_%s" % k] = int(v)
511                # To cope with swissProt plain text parser. this can cause errors
512                # if the attrib has the same name of an other annotation
513                self.ParsedSeqRecord.annotations[k] = int(v)
514            else:
515                # self.ParsedSeqRecord.annotations["entry_%s" % k] = v
516                # to cope with swissProt plain text parser:
517                self.ParsedSeqRecord.annotations[k] = v
518
519        # Top-to-bottom entry children parsing
520        for element in self.entry:
521            if element.tag == NS + "name":
522                _parse_name(element)
523            elif element.tag == NS + "accession":
524                _parse_accession(element)
525            elif element.tag == NS + "protein":
526                _parse_protein(element)
527            elif element.tag == NS + "gene":
528                _parse_gene(element)
529            elif element.tag == NS + "geneLocation":
530                _parse_geneLocation(element)
531            elif element.tag == NS + "organism":
532                _parse_organism(element)
533            elif element.tag == NS + "organismHost":
534                _parse_organismHost(element)
535            elif element.tag == NS + "keyword":
536                _parse_keyword(element)
537            elif element.tag == NS + "comment":
538                _parse_comment(element)
539            elif element.tag == NS + "dbReference":
540                _parse_dbReference(element)
541            elif element.tag == NS + "reference":
542                _parse_reference(element)
543            elif element.tag == NS + "feature":
544                _parse_feature(element)
545            elif element.tag == NS + "proteinExistence":
546                _parse_proteinExistence(element)
547            elif element.tag == NS + "evidence":
548                _parse_evidence(element)
549            elif element.tag == NS + "sequence":
550                _parse_sequence(element)
551            else:
552                pass
553
554        # remove duplicate dbxrefs
555        self.ParsedSeqRecord.dbxrefs = sorted(set(self.ParsedSeqRecord.dbxrefs))
556
557        # use first accession as id
558        if not self.ParsedSeqRecord.id:
559            self.ParsedSeqRecord.id = self.ParsedSeqRecord.annotations["accessions"][0]
560
561        return self.ParsedSeqRecord
562