1# Copyright 2010 by Andrea Pierleoni 2# Revisions copyright 2010, 2016 by Peter Cock 3# All rights reserved. 4# 5# This file is part of the Biopython distribution and governed by your 6# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 7# Please see the LICENSE file that should have been included as part of this 8# package. 9"""Bio.SeqIO support for the "uniprot-xml" file format. 10 11See Also: 12http://www.uniprot.org 13 14The UniProt XML format essentially replaces the old plain text file format 15originally introduced by SwissProt ("swiss" format in Bio.SeqIO). 16 17""" 18from xml.etree import ElementTree 19from xml.parsers.expat import errors 20 21from Bio import SeqFeature 22from Bio.Seq import Seq 23from Bio.SeqRecord import SeqRecord 24 25 26NS = "{http://uniprot.org/uniprot}" 27REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)" 28 29 30def UniprotIterator(source, alphabet=None, return_raw_comments=False): 31 """Iterate over UniProt XML as SeqRecord objects. 32 33 parses an XML entry at a time from any UniProt XML file 34 returns a SeqRecord for each iteration 35 36 This generator can be used in Bio.SeqIO 37 38 Argument source is a file-like object or a path to a file. 39 40 Optional argument alphabet should not be used anymore. 41 42 return_raw_comments = True --> comment fields are returned as complete XML to allow further processing 43 skip_parsing_errors = True --> if parsing errors are found, skip to next entry 44 """ 45 if alphabet is not None: 46 raise ValueError("The alphabet argument is no longer supported") 47 try: 48 for event, elem in ElementTree.iterparse(source, events=("start", "end")): 49 if event == "end" and elem.tag == NS + "entry": 50 yield Parser(elem, return_raw_comments=return_raw_comments).parse() 51 elem.clear() 52 except ElementTree.ParseError as exception: 53 if errors.messages[exception.code] == errors.XML_ERROR_NO_ELEMENTS: 54 assert exception.position == (1, 0) # line 1, column 0 55 raise ValueError("Empty file.") from None 56 else: 57 raise 58 59 60class Parser: 61 """Parse a UniProt XML entry to a SeqRecord. 62 63 Optional argument alphabet is no longer used. 64 65 return_raw_comments=True to get back the complete comment field in XML format 66 """ 67 68 def __init__(self, elem, alphabet=None, return_raw_comments=False): 69 """Initialize the class.""" 70 if alphabet is not None: 71 raise ValueError("The alphabet argument is no longer supported") 72 self.entry = elem 73 self.return_raw_comments = return_raw_comments 74 75 def parse(self): 76 """Parse the input.""" 77 assert self.entry.tag == NS + "entry" 78 79 def append_to_annotations(key, value): 80 if key not in self.ParsedSeqRecord.annotations: 81 self.ParsedSeqRecord.annotations[key] = [] 82 if value not in self.ParsedSeqRecord.annotations[key]: 83 self.ParsedSeqRecord.annotations[key].append(value) 84 85 def _parse_name(element): 86 self.ParsedSeqRecord.name = element.text 87 self.ParsedSeqRecord.dbxrefs.append(self.dbname + ":" + element.text) 88 89 def _parse_accession(element): 90 append_to_annotations( 91 "accessions", element.text 92 ) # to cope with SwissProt plain text parser 93 self.ParsedSeqRecord.dbxrefs.append(self.dbname + ":" + element.text) 94 95 def _parse_protein(element): 96 """Parse protein names (PRIVATE).""" 97 descr_set = False 98 for protein_element in element: 99 if protein_element.tag in [ 100 NS + "recommendedName", 101 NS + "submittedName", 102 NS + "alternativeName", 103 ]: # recommendedName tag are parsed before 104 # use protein fields for name and description 105 for rec_name in protein_element: 106 ann_key = "%s_%s" % ( 107 protein_element.tag.replace(NS, ""), 108 rec_name.tag.replace(NS, ""), 109 ) 110 append_to_annotations(ann_key, rec_name.text) 111 if (rec_name.tag == NS + "fullName") and not descr_set: 112 self.ParsedSeqRecord.description = rec_name.text 113 descr_set = True 114 elif protein_element.tag == NS + "component": 115 pass # not parsed 116 elif protein_element.tag == NS + "domain": 117 pass # not parsed 118 119 def _parse_gene(element): 120 for genename_element in element: 121 if "type" in genename_element.attrib: 122 ann_key = "gene_%s_%s" % ( 123 genename_element.tag.replace(NS, ""), 124 genename_element.attrib["type"], 125 ) 126 if genename_element.attrib["type"] == "primary": 127 self.ParsedSeqRecord.annotations[ 128 ann_key 129 ] = genename_element.text 130 else: 131 append_to_annotations(ann_key, genename_element.text) 132 133 def _parse_geneLocation(element): 134 append_to_annotations("geneLocation", element.attrib["type"]) 135 136 def _parse_organism(element): 137 organism_name = com_name = sci_name = "" 138 for organism_element in element: 139 if organism_element.tag == NS + "name": 140 if organism_element.text: 141 if organism_element.attrib["type"] == "scientific": 142 sci_name = organism_element.text 143 elif organism_element.attrib["type"] == "common": 144 com_name = organism_element.text 145 else: 146 # e.g. synonym 147 append_to_annotations( 148 "organism_name", organism_element.text 149 ) 150 elif organism_element.tag == NS + "dbReference": 151 self.ParsedSeqRecord.dbxrefs.append( 152 organism_element.attrib["type"] 153 + ":" 154 + organism_element.attrib["id"] 155 ) 156 elif organism_element.tag == NS + "lineage": 157 for taxon_element in organism_element: 158 if taxon_element.tag == NS + "taxon": 159 append_to_annotations("taxonomy", taxon_element.text) 160 if sci_name and com_name: 161 organism_name = "%s (%s)" % (sci_name, com_name) 162 elif sci_name: 163 organism_name = sci_name 164 elif com_name: 165 organism_name = com_name 166 self.ParsedSeqRecord.annotations["organism"] = organism_name 167 168 def _parse_organismHost(element): 169 for organism_element in element: 170 if organism_element.tag == NS + "name": 171 append_to_annotations("organism_host", organism_element.text) 172 173 def _parse_keyword(element): 174 append_to_annotations("keywords", element.text) 175 176 def _parse_comment(element): 177 """Parse comments (PRIVATE). 178 179 Comment fields are very heterogeneus. each type has his own (frequently mutated) schema. 180 To store all the contained data, more complex data structures are needed, such as 181 annotated dictionaries. This is left to end user, by optionally setting: 182 183 return_raw_comments=True 184 185 The original XML is returned in the annotation fields. 186 187 Available comment types at december 2009: 188 - "allergen" 189 - "alternative products" 190 - "biotechnology" 191 - "biophysicochemical properties" 192 - "catalytic activity" 193 - "caution" 194 - "cofactor" 195 - "developmental stage" 196 - "disease" 197 - "domain" 198 - "disruption phenotype" 199 - "enzyme regulation" 200 - "function" 201 - "induction" 202 - "miscellaneous" 203 - "pathway" 204 - "pharmaceutical" 205 - "polymorphism" 206 - "PTM" 207 - "RNA editing" 208 - "similarity" 209 - "subcellular location" 210 - "sequence caution" 211 - "subunit" 212 - "tissue specificity" 213 - "toxic dose" 214 - "online information" 215 - "mass spectrometry" 216 - "interaction" 217 218 """ 219 simple_comments = [ 220 "allergen", 221 "biotechnology", 222 "biophysicochemical properties", 223 "catalytic activity", 224 "caution", 225 "cofactor", 226 "developmental stage", 227 "disease", 228 "domain", 229 "disruption phenotype", 230 "enzyme regulation", 231 "function", 232 "induction", 233 "miscellaneous", 234 "pathway", 235 "pharmaceutical", 236 "polymorphism", 237 "PTM", 238 "RNA editing", # positions not parsed 239 "similarity", 240 "subunit", 241 "tissue specificity", 242 "toxic dose", 243 ] 244 245 if element.attrib["type"] in simple_comments: 246 ann_key = "comment_%s" % element.attrib["type"].replace(" ", "") 247 for text_element in element.iter(NS + "text"): 248 if text_element.text: 249 append_to_annotations(ann_key, text_element.text) 250 elif element.attrib["type"] == "subcellular location": 251 for subloc_element in element.iter(NS + "subcellularLocation"): 252 for el in subloc_element: 253 if el.text: 254 ann_key = "comment_%s_%s" % ( 255 element.attrib["type"].replace(" ", ""), 256 el.tag.replace(NS, ""), 257 ) 258 append_to_annotations(ann_key, el.text) 259 elif element.attrib["type"] == "interaction": 260 for interact_element in element.iter(NS + "interactant"): 261 ann_key = "comment_%s_intactId" % element.attrib["type"] 262 append_to_annotations(ann_key, interact_element.attrib["intactId"]) 263 elif element.attrib["type"] == "alternative products": 264 for alt_element in element.iter(NS + "isoform"): 265 ann_key = "comment_%s_isoform" % element.attrib["type"].replace( 266 " ", "" 267 ) 268 for id_element in alt_element.iter(NS + "id"): 269 append_to_annotations(ann_key, id_element.text) 270 elif element.attrib["type"] == "mass spectrometry": 271 ann_key = "comment_%s" % element.attrib["type"].replace(" ", "") 272 start = end = 0 273 for el in element.iter(NS + "location"): 274 pos_els = list(el.iter(NS + "position")) 275 # this try should be avoided, maybe it is safer to skip position parsing for mass spectrometry 276 try: 277 if pos_els: 278 end = int(pos_els[0].attrib["position"]) 279 start = end - 1 280 else: 281 start = int(next(el.iter(NS + "begin")).attrib["position"]) 282 start -= 1 283 end = int(next(el.iter(NS + "end")).attrib["position"]) 284 except (ValueError, KeyError): 285 # undefined positions or erroneously mapped 286 pass 287 mass = element.attrib["mass"] 288 method = element.attrib["method"] 289 if start == end == 0: 290 append_to_annotations(ann_key, "undefined:%s|%s" % (mass, method)) 291 else: 292 append_to_annotations( 293 ann_key, "%s..%s:%s|%s" % (start, end, mass, method) 294 ) 295 elif element.attrib["type"] == "sequence caution": 296 pass # not parsed: few information, complex structure 297 elif element.attrib["type"] == "online information": 298 for link_element in element.iter(NS + "link"): 299 ann_key = "comment_%s" % element.attrib["type"].replace(" ", "") 300 for id_element in link_element.iter(NS + "link"): 301 append_to_annotations( 302 ann_key, 303 "%s@%s" 304 % (element.attrib["name"], link_element.attrib["uri"]), 305 ) 306 307 # return raw XML comments if needed 308 if self.return_raw_comments: 309 ann_key = "comment_%s_xml" % element.attrib["type"].replace(" ", "") 310 append_to_annotations(ann_key, ElementTree.tostring(element)) 311 312 def _parse_dbReference(element): 313 self.ParsedSeqRecord.dbxrefs.append( 314 element.attrib["type"] + ":" + element.attrib["id"] 315 ) 316 # e.g. 317 # <dbReference type="PDB" key="11" id="2GEZ"> 318 # <property value="X-ray" type="method"/> 319 # <property value="2.60 A" type="resolution"/> 320 # <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/> 321 # </dbReference> 322 if "type" in element.attrib: 323 if element.attrib["type"] == "PDB": 324 method = "" 325 resolution = "" 326 for ref_element in element: 327 if ref_element.tag == NS + "property": 328 dat_type = ref_element.attrib["type"] 329 if dat_type == "method": 330 method = ref_element.attrib["value"] 331 if dat_type == "resolution": 332 resolution = ref_element.attrib["value"] 333 if dat_type == "chains": 334 pairs = ref_element.attrib["value"].split(",") 335 for elem in pairs: 336 pair = elem.strip().split("=") 337 if pair[1] != "-": 338 # TODO - How best to store these, do SeqFeatures make sense? 339 feature = SeqFeature.SeqFeature() 340 feature.type = element.attrib["type"] 341 feature.qualifiers["name"] = element.attrib[ 342 "id" 343 ] 344 feature.qualifiers["method"] = method 345 feature.qualifiers["resolution"] = resolution 346 feature.qualifiers["chains"] = pair[0].split( 347 "/" 348 ) 349 start = int(pair[1].split("-")[0]) - 1 350 end = int(pair[1].split("-")[1]) 351 feature.location = SeqFeature.FeatureLocation( 352 start, end 353 ) 354 # self.ParsedSeqRecord.features.append(feature) 355 356 for ref_element in element: 357 if ref_element.tag == NS + "property": 358 pass # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs 359 360 def _parse_reference(element): 361 reference = SeqFeature.Reference() 362 authors = [] 363 scopes = [] 364 tissues = [] 365 journal_name = "" 366 pub_type = "" 367 pub_date = "" 368 for ref_element in element: 369 if ref_element.tag == NS + "citation": 370 pub_type = ref_element.attrib["type"] 371 if pub_type == "submission": 372 pub_type += " to the " + ref_element.attrib["db"] 373 if "name" in ref_element.attrib: 374 journal_name = ref_element.attrib["name"] 375 pub_date = ref_element.attrib.get("date", "") 376 j_volume = ref_element.attrib.get("volume", "") 377 j_first = ref_element.attrib.get("first", "") 378 j_last = ref_element.attrib.get("last", "") 379 for cit_element in ref_element: 380 if cit_element.tag == NS + "title": 381 reference.title = cit_element.text 382 elif cit_element.tag == NS + "authorList": 383 for person_element in cit_element: 384 authors.append(person_element.attrib["name"]) 385 elif cit_element.tag == NS + "dbReference": 386 self.ParsedSeqRecord.dbxrefs.append( 387 cit_element.attrib["type"] 388 + ":" 389 + cit_element.attrib["id"] 390 ) 391 if cit_element.attrib["type"] == "PubMed": 392 reference.pubmed_id = cit_element.attrib["id"] 393 elif ref_element.attrib["type"] == "MEDLINE": 394 reference.medline_id = cit_element.attrib["id"] 395 elif ref_element.tag == NS + "scope": 396 scopes.append(ref_element.text) 397 elif ref_element.tag == NS + "source": 398 for source_element in ref_element: 399 if source_element.tag == NS + "tissue": 400 tissues.append(source_element.text) 401 if scopes: 402 scopes_str = "Scope: " + ", ".join(scopes) 403 else: 404 scopes_str = "" 405 if tissues: 406 tissues_str = "Tissue: " + ", ".join(tissues) 407 else: 408 tissues_str = "" 409 410 # locations cannot be parsed since they are actually written in 411 # free text inside scopes so all the references are put in the 412 # annotation. 413 reference.location = [] 414 reference.authors = ", ".join(authors) 415 if journal_name: 416 if pub_date and j_volume and j_first and j_last: 417 reference.journal = REFERENCE_JOURNAL % { 418 "name": journal_name, 419 "volume": j_volume, 420 "first": j_first, 421 "last": j_last, 422 "pub_date": pub_date, 423 } 424 else: 425 reference.journal = journal_name 426 reference.comment = " | ".join( 427 (pub_type, pub_date, scopes_str, tissues_str) 428 ) 429 append_to_annotations("references", reference) 430 431 def _parse_position(element, offset=0): 432 try: 433 position = int(element.attrib["position"]) + offset 434 except KeyError: 435 position = None 436 status = element.attrib.get("status", "") 437 if status == "unknown": 438 assert position is None 439 return SeqFeature.UnknownPosition() 440 elif not status: 441 return SeqFeature.ExactPosition(position) 442 elif status == "greater than": 443 return SeqFeature.AfterPosition(position) 444 elif status == "less than": 445 return SeqFeature.BeforePosition(position) 446 elif status == "uncertain": 447 return SeqFeature.UncertainPosition(position) 448 else: 449 raise NotImplementedError("Position status %r" % status) 450 451 def _parse_feature(element): 452 feature = SeqFeature.SeqFeature() 453 for k, v in element.attrib.items(): 454 feature.qualifiers[k] = v 455 feature.type = element.attrib.get("type", "") 456 if "id" in element.attrib: 457 feature.id = element.attrib["id"] 458 for feature_element in element: 459 if feature_element.tag == NS + "location": 460 position_elements = feature_element.findall(NS + "position") 461 if position_elements: 462 element = position_elements[0] 463 start_position = _parse_position(element, -1) 464 end_position = _parse_position(element) 465 else: 466 element = feature_element.findall(NS + "begin")[0] 467 start_position = _parse_position(element, -1) 468 element = feature_element.findall(NS + "end")[0] 469 end_position = _parse_position(element) 470 feature.location = SeqFeature.FeatureLocation( 471 start_position, end_position 472 ) 473 else: 474 try: 475 feature.qualifiers[ 476 feature_element.tag.replace(NS, "") 477 ] = feature_element.text 478 except Exception: # TODO - Which exceptions? 479 pass # skip unparsable tag 480 self.ParsedSeqRecord.features.append(feature) 481 482 def _parse_proteinExistence(element): 483 append_to_annotations("proteinExistence", element.attrib["type"]) 484 485 def _parse_evidence(element): 486 for k, v in element.attrib.items(): 487 ann_key = k 488 append_to_annotations(ann_key, v) 489 490 def _parse_sequence(element): 491 for k, v in element.attrib.items(): 492 if k in ("length", "mass", "version"): 493 self.ParsedSeqRecord.annotations["sequence_%s" % k] = int(v) 494 else: 495 self.ParsedSeqRecord.annotations["sequence_%s" % k] = v 496 self.ParsedSeqRecord.seq = Seq("".join(element.text.split())) 497 self.ParsedSeqRecord.annotations["molecule_type"] = "protein" 498 499 # ============================================# 500 # Initialize SeqRecord 501 self.ParsedSeqRecord = SeqRecord("", id="") 502 503 # Entry attribs parsing 504 # Unknown dataset should not happen! 505 self.dbname = self.entry.attrib.get("dataset", "UnknownDataset") 506 # add attribs to annotations 507 for k, v in self.entry.attrib.items(): 508 if k in ("version"): 509 # original 510 # self.ParsedSeqRecord.annotations["entry_%s" % k] = int(v) 511 # To cope with swissProt plain text parser. this can cause errors 512 # if the attrib has the same name of an other annotation 513 self.ParsedSeqRecord.annotations[k] = int(v) 514 else: 515 # self.ParsedSeqRecord.annotations["entry_%s" % k] = v 516 # to cope with swissProt plain text parser: 517 self.ParsedSeqRecord.annotations[k] = v 518 519 # Top-to-bottom entry children parsing 520 for element in self.entry: 521 if element.tag == NS + "name": 522 _parse_name(element) 523 elif element.tag == NS + "accession": 524 _parse_accession(element) 525 elif element.tag == NS + "protein": 526 _parse_protein(element) 527 elif element.tag == NS + "gene": 528 _parse_gene(element) 529 elif element.tag == NS + "geneLocation": 530 _parse_geneLocation(element) 531 elif element.tag == NS + "organism": 532 _parse_organism(element) 533 elif element.tag == NS + "organismHost": 534 _parse_organismHost(element) 535 elif element.tag == NS + "keyword": 536 _parse_keyword(element) 537 elif element.tag == NS + "comment": 538 _parse_comment(element) 539 elif element.tag == NS + "dbReference": 540 _parse_dbReference(element) 541 elif element.tag == NS + "reference": 542 _parse_reference(element) 543 elif element.tag == NS + "feature": 544 _parse_feature(element) 545 elif element.tag == NS + "proteinExistence": 546 _parse_proteinExistence(element) 547 elif element.tag == NS + "evidence": 548 _parse_evidence(element) 549 elif element.tag == NS + "sequence": 550 _parse_sequence(element) 551 else: 552 pass 553 554 # remove duplicate dbxrefs 555 self.ParsedSeqRecord.dbxrefs = sorted(set(self.ParsedSeqRecord.dbxrefs)) 556 557 # use first accession as id 558 if not self.ParsedSeqRecord.id: 559 self.ParsedSeqRecord.id = self.ParsedSeqRecord.annotations["accessions"][0] 560 561 return self.ParsedSeqRecord 562