1# Copyright 2000 by Jeffrey Chang, Brad Chapman. All rights reserved. 2# Copyright 2006-2017 by Peter Cock. All rights reserved. 3# 4# This code is part of the Biopython distribution and governed by its 5# license. Please see the LICENSE file that should have been included 6# as part of this package. 7 8"""Code to work with GenBank formatted files. 9 10Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 11the "genbank" or "embl" format names to parse GenBank or EMBL files into 12SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 13 14Using Bio.GenBank directly to parse GenBank files is only useful if you want 15to obtain GenBank-specific Record objects, which is a much closer 16representation to the raw file contents than the SeqRecord alternative from 17the FeatureParser (used in Bio.SeqIO). 18 19To use the Bio.GenBank parser, there are two helper functions: 20 21 - read Parse a handle containing a single GenBank record 22 as Bio.GenBank specific Record objects. 23 - parse Iterate over a handle containing multiple GenBank 24 records as Bio.GenBank specific Record objects. 25 26The following internal classes are not intended for direct use and may 27be deprecated in a future release. 28 29Classes: 30 - Iterator Iterate through a file of GenBank entries 31 - ErrorFeatureParser Catch errors caused during parsing. 32 - FeatureParser Parse GenBank data in SeqRecord and SeqFeature objects. 33 - RecordParser Parse GenBank data into a Record object. 34 35Exceptions: 36 - ParserFailureError Exception indicating a failure in the parser (ie. 37 scanner or consumer) 38 - LocationParserError Exception indicating a problem with the spark based 39 location parser. 40 41""" 42 43import re 44import warnings 45 46from Bio import BiopythonParserWarning 47from Bio.Seq import Seq 48from Bio import SeqFeature 49 50# other Bio.GenBank stuff 51from .utils import FeatureValueCleaner 52from .Scanner import GenBankScanner 53 54 55# Constants used to parse GenBank header lines 56GENBANK_INDENT = 12 57GENBANK_SPACER = " " * GENBANK_INDENT 58 59# Constants for parsing GenBank feature lines 60FEATURE_KEY_INDENT = 5 61FEATURE_QUALIFIER_INDENT = 21 62FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 63FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 64 65# Regular expressions for location parsing 66_solo_location = r"[<>]?\d+" 67_pair_location = r"[<>]?\d+\.\.[<>]?\d+" 68_between_location = r"\d+\^\d+" 69 70_within_position = r"\(\d+\.\d+\)" 71_re_within_position = re.compile(_within_position) 72_within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % ( 73 _within_position, 74 _within_position, 75) 76assert _re_within_position.match("(3.9)") 77assert re.compile(_within_location).match("(3.9)..10") 78assert re.compile(_within_location).match("26..(30.33)") 79assert re.compile(_within_location).match("(13.19)..(20.28)") 80 81_oneof_position = r"one\-of\(\d+(,\d+)+\)" 82_re_oneof_position = re.compile(_oneof_position) 83_oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (_oneof_position, _oneof_position) 84assert _re_oneof_position.match("one-of(6,9)") 85assert re.compile(_oneof_location).match("one-of(6,9)..101") 86assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)") 87assert re.compile(_oneof_location).match("6..one-of(101,104)") 88 89assert not _re_oneof_position.match("one-of(3)") 90assert _re_oneof_position.match("one-of(3,6)") 91assert _re_oneof_position.match("one-of(3,6,9)") 92 93 94_simple_location = r"\d+\.\.\d+" 95_re_simple_location = re.compile(r"^%s$" % _simple_location) 96_re_simple_compound = re.compile( 97 r"^(join|order|bond)\(%s(,%s)*\)$" % (_simple_location, _simple_location) 98) 99_complex_location = r"([a-zA-Z][a-zA-Z0-9_\.\|]*[a-zA-Z0-9]?\:)?(%s|%s|%s|%s|%s)" % ( 100 _pair_location, 101 _solo_location, 102 _between_location, 103 _within_location, 104 _oneof_location, 105) 106_re_complex_location = re.compile(r"^%s$" % _complex_location) 107_possibly_complemented_complex_location = r"(%s|complement\(%s\))" % ( 108 _complex_location, 109 _complex_location, 110) 111_re_complex_compound = re.compile( 112 r"^(join|order|bond)\(%s(,%s)*\)$" 113 % (_possibly_complemented_complex_location, _possibly_complemented_complex_location) 114) 115 116 117assert _re_simple_location.match("104..160") 118assert not _re_simple_location.match("68451760..68452073^68452074") 119assert not _re_simple_location.match("<104..>160") 120assert not _re_simple_location.match("104") 121assert not _re_simple_location.match("<1") 122assert not _re_simple_location.match(">99999") 123assert not _re_simple_location.match("join(104..160,320..390,504..579)") 124assert not _re_simple_compound.match("bond(12,63)") 125assert _re_simple_compound.match("join(104..160,320..390,504..579)") 126assert _re_simple_compound.match("order(1..69,1308..1465)") 127assert not _re_simple_compound.match("order(1..69,1308..1465,1524)") 128assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)") 129assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)") 130assert not _re_simple_compound.match( 131 "join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)" 132) 133assert not _re_simple_compound.match("test(1..69,1308..1465)") 134assert not _re_simple_compound.match("complement(1..69)") 135assert not _re_simple_compound.match("(1..69)") 136assert _re_complex_location.match("(3.9)..10") 137assert _re_complex_location.match("26..(30.33)") 138assert _re_complex_location.match("(13.19)..(20.28)") 139assert _re_complex_location.match("41^42") # between 140assert _re_complex_location.match("AL121804:41^42") 141assert _re_complex_location.match("AL121804:41..610") 142assert _re_complex_location.match("AL121804.2:41..610") 143assert _re_complex_location.match( 144 "AL358792.24.1.166931:3274..3461" 145) # lots of dots in external reference 146assert _re_complex_location.match("one-of(3,6)..101") 147assert _re_complex_compound.match( 148 "join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)" 149) 150assert not _re_simple_compound.match( 151 "join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)" 152) 153assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)") 154assert _re_complex_compound.match( 155 "join(complement(AL354868.10.1.164018:80837..81016),complement(AL354868.10.1.164018:80539..80835))" 156) 157 158# Trans-spliced example from NC_016406, note underscore in reference name: 159assert _re_complex_location.match("NC_016402.1:6618..6676") 160assert _re_complex_location.match("181647..181905") 161assert _re_complex_compound.match( 162 "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)" 163) 164assert not _re_complex_location.match( 165 "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)" 166) 167assert not _re_simple_compound.match( 168 "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)" 169) 170assert not _re_complex_location.match( 171 "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)" 172) 173assert not _re_simple_location.match( 174 "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)" 175) 176 177_solo_bond = re.compile(r"bond\(%s\)" % _solo_location) 178assert _solo_bond.match("bond(196)") 179assert _solo_bond.search("bond(196)") 180assert _solo_bond.search("join(bond(284),bond(305),bond(309),bond(305))") 181 182 183def _pos(pos_str, offset=0): 184 """Build a Position object (PRIVATE). 185 186 For an end position, leave offset as zero (default): 187 188 >>> _pos("5") 189 ExactPosition(5) 190 191 For a start position, set offset to minus one (for Python counting): 192 193 >>> _pos("5", -1) 194 ExactPosition(4) 195 196 This also covers fuzzy positions: 197 198 >>> p = _pos("<5") 199 >>> p 200 BeforePosition(5) 201 >>> print(p) 202 <5 203 >>> int(p) 204 5 205 206 >>> _pos(">5") 207 AfterPosition(5) 208 209 By default assumes an end position, so note the integer behaviour: 210 211 >>> p = _pos("one-of(5,8,11)") 212 >>> p 213 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)]) 214 >>> print(p) 215 one-of(5,8,11) 216 >>> int(p) 217 11 218 219 >>> _pos("(8.10)") 220 WithinPosition(10, left=8, right=10) 221 222 Fuzzy start positions: 223 224 >>> p = _pos("<5", -1) 225 >>> p 226 BeforePosition(4) 227 >>> print(p) 228 <4 229 >>> int(p) 230 4 231 232 Notice how the integer behaviour changes too! 233 234 >>> p = _pos("one-of(5,8,11)", -1) 235 >>> p 236 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)]) 237 >>> print(p) 238 one-of(4,7,10) 239 >>> int(p) 240 4 241 242 """ 243 if pos_str.startswith("<"): 244 return SeqFeature.BeforePosition(int(pos_str[1:]) + offset) 245 elif pos_str.startswith(">"): 246 return SeqFeature.AfterPosition(int(pos_str[1:]) + offset) 247 elif _re_within_position.match(pos_str): 248 s, e = pos_str[1:-1].split(".") 249 s = int(s) + offset 250 e = int(e) + offset 251 if offset == -1: 252 default = s 253 else: 254 default = e 255 return SeqFeature.WithinPosition(default, left=s, right=e) 256 elif _re_oneof_position.match(pos_str): 257 assert pos_str.startswith("one-of(") 258 assert pos_str[-1] == ")" 259 parts = [ 260 SeqFeature.ExactPosition(int(pos) + offset) 261 for pos in pos_str[7:-1].split(",") 262 ] 263 if offset == -1: 264 default = min(int(pos) for pos in parts) 265 else: 266 default = max(int(pos) for pos in parts) 267 return SeqFeature.OneOfPosition(default, choices=parts) 268 else: 269 return SeqFeature.ExactPosition(int(pos_str) + offset) 270 271 272def _loc(loc_str, expected_seq_length, strand, seq_type=None): 273 """Make FeatureLocation from non-compound non-complement location (PRIVATE). 274 275 This is also invoked to 'automatically' fix ambiguous formatting of features 276 that span the origin of a circular sequence. 277 278 Simple examples, 279 280 >>> _loc("123..456", 1000, +1) 281 FeatureLocation(ExactPosition(122), ExactPosition(456), strand=1) 282 >>> _loc("<123..>456", 1000, strand = -1) 283 FeatureLocation(BeforePosition(122), AfterPosition(456), strand=-1) 284 285 A more complex location using within positions, 286 287 >>> _loc("(9.10)..(20.25)", 1000, 1) 288 FeatureLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1) 289 290 Notice how that will act as though it has overall start 8 and end 25. 291 292 Zero length between feature, 293 294 >>> _loc("123^124", 1000, 0) 295 FeatureLocation(ExactPosition(123), ExactPosition(123), strand=0) 296 297 The expected sequence length is needed for a special case, a between 298 position at the start/end of a circular genome: 299 300 >>> _loc("1000^1", 1000, 1) 301 FeatureLocation(ExactPosition(1000), ExactPosition(1000), strand=1) 302 303 Apart from this special case, between positions P^Q must have P+1==Q, 304 305 >>> _loc("123^456", 1000, 1) 306 Traceback (most recent call last): 307 ... 308 ValueError: Invalid between location '123^456' 309 310 You can optionally provide a reference name: 311 312 >>> _loc("AL391218.9:105173..108462", 2000000, 1) 313 FeatureLocation(ExactPosition(105172), ExactPosition(108462), strand=1, ref='AL391218.9') 314 315 >>> _loc("<2644..159", 2868, 1, "circular") 316 CompoundLocation([FeatureLocation(BeforePosition(2643), ExactPosition(2868), strand=1), FeatureLocation(ExactPosition(0), ExactPosition(159), strand=1)], 'join') 317 """ 318 if ":" in loc_str: 319 ref, loc_str = loc_str.split(":") 320 else: 321 ref = None 322 try: 323 s, e = loc_str.split("..") 324 except ValueError: 325 assert ".." not in loc_str 326 if "^" in loc_str: 327 # A between location like "67^68" (one based counting) is a 328 # special case (note it has zero length). In python slice 329 # notation this is 67:67, a zero length slice. See Bug 2622 330 # Further more, on a circular genome of length N you can have 331 # a location N^1 meaning the junction at the origin. See Bug 3098. 332 # NOTE - We can imagine between locations like "2^4", but this 333 # is just "3". Similarly, "2^5" is just "3..4" 334 s, e = loc_str.split("^") 335 if int(s) + 1 == int(e): 336 pos = _pos(s) 337 elif int(s) == expected_seq_length and e == "1": 338 pos = _pos(s) 339 else: 340 raise ValueError("Invalid between location %r" % loc_str) from None 341 return SeqFeature.FeatureLocation(pos, pos, strand, ref=ref) 342 else: 343 # e.g. "123" 344 s = loc_str 345 e = loc_str 346 347 # Attempt to fix features that span the origin 348 s_pos = _pos(s, -1) 349 e_pos = _pos(e) 350 if int(s_pos) > int(e_pos): 351 if seq_type is None or "circular" not in seq_type.lower(): 352 warnings.warn( 353 "It appears that %r is a feature that spans " 354 "the origin, but the sequence topology is " 355 "undefined. Skipping feature." % loc_str, 356 BiopythonParserWarning, 357 ) 358 return None 359 warnings.warn( 360 "Attempting to fix invalid location %r as " 361 "it looks like incorrect origin wrapping. " 362 "Please fix input file, this could have " 363 "unintended behavior." % loc_str, 364 BiopythonParserWarning, 365 ) 366 367 f1 = SeqFeature.FeatureLocation(s_pos, expected_seq_length, strand) 368 f2 = SeqFeature.FeatureLocation(0, int(e_pos), strand) 369 370 if strand == -1: 371 # For complementary features spanning the origin 372 return f2 + f1 373 else: 374 return f1 + f2 375 376 return SeqFeature.FeatureLocation(_pos(s, -1), _pos(e), strand, ref=ref) 377 378 379def _split_compound_loc(compound_loc): 380 """Split a tricky compound location string (PRIVATE). 381 382 >>> list(_split_compound_loc("123..145")) 383 ['123..145'] 384 >>> list(_split_compound_loc("123..145,200..209")) 385 ['123..145', '200..209'] 386 >>> list(_split_compound_loc("one-of(200,203)..300")) 387 ['one-of(200,203)..300'] 388 >>> list(_split_compound_loc("complement(123..145),200..209")) 389 ['complement(123..145)', '200..209'] 390 >>> list(_split_compound_loc("123..145,one-of(200,203)..209")) 391 ['123..145', 'one-of(200,203)..209'] 392 >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300")) 393 ['123..145', 'one-of(200,203)..one-of(209,211)', '300'] 394 >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300")) 395 ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300'] 396 >>> list(_split_compound_loc("123..145,200..one-of(209,211),300")) 397 ['123..145', '200..one-of(209,211)', '300'] 398 >>> list(_split_compound_loc("123..145,200..one-of(209,211)")) 399 ['123..145', '200..one-of(209,211)'] 400 >>> list(_split_compound_loc("complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905")) 401 ['complement(149815..150200)', 'complement(293787..295573)', 'NC_016402.1:6618..6676', '181647..181905'] 402 """ 403 if "one-of(" in compound_loc: 404 # Hard case 405 while "," in compound_loc: 406 assert compound_loc[0] != "," 407 assert compound_loc[0:2] != ".." 408 i = compound_loc.find(",") 409 part = compound_loc[:i] 410 compound_loc = compound_loc[i:] # includes the comma 411 while part.count("(") > part.count(")"): 412 assert "one-of(" in part, (part, compound_loc) 413 i = compound_loc.find(")") 414 part += compound_loc[: i + 1] 415 compound_loc = compound_loc[i + 1 :] 416 if compound_loc.startswith(".."): 417 i = compound_loc.find(",") 418 if i == -1: 419 part += compound_loc 420 compound_loc = "" 421 else: 422 part += compound_loc[:i] 423 compound_loc = compound_loc[i:] # includes the comma 424 while part.count("(") > part.count(")"): 425 assert part.count("one-of(") == 2 426 i = compound_loc.find(")") 427 part += compound_loc[: i + 1] 428 compound_loc = compound_loc[i + 1 :] 429 if compound_loc.startswith(","): 430 compound_loc = compound_loc[1:] 431 assert part 432 yield part 433 if compound_loc: 434 yield compound_loc 435 else: 436 # Easy case 437 yield from compound_loc.split(",") 438 439 440class Iterator: 441 """Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE). 442 443 This class is likely to be deprecated in a future release of Biopython. 444 Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...) 445 for SeqRecord and GenBank specific Record objects respectively instead. 446 """ 447 448 def __init__(self, handle, parser=None): 449 """Initialize the iterator. 450 451 Arguments: 452 - handle - A handle with GenBank entries to iterate through. 453 - parser - An optional parser to pass the entries through before 454 returning them. If None, then the raw entry will be returned. 455 456 """ 457 self.handle = handle 458 self._parser = parser 459 460 def __next__(self): 461 """Return the next GenBank record from the handle. 462 463 Will return None if we ran out of records. 464 """ 465 if self._parser is None: 466 lines = [] 467 while True: 468 line = self.handle.readline() 469 if not line: 470 return None # Premature end of file? 471 lines.append(line) 472 if line.rstrip() == "//": 473 break 474 return "".join(lines) 475 try: 476 return self._parser.parse(self.handle) 477 except StopIteration: 478 return None 479 480 def __iter__(self): 481 """Iterate over the records.""" 482 return iter(self.__next__, None) 483 484 485class ParserFailureError(Exception): 486 """Failure caused by some kind of problem in the parser.""" 487 488 pass 489 490 491class LocationParserError(Exception): 492 """Could not Properly parse out a location from a GenBank file.""" 493 494 pass 495 496 497_cleaner = FeatureValueCleaner() 498 499 500class FeatureParser: 501 """Parse GenBank files into Seq + Feature objects (OBSOLETE). 502 503 Direct use of this class is discouraged, and may be deprecated in 504 a future release of Biopython. 505 506 Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead. 507 """ 508 509 def __init__(self, debug_level=0, use_fuzziness=1, feature_cleaner=None): 510 """Initialize a GenBank parser and Feature consumer. 511 512 Arguments: 513 - debug_level - An optional argument that species the amount of 514 debugging information the parser should spit out. By default we have 515 no debugging info (the fastest way to do things), but if you want 516 you can set this as high as two and see exactly where a parse fails. 517 - use_fuzziness - Specify whether or not to use fuzzy representations. 518 The default is 1 (use fuzziness). 519 - feature_cleaner - A class which will be used to clean out the 520 values of features. This class must implement the function 521 clean_value. GenBank.utils has a "standard" cleaner class, which 522 is used by default. 523 524 """ 525 self._scanner = GenBankScanner(debug_level) 526 self.use_fuzziness = use_fuzziness 527 if feature_cleaner: 528 self._cleaner = feature_cleaner 529 else: 530 self._cleaner = _cleaner # default 531 532 def parse(self, handle): 533 """Parse the specified handle.""" 534 _consumer = _FeatureConsumer(self.use_fuzziness, self._cleaner) 535 self._scanner.feed(handle, _consumer) 536 return _consumer.data 537 538 539class RecordParser: 540 """Parse GenBank files into Record objects (OBSOLETE). 541 542 Direct use of this class is discouraged, and may be deprecated in 543 a future release of Biopython. 544 545 Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions 546 instead. 547 """ 548 549 def __init__(self, debug_level=0): 550 """Initialize the parser. 551 552 Arguments: 553 - debug_level - An optional argument that species the amount of 554 debugging information the parser should spit out. By default we have 555 no debugging info (the fastest way to do things), but if you want 556 you can set this as high as two and see exactly where a parse fails. 557 558 """ 559 self._scanner = GenBankScanner(debug_level) 560 561 def parse(self, handle): 562 """Parse the specified handle into a GenBank record.""" 563 _consumer = _RecordConsumer() 564 565 self._scanner.feed(handle, _consumer) 566 return _consumer.data 567 568 569class _BaseGenBankConsumer: 570 """Abstract GenBank consumer providing useful general functions (PRIVATE). 571 572 This just helps to eliminate some duplication in things that most 573 GenBank consumers want to do. 574 """ 575 576 # Special keys in GenBank records that we should remove spaces from 577 # For instance, \translation keys have values which are proteins and 578 # should have spaces and newlines removed from them. This class 579 # attribute gives us more control over specific formatting problems. 580 remove_space_keys = ["translation"] 581 582 def __init__(self): 583 pass 584 585 @staticmethod 586 def _split_keywords(keyword_string): 587 """Split a string of keywords into a nice clean list (PRIVATE).""" 588 # process the keywords into a python list 589 if keyword_string == "" or keyword_string == ".": 590 keywords = "" 591 elif keyword_string[-1] == ".": 592 keywords = keyword_string[:-1] 593 else: 594 keywords = keyword_string 595 keyword_list = keywords.split(";") 596 return [x.strip() for x in keyword_list] 597 598 @staticmethod 599 def _split_accessions(accession_string): 600 """Split a string of accession numbers into a list (PRIVATE).""" 601 # first replace all line feeds with spaces 602 # Also, EMBL style accessions are split with ';' 603 accession = accession_string.replace("\n", " ").replace(";", " ") 604 605 return [x.strip() for x in accession.split() if x.strip()] 606 607 @staticmethod 608 def _split_taxonomy(taxonomy_string): 609 """Split a string with taxonomy info into a list (PRIVATE).""" 610 if not taxonomy_string or taxonomy_string == ".": 611 # Missing data, no taxonomy 612 return [] 613 614 if taxonomy_string[-1] == ".": 615 tax_info = taxonomy_string[:-1] 616 else: 617 tax_info = taxonomy_string 618 tax_list = tax_info.split(";") 619 new_tax_list = [] 620 for tax_item in tax_list: 621 new_items = tax_item.split("\n") 622 new_tax_list.extend(new_items) 623 while "" in new_tax_list: 624 new_tax_list.remove("") 625 return [x.strip() for x in new_tax_list] 626 627 @staticmethod 628 def _clean_location(location_string): 629 """Clean whitespace out of a location string (PRIVATE). 630 631 The location parser isn't a fan of whitespace, so we clean it out 632 before feeding it into the parser. 633 """ 634 # Originally this imported string.whitespace and did a replace 635 # via a loop. It's simpler to just split on whitespace and rejoin 636 # the string - and this avoids importing string too. See Bug 2684. 637 return "".join(location_string.split()) 638 639 @staticmethod 640 def _remove_newlines(text): 641 """Remove any newlines in the passed text, returning the new string (PRIVATE).""" 642 # get rid of newlines in the qualifier value 643 newlines = ["\n", "\r"] 644 for ws in newlines: 645 text = text.replace(ws, "") 646 647 return text 648 649 @staticmethod 650 def _normalize_spaces(text): 651 """Replace multiple spaces in the passed text with single spaces (PRIVATE).""" 652 # get rid of excessive spaces 653 return " ".join(x for x in text.split(" ") if x) 654 655 @staticmethod 656 def _remove_spaces(text): 657 """Remove all spaces from the passed text (PRIVATE).""" 658 return text.replace(" ", "") 659 660 @staticmethod 661 def _convert_to_python_numbers(start, end): 662 """Convert a start and end range to python notation (PRIVATE). 663 664 In GenBank, starts and ends are defined in "biological" coordinates, 665 where 1 is the first base and [i, j] means to include both i and j. 666 667 In python, 0 is the first base and [i, j] means to include i, but 668 not j. 669 670 So, to convert "biological" to python coordinates, we need to 671 subtract 1 from the start, and leave the end and things should 672 be converted happily. 673 """ 674 new_start = start - 1 675 new_end = end 676 677 return new_start, new_end 678 679 680class _FeatureConsumer(_BaseGenBankConsumer): 681 """Create a SeqRecord object with Features to return (PRIVATE). 682 683 Attributes: 684 - use_fuzziness - specify whether or not to parse with fuzziness in 685 feature locations. 686 - feature_cleaner - a class that will be used to provide specialized 687 cleaning-up of feature values. 688 689 """ 690 691 def __init__(self, use_fuzziness, feature_cleaner=None): 692 from Bio.SeqRecord import SeqRecord 693 694 _BaseGenBankConsumer.__init__(self) 695 self.data = SeqRecord(None, id=None) 696 self.data.id = None 697 self.data.description = "" 698 699 self._use_fuzziness = use_fuzziness 700 self._feature_cleaner = feature_cleaner 701 702 self._seq_type = "" 703 self._seq_data = [] 704 self._cur_reference = None 705 self._cur_feature = None 706 self._expected_size = None 707 708 def locus(self, locus_name): 709 """Set the locus name is set as the name of the Sequence.""" 710 self.data.name = locus_name 711 712 def size(self, content): 713 """Record the sequence length.""" 714 self._expected_size = int(content) 715 716 def residue_type(self, type): 717 """Record the sequence type (SEMI-OBSOLETE). 718 719 This reflects the fact that the topology (linear/circular) and 720 molecule type (e.g. DNA vs RNA) were a single field in early 721 files. Current GenBank/EMBL files have two fields. 722 """ 723 self._seq_type = type.strip() 724 725 def topology(self, topology): 726 """Validate and record sequence topology. 727 728 The topology argument should be "linear" or "circular" (string). 729 """ 730 if topology: 731 if topology not in ["linear", "circular"]: 732 raise ParserFailureError( 733 "Unexpected topology %r should be linear or circular" % topology 734 ) 735 self.data.annotations["topology"] = topology 736 737 def molecule_type(self, mol_type): 738 """Validate and record the molecule type (for round-trip etc).""" 739 if mol_type: 740 if "circular" in mol_type or "linear" in mol_type: 741 raise ParserFailureError( 742 "Molecule type %r should not include topology" % mol_type 743 ) 744 745 # Writing out records will fail if we have a lower case DNA 746 # or RNA string in here, so upper case it. 747 # This is a bit ugly, but we don't want to upper case e.g. 748 # the m in mRNA, but thanks to the strip we lost the spaces 749 # so we need to index from the back 750 if mol_type[-3:].upper() in ("DNA", "RNA") and not mol_type[-3:].isupper(): 751 warnings.warn( 752 "Non-upper case molecule type in LOCUS line: %s" % mol_type, 753 BiopythonParserWarning, 754 ) 755 756 self.data.annotations["molecule_type"] = mol_type 757 758 def data_file_division(self, division): 759 self.data.annotations["data_file_division"] = division 760 761 def date(self, submit_date): 762 self.data.annotations["date"] = submit_date 763 764 def definition(self, definition): 765 """Set the definition as the description of the sequence.""" 766 if self.data.description: 767 # Append to any existing description 768 # e.g. EMBL files with two DE lines. 769 self.data.description += " " + definition 770 else: 771 self.data.description = definition 772 773 def accession(self, acc_num): 774 """Set the accession number as the id of the sequence. 775 776 If we have multiple accession numbers, the first one passed is 777 used. 778 """ 779 new_acc_nums = self._split_accessions(acc_num) 780 781 # Also record them ALL in the annotations 782 try: 783 # On the off chance there was more than one accession line: 784 for acc in new_acc_nums: 785 # Prevent repeat entries 786 if acc not in self.data.annotations["accessions"]: 787 self.data.annotations["accessions"].append(acc) 788 except KeyError: 789 self.data.annotations["accessions"] = new_acc_nums 790 791 # if we haven't set the id information yet, add the first acc num 792 if not self.data.id: 793 if len(new_acc_nums) > 0: 794 # self.data.id = new_acc_nums[0] 795 # Use the FIRST accession as the ID, not the first on this line! 796 self.data.id = self.data.annotations["accessions"][0] 797 798 def tls(self, content): 799 self.data.annotations["tls"] = content.split("-") 800 801 def tsa(self, content): 802 self.data.annotations["tsa"] = content.split("-") 803 804 def wgs(self, content): 805 self.data.annotations["wgs"] = content.split("-") 806 807 def add_wgs_scafld(self, content): 808 self.data.annotations.setdefault("wgs_scafld", []).append(content.split("-")) 809 810 def nid(self, content): 811 self.data.annotations["nid"] = content 812 813 def pid(self, content): 814 self.data.annotations["pid"] = content 815 816 def version(self, version_id): 817 # Want to use the versioned accession as the record.id 818 # This comes from the VERSION line in GenBank files, or the 819 # obsolete SV line in EMBL. For the new EMBL files we need 820 # both the version suffix from the ID line and the accession 821 # from the AC line. 822 if version_id.count(".") == 1 and version_id.split(".")[1].isdigit(): 823 self.accession(version_id.split(".")[0]) 824 self.version_suffix(version_id.split(".")[1]) 825 elif version_id: 826 # For backwards compatibility... 827 self.data.id = version_id 828 829 def project(self, content): 830 """Handle the information from the PROJECT line as a list of projects. 831 832 e.g.:: 833 834 PROJECT GenomeProject:28471 835 836 or:: 837 838 PROJECT GenomeProject:13543 GenomeProject:99999 839 840 This is stored as dbxrefs in the SeqRecord to be consistent with the 841 projected switch of this line to DBLINK in future GenBank versions. 842 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 843 "Project:28471" as part of this transition. 844 """ 845 content = content.replace("GenomeProject:", "Project:") 846 self.data.dbxrefs.extend(p for p in content.split() if p) 847 848 def dblink(self, content): 849 """Store DBLINK cross references as dbxrefs in our record object. 850 851 This line type is expected to replace the PROJECT line in 2009. e.g. 852 853 During transition:: 854 855 PROJECT GenomeProject:28471 856 DBLINK Project:28471 857 Trace Assembly Archive:123456 858 859 Once the project line is dropped:: 860 861 DBLINK Project:28471 862 Trace Assembly Archive:123456 863 864 Note GenomeProject -> Project. 865 866 We'll have to see some real examples to be sure, but based on the 867 above example we can expect one reference per line. 868 869 Note that at some point the NCBI have included an extra space, e.g.:: 870 871 DBLINK Project: 28471 872 873 """ 874 # During the transition period with both PROJECT and DBLINK lines, 875 # we don't want to add the same cross reference twice. 876 while ": " in content: 877 content = content.replace(": ", ":") 878 if content.strip() not in self.data.dbxrefs: 879 self.data.dbxrefs.append(content.strip()) 880 881 def version_suffix(self, version): 882 """Set the version to overwrite the id. 883 884 Since the version provides the same information as the accession 885 number, plus some extra info, we set this as the id if we have 886 a version. 887 """ 888 # e.g. GenBank line: 889 # VERSION U49845.1 GI:1293613 890 # or the obsolete EMBL line: 891 # SV U49845.1 892 # Scanner calls consumer.version("U49845.1") 893 # which then calls consumer.version_suffix(1) 894 # 895 # e.g. EMBL new line: 896 # ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 897 # Scanner calls consumer.version_suffix(1) 898 assert version.isdigit() 899 self.data.annotations["sequence_version"] = int(version) 900 901 def db_source(self, content): 902 self.data.annotations["db_source"] = content.rstrip() 903 904 def gi(self, content): 905 self.data.annotations["gi"] = content 906 907 def keywords(self, content): 908 if "keywords" in self.data.annotations: 909 # Multi-line keywords, append to list 910 # Note EMBL states "A keyword is never split between lines." 911 self.data.annotations["keywords"].extend(self._split_keywords(content)) 912 else: 913 self.data.annotations["keywords"] = self._split_keywords(content) 914 915 def segment(self, content): 916 self.data.annotations["segment"] = content 917 918 def source(self, content): 919 # Note that some software (e.g. VectorNTI) may produce an empty 920 # source (rather than using a dot/period as might be expected). 921 if content == "": 922 source_info = "" 923 elif content[-1] == ".": 924 source_info = content[:-1] 925 else: 926 source_info = content 927 self.data.annotations["source"] = source_info 928 929 def organism(self, content): 930 self.data.annotations["organism"] = content 931 932 def taxonomy(self, content): 933 """Record (another line of) the taxonomy lineage.""" 934 lineage = self._split_taxonomy(content) 935 try: 936 self.data.annotations["taxonomy"].extend(lineage) 937 except KeyError: 938 self.data.annotations["taxonomy"] = lineage 939 940 def reference_num(self, content): 941 """Signal the beginning of a new reference object.""" 942 # if we have a current reference that hasn't been added to 943 # the list of references, add it. 944 if self._cur_reference is not None: 945 self.data.annotations["references"].append(self._cur_reference) 946 else: 947 self.data.annotations["references"] = [] 948 949 self._cur_reference = SeqFeature.Reference() 950 951 def reference_bases(self, content): 952 """Attempt to determine the sequence region the reference entails. 953 954 Possible types of information we may have to deal with: 955 956 (bases 1 to 86436) 957 (sites) 958 (bases 1 to 105654; 110423 to 111122) 959 1 (residues 1 to 182) 960 """ 961 # first remove the parentheses 962 assert content.endswith(")"), content 963 ref_base_info = content[1:-1] 964 965 all_locations = [] 966 # parse if we've got 'bases' and 'to' 967 if "bases" in ref_base_info and "to" in ref_base_info: 968 # get rid of the beginning 'bases' 969 ref_base_info = ref_base_info[5:] 970 locations = self._split_reference_locations(ref_base_info) 971 all_locations.extend(locations) 972 elif "residues" in ref_base_info and "to" in ref_base_info: 973 residues_start = ref_base_info.find("residues") 974 # get only the information after "residues" 975 ref_base_info = ref_base_info[(residues_start + len("residues ")) :] 976 locations = self._split_reference_locations(ref_base_info) 977 all_locations.extend(locations) 978 979 # make sure if we are not finding information then we have 980 # the string 'sites' or the string 'bases' 981 elif ref_base_info == "sites" or ref_base_info.strip() == "bases": 982 pass 983 # otherwise raise an error 984 else: 985 raise ValueError( 986 "Could not parse base info %s in record %s" 987 % (ref_base_info, self.data.id) 988 ) 989 990 self._cur_reference.location = all_locations 991 992 def _split_reference_locations(self, location_string): 993 """Get reference locations out of a string of reference information (PRIVATE). 994 995 The passed string should be of the form:: 996 997 1 to 20; 20 to 100 998 999 This splits the information out and returns a list of location objects 1000 based on the reference locations. 1001 """ 1002 # split possibly multiple locations using the ';' 1003 all_base_info = location_string.split(";") 1004 1005 new_locations = [] 1006 for base_info in all_base_info: 1007 start, end = base_info.split("to") 1008 new_start, new_end = self._convert_to_python_numbers( 1009 int(start.strip()), int(end.strip()) 1010 ) 1011 this_location = SeqFeature.FeatureLocation(new_start, new_end) 1012 new_locations.append(this_location) 1013 return new_locations 1014 1015 def authors(self, content): 1016 if self._cur_reference.authors: 1017 self._cur_reference.authors += " " + content 1018 else: 1019 self._cur_reference.authors = content 1020 1021 def consrtm(self, content): 1022 if self._cur_reference.consrtm: 1023 self._cur_reference.consrtm += " " + content 1024 else: 1025 self._cur_reference.consrtm = content 1026 1027 def title(self, content): 1028 if self._cur_reference is None: 1029 warnings.warn( 1030 "GenBank TITLE line without REFERENCE line.", BiopythonParserWarning 1031 ) 1032 elif self._cur_reference.title: 1033 self._cur_reference.title += " " + content 1034 else: 1035 self._cur_reference.title = content 1036 1037 def journal(self, content): 1038 if self._cur_reference.journal: 1039 self._cur_reference.journal += " " + content 1040 else: 1041 self._cur_reference.journal = content 1042 1043 def medline_id(self, content): 1044 self._cur_reference.medline_id = content 1045 1046 def pubmed_id(self, content): 1047 self._cur_reference.pubmed_id = content 1048 1049 def remark(self, content): 1050 """Deal with a reference comment.""" 1051 if self._cur_reference.comment: 1052 self._cur_reference.comment += " " + content 1053 else: 1054 self._cur_reference.comment = content 1055 1056 def comment(self, content): 1057 try: 1058 self.data.annotations["comment"] += "\n" + "\n".join(content) 1059 except KeyError: 1060 self.data.annotations["comment"] = "\n".join(content) 1061 1062 def structured_comment(self, content): 1063 self.data.annotations["structured_comment"] = content 1064 1065 def features_line(self, content): 1066 """Get ready for the feature table when we reach the FEATURE line.""" 1067 self.start_feature_table() 1068 1069 def start_feature_table(self): 1070 """Indicate we've got to the start of the feature table.""" 1071 # make sure we've added on our last reference object 1072 if self._cur_reference is not None: 1073 self.data.annotations["references"].append(self._cur_reference) 1074 self._cur_reference = None 1075 1076 def feature_key(self, content): 1077 # start a new feature 1078 self._cur_feature = SeqFeature.SeqFeature() 1079 self._cur_feature.type = content 1080 self.data.features.append(self._cur_feature) 1081 1082 def location(self, content): 1083 """Parse out location information from the location string. 1084 1085 This uses simple Python code with some regular expressions to do the 1086 parsing, and then translates the results into appropriate objects. 1087 """ 1088 # clean up newlines and other whitespace inside the location before 1089 # parsing - locations should have no whitespace whatsoever 1090 location_line = self._clean_location(content) 1091 1092 # Older records have junk like replace(266,"c") in the 1093 # location line. Newer records just replace this with 1094 # the number 266 and have the information in a more reasonable 1095 # place. So we'll just grab out the number and feed this to the 1096 # parser. We shouldn't really be losing any info this way. 1097 if "replace" in location_line: 1098 comma_pos = location_line.find(",") 1099 location_line = location_line[8:comma_pos] 1100 1101 cur_feature = self._cur_feature 1102 1103 # Handle top level complement here for speed 1104 if location_line.startswith("complement("): 1105 assert location_line.endswith(")") 1106 location_line = location_line[11:-1] 1107 strand = -1 1108 elif "PROTEIN" in self._seq_type.upper(): 1109 strand = None 1110 else: 1111 # Assume nucleotide otherwise feature strand for 1112 # GenBank files with bad LOCUS lines set to None 1113 strand = 1 1114 1115 # Special case handling of the most common cases for speed 1116 if _re_simple_location.match(location_line): 1117 # e.g. "123..456" 1118 s, e = location_line.split("..") 1119 try: 1120 cur_feature.location = SeqFeature.FeatureLocation( 1121 int(s) - 1, int(e), strand 1122 ) 1123 except ValueError: 1124 # Could be non-integers, more likely bad origin wrapping 1125 cur_feature.location = _loc( 1126 location_line, 1127 self._expected_size, 1128 strand, 1129 seq_type=self._seq_type.lower(), 1130 ) 1131 return 1132 1133 if ",)" in location_line: 1134 warnings.warn( 1135 "Dropping trailing comma in malformed feature location", 1136 BiopythonParserWarning, 1137 ) 1138 location_line = location_line.replace(",)", ")") 1139 1140 if _solo_bond.search(location_line): 1141 # e.g. bond(196) 1142 # e.g. join(bond(284),bond(305),bond(309),bond(305)) 1143 warnings.warn( 1144 "Dropping bond qualifier in feature location", BiopythonParserWarning 1145 ) 1146 # There ought to be a better way to do this... 1147 for x in _solo_bond.finditer(location_line): 1148 x = x.group() 1149 location_line = location_line.replace(x, x[5:-1]) 1150 1151 if _re_simple_compound.match(location_line): 1152 # e.g. join(<123..456,480..>500) 1153 i = location_line.find("(") 1154 # cur_feature.location_operator = location_line[:i] 1155 # we can split on the comma because these are simple locations 1156 locs = [] 1157 for part in location_line[i + 1 : -1].split(","): 1158 s, e = part.split("..") 1159 1160 try: 1161 locs.append(SeqFeature.FeatureLocation(int(s) - 1, int(e), strand)) 1162 except ValueError: 1163 # Could be non-integers, more likely bad origin wrapping 1164 1165 # In the case of bad origin wrapping, _loc will return 1166 # a CompoundLocation. CompoundLocation.parts returns a 1167 # list of the FeatureLocation objects inside the 1168 # CompoundLocation. 1169 locs.extend( 1170 _loc( 1171 part, self._expected_size, strand, self._seq_type.lower() 1172 ).parts 1173 ) 1174 1175 if len(locs) < 2: 1176 # The CompoundLocation will raise a ValueError here! 1177 warnings.warn( 1178 "Should have at least 2 parts for compound location", 1179 BiopythonParserWarning, 1180 ) 1181 cur_feature.location = None 1182 return 1183 if strand == -1: 1184 cur_feature.location = SeqFeature.CompoundLocation( 1185 locs[::-1], operator=location_line[:i] 1186 ) 1187 else: 1188 cur_feature.location = SeqFeature.CompoundLocation( 1189 locs, operator=location_line[:i] 1190 ) 1191 return 1192 1193 # Handle the general case with more complex regular expressions 1194 if _re_complex_location.match(location_line): 1195 # e.g. "AL121804.2:41..610" 1196 cur_feature.location = _loc( 1197 location_line, 1198 self._expected_size, 1199 strand, 1200 seq_type=self._seq_type.lower(), 1201 ) 1202 return 1203 1204 if _re_complex_compound.match(location_line): 1205 i = location_line.find("(") 1206 # cur_feature.location_operator = location_line[:i] 1207 # Can't split on the comma because of positions like one-of(1,2,3) 1208 locs = [] 1209 for part in _split_compound_loc(location_line[i + 1 : -1]): 1210 if part.startswith("complement("): 1211 assert part[-1] == ")" 1212 part = part[11:-1] 1213 assert strand != -1, "Double complement?" 1214 part_strand = -1 1215 else: 1216 part_strand = strand 1217 try: 1218 # There is likely a problem with origin wrapping. 1219 # Using _loc to return a CompoundLocation of the 1220 # wrapped feature and returning the two FeatureLocation 1221 # objects to extend to the list of feature locations. 1222 loc = _loc( 1223 part, 1224 self._expected_size, 1225 part_strand, 1226 seq_type=self._seq_type.lower(), 1227 ).parts 1228 1229 except ValueError: 1230 print(location_line) 1231 print(part) 1232 raise 1233 # loc will be a list of one or two FeatureLocation items. 1234 locs.extend(loc) 1235 # Historically a join on the reverse strand has been represented 1236 # in Biopython with both the parent SeqFeature and its children 1237 # (the exons for a CDS) all given a strand of -1. Likewise, for 1238 # a join feature on the forward strand they all have strand +1. 1239 # However, we must also consider evil mixed strand examples like 1240 # this, join(complement(69611..69724),139856..140087,140625..140650) 1241 if strand == -1: 1242 # Whole thing was wrapped in complement(...) 1243 for l in locs: 1244 assert l.strand == -1 1245 # Reverse the backwards order used in GenBank files 1246 # with complement(join(...)) 1247 cur_feature.location = SeqFeature.CompoundLocation( 1248 locs[::-1], operator=location_line[:i] 1249 ) 1250 else: 1251 cur_feature.location = SeqFeature.CompoundLocation( 1252 locs, operator=location_line[:i] 1253 ) 1254 return 1255 # Not recognised 1256 if "order" in location_line and "join" in location_line: 1257 # See Bug 3197 1258 msg = ( 1259 'Combinations of "join" and "order" within the same ' 1260 "location (nested operators) are illegal:\n" + location_line 1261 ) 1262 raise LocationParserError(msg) 1263 # This used to be an error.... 1264 cur_feature.location = None 1265 warnings.warn( 1266 BiopythonParserWarning( 1267 "Couldn't parse feature location: %r" % location_line 1268 ) 1269 ) 1270 1271 def feature_qualifier(self, key, value): 1272 """When we get a qualifier key and its value. 1273 1274 Can receive None, since you can have valueless keys such as /pseudo 1275 """ 1276 # Hack to try to preserve historical behaviour of /pseudo etc 1277 if value is None: 1278 # if the key doesn't exist yet, add an empty string 1279 if key not in self._cur_feature.qualifiers: 1280 self._cur_feature.qualifiers[key] = [""] 1281 return 1282 # otherwise just skip this key 1283 return 1284 1285 # Remove enclosing quotation marks 1286 value = re.sub('^"|"$', "", value) 1287 1288 # Handle NCBI escaping 1289 # Warn if escaping is not according to standard 1290 if re.search(r'[^"]"[^"]|^"[^"]|[^"]"$', value): 1291 warnings.warn( 1292 'The NCBI states double-quote characters like " should be escaped as "" ' 1293 "(two double - quotes), but here it was not: %r" % value, 1294 BiopythonParserWarning, 1295 ) 1296 # Undo escaping, repeated double quotes -> one double quote 1297 value = value.replace('""', '"') 1298 1299 if self._feature_cleaner is not None: 1300 value = self._feature_cleaner.clean_value(key, value) 1301 1302 # if the qualifier name exists, append the value 1303 if key in self._cur_feature.qualifiers: 1304 self._cur_feature.qualifiers[key].append(value) 1305 # otherwise start a new list of the key with its values 1306 else: 1307 self._cur_feature.qualifiers[key] = [value] 1308 1309 def feature_qualifier_name(self, content_list): 1310 """Use feature_qualifier instead (OBSOLETE).""" 1311 raise NotImplementedError("Use the feature_qualifier method instead.") 1312 1313 def feature_qualifier_description(self, content): 1314 """Use feature_qualifier instead (OBSOLETE).""" 1315 raise NotImplementedError("Use the feature_qualifier method instead.") 1316 1317 def contig_location(self, content): 1318 """Deal with CONTIG information.""" 1319 # Historically this was stored as a SeqFeature object, but it was 1320 # stored under record.annotations["contig"] and not under 1321 # record.features with the other SeqFeature objects. 1322 # 1323 # The CONTIG location line can include additional tokens like 1324 # Gap(), Gap(100) or Gap(unk100) which are not used in the feature 1325 # location lines, so storing it using SeqFeature based location 1326 # objects is difficult. 1327 # 1328 # We now store this a string, which means for BioSQL we are now in 1329 # much better agreement with how BioPerl records the CONTIG line 1330 # in the database. 1331 # 1332 # NOTE - This code assumes the scanner will return all the CONTIG 1333 # lines already combined into one long string! 1334 self.data.annotations["contig"] = content 1335 1336 def origin_name(self, content): 1337 pass 1338 1339 def base_count(self, content): 1340 pass 1341 1342 def base_number(self, content): 1343 pass 1344 1345 def sequence(self, content): 1346 """Add up sequence information as we get it. 1347 1348 To try and make things speedier, this puts all of the strings 1349 into a list of strings, and then uses string.join later to put 1350 them together. Supposedly, this is a big time savings 1351 """ 1352 assert " " not in content 1353 self._seq_data.append(content.upper()) 1354 1355 def record_end(self, content): 1356 """Clean up when we've finished the record.""" 1357 # Try and append the version number to the accession for the full id 1358 if not self.data.id: 1359 if "accessions" in self.data.annotations: 1360 raise ValueError( 1361 "Problem adding version number to accession: " 1362 + str(self.data.annotations["accessions"]) 1363 ) 1364 self.data.id = self.data.name # Good fall back? 1365 elif self.data.id.count(".") == 0: 1366 try: 1367 self.data.id += ".%i" % self.data.annotations["sequence_version"] 1368 except KeyError: 1369 pass 1370 1371 # add the sequence information 1372 1373 sequence = "".join(self._seq_data) 1374 1375 if ( 1376 self._expected_size is not None 1377 and len(sequence) != 0 1378 and self._expected_size != len(sequence) 1379 ): 1380 warnings.warn( 1381 "Expected sequence length %i, found %i (%s)." 1382 % (self._expected_size, len(sequence), self.data.id), 1383 BiopythonParserWarning, 1384 ) 1385 1386 molecule_type = None 1387 if self._seq_type: 1388 # mRNA is really also DNA, since it is actually cDNA 1389 if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper(): 1390 molecule_type = "DNA" 1391 # are there ever really RNA sequences in GenBank? 1392 elif "RNA" in self._seq_type.upper(): 1393 # Even for data which was from RNA, the sequence string 1394 # is usually given as DNA (T not U). Bug 3010 1395 molecule_type = "RNA" 1396 elif ( 1397 "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT" 1398 ): # PRT is used in EMBL-bank for patents 1399 molecule_type = "protein" 1400 # work around ugly GenBank records which have circular or 1401 # linear but no indication of sequence type 1402 elif self._seq_type in ["circular", "linear", "unspecified"]: 1403 pass 1404 # we have a bug if we get here 1405 else: 1406 raise ValueError( 1407 "Could not determine molecule_type for seq_type %s" % self._seq_type 1408 ) 1409 # Don't overwrite molecule_type 1410 if molecule_type is not None: 1411 self.data.annotations["molecule_type"] = self.data.annotations.get( 1412 "molecule_type", molecule_type 1413 ) 1414 if not sequence and self._expected_size: 1415 self.data.seq = Seq(None, length=self._expected_size) 1416 else: 1417 self.data.seq = Seq(sequence) 1418 1419 1420class _RecordConsumer(_BaseGenBankConsumer): 1421 """Create a GenBank Record object from scanner generated information (PRIVATE).""" 1422 1423 def __init__(self): 1424 _BaseGenBankConsumer.__init__(self) 1425 from . import Record 1426 1427 self.data = Record.Record() 1428 1429 self._seq_data = [] 1430 self._cur_reference = None 1431 self._cur_feature = None 1432 self._cur_qualifier = None 1433 1434 def tls(self, content): 1435 self.data.tls = content.split("-") 1436 1437 def tsa(self, content): 1438 self.data.tsa = content.split("-") 1439 1440 def wgs(self, content): 1441 self.data.wgs = content.split("-") 1442 1443 def add_wgs_scafld(self, content): 1444 self.data.wgs_scafld.append(content.split("-")) 1445 1446 def locus(self, content): 1447 self.data.locus = content 1448 1449 def size(self, content): 1450 self.data.size = content 1451 1452 def residue_type(self, content): 1453 # Be lenient about parsing, but technically lowercase residue types are malformed. 1454 if "dna" in content or "rna" in content: 1455 warnings.warn( 1456 "Invalid seq_type (%s): DNA/RNA should be uppercase." % content, 1457 BiopythonParserWarning, 1458 ) 1459 self.data.residue_type = content 1460 1461 def data_file_division(self, content): 1462 self.data.data_file_division = content 1463 1464 def date(self, content): 1465 self.data.date = content 1466 1467 def definition(self, content): 1468 self.data.definition = content 1469 1470 def accession(self, content): 1471 for acc in self._split_accessions(content): 1472 if acc not in self.data.accession: 1473 self.data.accession.append(acc) 1474 1475 def molecule_type(self, mol_type): 1476 """Validate and record the molecule type (for round-trip etc).""" 1477 if mol_type: 1478 if "circular" in mol_type or "linear" in mol_type: 1479 raise ParserFailureError( 1480 "Molecule type %r should not include topology" % mol_type 1481 ) 1482 1483 # Writing out records will fail if we have a lower case DNA 1484 # or RNA string in here, so upper case it. 1485 # This is a bit ugly, but we don't want to upper case e.g. 1486 # the m in mRNA, but thanks to the strip we lost the spaces 1487 # so we need to index from the back 1488 if mol_type[-3:].upper() in ("DNA", "RNA") and not mol_type[-3:].isupper(): 1489 warnings.warn( 1490 "Non-upper case molecule type in LOCUS line: %s" % mol_type, 1491 BiopythonParserWarning, 1492 ) 1493 1494 self.data.molecule_type = mol_type 1495 1496 def topology(self, topology): 1497 """Validate and record sequence topology. 1498 1499 The topology argument should be "linear" or "circular" (string). 1500 """ 1501 if topology: 1502 if topology not in ["linear", "circular"]: 1503 raise ParserFailureError( 1504 "Unexpected topology %r should be linear or circular" % topology 1505 ) 1506 self.data.topology = topology 1507 1508 def nid(self, content): 1509 self.data.nid = content 1510 1511 def pid(self, content): 1512 self.data.pid = content 1513 1514 def version(self, content): 1515 self.data.version = content 1516 1517 def db_source(self, content): 1518 self.data.db_source = content.rstrip() 1519 1520 def gi(self, content): 1521 self.data.gi = content 1522 1523 def keywords(self, content): 1524 self.data.keywords = self._split_keywords(content) 1525 1526 def project(self, content): 1527 self.data.projects.extend(p for p in content.split() if p) 1528 1529 def dblink(self, content): 1530 self.data.dblinks.append(content) 1531 1532 def segment(self, content): 1533 self.data.segment = content 1534 1535 def source(self, content): 1536 self.data.source = content 1537 1538 def organism(self, content): 1539 self.data.organism = content 1540 1541 def taxonomy(self, content): 1542 self.data.taxonomy = self._split_taxonomy(content) 1543 1544 def reference_num(self, content): 1545 """Grab the reference number and signal the start of a new reference.""" 1546 # check if we have a reference to add 1547 if self._cur_reference is not None: 1548 self.data.references.append(self._cur_reference) 1549 1550 from . import Record 1551 1552 self._cur_reference = Record.Reference() 1553 self._cur_reference.number = content 1554 1555 def reference_bases(self, content): 1556 self._cur_reference.bases = content 1557 1558 def authors(self, content): 1559 self._cur_reference.authors = content 1560 1561 def consrtm(self, content): 1562 self._cur_reference.consrtm = content 1563 1564 def title(self, content): 1565 if self._cur_reference is None: 1566 warnings.warn( 1567 "GenBank TITLE line without REFERENCE line.", BiopythonParserWarning 1568 ) 1569 return 1570 self._cur_reference.title = content 1571 1572 def journal(self, content): 1573 self._cur_reference.journal = content 1574 1575 def medline_id(self, content): 1576 self._cur_reference.medline_id = content 1577 1578 def pubmed_id(self, content): 1579 self._cur_reference.pubmed_id = content 1580 1581 def remark(self, content): 1582 self._cur_reference.remark = content 1583 1584 def comment(self, content): 1585 self.data.comment += "\n".join(content) 1586 1587 def structured_comment(self, content): 1588 self.data.structured_comment = content 1589 1590 def primary_ref_line(self, content): 1591 """Save reference data for the PRIMARY line.""" 1592 self.data.primary.append(content) 1593 1594 def primary(self, content): 1595 pass 1596 1597 def features_line(self, content): 1598 """Get ready for the feature table when we reach the FEATURE line.""" 1599 self.start_feature_table() 1600 1601 def start_feature_table(self): 1602 """Signal the start of the feature table.""" 1603 # we need to add on the last reference 1604 if self._cur_reference is not None: 1605 self.data.references.append(self._cur_reference) 1606 1607 def feature_key(self, content): 1608 """Grab the key of the feature and signal the start of a new feature.""" 1609 # first add on feature information if we've got any 1610 self._add_feature() 1611 1612 from . import Record 1613 1614 self._cur_feature = Record.Feature() 1615 self._cur_feature.key = content 1616 1617 def _add_feature(self): 1618 """Add a feature to the record, with relevant checks (PRIVATE). 1619 1620 This does all of the appropriate checking to make sure we haven't 1621 left any info behind, and that we are only adding info if it 1622 exists. 1623 """ 1624 if self._cur_feature is not None: 1625 # if we have a left over qualifier, add it to the qualifiers 1626 # on the current feature 1627 if self._cur_qualifier is not None: 1628 self._cur_feature.qualifiers.append(self._cur_qualifier) 1629 1630 self._cur_qualifier = None 1631 self.data.features.append(self._cur_feature) 1632 1633 def location(self, content): 1634 self._cur_feature.location = self._clean_location(content) 1635 1636 def feature_qualifier(self, key, value): 1637 self.feature_qualifier_name([key]) 1638 if value is not None: 1639 self.feature_qualifier_description(value) 1640 1641 def feature_qualifier_name(self, content_list): 1642 """Deal with qualifier names. 1643 1644 We receive a list of keys, since you can have valueless keys such as 1645 /pseudo which would be passed in with the next key (since no other 1646 tags separate them in the file) 1647 """ 1648 from . import Record 1649 1650 for content in content_list: 1651 # the record parser keeps the /s -- add them if we don't have 'em 1652 if not content.startswith("/"): 1653 content = "/%s" % content 1654 # add on a qualifier if we've got one 1655 if self._cur_qualifier is not None: 1656 self._cur_feature.qualifiers.append(self._cur_qualifier) 1657 1658 self._cur_qualifier = Record.Qualifier() 1659 self._cur_qualifier.key = content 1660 1661 def feature_qualifier_description(self, content): 1662 # if we have info then the qualifier key should have a ='s 1663 if "=" not in self._cur_qualifier.key: 1664 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1665 cur_content = self._remove_newlines(content) 1666 # remove all spaces from the value if it is a type where spaces 1667 # are not important 1668 for remove_space_key in self.__class__.remove_space_keys: 1669 if remove_space_key in self._cur_qualifier.key: 1670 cur_content = self._remove_spaces(cur_content) 1671 self._cur_qualifier.value = self._normalize_spaces(cur_content) 1672 1673 def base_count(self, content): 1674 self.data.base_counts = content 1675 1676 def origin_name(self, content): 1677 self.data.origin = content 1678 1679 def contig_location(self, content): 1680 """Signal that we have contig information to add to the record.""" 1681 self.data.contig = self._clean_location(content) 1682 1683 def sequence(self, content): 1684 """Add sequence information to a list of sequence strings. 1685 1686 This removes spaces in the data and uppercases the sequence, and 1687 then adds it to a list of sequences. Later on we'll join this 1688 list together to make the final sequence. This is faster than 1689 adding on the new string every time. 1690 """ 1691 assert " " not in content 1692 self._seq_data.append(content.upper()) 1693 1694 def record_end(self, content): 1695 """Signal the end of the record and do any necessary clean-up.""" 1696 # add together all of the sequence parts to create the 1697 # final sequence string 1698 self.data.sequence = "".join(self._seq_data) 1699 # add on the last feature 1700 self._add_feature() 1701 1702 1703def parse(handle): 1704 """Iterate over GenBank formatted entries as Record objects. 1705 1706 >>> from Bio import GenBank 1707 >>> with open("GenBank/NC_000932.gb") as handle: 1708 ... for record in GenBank.parse(handle): 1709 ... print(record.accession) 1710 ['NC_000932'] 1711 1712 To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb") 1713 instead. 1714 """ 1715 return iter(Iterator(handle, RecordParser())) 1716 1717 1718def read(handle): 1719 """Read a handle containing a single GenBank entry as a Record object. 1720 1721 >>> from Bio import GenBank 1722 >>> with open("GenBank/NC_000932.gb") as handle: 1723 ... record = GenBank.read(handle) 1724 ... print(record.accession) 1725 ['NC_000932'] 1726 1727 To get a SeqRecord object use Bio.SeqIO.read(..., format="gb") 1728 instead. 1729 """ 1730 iterator = parse(handle) 1731 try: 1732 record = next(iterator) 1733 except StopIteration: 1734 raise ValueError("No records found in handle") from None 1735 try: 1736 next(iterator) 1737 raise ValueError("More than one record found in handle") 1738 except StopIteration: 1739 pass 1740 return record 1741 1742 1743if __name__ == "__main__": 1744 from Bio._utils import run_doctest 1745 1746 run_doctest() 1747