1# Copyright 2000-2003 Jeff Chang. 2# Copyright 2001-2008 Brad Chapman. 3# Copyright 2005-2016 by Peter Cock. 4# Copyright 2006-2009 Michiel de Hoon. 5# All rights reserved. 6# 7# This file is part of the Biopython distribution and governed by your 8# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 9# Please see the LICENSE file that should have been included as part of this 10# package. 11"""Represent a Sequence Feature holding info about a part of a sequence. 12 13This is heavily modeled after the Biocorba SeqFeature objects, and 14may be pretty biased towards GenBank stuff since I'm writing it 15for the GenBank parser output... 16 17What's here: 18 19Base class to hold a Feature 20---------------------------- 21 22Classes: 23 - SeqFeature 24 25Hold information about a Reference 26---------------------------------- 27 28This is an attempt to create a General class to hold Reference type 29information. 30 31Classes: 32 - Reference 33 34Specify locations of a feature on a Sequence 35-------------------------------------------- 36 37This aims to handle, in Ewan Birney's words, 'the dreaded fuzziness issue'. 38This has the advantages of allowing us to handle fuzzy stuff in case anyone 39needs it, and also be compatible with BioPerl etc and BioSQL. 40 41Classes: 42 - FeatureLocation - Specify the start and end location of a feature. 43 - CompoundLocation - Collection of FeatureLocation objects (for joins etc). 44 - ExactPosition - Specify the position as being exact. 45 - WithinPosition - Specify a position occurring within some range. 46 - BetweenPosition - Specify a position occurring between a range (OBSOLETE?). 47 - BeforePosition - Specify the position as being found before some base. 48 - AfterPosition - Specify the position as being found after some base. 49 - OneOfPosition - Specify a position where the location can be multiple positions. 50 - UncertainPosition - Specify a specific position which is uncertain. 51 - UnknownPosition - Represents missing information like '?' in UniProt. 52 53""" 54import functools 55 56from collections import OrderedDict 57 58from Bio.Seq import MutableSeq 59from Bio.Seq import reverse_complement 60from Bio.Seq import Seq 61 62 63class SeqFeature: 64 """Represent a Sequence Feature on an object. 65 66 Attributes: 67 - location - the location of the feature on the sequence (FeatureLocation) 68 - type - the specified type of the feature (ie. CDS, exon, repeat...) 69 - location_operator - a string specifying how this SeqFeature may 70 be related to others. For example, in the example GenBank feature 71 shown below, the location_operator would be "join". This is a proxy 72 for feature.location.operator and only applies to compound locations. 73 - strand - A value specifying on which strand (of a DNA sequence, for 74 instance) the feature deals with. 1 indicates the plus strand, -1 75 indicates the minus strand, 0 indicates stranded but unknown (? in GFF3), 76 while the default of None indicates that strand doesn't apply (dot in GFF3, 77 e.g. features on proteins). Note this is a shortcut for accessing the 78 strand property of the feature's location. 79 - id - A string identifier for the feature. 80 - ref - A reference to another sequence. This could be an accession 81 number for some different sequence. Note this is a shortcut for the 82 reference property of the feature's location. 83 - ref_db - A different database for the reference accession number. 84 Note this is a shortcut for the reference property of the location 85 - qualifiers - A dictionary of qualifiers on the feature. These are 86 analogous to the qualifiers from a GenBank feature table. The keys of 87 the dictionary are qualifier names, the values are the qualifier 88 values. As of Biopython 1.69 this is an ordered dictionary. 89 90 """ 91 92 def __init__( 93 self, 94 location=None, 95 type="", 96 location_operator="", 97 strand=None, 98 id="<unknown id>", 99 qualifiers=None, 100 sub_features=None, 101 ref=None, 102 ref_db=None, 103 ): 104 """Initialize a SeqFeature on a Sequence. 105 106 location can either be a FeatureLocation (with strand argument also 107 given if required), or None. 108 109 e.g. With no strand, on the forward strand, and on the reverse strand: 110 111 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 112 >>> f1 = SeqFeature(FeatureLocation(5, 10), type="domain") 113 >>> f1.strand == f1.location.strand == None 114 True 115 >>> f2 = SeqFeature(FeatureLocation(7, 110, strand=1), type="CDS") 116 >>> f2.strand == f2.location.strand == +1 117 True 118 >>> f3 = SeqFeature(FeatureLocation(9, 108, strand=-1), type="CDS") 119 >>> f3.strand == f3.location.strand == -1 120 True 121 122 An invalid strand will trigger an exception: 123 124 >>> f4 = SeqFeature(FeatureLocation(50, 60), strand=2) 125 Traceback (most recent call last): 126 ... 127 ValueError: Strand should be +1, -1, 0 or None, not 2 128 129 Similarly if set via the FeatureLocation directly: 130 131 >>> loc4 = FeatureLocation(50, 60, strand=2) 132 Traceback (most recent call last): 133 ... 134 ValueError: Strand should be +1, -1, 0 or None, not 2 135 136 For exact start/end positions, an integer can be used (as shown above) 137 as shorthand for the ExactPosition object. For non-exact locations, the 138 FeatureLocation must be specified via the appropriate position objects. 139 140 Note that the strand, ref and ref_db arguments to the SeqFeature are 141 now obsolete and will be deprecated in a future release (which will 142 give warning messages) and later removed. Set them via the location 143 object instead. 144 145 Note that location_operator and sub_features arguments can no longer 146 be used, instead do this via the CompoundLocation object. 147 """ 148 if ( 149 location is not None 150 and not isinstance(location, FeatureLocation) 151 and not isinstance(location, CompoundLocation) 152 ): 153 raise TypeError( 154 "FeatureLocation, CompoundLocation (or None) required for the location" 155 ) 156 self.location = location 157 self.type = type 158 if location_operator: 159 # TODO - Deprecation warning 160 self.location_operator = location_operator 161 if strand is not None: 162 # TODO - Deprecation warning 163 self.strand = strand 164 self.id = id 165 if qualifiers is None: 166 qualifiers = OrderedDict() 167 self.qualifiers = qualifiers 168 if sub_features is not None: 169 raise TypeError("Rather than sub_features, use a CompoundFeatureLocation") 170 if ref is not None: 171 # TODO - Deprecation warning 172 self.ref = ref 173 if ref_db is not None: 174 # TODO - Deprecation warning 175 self.ref_db = ref_db 176 177 def _get_strand(self): 178 """Get function for the strand property (PRIVATE).""" 179 return self.location.strand 180 181 def _set_strand(self, value): 182 """Set function for the strand property (PRIVATE).""" 183 try: 184 self.location.strand = value 185 except AttributeError: 186 if self.location is None: 187 if value is not None: 188 raise ValueError("Can't set strand without a location.") from None 189 else: 190 raise 191 192 strand = property( 193 fget=_get_strand, 194 fset=_set_strand, 195 doc="""Feature's strand 196 197 This is a shortcut for feature.location.strand 198 """, 199 ) 200 201 def _get_ref(self): 202 """Get function for the reference property (PRIVATE).""" 203 try: 204 return self.location.ref 205 except AttributeError: 206 return None 207 208 def _set_ref(self, value): 209 """Set function for the reference property (PRIVATE).""" 210 try: 211 self.location.ref = value 212 except AttributeError: 213 if self.location is None: 214 if value is not None: 215 raise ValueError("Can't set ref without a location.") from None 216 else: 217 raise 218 219 ref = property( 220 fget=_get_ref, 221 fset=_set_ref, 222 doc="""Feature location reference (e.g. accession). 223 224 This is a shortcut for feature.location.ref 225 """, 226 ) 227 228 def _get_ref_db(self): 229 """Get function for the database reference property (PRIVATE).""" 230 try: 231 return self.location.ref_db 232 except AttributeError: 233 return None 234 235 def _set_ref_db(self, value): 236 """Set function for the database reference property (PRIVATE).""" 237 self.location.ref_db = value 238 239 ref_db = property( 240 fget=_get_ref_db, 241 fset=_set_ref_db, 242 doc="""Feature location reference's database. 243 244 This is a shortcut for feature.location.ref_db 245 """, 246 ) 247 248 def _get_location_operator(self): 249 """Get function for the location operator property (PRIVATE).""" 250 try: 251 return self.location.operator 252 except AttributeError: 253 return None 254 255 def _set_location_operator(self, value): 256 """Set function for the location operator property (PRIVATE).""" 257 if value: 258 if isinstance(self.location, CompoundLocation): 259 self.location.operator = value 260 elif self.location is None: 261 raise ValueError( 262 "Location is None so can't set its operator (to %r)" % value 263 ) 264 else: 265 raise ValueError("Only CompoundLocation gets an operator (%r)" % value) 266 267 location_operator = property( 268 fget=_get_location_operator, 269 fset=_set_location_operator, 270 doc="Location operator for compound locations (e.g. join).", 271 ) 272 273 def __repr__(self): 274 """Represent the feature as a string for debugging.""" 275 answer = "%s(%r" % (self.__class__.__name__, self.location) 276 if self.type: 277 answer += ", type=%r" % self.type 278 if self.location_operator: 279 answer += ", location_operator=%r" % self.location_operator 280 if self.id and self.id != "<unknown id>": 281 answer += ", id=%r" % self.id 282 if self.ref: 283 answer += ", ref=%r" % self.ref 284 if self.ref_db: 285 answer += ", ref_db=%r" % self.ref_db 286 answer += ")" 287 return answer 288 289 def __str__(self): 290 """Return the full feature as a python string.""" 291 out = "type: %s\n" % self.type 292 out += "location: %s\n" % self.location 293 if self.id and self.id != "<unknown id>": 294 out += "id: %s\n" % self.id 295 out += "qualifiers:\n" 296 for qual_key in sorted(self.qualifiers): 297 out += " Key: %s, Value: %s\n" % (qual_key, self.qualifiers[qual_key]) 298 return out 299 300 def _shift(self, offset): 301 """Return a copy of the feature with its location shifted (PRIVATE). 302 303 The annotation qaulifiers are copied. 304 """ 305 return SeqFeature( 306 location=self.location._shift(offset), 307 type=self.type, 308 location_operator=self.location_operator, 309 id=self.id, 310 qualifiers=OrderedDict(self.qualifiers.items()), 311 ) 312 313 def _flip(self, length): 314 """Return a copy of the feature with its location flipped (PRIVATE). 315 316 The argument length gives the length of the parent sequence. For 317 example a location 0..20 (+1 strand) with parent length 30 becomes 318 after flipping 10..30 (-1 strand). Strandless (None) or unknown 319 strand (0) remain like that - just their end points are changed. 320 321 The annotation qaulifiers are copied. 322 """ 323 return SeqFeature( 324 location=self.location._flip(length), 325 type=self.type, 326 location_operator=self.location_operator, 327 id=self.id, 328 qualifiers=OrderedDict(self.qualifiers.items()), 329 ) 330 331 def extract(self, parent_sequence, references=None): 332 """Extract the feature's sequence from supplied parent sequence. 333 334 The parent_sequence can be a Seq like object or a string, and will 335 generally return an object of the same type. The exception to this is 336 a MutableSeq as the parent sequence will return a Seq object. 337 338 This should cope with complex locations including complements, joins 339 and fuzzy positions. Even mixed strand features should work! This 340 also covers features on protein sequences (e.g. domains), although 341 here reverse strand features are not permitted. If the 342 location refers to other records, they must be supplied in the 343 optional dictionary references. 344 345 >>> from Bio.Seq import Seq 346 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 347 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") 348 >>> f = SeqFeature(FeatureLocation(8, 15), type="domain") 349 >>> f.extract(seq) 350 Seq('VALIVIC') 351 352 If the FeatureLocation is None, e.g. when parsing invalid locus 353 locations in the GenBank parser, extract() will raise a ValueError. 354 355 >>> from Bio.Seq import Seq 356 >>> from Bio.SeqFeature import SeqFeature 357 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") 358 >>> f = SeqFeature(None, type="domain") 359 >>> f.extract(seq) 360 Traceback (most recent call last): 361 ... 362 ValueError: The feature's .location is None. Check the sequence file for a valid location. 363 364 Note - currently only compound features of type "join" are supported. 365 """ 366 if self.location is None: 367 raise ValueError( 368 "The feature's .location is None. Check the " 369 "sequence file for a valid location." 370 ) 371 return self.location.extract(parent_sequence, references=references) 372 373 def translate( 374 self, 375 parent_sequence, 376 table="Standard", 377 start_offset=None, 378 stop_symbol="*", 379 to_stop=False, 380 cds=None, 381 gap=None, 382 ): 383 """Get a translation of the feature's sequence. 384 385 This method is intended for CDS or other features that code proteins 386 and is a shortcut that will both extract the feature and 387 translate it, taking into account the codon_start and transl_table 388 qualifiers, if they are present. If they are not present the 389 value of the arguments "table" and "start_offset" are used. 390 391 The "cds" parameter is set to "True" if the feature is of type 392 "CDS" but can be overridden by giving an explicit argument. 393 394 The arguments stop_symbol, to_stop and gap have the same meaning 395 as Seq.translate, refer to that documentation for further information. 396 397 Arguments: 398 - parent_sequence - A DNA or RNA sequence. 399 - table - Which codon table to use if there is no transl_table 400 qualifier for this feature. This can be either a name 401 (string), an NCBI identifier (integer), or a CodonTable 402 object (useful for non-standard genetic codes). This 403 defaults to the "Standard" table. 404 - start_offset - offset at which the first complete codon of a 405 coding feature can be found, relative to the first base of 406 that feature. Has a valid value of 0, 1 or 2. NOTE: this 407 uses python's 0-based numbering whereas the codon_start 408 qualifier in files from NCBI use 1-based numbering. 409 Will override a codon_start qualifier 410 411 >>> from Bio.Seq import Seq 412 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 413 >>> seq = Seq("GGTTACACTTACCGATAATGTCTCTGATGA") 414 >>> f = SeqFeature(FeatureLocation(0, 30), type="CDS") 415 >>> f.qualifiers['transl_table'] = [11] 416 417 Note that features of type CDS are subject to the usual 418 checks at translation. But you can override this behaviour 419 by giving explicit arguments: 420 421 >>> f.translate(seq, cds=False) 422 Seq('GYTYR*CL**') 423 424 Now use the start_offset argument to change the frame. Note 425 this uses python 0-based numbering. 426 427 >>> f.translate(seq, start_offset=1, cds=False) 428 Seq('VTLTDNVSD') 429 430 Alternatively use the codon_start qualifier to do the same 431 thing. Note: this uses 1-based numbering, which is found 432 in files from NCBI. 433 434 >>> f.qualifiers['codon_start'] = [2] 435 >>> f.translate(seq, cds=False) 436 Seq('VTLTDNVSD') 437 """ 438 # see if this feature should be translated in a different 439 # frame using the "codon_start" qualifier 440 if start_offset is None: 441 try: 442 start_offset = int(self.qualifiers["codon_start"][0]) - 1 443 except KeyError: 444 start_offset = 0 445 446 if start_offset not in [0, 1, 2]: 447 raise ValueError( 448 "The start_offset must be 0, 1, or 2. " 449 f"The supplied value is {start_offset}. " 450 "Check the value of either the codon_start qualifier " 451 "or the start_offset argument" 452 ) 453 454 feat_seq = self.extract(parent_sequence)[start_offset:] 455 codon_table = self.qualifiers.get("transl_table", [table])[0] 456 457 if cds is None: 458 cds = self.type == "CDS" 459 460 return feat_seq.translate( 461 table=codon_table, 462 stop_symbol=stop_symbol, 463 to_stop=to_stop, 464 cds=cds, 465 gap=gap, 466 ) 467 468 def __bool__(self): 469 """Boolean value of an instance of this class (True). 470 471 This behaviour is for backwards compatibility, since until the 472 __len__ method was added, a SeqFeature always evaluated as True. 473 474 Note that in comparison, Seq objects, strings, lists, etc, will all 475 evaluate to False if they have length zero. 476 477 WARNING: The SeqFeature may in future evaluate to False when its 478 length is zero (in order to better match normal python behaviour)! 479 """ 480 return True 481 482 def __len__(self): 483 """Return the length of the region where the feature is located. 484 485 >>> from Bio.Seq import Seq 486 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 487 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") 488 >>> f = SeqFeature(FeatureLocation(8, 15), type="domain") 489 >>> len(f) 490 7 491 >>> f.extract(seq) 492 Seq('VALIVIC') 493 >>> len(f.extract(seq)) 494 7 495 496 This is a proxy for taking the length of the feature's location: 497 498 >>> len(f.location) 499 7 500 501 For simple features this is the same as the region spanned (end 502 position minus start position using Pythonic counting). However, for 503 a compound location (e.g. a CDS as the join of several exons) the 504 gaps are not counted (e.g. introns). This ensures that len(f) matches 505 len(f.extract(parent_seq)), and also makes sure things work properly 506 with features wrapping the origin etc. 507 """ 508 return len(self.location) 509 510 def __iter__(self): 511 """Iterate over the parent positions within the feature. 512 513 The iteration order is strand aware, and can be thought of as moving 514 along the feature using the parent sequence coordinates: 515 516 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 517 >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1) 518 >>> len(f) 519 5 520 >>> for i in f: print(i) 521 9 522 8 523 7 524 6 525 5 526 >>> list(f) 527 [9, 8, 7, 6, 5] 528 529 This is a proxy for iterating over the location, 530 531 >>> list(f.location) 532 [9, 8, 7, 6, 5] 533 """ 534 return iter(self.location) 535 536 def __contains__(self, value): 537 """Check if an integer position is within the feature. 538 539 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 540 >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1) 541 >>> len(f) 542 5 543 >>> [i for i in range(15) if i in f] 544 [5, 6, 7, 8, 9] 545 546 For example, to see which features include a SNP position, you could 547 use this: 548 549 >>> from Bio import SeqIO 550 >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb") 551 >>> for f in record.features: 552 ... if 1750 in f: 553 ... print("%s %s" % (f.type, f.location)) 554 source [0:154478](+) 555 gene [1716:4347](-) 556 tRNA join{[4310:4347](-), [1716:1751](-)} 557 558 Note that for a feature defined as a join of several subfeatures (e.g. 559 the union of several exons) the gaps are not checked (e.g. introns). 560 In this example, the tRNA location is defined in the GenBank file as 561 complement(join(1717..1751,4311..4347)), so that position 1760 falls 562 in the gap: 563 564 >>> for f in record.features: 565 ... if 1760 in f: 566 ... print("%s %s" % (f.type, f.location)) 567 source [0:154478](+) 568 gene [1716:4347](-) 569 570 Note that additional care may be required with fuzzy locations, for 571 example just before a BeforePosition: 572 573 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 574 >>> from Bio.SeqFeature import BeforePosition 575 >>> f = SeqFeature(FeatureLocation(BeforePosition(3), 8), type="domain") 576 >>> len(f) 577 5 578 >>> [i for i in range(10) if i in f] 579 [3, 4, 5, 6, 7] 580 581 Note that is is a proxy for testing membership on the location. 582 583 >>> [i for i in range(10) if i in f.location] 584 [3, 4, 5, 6, 7] 585 """ 586 return value in self.location 587 588 589# --- References 590 591 592# TODO -- Will this hold PubMed and Medline information decently? 593class Reference: 594 """Represent a Generic Reference object. 595 596 Attributes: 597 - location - A list of Location objects specifying regions of 598 the sequence that the references correspond to. If no locations are 599 specified, the entire sequence is assumed. 600 - authors - A big old string, or a list split by author, of authors 601 for the reference. 602 - title - The title of the reference. 603 - journal - Journal the reference was published in. 604 - medline_id - A medline reference for the article. 605 - pubmed_id - A pubmed reference for the article. 606 - comment - A place to stick any comments about the reference. 607 608 """ 609 610 def __init__(self): 611 """Initialize the class.""" 612 self.location = [] 613 self.authors = "" 614 self.consrtm = "" 615 self.title = "" 616 self.journal = "" 617 self.medline_id = "" 618 self.pubmed_id = "" 619 self.comment = "" 620 621 def __str__(self): 622 """Return the full Reference object as a python string.""" 623 out = "" 624 for single_location in self.location: 625 out += "location: %s\n" % single_location 626 out += "authors: %s\n" % self.authors 627 if self.consrtm: 628 out += "consrtm: %s\n" % self.consrtm 629 out += "title: %s\n" % self.title 630 out += "journal: %s\n" % self.journal 631 out += "medline id: %s\n" % self.medline_id 632 out += "pubmed id: %s\n" % self.pubmed_id 633 out += "comment: %s\n" % self.comment 634 return out 635 636 def __repr__(self): 637 """Represent the Reference object as a string for debugging.""" 638 # TODO - Update this is __init__ later accepts values 639 return "%s(title=%r, ...)" % (self.__class__.__name__, self.title) 640 641 def __eq__(self, other): 642 """Check if two Reference objects should be considered equal. 643 644 Note prior to Biopython 1.70 the location was not compared, as 645 until then __eq__ for the FeatureLocation class was not defined. 646 """ 647 return ( 648 self.authors == other.authors 649 and self.consrtm == other.consrtm 650 and self.title == other.title 651 and self.journal == other.journal 652 and self.medline_id == other.medline_id 653 and self.pubmed_id == other.pubmed_id 654 and self.comment == other.comment 655 and self.location == other.location 656 ) 657 658 659# --- Handling feature locations 660 661 662class FeatureLocation: 663 """Specify the location of a feature along a sequence. 664 665 The FeatureLocation is used for simple continuous features, which can 666 be described as running from a start position to and end position 667 (optionally with a strand and reference information). More complex 668 locations made up from several non-continuous parts (e.g. a coding 669 sequence made up of several exons) are described using a SeqFeature 670 with a CompoundLocation. 671 672 Note that the start and end location numbering follow Python's scheme, 673 thus a GenBank entry of 123..150 (one based counting) becomes a location 674 of [122:150] (zero based counting). 675 676 >>> from Bio.SeqFeature import FeatureLocation 677 >>> f = FeatureLocation(122, 150) 678 >>> print(f) 679 [122:150] 680 >>> print(f.start) 681 122 682 >>> print(f.end) 683 150 684 >>> print(f.strand) 685 None 686 687 Note the strand defaults to None. If you are working with nucleotide 688 sequences you'd want to be explicit if it is the forward strand: 689 690 >>> from Bio.SeqFeature import FeatureLocation 691 >>> f = FeatureLocation(122, 150, strand=+1) 692 >>> print(f) 693 [122:150](+) 694 >>> print(f.strand) 695 1 696 697 Note that for a parent sequence of length n, the FeatureLocation 698 start and end must satisfy the inequality 0 <= start <= end <= n. 699 This means even for features on the reverse strand of a nucleotide 700 sequence, we expect the 'start' coordinate to be less than the 701 'end'. 702 703 >>> from Bio.SeqFeature import FeatureLocation 704 >>> r = FeatureLocation(122, 150, strand=-1) 705 >>> print(r) 706 [122:150](-) 707 >>> print(r.start) 708 122 709 >>> print(r.end) 710 150 711 >>> print(r.strand) 712 -1 713 714 i.e. Rather than thinking of the 'start' and 'end' biologically in a 715 strand aware manner, think of them as the 'left most' or 'minimum' 716 boundary, and the 'right most' or 'maximum' boundary of the region 717 being described. This is particularly important with compound 718 locations describing non-continuous regions. 719 720 In the example above we have used standard exact positions, but there 721 are also specialised position objects used to represent fuzzy positions 722 as well, for example a GenBank location like complement(<123..150) 723 would use a BeforePosition object for the start. 724 """ 725 726 def __init__(self, start, end, strand=None, ref=None, ref_db=None): 727 """Initialize the class. 728 729 start and end arguments specify the values where the feature begins 730 and ends. These can either by any of the ``*Position`` objects that 731 inherit from AbstractPosition, or can just be integers specifying the 732 position. In the case of integers, the values are assumed to be 733 exact and are converted in ExactPosition arguments. This is meant 734 to make it easy to deal with non-fuzzy ends. 735 736 i.e. Short form: 737 738 >>> from Bio.SeqFeature import FeatureLocation 739 >>> loc = FeatureLocation(5, 10, strand=-1) 740 >>> print(loc) 741 [5:10](-) 742 743 Explicit form: 744 745 >>> from Bio.SeqFeature import FeatureLocation, ExactPosition 746 >>> loc = FeatureLocation(ExactPosition(5), ExactPosition(10), strand=-1) 747 >>> print(loc) 748 [5:10](-) 749 750 Other fuzzy positions are used similarly, 751 752 >>> from Bio.SeqFeature import FeatureLocation 753 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 754 >>> loc2 = FeatureLocation(BeforePosition(5), AfterPosition(10), strand=-1) 755 >>> print(loc2) 756 [<5:>10](-) 757 758 For nucleotide features you will also want to specify the strand, 759 use 1 for the forward (plus) strand, -1 for the reverse (negative) 760 strand, 0 for stranded but strand unknown (? in GFF3), or None for 761 when the strand does not apply (dot in GFF3), e.g. features on 762 proteins. 763 764 >>> loc = FeatureLocation(5, 10, strand=+1) 765 >>> print(loc) 766 [5:10](+) 767 >>> print(loc.strand) 768 1 769 770 Normally feature locations are given relative to the parent 771 sequence you are working with, but an explicit accession can 772 be given with the optional ref and db_ref strings: 773 774 >>> loc = FeatureLocation(105172, 108462, ref="AL391218.9", strand=1) 775 >>> print(loc) 776 AL391218.9[105172:108462](+) 777 >>> print(loc.ref) 778 AL391218.9 779 780 """ 781 # TODO - Check 0 <= start <= end (<= length of reference) 782 if isinstance(start, AbstractPosition): 783 self._start = start 784 elif isinstance(start, int): 785 self._start = ExactPosition(start) 786 else: 787 raise TypeError("start=%r %s" % (start, type(start))) 788 if isinstance(end, AbstractPosition): 789 self._end = end 790 elif isinstance(end, int): 791 self._end = ExactPosition(end) 792 else: 793 raise TypeError("end=%r %s" % (end, type(end))) 794 if ( 795 isinstance(self.start.position, int) 796 and isinstance(self.end.position, int) 797 and self.start > self.end 798 ): 799 raise ValueError( 800 f"End location ({self.end}) must be greater than " 801 f"or equal to start location ({self.start})" 802 ) 803 self.strand = strand 804 self.ref = ref 805 self.ref_db = ref_db 806 807 def _get_strand(self): 808 """Get function for the strand property (PRIVATE).""" 809 return self._strand 810 811 def _set_strand(self, value): 812 """Set function for the strand property (PRIVATE).""" 813 if value not in [+1, -1, 0, None]: 814 raise ValueError("Strand should be +1, -1, 0 or None, not %r" % value) 815 self._strand = value 816 817 strand = property( 818 fget=_get_strand, 819 fset=_set_strand, 820 doc="Strand of the location (+1, -1, 0 or None).", 821 ) 822 823 def __str__(self): 824 """Return a representation of the FeatureLocation object (with python counting). 825 826 For the simple case this uses the python splicing syntax, [122:150] 827 (zero based counting) which GenBank would call 123..150 (one based 828 counting). 829 """ 830 answer = "[%s:%s]" % (self._start, self._end) 831 if self.ref and self.ref_db: 832 answer = "%s:%s%s" % (self.ref_db, self.ref, answer) 833 elif self.ref: 834 answer = self.ref + answer 835 # Is ref_db without ref meaningful? 836 if self.strand is None: 837 return answer 838 elif self.strand == +1: 839 return answer + "(+)" 840 elif self.strand == -1: 841 return answer + "(-)" 842 else: 843 # strand = 0, stranded but strand unknown, ? in GFF3 844 return answer + "(?)" 845 846 def __repr__(self): 847 """Represent the FeatureLocation object as a string for debugging.""" 848 optional = "" 849 if self.strand is not None: 850 optional += ", strand=%r" % self.strand 851 if self.ref is not None: 852 optional += ", ref=%r" % self.ref 853 if self.ref_db is not None: 854 optional += ", ref_db=%r" % self.ref_db 855 return "%s(%r, %r%s)" % ( 856 self.__class__.__name__, 857 self.start, 858 self.end, 859 optional, 860 ) 861 862 def __add__(self, other): 863 """Combine location with another FeatureLocation object, or shift it. 864 865 You can add two feature locations to make a join CompoundLocation: 866 867 >>> from Bio.SeqFeature import FeatureLocation 868 >>> f1 = FeatureLocation(5, 10) 869 >>> f2 = FeatureLocation(20, 30) 870 >>> combined = f1 + f2 871 >>> print(combined) 872 join{[5:10], [20:30]} 873 874 This is thus equivalent to: 875 876 >>> from Bio.SeqFeature import CompoundLocation 877 >>> join = CompoundLocation([f1, f2]) 878 >>> print(join) 879 join{[5:10], [20:30]} 880 881 You can also use sum(...) in this way: 882 883 >>> join = sum([f1, f2]) 884 >>> print(join) 885 join{[5:10], [20:30]} 886 887 Furthermore, you can combine a FeatureLocation with a CompoundLocation 888 in this way. 889 890 Separately, adding an integer will give a new FeatureLocation with 891 its start and end offset by that amount. For example: 892 893 >>> print(f1) 894 [5:10] 895 >>> print(f1 + 100) 896 [105:110] 897 >>> print(200 + f1) 898 [205:210] 899 900 This can be useful when editing annotation. 901 """ 902 if isinstance(other, FeatureLocation): 903 return CompoundLocation([self, other]) 904 elif isinstance(other, int): 905 return self._shift(other) 906 else: 907 # This will allow CompoundLocation's __radd__ to be called: 908 return NotImplemented 909 910 def __radd__(self, other): 911 """Add a feature locationanother FeatureLocation object to the left.""" 912 if isinstance(other, int): 913 return self._shift(other) 914 else: 915 return NotImplemented 916 917 def __nonzero__(self): 918 """Return True regardless of the length of the feature. 919 920 This behaviour is for backwards compatibility, since until the 921 __len__ method was added, a FeatureLocation always evaluated as True. 922 923 Note that in comparison, Seq objects, strings, lists, etc, will all 924 evaluate to False if they have length zero. 925 926 WARNING: The FeatureLocation may in future evaluate to False when its 927 length is zero (in order to better match normal python behaviour)! 928 """ 929 return True 930 931 def __len__(self): 932 """Return the length of the region described by the FeatureLocation object. 933 934 Note that extra care may be needed for fuzzy locations, e.g. 935 936 >>> from Bio.SeqFeature import FeatureLocation 937 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 938 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 939 >>> len(loc) 940 5 941 """ 942 return int(self._end) - int(self._start) 943 944 def __contains__(self, value): 945 """Check if an integer position is within the FeatureLocation object. 946 947 Note that extra care may be needed for fuzzy locations, e.g. 948 949 >>> from Bio.SeqFeature import FeatureLocation 950 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 951 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 952 >>> len(loc) 953 5 954 >>> [i for i in range(15) if i in loc] 955 [5, 6, 7, 8, 9] 956 """ 957 if not isinstance(value, int): 958 raise ValueError( 959 "Currently we only support checking for integer " 960 "positions being within a FeatureLocation." 961 ) 962 if value < self._start or value >= self._end: 963 return False 964 else: 965 return True 966 967 def __iter__(self): 968 """Iterate over the parent positions within the FeatureLocation object. 969 970 >>> from Bio.SeqFeature import FeatureLocation 971 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 972 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 973 >>> len(loc) 974 5 975 >>> for i in loc: print(i) 976 5 977 6 978 7 979 8 980 9 981 >>> list(loc) 982 [5, 6, 7, 8, 9] 983 >>> [i for i in range(15) if i in loc] 984 [5, 6, 7, 8, 9] 985 986 Note this is strand aware: 987 988 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10), strand = -1) 989 >>> list(loc) 990 [9, 8, 7, 6, 5] 991 """ 992 if self.strand == -1: 993 yield from range(self._end - 1, self._start - 1, -1) 994 else: 995 yield from range(self._start, self._end) 996 997 def __eq__(self, other): 998 """Implement equality by comparing all the location attributes.""" 999 if not isinstance(other, FeatureLocation): 1000 return False 1001 return ( 1002 self._start == other.start 1003 and self._end == other.end 1004 and self._strand == other.strand 1005 and self.ref == other.ref 1006 and self.ref_db == other.ref_db 1007 ) 1008 1009 def _shift(self, offset): 1010 """Return a copy of the FeatureLocation shifted by an offset (PRIVATE). 1011 1012 Returns self when location is relative to an external reference. 1013 """ 1014 # TODO - What if offset is a fuzzy position? 1015 if self.ref or self.ref_db: 1016 return self 1017 return FeatureLocation( 1018 start=self._start._shift(offset), 1019 end=self._end._shift(offset), 1020 strand=self.strand, 1021 ) 1022 1023 def _flip(self, length): 1024 """Return a copy of the location after the parent is reversed (PRIVATE). 1025 1026 Returns self when location is relative to an external reference. 1027 """ 1028 if self.ref or self.ref_db: 1029 return self 1030 # Note this will flip the start and end too! 1031 if self.strand == +1: 1032 flip_strand = -1 1033 elif self.strand == -1: 1034 flip_strand = +1 1035 else: 1036 # 0 or None 1037 flip_strand = self.strand 1038 return FeatureLocation( 1039 start=self._end._flip(length), 1040 end=self._start._flip(length), 1041 strand=flip_strand, 1042 ) 1043 1044 @property 1045 def parts(self): 1046 """Read only list of sections (always one, the FeatureLocation object). 1047 1048 This is a convenience property allowing you to write code handling 1049 both simple FeatureLocation objects (with one part) and more complex 1050 CompoundLocation objects (with multiple parts) interchangeably. 1051 """ 1052 return [self] 1053 1054 @property 1055 def start(self): 1056 """Start location - left most (minimum) value, regardless of strand. 1057 1058 Read only, returns an integer like position object, possibly a fuzzy 1059 position. 1060 """ 1061 return self._start 1062 1063 @property 1064 def end(self): 1065 """End location - right most (maximum) value, regardless of strand. 1066 1067 Read only, returns an integer like position object, possibly a fuzzy 1068 position. 1069 """ 1070 return self._end 1071 1072 @property 1073 def nofuzzy_start(self): 1074 """Start position (integer, approximated if fuzzy, read only) (OBSOLETE). 1075 1076 This is now an alias for int(feature.start), which should be 1077 used in preference -- unless you are trying to support old 1078 versions of Biopython. 1079 """ 1080 try: 1081 return int(self._start) 1082 except TypeError: 1083 if isinstance(self._start, UnknownPosition): 1084 return None 1085 raise 1086 1087 @property 1088 def nofuzzy_end(self): 1089 """End position (integer, approximated if fuzzy, read only) (OBSOLETE). 1090 1091 This is now an alias for int(feature.end), which should be 1092 used in preference -- unless you are trying to support old 1093 versions of Biopython. 1094 """ 1095 try: 1096 return int(self._end) 1097 except TypeError: 1098 if isinstance(self._end, UnknownPosition): 1099 return None 1100 raise 1101 1102 def extract(self, parent_sequence, references=None): 1103 """Extract the sequence from supplied parent sequence using the FeatureLocation object. 1104 1105 The parent_sequence can be a Seq like object or a string, and will 1106 generally return an object of the same type. The exception to this is 1107 a MutableSeq as the parent sequence will return a Seq object. 1108 If the location refers to other records, they must be supplied 1109 in the optional dictionary references. 1110 1111 >>> from Bio.Seq import Seq 1112 >>> from Bio.SeqFeature import FeatureLocation 1113 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") 1114 >>> feature_loc = FeatureLocation(8, 15) 1115 >>> feature_loc.extract(seq) 1116 Seq('VALIVIC') 1117 1118 """ 1119 if self.ref or self.ref_db: 1120 if not references: 1121 raise ValueError( 1122 f"Feature references another sequence ({self.ref})," 1123 " references mandatory" 1124 ) 1125 elif self.ref not in references: 1126 # KeyError? 1127 raise ValueError( 1128 f"Feature references another sequence ({self.ref})," 1129 " not found in references" 1130 ) 1131 parent_sequence = references[self.ref] 1132 try: 1133 # If was a SeqRecord, just take the sequence 1134 # (should focus on the annotation of the feature) 1135 parent_sequence = parent_sequence.seq 1136 except AttributeError: 1137 pass 1138 if isinstance(parent_sequence, MutableSeq): 1139 # This avoids complications with reverse complements 1140 # (the MutableSeq reverse complement acts in situ) 1141 parent_sequence = Seq(parent_sequence) 1142 f_seq = parent_sequence[self.nofuzzy_start : self.nofuzzy_end] 1143 if self.strand == -1: 1144 try: 1145 f_seq = f_seq.reverse_complement() 1146 except AttributeError: 1147 assert isinstance(f_seq, str) 1148 f_seq = reverse_complement(f_seq) 1149 return f_seq 1150 1151 1152class CompoundLocation: 1153 """For handling joins etc where a feature location has several parts.""" 1154 1155 def __init__(self, parts, operator="join"): 1156 """Initialize the class. 1157 1158 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1159 >>> f1 = FeatureLocation(10, 40, strand=+1) 1160 >>> f2 = FeatureLocation(50, 59, strand=+1) 1161 >>> f = CompoundLocation([f1, f2]) 1162 >>> len(f) == len(f1) + len(f2) == 39 == len(list(f)) 1163 True 1164 >>> print(f.operator) 1165 join 1166 >>> 5 in f 1167 False 1168 >>> 15 in f 1169 True 1170 >>> f.strand 1171 1 1172 1173 Notice that the strand of the compound location is computed 1174 automatically - in the case of mixed strands on the sub-locations 1175 the overall strand is set to None. 1176 1177 >>> f = CompoundLocation([FeatureLocation(3, 6, strand=+1), 1178 ... FeatureLocation(10, 13, strand=-1)]) 1179 >>> print(f.strand) 1180 None 1181 >>> len(f) 1182 6 1183 >>> list(f) 1184 [3, 4, 5, 12, 11, 10] 1185 1186 The example above doing list(f) iterates over the coordinates within the 1187 feature. This allows you to use max and min on the location, to find the 1188 range covered: 1189 1190 >>> min(f) 1191 3 1192 >>> max(f) 1193 12 1194 1195 More generally, you can use the compound location's start and end which 1196 give the full span covered, 0 <= start <= end <= full sequence length. 1197 1198 >>> f.start == min(f) 1199 True 1200 >>> f.end == max(f) + 1 1201 True 1202 1203 This is consistent with the behaviour of the simple FeatureLocation for 1204 a single region, where again the 'start' and 'end' do not necessarily 1205 give the biological start and end, but rather the 'minimal' and 'maximal' 1206 coordinate boundaries. 1207 1208 Note that adding locations provides a more intuitive method of 1209 construction: 1210 1211 >>> f = FeatureLocation(3, 6, strand=+1) + FeatureLocation(10, 13, strand=-1) 1212 >>> len(f) 1213 6 1214 >>> list(f) 1215 [3, 4, 5, 12, 11, 10] 1216 """ 1217 self.operator = operator 1218 self.parts = list(parts) 1219 for loc in self.parts: 1220 if not isinstance(loc, FeatureLocation): 1221 raise ValueError( 1222 "CompoundLocation should be given a list of " 1223 "FeatureLocation objects, not %s" % loc.__class__ 1224 ) 1225 if len(parts) < 2: 1226 raise ValueError( 1227 "CompoundLocation should have at least 2 parts, not %r" % parts 1228 ) 1229 1230 def __str__(self): 1231 """Return a representation of the CompoundLocation object (with python counting).""" 1232 return "%s{%s}" % (self.operator, ", ".join(str(loc) for loc in self.parts)) 1233 1234 def __repr__(self): 1235 """Represent the CompoundLocation object as string for debugging.""" 1236 return "%s(%r, %r)" % (self.__class__.__name__, self.parts, self.operator) 1237 1238 def _get_strand(self): 1239 """Get function for the strand property (PRIVATE).""" 1240 # Historically a join on the reverse strand has been represented 1241 # in Biopython with both the parent SeqFeature and its children 1242 # (the exons for a CDS) all given a strand of -1. Likewise, for 1243 # a join feature on the forward strand they all have strand +1. 1244 # However, we must also consider evil mixed strand examples like 1245 # this, join(complement(69611..69724),139856..140087,140625..140650) 1246 if len({loc.strand for loc in self.parts}) == 1: 1247 return self.parts[0].strand 1248 else: 1249 return None # i.e. mixed strands 1250 1251 def _set_strand(self, value): 1252 """Set function for the strand property (PRIVATE).""" 1253 # Should this be allowed/encouraged? 1254 for loc in self.parts: 1255 loc.strand = value 1256 1257 strand = property( 1258 fget=_get_strand, 1259 fset=_set_strand, 1260 doc="""Overall strand of the compound location. 1261 1262 If all the parts have the same strand, that is returned. Otherwise 1263 for mixed strands, this returns None. 1264 1265 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1266 >>> f1 = FeatureLocation(15, 17, strand=1) 1267 >>> f2 = FeatureLocation(20, 30, strand=-1) 1268 >>> f = f1 + f2 1269 >>> f1.strand 1270 1 1271 >>> f2.strand 1272 -1 1273 >>> f.strand 1274 >>> f.strand is None 1275 True 1276 1277 If you set the strand of a CompoundLocation, this is applied to 1278 all the parts - use with caution: 1279 1280 >>> f.strand = 1 1281 >>> f1.strand 1282 1 1283 >>> f2.strand 1284 1 1285 >>> f.strand 1286 1 1287 1288 """, 1289 ) 1290 1291 def __add__(self, other): 1292 """Combine locations, or shift the location by an integer offset. 1293 1294 >>> from Bio.SeqFeature import FeatureLocation 1295 >>> f1 = FeatureLocation(15, 17) + FeatureLocation(20, 30) 1296 >>> print(f1) 1297 join{[15:17], [20:30]} 1298 1299 You can add another FeatureLocation: 1300 1301 >>> print(f1 + FeatureLocation(40, 50)) 1302 join{[15:17], [20:30], [40:50]} 1303 >>> print(FeatureLocation(5, 10) + f1) 1304 join{[5:10], [15:17], [20:30]} 1305 1306 You can also add another CompoundLocation: 1307 1308 >>> f2 = FeatureLocation(40, 50) + FeatureLocation(60, 70) 1309 >>> print(f2) 1310 join{[40:50], [60:70]} 1311 >>> print(f1 + f2) 1312 join{[15:17], [20:30], [40:50], [60:70]} 1313 1314 Also, as with the FeatureLocation, adding an integer shifts the 1315 location's co-ordinates by that offset: 1316 1317 >>> print(f1 + 100) 1318 join{[115:117], [120:130]} 1319 >>> print(200 + f1) 1320 join{[215:217], [220:230]} 1321 >>> print(f1 + (-5)) 1322 join{[10:12], [15:25]} 1323 """ 1324 if isinstance(other, FeatureLocation): 1325 return CompoundLocation(self.parts + [other], self.operator) 1326 elif isinstance(other, CompoundLocation): 1327 if self.operator != other.operator: 1328 # Handle join+order -> order as a special case? 1329 raise ValueError( 1330 "Mixed operators %s and %s" % (self.operator, other.operator) 1331 ) 1332 return CompoundLocation(self.parts + other.parts, self.operator) 1333 elif isinstance(other, int): 1334 return self._shift(other) 1335 else: 1336 raise NotImplementedError 1337 1338 def __radd__(self, other): 1339 """Add a feature to the left.""" 1340 if isinstance(other, FeatureLocation): 1341 return CompoundLocation([other] + self.parts, self.operator) 1342 elif isinstance(other, int): 1343 return self._shift(other) 1344 else: 1345 raise NotImplementedError 1346 1347 def __contains__(self, value): 1348 """Check if an integer position is within the CompoundLocation object.""" 1349 for loc in self.parts: 1350 if value in loc: 1351 return True 1352 return False 1353 1354 def __nonzero__(self): 1355 """Return True regardless of the length of the feature. 1356 1357 This behaviour is for backwards compatibility, since until the 1358 __len__ method was added, a FeatureLocation always evaluated as True. 1359 1360 Note that in comparison, Seq objects, strings, lists, etc, will all 1361 evaluate to False if they have length zero. 1362 1363 WARNING: The FeatureLocation may in future evaluate to False when its 1364 length is zero (in order to better match normal python behaviour)! 1365 """ 1366 return True 1367 1368 def __len__(self): 1369 """Return the length of the CompoundLocation object.""" 1370 return sum(len(loc) for loc in self.parts) 1371 1372 def __iter__(self): 1373 """Iterate over the parent positions within the CompoundLocation object.""" 1374 for loc in self.parts: 1375 yield from loc 1376 1377 def __eq__(self, other): 1378 """Check if all parts of CompoundLocation are equal to all parts of other CompoundLocation.""" 1379 if not isinstance(other, CompoundLocation): 1380 return False 1381 if len(self.parts) != len(other.parts): 1382 return False 1383 if self.operator != other.operator: 1384 return False 1385 for self_part, other_part in zip(self.parts, other.parts): 1386 if self_part != other_part: 1387 return False 1388 return True 1389 1390 def _shift(self, offset): 1391 """Return a copy of the CompoundLocation shifted by an offset (PRIVATE).""" 1392 return CompoundLocation( 1393 [loc._shift(offset) for loc in self.parts], self.operator 1394 ) 1395 1396 def _flip(self, length): 1397 """Return a copy of the locations after the parent is reversed (PRIVATE). 1398 1399 Note that the order of the parts is NOT reversed too. Consider a CDS 1400 on the forward strand with exons small, medium and large (in length). 1401 Once we change the frame of reference to the reverse complement strand, 1402 the start codon is still part of the small exon, and the stop codon 1403 still part of the large exon - so the part order remains the same! 1404 1405 Here is an artificial example, were the features map to the two upper 1406 case regions and the lower case runs of n are not used: 1407 1408 >>> from Bio.Seq import Seq 1409 >>> from Bio.SeqFeature import FeatureLocation 1410 >>> dna = Seq("nnnnnAGCATCCTGCTGTACnnnnnnnnGAGAMTGCCATGCCCCTGGAGTGAnnnnn") 1411 >>> small = FeatureLocation(5, 20, strand=1) 1412 >>> large = FeatureLocation(28, 52, strand=1) 1413 >>> location = small + large 1414 >>> print(small) 1415 [5:20](+) 1416 >>> print(large) 1417 [28:52](+) 1418 >>> print(location) 1419 join{[5:20](+), [28:52](+)} 1420 >>> for part in location.parts: 1421 ... print(len(part)) 1422 ... 1423 15 1424 24 1425 1426 As you can see, this is a silly example where each "exon" is a word: 1427 1428 >>> print(small.extract(dna).translate()) 1429 SILLY 1430 >>> print(large.extract(dna).translate()) 1431 EXAMPLE* 1432 >>> print(location.extract(dna).translate()) 1433 SILLYEXAMPLE* 1434 >>> for part in location.parts: 1435 ... print(part.extract(dna).translate()) 1436 ... 1437 SILLY 1438 EXAMPLE* 1439 1440 Now, let's look at this from the reverse strand frame of reference: 1441 1442 >>> flipped_dna = dna.reverse_complement() 1443 >>> flipped_location = location._flip(len(dna)) 1444 >>> print(flipped_location.extract(flipped_dna).translate()) 1445 SILLYEXAMPLE* 1446 >>> for part in flipped_location.parts: 1447 ... print(part.extract(flipped_dna).translate()) 1448 ... 1449 SILLY 1450 EXAMPLE* 1451 1452 The key point here is the first part of the CompoundFeature is still the 1453 small exon, while the second part is still the large exon: 1454 1455 >>> for part in flipped_location.parts: 1456 ... print(len(part)) 1457 ... 1458 15 1459 24 1460 >>> print(flipped_location) 1461 join{[37:52](-), [5:29](-)} 1462 1463 Notice the parts are not reversed. However, there was a bug here in older 1464 versions of Biopython which would have given join{[5:29](-), [37:52](-)} 1465 and the translation would have wrongly been "EXAMPLE*SILLY" instead. 1466 1467 """ 1468 return CompoundLocation( 1469 [loc._flip(length) for loc in self.parts], self.operator 1470 ) 1471 1472 @property 1473 def start(self): 1474 """Start location - left most (minimum) value, regardless of strand. 1475 1476 Read only, returns an integer like position object, possibly a fuzzy 1477 position. 1478 1479 For the special case of a CompoundLocation wrapping the origin of a 1480 circular genome, this will return zero. 1481 """ 1482 return min(loc.start for loc in self.parts) 1483 1484 @property 1485 def end(self): 1486 """End location - right most (maximum) value, regardless of strand. 1487 1488 Read only, returns an integer like position object, possibly a fuzzy 1489 position. 1490 1491 For the special case of a CompoundLocation wrapping the origin of 1492 a circular genome this will match the genome length (minus one 1493 given how Python counts from zero). 1494 """ 1495 return max(loc.end for loc in self.parts) 1496 1497 @property 1498 def nofuzzy_start(self): 1499 """Start position (integer, approximated if fuzzy, read only) (OBSOLETE). 1500 1501 This is an alias for int(feature.start), which should be used in 1502 preference -- unless you are trying to support old versions of 1503 Biopython. 1504 """ 1505 try: 1506 return int(self.start) 1507 except TypeError: 1508 if isinstance(self.start, UnknownPosition): 1509 return None 1510 raise 1511 1512 @property 1513 def nofuzzy_end(self): 1514 """End position (integer, approximated if fuzzy, read only) (OBSOLETE). 1515 1516 This is an alias for int(feature.end), which should be used in 1517 preference -- unless you are trying to support old versions of 1518 Biopython. 1519 """ 1520 try: 1521 return int(self.end) 1522 except TypeError: 1523 if isinstance(self.end, UnknownPosition): 1524 return None 1525 raise 1526 1527 @property 1528 def ref(self): 1529 """Not present in CompoundLocation, dummy method for API compatibility.""" 1530 return None 1531 1532 @property 1533 def ref_db(self): 1534 """Not present in CompoundLocation, dummy method for API compatibility.""" 1535 return None 1536 1537 def extract(self, parent_sequence, references=None): 1538 """Extract the sequence from supplied parent sequence using the CompoundLocation object. 1539 1540 The parent_sequence can be a Seq like object or a string, and will 1541 generally return an object of the same type. The exception to this is 1542 a MutableSeq as the parent sequence will return a Seq object. 1543 If the location refers to other records, they must be supplied 1544 in the optional dictionary references. 1545 1546 >>> from Bio.Seq import Seq 1547 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1548 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL") 1549 >>> fl1 = FeatureLocation(2, 8) 1550 >>> fl2 = FeatureLocation(10, 15) 1551 >>> fl3 = CompoundLocation([fl1,fl2]) 1552 >>> fl3.extract(seq) 1553 Seq('QHKAMILIVIC') 1554 1555 """ 1556 # This copes with mixed strand features & all on reverse: 1557 parts = [ 1558 loc.extract(parent_sequence, references=references) for loc in self.parts 1559 ] 1560 f_seq = functools.reduce(lambda x, y: x + y, parts) 1561 return f_seq 1562 1563 1564class AbstractPosition: 1565 """Abstract base class representing a position.""" 1566 1567 def __repr__(self): 1568 """Represent the AbstractPosition object as a string for debugging.""" 1569 return "%s(...)" % (self.__class__.__name__) 1570 1571 1572class ExactPosition(int, AbstractPosition): 1573 """Specify the specific position of a boundary. 1574 1575 Arguments: 1576 - position - The position of the boundary. 1577 - extension - An optional argument which must be zero since we don't 1578 have an extension. The argument is provided so that the same number 1579 of arguments can be passed to all position types. 1580 1581 In this case, there is no fuzziness associated with the position. 1582 1583 >>> p = ExactPosition(5) 1584 >>> p 1585 ExactPosition(5) 1586 >>> print(p) 1587 5 1588 1589 >>> isinstance(p, AbstractPosition) 1590 True 1591 >>> isinstance(p, int) 1592 True 1593 1594 Integer comparisons and operations should work as expected: 1595 1596 >>> p == 5 1597 True 1598 >>> p < 6 1599 True 1600 >>> p <= 5 1601 True 1602 >>> p + 10 1603 15 1604 1605 """ 1606 1607 def __new__(cls, position, extension=0): 1608 """Create an ExactPosition object.""" 1609 if extension != 0: 1610 raise AttributeError( 1611 "Non-zero extension %s for exact position." % extension 1612 ) 1613 return int.__new__(cls, position) 1614 1615 # Must define this on Python 3.8 onwards because we redefine __repr__ 1616 def __str__(self): 1617 """Return a representation of the ExactPosition object (with python counting).""" 1618 return str(int(self)) 1619 1620 def __repr__(self): 1621 """Represent the ExactPosition object as a string for debugging.""" 1622 return "%s(%i)" % (self.__class__.__name__, int(self)) 1623 1624 @property 1625 def position(self): 1626 """Legacy attribute to get position as integer (OBSOLETE).""" 1627 return int(self) 1628 1629 @property 1630 def extension(self): 1631 """Not present in this object, return zero (OBSOLETE).""" 1632 return 0 1633 1634 def _shift(self, offset): 1635 """Return a copy of the position object with its location shifted (PRIVATE).""" 1636 # By default preserve any subclass 1637 return self.__class__(int(self) + offset) 1638 1639 def _flip(self, length): 1640 """Return a copy of the location after the parent is reversed (PRIVATE).""" 1641 # By default preserve any subclass 1642 return self.__class__(length - int(self)) 1643 1644 1645class UncertainPosition(ExactPosition): 1646 """Specify a specific position which is uncertain. 1647 1648 This is used in UniProt, e.g. ?222 for uncertain position 222, or in the 1649 XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL. 1650 """ 1651 1652 pass 1653 1654 1655class UnknownPosition(AbstractPosition): 1656 """Specify a specific position which is unknown (has no position). 1657 1658 This is used in UniProt, e.g. ? or in the XML as unknown. 1659 """ 1660 1661 def __repr__(self): 1662 """Represent the UnknownPosition object as a string for debugging.""" 1663 return "%s()" % self.__class__.__name__ 1664 1665 def __hash__(self): 1666 """Return the hash value of the UnknownPosition object.""" 1667 return hash(None) 1668 1669 @property 1670 def position(self): 1671 """Legacy attribute to get location (None) (OBSOLETE).""" 1672 return None 1673 1674 @property 1675 def extension(self): # noqa: D402 1676 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" # noqa: D402 1677 return 0 1678 1679 def _shift(self, offset): 1680 """Return a copy of the position object with its location shifted (PRIVATE).""" 1681 return self 1682 1683 def _flip(self, length): 1684 """Return a copy of the location after the parent is reversed (PRIVATE).""" 1685 return self 1686 1687 1688class WithinPosition(int, AbstractPosition): 1689 """Specify the position of a boundary within some coordinates. 1690 1691 Arguments: 1692 - position - The default integer position 1693 - left - The start (left) position of the boundary 1694 - right - The end (right) position of the boundary 1695 1696 This allows dealing with a location like ((1.4)..100). This 1697 indicates that the start of the sequence is somewhere between 1 1698 and 4. Since this is a start coordinate, it should acts like 1699 it is at position 1 (or in Python counting, 0). 1700 1701 >>> p = WithinPosition(10, 10, 13) 1702 >>> p 1703 WithinPosition(10, left=10, right=13) 1704 >>> print(p) 1705 (10.13) 1706 >>> int(p) 1707 10 1708 1709 Basic integer comparisons and operations should work as though 1710 this were a plain integer: 1711 1712 >>> p == 10 1713 True 1714 >>> p in [9, 10, 11] 1715 True 1716 >>> p < 11 1717 True 1718 >>> p + 10 1719 20 1720 1721 >>> isinstance(p, WithinPosition) 1722 True 1723 >>> isinstance(p, AbstractPosition) 1724 True 1725 >>> isinstance(p, int) 1726 True 1727 1728 Note this also applies for comparison to other position objects, 1729 where again the integer behaviour is used: 1730 1731 >>> p == 10 1732 True 1733 >>> p == ExactPosition(10) 1734 True 1735 >>> p == BeforePosition(10) 1736 True 1737 >>> p == AfterPosition(10) 1738 True 1739 1740 If this were an end point, you would want the position to be 13: 1741 1742 >>> p2 = WithinPosition(13, 10, 13) 1743 >>> p2 1744 WithinPosition(13, left=10, right=13) 1745 >>> print(p2) 1746 (10.13) 1747 >>> int(p2) 1748 13 1749 >>> p2 == 13 1750 True 1751 >>> p2 == ExactPosition(13) 1752 True 1753 1754 The old legacy properties of position and extension give the 1755 starting/lower/left position as an integer, and the distance 1756 to the ending/higher/right position as an integer. Note that 1757 the position object will act like either the left or the right 1758 end-point depending on how it was created: 1759 1760 >>> p.position == p2.position == 10 1761 True 1762 >>> p.extension == p2.extension == 3 1763 True 1764 >>> int(p) == int(p2) 1765 False 1766 >>> p == 10 1767 True 1768 >>> p2 == 13 1769 True 1770 1771 """ 1772 1773 def __new__(cls, position, left, right): 1774 """Create a WithinPosition object.""" 1775 if not (position == left or position == right): 1776 raise RuntimeError( 1777 "WithinPosition: %r should match left %r or " 1778 "right %r" % (position, left, right) 1779 ) 1780 obj = int.__new__(cls, position) 1781 obj._left = left 1782 obj._right = right 1783 return obj 1784 1785 def __getnewargs__(self): 1786 """Return the arguments accepted by __new__. 1787 1788 Necessary to allow pickling and unpickling of class instances. 1789 """ 1790 return (int(self), self._left, self._right) 1791 1792 def __repr__(self): 1793 """Represent the WithinPosition object as a string for debugging.""" 1794 return "%s(%i, left=%i, right=%i)" % ( 1795 self.__class__.__name__, 1796 int(self), 1797 self._left, 1798 self._right, 1799 ) 1800 1801 def __str__(self): 1802 """Return a representation of the WithinPosition object (with python counting).""" 1803 return "(%s.%s)" % (self._left, self._right) 1804 1805 @property 1806 def position(self): 1807 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1808 return self._left 1809 1810 @property 1811 def extension(self): # noqa: D402 1812 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE).""" # noqa: D402 1813 return self._right - self._left 1814 1815 def _shift(self, offset): 1816 """Return a copy of the position object with its location shifted (PRIVATE).""" 1817 return self.__class__( 1818 int(self) + offset, self._left + offset, self._right + offset 1819 ) 1820 1821 def _flip(self, length): 1822 """Return a copy of the location after the parent is reversed (PRIVATE).""" 1823 return self.__class__( 1824 length - int(self), length - self._right, length - self._left 1825 ) 1826 1827 1828class BetweenPosition(int, AbstractPosition): 1829 """Specify the position of a boundary between two coordinates (OBSOLETE?). 1830 1831 Arguments: 1832 - position - The default integer position 1833 - left - The start (left) position of the boundary 1834 - right - The end (right) position of the boundary 1835 1836 This allows dealing with a position like 123^456. This 1837 indicates that the start of the sequence is somewhere between 1838 123 and 456. It is up to the parser to set the position argument 1839 to either boundary point (depending on if this is being used as 1840 a start or end of the feature). For example as a feature end: 1841 1842 >>> p = BetweenPosition(456, 123, 456) 1843 >>> p 1844 BetweenPosition(456, left=123, right=456) 1845 >>> print(p) 1846 (123^456) 1847 >>> int(p) 1848 456 1849 1850 Integer equality and comparison use the given position, 1851 1852 >>> p == 456 1853 True 1854 >>> p in [455, 456, 457] 1855 True 1856 >>> p > 300 1857 True 1858 1859 The old legacy properties of position and extension give the 1860 starting/lower/left position as an integer, and the distance 1861 to the ending/higher/right position as an integer. Note that 1862 the position object will act like either the left or the right 1863 end-point depending on how it was created: 1864 1865 >>> p2 = BetweenPosition(123, left=123, right=456) 1866 >>> p.position == p2.position == 123 1867 True 1868 >>> p.extension 1869 333 1870 >>> p2.extension 1871 333 1872 >>> p.extension == p2.extension == 333 1873 True 1874 >>> int(p) == int(p2) 1875 False 1876 >>> p == 456 1877 True 1878 >>> p2 == 123 1879 True 1880 1881 Note this potentially surprising behaviour: 1882 1883 >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123) 1884 True 1885 >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123) 1886 True 1887 >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123) 1888 True 1889 1890 i.e. For equality (and sorting) the position objects behave like 1891 integers. 1892 1893 """ 1894 1895 def __new__(cls, position, left, right): 1896 """Create a new instance in BetweenPosition object.""" 1897 assert position == left or position == right 1898 obj = int.__new__(cls, position) 1899 obj._left = left 1900 obj._right = right 1901 return obj 1902 1903 def __getnewargs__(self): 1904 """Return the arguments accepted by __new__. 1905 1906 Necessary to allow pickling and unpickling of class instances. 1907 """ 1908 return (int(self), self._left, self._right) 1909 1910 def __repr__(self): 1911 """Represent the BetweenPosition object as a string for debugging.""" 1912 return "%s(%i, left=%i, right=%i)" % ( 1913 self.__class__.__name__, 1914 int(self), 1915 self._left, 1916 self._right, 1917 ) 1918 1919 def __str__(self): 1920 """Return a representation of the BetweenPosition object (with python counting).""" 1921 return "(%s^%s)" % (self._left, self._right) 1922 1923 @property 1924 def position(self): 1925 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1926 return self._left 1927 1928 @property 1929 def extension(self): # noqa: D402 1930 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE).""" # noqa: D402 1931 return self._right - self._left 1932 1933 def _shift(self, offset): 1934 """Return a copy of the position object with its location shifted (PRIVATE).""" 1935 return self.__class__( 1936 int(self) + offset, self._left + offset, self._right + offset 1937 ) 1938 1939 def _flip(self, length): 1940 """Return a copy of the location after the parent is reversed (PRIVATE).""" 1941 return self.__class__( 1942 length - int(self), length - self._right, length - self._left 1943 ) 1944 1945 1946class BeforePosition(int, AbstractPosition): 1947 """Specify a position where the actual location occurs before it. 1948 1949 Arguments: 1950 - position - The upper boundary of where the location can occur. 1951 - extension - An optional argument which must be zero since we don't 1952 have an extension. The argument is provided so that the same number 1953 of arguments can be passed to all position types. 1954 1955 This is used to specify positions like (<10..100) where the location 1956 occurs somewhere before position 10. 1957 1958 >>> p = BeforePosition(5) 1959 >>> p 1960 BeforePosition(5) 1961 >>> print(p) 1962 <5 1963 >>> int(p) 1964 5 1965 >>> p + 10 1966 15 1967 1968 Note this potentially surprising behaviour: 1969 1970 >>> p == ExactPosition(5) 1971 True 1972 >>> p == AfterPosition(5) 1973 True 1974 1975 Just remember that for equality and sorting the position objects act 1976 like integers. 1977 """ 1978 1979 # Subclasses int so can't use __init__ 1980 def __new__(cls, position, extension=0): 1981 """Create a new instance in BeforePosition object.""" 1982 if extension != 0: 1983 raise AttributeError( 1984 "Non-zero extension %s for exact position." % extension 1985 ) 1986 return int.__new__(cls, position) 1987 1988 @property 1989 def position(self): 1990 """Legacy attribute to get position as integer (OBSOLETE).""" 1991 return int(self) 1992 1993 @property 1994 def extension(self): # noqa: D402 1995 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" # noqa: D402 1996 return 0 1997 1998 def __repr__(self): 1999 """Represent the location as a string for debugging.""" 2000 return "%s(%i)" % (self.__class__.__name__, int(self)) 2001 2002 def __str__(self): 2003 """Return a representation of the BeforePosition object (with python counting).""" 2004 return "<%s" % self.position 2005 2006 def _shift(self, offset): 2007 """Return a copy of the position object with its location shifted (PRIVATE).""" 2008 return self.__class__(int(self) + offset) 2009 2010 def _flip(self, length): 2011 """Return a copy of the location after the parent is reversed (PRIVATE).""" 2012 return AfterPosition(length - int(self)) 2013 2014 2015class AfterPosition(int, AbstractPosition): 2016 """Specify a position where the actual location is found after it. 2017 2018 Arguments: 2019 - position - The lower boundary of where the location can occur. 2020 - extension - An optional argument which must be zero since we don't 2021 have an extension. The argument is provided so that the same number 2022 of arguments can be passed to all position types. 2023 2024 This is used to specify positions like (>10..100) where the location 2025 occurs somewhere after position 10. 2026 2027 >>> p = AfterPosition(7) 2028 >>> p 2029 AfterPosition(7) 2030 >>> print(p) 2031 >7 2032 >>> int(p) 2033 7 2034 >>> p + 10 2035 17 2036 2037 >>> isinstance(p, AfterPosition) 2038 True 2039 >>> isinstance(p, AbstractPosition) 2040 True 2041 >>> isinstance(p, int) 2042 True 2043 2044 Note this potentially surprising behaviour: 2045 2046 >>> p == ExactPosition(7) 2047 True 2048 >>> p == BeforePosition(7) 2049 True 2050 2051 Just remember that for equality and sorting the position objects act 2052 like integers. 2053 """ 2054 2055 # Subclasses int so can't use __init__ 2056 def __new__(cls, position, extension=0): 2057 """Create a new instance of the AfterPosition object.""" 2058 if extension != 0: 2059 raise AttributeError( 2060 "Non-zero extension %s for exact position." % extension 2061 ) 2062 return int.__new__(cls, position) 2063 2064 @property 2065 def position(self): 2066 """Legacy attribute to get position as integer (OBSOLETE).""" 2067 return int(self) 2068 2069 @property 2070 def extension(self): # noqa: D402 2071 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" # noqa: D402 2072 return 0 2073 2074 def __repr__(self): 2075 """Represent the location as a string for debugging.""" 2076 return "%s(%i)" % (self.__class__.__name__, int(self)) 2077 2078 def __str__(self): 2079 """Return a representation of the AfterPosition object (with python counting).""" 2080 return ">%s" % self.position 2081 2082 def _shift(self, offset): 2083 """Return a copy of the position object with its location shifted (PRIVATE).""" 2084 return self.__class__(int(self) + offset) 2085 2086 def _flip(self, length): 2087 """Return a copy of the location after the parent is reversed (PRIVATE).""" 2088 return BeforePosition(length - int(self)) 2089 2090 2091class OneOfPosition(int, AbstractPosition): 2092 """Specify a position where the location can be multiple positions. 2093 2094 This models the GenBank 'one-of(1888,1901)' function, and tries 2095 to make this fit within the Biopython Position models. If this was 2096 a start position it should act like 1888, but as an end position 1901. 2097 2098 >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)]) 2099 >>> p 2100 OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)]) 2101 >>> int(p) 2102 1888 2103 2104 Integer comparisons and operators act like using int(p), 2105 2106 >>> p == 1888 2107 True 2108 >>> p <= 1888 2109 True 2110 >>> p > 1888 2111 False 2112 >>> p + 100 2113 1988 2114 2115 >>> isinstance(p, OneOfPosition) 2116 True 2117 >>> isinstance(p, AbstractPosition) 2118 True 2119 >>> isinstance(p, int) 2120 True 2121 2122 The old legacy properties of position and extension give the 2123 starting/lowest/left-most position as an integer, and the 2124 distance to the ending/highest/right-most position as an integer. 2125 Note that the position object will act like one of the list of 2126 possible locations depending on how it was created: 2127 2128 >>> p2 = OneOfPosition(1901, [ExactPosition(1888), ExactPosition(1901)]) 2129 >>> p.position == p2.position == 1888 2130 True 2131 >>> p.extension == p2.extension == 13 2132 True 2133 >>> int(p) == int(p2) 2134 False 2135 >>> p == 1888 2136 True 2137 >>> p2 == 1901 2138 True 2139 2140 """ 2141 2142 def __new__(cls, position, choices): 2143 """Initialize with a set of possible positions. 2144 2145 position_list is a list of AbstractPosition derived objects, 2146 specifying possible locations. 2147 2148 position is an integer specifying the default behaviour. 2149 """ 2150 if position not in choices: 2151 raise ValueError( 2152 "OneOfPosition: %r should match one of %r" % (position, choices) 2153 ) 2154 obj = int.__new__(cls, position) 2155 obj.position_choices = choices 2156 return obj 2157 2158 def __getnewargs__(self): 2159 """Return the arguments accepted by __new__. 2160 2161 Necessary to allow pickling and unpickling of class instances. 2162 """ 2163 return (int(self), self.position_choices) 2164 2165 @property 2166 def position(self): 2167 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 2168 return min(int(pos) for pos in self.position_choices) 2169 2170 @property 2171 def extension(self): 2172 """Legacy attribute to get extension as integer (OBSOLETE).""" 2173 positions = [int(pos) for pos in self.position_choices] 2174 return max(positions) - min(positions) 2175 2176 def __repr__(self): 2177 """Represent the OneOfPosition object as a string for debugging.""" 2178 return "%s(%i, choices=%r)" % ( 2179 self.__class__.__name__, 2180 int(self), 2181 self.position_choices, 2182 ) 2183 2184 def __str__(self): 2185 """Return a representation of the OneOfPosition object (with python counting).""" 2186 out = "one-of(" 2187 for position in self.position_choices: 2188 out += "%s," % position 2189 # replace the last comma with the closing parenthesis 2190 return out[:-1] + ")" 2191 2192 def _shift(self, offset): 2193 """Return a copy of the position object with its location shifted (PRIVATE).""" 2194 return self.__class__( 2195 int(self) + offset, [p._shift(offset) for p in self.position_choices] 2196 ) 2197 2198 def _flip(self, length): 2199 """Return a copy of the location after the parent is reversed (PRIVATE).""" 2200 return self.__class__( 2201 length - int(self), [p._flip(length) for p in self.position_choices[::-1]] 2202 ) 2203 2204 2205class PositionGap: 2206 """Simple class to hold information about a gap between positions.""" 2207 2208 def __init__(self, gap_size): 2209 """Intialize with a position object containing the gap information.""" 2210 self.gap_size = gap_size 2211 2212 def __repr__(self): 2213 """Represent the position gap as a string for debugging.""" 2214 return "%s(%r)" % (self.__class__.__name__, self.gap_size) 2215 2216 def __str__(self): 2217 """Return a representation of the PositionGap object (with python counting).""" 2218 return "gap(%s)" % self.gap_size 2219 2220 2221if __name__ == "__main__": 2222 from Bio._utils import run_doctest 2223 2224 run_doctest() 2225