1# Copyright 2000-2003 Jeff Chang.
2# Copyright 2001-2008 Brad Chapman.
3# Copyright 2005-2016 by Peter Cock.
4# Copyright 2006-2009 Michiel de Hoon.
5# All rights reserved.
6#
7# This file is part of the Biopython distribution and governed by your
8# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
9# Please see the LICENSE file that should have been included as part of this
10# package.
11"""Represent a Sequence Feature holding info about a part of a sequence.
12
13This is heavily modeled after the Biocorba SeqFeature objects, and
14may be pretty biased towards GenBank stuff since I'm writing it
15for the GenBank parser output...
16
17What's here:
18
19Base class to hold a Feature
20----------------------------
21
22Classes:
23 - SeqFeature
24
25Hold information about a Reference
26----------------------------------
27
28This is an attempt to create a General class to hold Reference type
29information.
30
31Classes:
32 - Reference
33
34Specify locations of a feature on a Sequence
35--------------------------------------------
36
37This aims to handle, in Ewan Birney's words, 'the dreaded fuzziness issue'.
38This has the advantages of allowing us to handle fuzzy stuff in case anyone
39needs it, and also be compatible with BioPerl etc and BioSQL.
40
41Classes:
42 - FeatureLocation - Specify the start and end location of a feature.
43 - CompoundLocation - Collection of FeatureLocation objects (for joins etc).
44 - ExactPosition - Specify the position as being exact.
45 - WithinPosition - Specify a position occurring within some range.
46 - BetweenPosition - Specify a position occurring between a range (OBSOLETE?).
47 - BeforePosition - Specify the position as being found before some base.
48 - AfterPosition - Specify the position as being found after some base.
49 - OneOfPosition - Specify a position where the location can be multiple positions.
50 - UncertainPosition - Specify a specific position which is uncertain.
51 - UnknownPosition - Represents missing information like '?' in UniProt.
52
53"""
54import functools
55
56from collections import OrderedDict
57
58from Bio.Seq import MutableSeq
59from Bio.Seq import reverse_complement
60from Bio.Seq import Seq
61
62
63class SeqFeature:
64    """Represent a Sequence Feature on an object.
65
66    Attributes:
67     - location - the location of the feature on the sequence (FeatureLocation)
68     - type - the specified type of the feature (ie. CDS, exon, repeat...)
69     - location_operator - a string specifying how this SeqFeature may
70       be related to others. For example, in the example GenBank feature
71       shown below, the location_operator would be "join". This is a proxy
72       for feature.location.operator and only applies to compound locations.
73     - strand - A value specifying on which strand (of a DNA sequence, for
74       instance) the feature deals with. 1 indicates the plus strand, -1
75       indicates the minus strand, 0 indicates stranded but unknown (? in GFF3),
76       while the default of None indicates that strand doesn't apply (dot in GFF3,
77       e.g. features on proteins). Note this is a shortcut for accessing the
78       strand property of the feature's location.
79     - id - A string identifier for the feature.
80     - ref - A reference to another sequence. This could be an accession
81       number for some different sequence. Note this is a shortcut for the
82       reference property of the feature's location.
83     - ref_db - A different database for the reference accession number.
84       Note this is a shortcut for the reference property of the location
85     - qualifiers - A dictionary of qualifiers on the feature. These are
86       analogous to the qualifiers from a GenBank feature table. The keys of
87       the dictionary are qualifier names, the values are the qualifier
88       values. As of Biopython 1.69 this is an ordered dictionary.
89
90    """
91
92    def __init__(
93        self,
94        location=None,
95        type="",
96        location_operator="",
97        strand=None,
98        id="<unknown id>",
99        qualifiers=None,
100        sub_features=None,
101        ref=None,
102        ref_db=None,
103    ):
104        """Initialize a SeqFeature on a Sequence.
105
106        location can either be a FeatureLocation (with strand argument also
107        given if required), or None.
108
109        e.g. With no strand, on the forward strand, and on the reverse strand:
110
111        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
112        >>> f1 = SeqFeature(FeatureLocation(5, 10), type="domain")
113        >>> f1.strand == f1.location.strand == None
114        True
115        >>> f2 = SeqFeature(FeatureLocation(7, 110, strand=1), type="CDS")
116        >>> f2.strand == f2.location.strand == +1
117        True
118        >>> f3 = SeqFeature(FeatureLocation(9, 108, strand=-1), type="CDS")
119        >>> f3.strand == f3.location.strand == -1
120        True
121
122        An invalid strand will trigger an exception:
123
124        >>> f4 = SeqFeature(FeatureLocation(50, 60), strand=2)
125        Traceback (most recent call last):
126           ...
127        ValueError: Strand should be +1, -1, 0 or None, not 2
128
129        Similarly if set via the FeatureLocation directly:
130
131        >>> loc4 = FeatureLocation(50, 60, strand=2)
132        Traceback (most recent call last):
133           ...
134        ValueError: Strand should be +1, -1, 0 or None, not 2
135
136        For exact start/end positions, an integer can be used (as shown above)
137        as shorthand for the ExactPosition object. For non-exact locations, the
138        FeatureLocation must be specified via the appropriate position objects.
139
140        Note that the strand, ref and ref_db arguments to the SeqFeature are
141        now obsolete and will be deprecated in a future release (which will
142        give warning messages) and later removed. Set them via the location
143        object instead.
144
145        Note that location_operator and sub_features arguments can no longer
146        be used, instead do this via the CompoundLocation object.
147        """
148        if (
149            location is not None
150            and not isinstance(location, FeatureLocation)
151            and not isinstance(location, CompoundLocation)
152        ):
153            raise TypeError(
154                "FeatureLocation, CompoundLocation (or None) required for the location"
155            )
156        self.location = location
157        self.type = type
158        if location_operator:
159            # TODO - Deprecation warning
160            self.location_operator = location_operator
161        if strand is not None:
162            # TODO - Deprecation warning
163            self.strand = strand
164        self.id = id
165        if qualifiers is None:
166            qualifiers = OrderedDict()
167        self.qualifiers = qualifiers
168        if sub_features is not None:
169            raise TypeError("Rather than sub_features, use a CompoundFeatureLocation")
170        if ref is not None:
171            # TODO - Deprecation warning
172            self.ref = ref
173        if ref_db is not None:
174            # TODO - Deprecation warning
175            self.ref_db = ref_db
176
177    def _get_strand(self):
178        """Get function for the strand property (PRIVATE)."""
179        return self.location.strand
180
181    def _set_strand(self, value):
182        """Set function for the strand property (PRIVATE)."""
183        try:
184            self.location.strand = value
185        except AttributeError:
186            if self.location is None:
187                if value is not None:
188                    raise ValueError("Can't set strand without a location.") from None
189            else:
190                raise
191
192    strand = property(
193        fget=_get_strand,
194        fset=_set_strand,
195        doc="""Feature's strand
196
197                          This is a shortcut for feature.location.strand
198                          """,
199    )
200
201    def _get_ref(self):
202        """Get function for the reference property (PRIVATE)."""
203        try:
204            return self.location.ref
205        except AttributeError:
206            return None
207
208    def _set_ref(self, value):
209        """Set function for the reference property (PRIVATE)."""
210        try:
211            self.location.ref = value
212        except AttributeError:
213            if self.location is None:
214                if value is not None:
215                    raise ValueError("Can't set ref without a location.") from None
216            else:
217                raise
218
219    ref = property(
220        fget=_get_ref,
221        fset=_set_ref,
222        doc="""Feature location reference (e.g. accession).
223
224                       This is a shortcut for feature.location.ref
225                       """,
226    )
227
228    def _get_ref_db(self):
229        """Get function for the database reference property (PRIVATE)."""
230        try:
231            return self.location.ref_db
232        except AttributeError:
233            return None
234
235    def _set_ref_db(self, value):
236        """Set function for the database reference property (PRIVATE)."""
237        self.location.ref_db = value
238
239    ref_db = property(
240        fget=_get_ref_db,
241        fset=_set_ref_db,
242        doc="""Feature location reference's database.
243
244                          This is a shortcut for feature.location.ref_db
245                          """,
246    )
247
248    def _get_location_operator(self):
249        """Get function for the location operator property (PRIVATE)."""
250        try:
251            return self.location.operator
252        except AttributeError:
253            return None
254
255    def _set_location_operator(self, value):
256        """Set function for the location operator property (PRIVATE)."""
257        if value:
258            if isinstance(self.location, CompoundLocation):
259                self.location.operator = value
260            elif self.location is None:
261                raise ValueError(
262                    "Location is None so can't set its operator (to %r)" % value
263                )
264            else:
265                raise ValueError("Only CompoundLocation gets an operator (%r)" % value)
266
267    location_operator = property(
268        fget=_get_location_operator,
269        fset=_set_location_operator,
270        doc="Location operator for compound locations (e.g. join).",
271    )
272
273    def __repr__(self):
274        """Represent the feature as a string for debugging."""
275        answer = "%s(%r" % (self.__class__.__name__, self.location)
276        if self.type:
277            answer += ", type=%r" % self.type
278        if self.location_operator:
279            answer += ", location_operator=%r" % self.location_operator
280        if self.id and self.id != "<unknown id>":
281            answer += ", id=%r" % self.id
282        if self.ref:
283            answer += ", ref=%r" % self.ref
284        if self.ref_db:
285            answer += ", ref_db=%r" % self.ref_db
286        answer += ")"
287        return answer
288
289    def __str__(self):
290        """Return the full feature as a python string."""
291        out = "type: %s\n" % self.type
292        out += "location: %s\n" % self.location
293        if self.id and self.id != "<unknown id>":
294            out += "id: %s\n" % self.id
295        out += "qualifiers:\n"
296        for qual_key in sorted(self.qualifiers):
297            out += "    Key: %s, Value: %s\n" % (qual_key, self.qualifiers[qual_key])
298        return out
299
300    def _shift(self, offset):
301        """Return a copy of the feature with its location shifted (PRIVATE).
302
303        The annotation qaulifiers are copied.
304        """
305        return SeqFeature(
306            location=self.location._shift(offset),
307            type=self.type,
308            location_operator=self.location_operator,
309            id=self.id,
310            qualifiers=OrderedDict(self.qualifiers.items()),
311        )
312
313    def _flip(self, length):
314        """Return a copy of the feature with its location flipped (PRIVATE).
315
316        The argument length gives the length of the parent sequence. For
317        example a location 0..20 (+1 strand) with parent length 30 becomes
318        after flipping 10..30 (-1 strand). Strandless (None) or unknown
319        strand (0) remain like that - just their end points are changed.
320
321        The annotation qaulifiers are copied.
322        """
323        return SeqFeature(
324            location=self.location._flip(length),
325            type=self.type,
326            location_operator=self.location_operator,
327            id=self.id,
328            qualifiers=OrderedDict(self.qualifiers.items()),
329        )
330
331    def extract(self, parent_sequence, references=None):
332        """Extract the feature's sequence from supplied parent sequence.
333
334        The parent_sequence can be a Seq like object or a string, and will
335        generally return an object of the same type. The exception to this is
336        a MutableSeq as the parent sequence will return a Seq object.
337
338        This should cope with complex locations including complements, joins
339        and fuzzy positions. Even mixed strand features should work! This
340        also covers features on protein sequences (e.g. domains), although
341        here reverse strand features are not permitted. If the
342        location refers to other records, they must be supplied in the
343        optional dictionary references.
344
345        >>> from Bio.Seq import Seq
346        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
347        >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
348        >>> f = SeqFeature(FeatureLocation(8, 15), type="domain")
349        >>> f.extract(seq)
350        Seq('VALIVIC')
351
352        If the FeatureLocation is None, e.g. when parsing invalid locus
353        locations in the GenBank parser, extract() will raise a ValueError.
354
355        >>> from Bio.Seq import Seq
356        >>> from Bio.SeqFeature import SeqFeature
357        >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
358        >>> f = SeqFeature(None, type="domain")
359        >>> f.extract(seq)
360        Traceback (most recent call last):
361           ...
362        ValueError: The feature's .location is None. Check the sequence file for a valid location.
363
364        Note - currently only compound features of type "join" are supported.
365        """
366        if self.location is None:
367            raise ValueError(
368                "The feature's .location is None. Check the "
369                "sequence file for a valid location."
370            )
371        return self.location.extract(parent_sequence, references=references)
372
373    def translate(
374        self,
375        parent_sequence,
376        table="Standard",
377        start_offset=None,
378        stop_symbol="*",
379        to_stop=False,
380        cds=None,
381        gap=None,
382    ):
383        """Get a translation of the feature's sequence.
384
385        This method is intended for CDS or other features that code proteins
386        and is a shortcut that will both extract the feature and
387        translate it, taking into account the codon_start and transl_table
388        qualifiers, if they are present. If they are not present the
389        value of the arguments "table" and "start_offset" are used.
390
391        The "cds" parameter is set to "True" if the feature is of type
392        "CDS" but can be overridden by giving an explicit argument.
393
394        The arguments stop_symbol, to_stop and gap have the same meaning
395        as Seq.translate, refer to that documentation for further information.
396
397        Arguments:
398         - parent_sequence - A DNA or RNA sequence.
399         - table - Which codon table to use if there is no transl_table
400           qualifier for this feature. This can be either a name
401           (string), an NCBI identifier (integer), or a CodonTable
402           object (useful for non-standard genetic codes).  This
403           defaults to the "Standard" table.
404         - start_offset - offset at which the first complete codon of a
405           coding feature can be found, relative to the first base of
406           that feature. Has a valid value of 0, 1 or 2. NOTE: this
407           uses python's 0-based numbering whereas the codon_start
408           qualifier in files from NCBI use 1-based numbering.
409           Will override a codon_start qualifier
410
411        >>> from Bio.Seq import Seq
412        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
413        >>> seq = Seq("GGTTACACTTACCGATAATGTCTCTGATGA")
414        >>> f = SeqFeature(FeatureLocation(0, 30), type="CDS")
415        >>> f.qualifiers['transl_table'] = [11]
416
417        Note that features of type CDS are subject to the usual
418        checks at translation. But you can override this behaviour
419        by giving explicit arguments:
420
421        >>> f.translate(seq, cds=False)
422        Seq('GYTYR*CL**')
423
424        Now use the start_offset argument to change the frame. Note
425        this uses python 0-based numbering.
426
427        >>> f.translate(seq, start_offset=1, cds=False)
428        Seq('VTLTDNVSD')
429
430        Alternatively use the codon_start qualifier to do the same
431        thing. Note: this uses 1-based numbering, which is found
432        in files from NCBI.
433
434        >>> f.qualifiers['codon_start'] = [2]
435        >>> f.translate(seq, cds=False)
436        Seq('VTLTDNVSD')
437        """
438        # see if this feature should be translated in a different
439        # frame using the "codon_start" qualifier
440        if start_offset is None:
441            try:
442                start_offset = int(self.qualifiers["codon_start"][0]) - 1
443            except KeyError:
444                start_offset = 0
445
446        if start_offset not in [0, 1, 2]:
447            raise ValueError(
448                "The start_offset must be 0, 1, or 2. "
449                f"The supplied value is {start_offset}. "
450                "Check the value of either the codon_start qualifier "
451                "or the start_offset argument"
452            )
453
454        feat_seq = self.extract(parent_sequence)[start_offset:]
455        codon_table = self.qualifiers.get("transl_table", [table])[0]
456
457        if cds is None:
458            cds = self.type == "CDS"
459
460        return feat_seq.translate(
461            table=codon_table,
462            stop_symbol=stop_symbol,
463            to_stop=to_stop,
464            cds=cds,
465            gap=gap,
466        )
467
468    def __bool__(self):
469        """Boolean value of an instance of this class (True).
470
471        This behaviour is for backwards compatibility, since until the
472        __len__ method was added, a SeqFeature always evaluated as True.
473
474        Note that in comparison, Seq objects, strings, lists, etc, will all
475        evaluate to False if they have length zero.
476
477        WARNING: The SeqFeature may in future evaluate to False when its
478        length is zero (in order to better match normal python behaviour)!
479        """
480        return True
481
482    def __len__(self):
483        """Return the length of the region where the feature is located.
484
485        >>> from Bio.Seq import Seq
486        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
487        >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
488        >>> f = SeqFeature(FeatureLocation(8, 15), type="domain")
489        >>> len(f)
490        7
491        >>> f.extract(seq)
492        Seq('VALIVIC')
493        >>> len(f.extract(seq))
494        7
495
496        This is a proxy for taking the length of the feature's location:
497
498        >>> len(f.location)
499        7
500
501        For simple features this is the same as the region spanned (end
502        position minus start position using Pythonic counting). However, for
503        a compound location (e.g. a CDS as the join of several exons) the
504        gaps are not counted (e.g. introns). This ensures that len(f) matches
505        len(f.extract(parent_seq)), and also makes sure things work properly
506        with features wrapping the origin etc.
507        """
508        return len(self.location)
509
510    def __iter__(self):
511        """Iterate over the parent positions within the feature.
512
513        The iteration order is strand aware, and can be thought of as moving
514        along the feature using the parent sequence coordinates:
515
516        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
517        >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1)
518        >>> len(f)
519        5
520        >>> for i in f: print(i)
521        9
522        8
523        7
524        6
525        5
526        >>> list(f)
527        [9, 8, 7, 6, 5]
528
529        This is a proxy for iterating over the location,
530
531        >>> list(f.location)
532        [9, 8, 7, 6, 5]
533        """
534        return iter(self.location)
535
536    def __contains__(self, value):
537        """Check if an integer position is within the feature.
538
539        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
540        >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1)
541        >>> len(f)
542        5
543        >>> [i for i in range(15) if i in f]
544        [5, 6, 7, 8, 9]
545
546        For example, to see which features include a SNP position, you could
547        use this:
548
549        >>> from Bio import SeqIO
550        >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb")
551        >>> for f in record.features:
552        ...     if 1750 in f:
553        ...         print("%s %s" % (f.type, f.location))
554        source [0:154478](+)
555        gene [1716:4347](-)
556        tRNA join{[4310:4347](-), [1716:1751](-)}
557
558        Note that for a feature defined as a join of several subfeatures (e.g.
559        the union of several exons) the gaps are not checked (e.g. introns).
560        In this example, the tRNA location is defined in the GenBank file as
561        complement(join(1717..1751,4311..4347)), so that position 1760 falls
562        in the gap:
563
564        >>> for f in record.features:
565        ...     if 1760 in f:
566        ...         print("%s %s" % (f.type, f.location))
567        source [0:154478](+)
568        gene [1716:4347](-)
569
570        Note that additional care may be required with fuzzy locations, for
571        example just before a BeforePosition:
572
573        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
574        >>> from Bio.SeqFeature import BeforePosition
575        >>> f = SeqFeature(FeatureLocation(BeforePosition(3), 8), type="domain")
576        >>> len(f)
577        5
578        >>> [i for i in range(10) if i in f]
579        [3, 4, 5, 6, 7]
580
581        Note that is is a proxy for testing membership on the location.
582
583        >>> [i for i in range(10) if i in f.location]
584        [3, 4, 5, 6, 7]
585        """
586        return value in self.location
587
588
589# --- References
590
591
592# TODO -- Will this hold PubMed and Medline information decently?
593class Reference:
594    """Represent a Generic Reference object.
595
596    Attributes:
597     - location - A list of Location objects specifying regions of
598       the sequence that the references correspond to. If no locations are
599       specified, the entire sequence is assumed.
600     - authors - A big old string, or a list split by author, of authors
601       for the reference.
602     - title - The title of the reference.
603     - journal - Journal the reference was published in.
604     - medline_id - A medline reference for the article.
605     - pubmed_id - A pubmed reference for the article.
606     - comment - A place to stick any comments about the reference.
607
608    """
609
610    def __init__(self):
611        """Initialize the class."""
612        self.location = []
613        self.authors = ""
614        self.consrtm = ""
615        self.title = ""
616        self.journal = ""
617        self.medline_id = ""
618        self.pubmed_id = ""
619        self.comment = ""
620
621    def __str__(self):
622        """Return the full Reference object as a python string."""
623        out = ""
624        for single_location in self.location:
625            out += "location: %s\n" % single_location
626        out += "authors: %s\n" % self.authors
627        if self.consrtm:
628            out += "consrtm: %s\n" % self.consrtm
629        out += "title: %s\n" % self.title
630        out += "journal: %s\n" % self.journal
631        out += "medline id: %s\n" % self.medline_id
632        out += "pubmed id: %s\n" % self.pubmed_id
633        out += "comment: %s\n" % self.comment
634        return out
635
636    def __repr__(self):
637        """Represent the Reference object as a string for debugging."""
638        # TODO - Update this is __init__ later accepts values
639        return "%s(title=%r, ...)" % (self.__class__.__name__, self.title)
640
641    def __eq__(self, other):
642        """Check if two Reference objects should be considered equal.
643
644        Note prior to Biopython 1.70 the location was not compared, as
645        until then __eq__ for the FeatureLocation class was not defined.
646        """
647        return (
648            self.authors == other.authors
649            and self.consrtm == other.consrtm
650            and self.title == other.title
651            and self.journal == other.journal
652            and self.medline_id == other.medline_id
653            and self.pubmed_id == other.pubmed_id
654            and self.comment == other.comment
655            and self.location == other.location
656        )
657
658
659# --- Handling feature locations
660
661
662class FeatureLocation:
663    """Specify the location of a feature along a sequence.
664
665    The FeatureLocation is used for simple continuous features, which can
666    be described as running from a start position to and end position
667    (optionally with a strand and reference information).  More complex
668    locations made up from several non-continuous parts (e.g. a coding
669    sequence made up of several exons) are described using a SeqFeature
670    with a CompoundLocation.
671
672    Note that the start and end location numbering follow Python's scheme,
673    thus a GenBank entry of 123..150 (one based counting) becomes a location
674    of [122:150] (zero based counting).
675
676    >>> from Bio.SeqFeature import FeatureLocation
677    >>> f = FeatureLocation(122, 150)
678    >>> print(f)
679    [122:150]
680    >>> print(f.start)
681    122
682    >>> print(f.end)
683    150
684    >>> print(f.strand)
685    None
686
687    Note the strand defaults to None. If you are working with nucleotide
688    sequences you'd want to be explicit if it is the forward strand:
689
690    >>> from Bio.SeqFeature import FeatureLocation
691    >>> f = FeatureLocation(122, 150, strand=+1)
692    >>> print(f)
693    [122:150](+)
694    >>> print(f.strand)
695    1
696
697    Note that for a parent sequence of length n, the FeatureLocation
698    start and end must satisfy the inequality 0 <= start <= end <= n.
699    This means even for features on the reverse strand of a nucleotide
700    sequence, we expect the 'start' coordinate to be less than the
701    'end'.
702
703    >>> from Bio.SeqFeature import FeatureLocation
704    >>> r = FeatureLocation(122, 150, strand=-1)
705    >>> print(r)
706    [122:150](-)
707    >>> print(r.start)
708    122
709    >>> print(r.end)
710    150
711    >>> print(r.strand)
712    -1
713
714    i.e. Rather than thinking of the 'start' and 'end' biologically in a
715    strand aware manner, think of them as the 'left most' or 'minimum'
716    boundary, and the 'right most' or 'maximum' boundary of the region
717    being described. This is particularly important with compound
718    locations describing non-continuous regions.
719
720    In the example above we have used standard exact positions, but there
721    are also specialised position objects used to represent fuzzy positions
722    as well, for example a GenBank location like complement(<123..150)
723    would use a BeforePosition object for the start.
724    """
725
726    def __init__(self, start, end, strand=None, ref=None, ref_db=None):
727        """Initialize the class.
728
729        start and end arguments specify the values where the feature begins
730        and ends. These can either by any of the ``*Position`` objects that
731        inherit from AbstractPosition, or can just be integers specifying the
732        position. In the case of integers, the values are assumed to be
733        exact and are converted in ExactPosition arguments. This is meant
734        to make it easy to deal with non-fuzzy ends.
735
736        i.e. Short form:
737
738        >>> from Bio.SeqFeature import FeatureLocation
739        >>> loc = FeatureLocation(5, 10, strand=-1)
740        >>> print(loc)
741        [5:10](-)
742
743        Explicit form:
744
745        >>> from Bio.SeqFeature import FeatureLocation, ExactPosition
746        >>> loc = FeatureLocation(ExactPosition(5), ExactPosition(10), strand=-1)
747        >>> print(loc)
748        [5:10](-)
749
750        Other fuzzy positions are used similarly,
751
752        >>> from Bio.SeqFeature import FeatureLocation
753        >>> from Bio.SeqFeature import BeforePosition, AfterPosition
754        >>> loc2 = FeatureLocation(BeforePosition(5), AfterPosition(10), strand=-1)
755        >>> print(loc2)
756        [<5:>10](-)
757
758        For nucleotide features you will also want to specify the strand,
759        use 1 for the forward (plus) strand, -1 for the reverse (negative)
760        strand, 0 for stranded but strand unknown (? in GFF3), or None for
761        when the strand does not apply (dot in GFF3), e.g. features on
762        proteins.
763
764        >>> loc = FeatureLocation(5, 10, strand=+1)
765        >>> print(loc)
766        [5:10](+)
767        >>> print(loc.strand)
768        1
769
770        Normally feature locations are given relative to the parent
771        sequence you are working with, but an explicit accession can
772        be given with the optional ref and db_ref strings:
773
774        >>> loc = FeatureLocation(105172, 108462, ref="AL391218.9", strand=1)
775        >>> print(loc)
776        AL391218.9[105172:108462](+)
777        >>> print(loc.ref)
778        AL391218.9
779
780        """
781        # TODO - Check 0 <= start <= end (<= length of reference)
782        if isinstance(start, AbstractPosition):
783            self._start = start
784        elif isinstance(start, int):
785            self._start = ExactPosition(start)
786        else:
787            raise TypeError("start=%r %s" % (start, type(start)))
788        if isinstance(end, AbstractPosition):
789            self._end = end
790        elif isinstance(end, int):
791            self._end = ExactPosition(end)
792        else:
793            raise TypeError("end=%r %s" % (end, type(end)))
794        if (
795            isinstance(self.start.position, int)
796            and isinstance(self.end.position, int)
797            and self.start > self.end
798        ):
799            raise ValueError(
800                f"End location ({self.end}) must be greater than "
801                f"or equal to start location ({self.start})"
802            )
803        self.strand = strand
804        self.ref = ref
805        self.ref_db = ref_db
806
807    def _get_strand(self):
808        """Get function for the strand property (PRIVATE)."""
809        return self._strand
810
811    def _set_strand(self, value):
812        """Set function for the strand property (PRIVATE)."""
813        if value not in [+1, -1, 0, None]:
814            raise ValueError("Strand should be +1, -1, 0 or None, not %r" % value)
815        self._strand = value
816
817    strand = property(
818        fget=_get_strand,
819        fset=_set_strand,
820        doc="Strand of the location (+1, -1, 0 or None).",
821    )
822
823    def __str__(self):
824        """Return a representation of the FeatureLocation object (with python counting).
825
826        For the simple case this uses the python splicing syntax, [122:150]
827        (zero based counting) which GenBank would call 123..150 (one based
828        counting).
829        """
830        answer = "[%s:%s]" % (self._start, self._end)
831        if self.ref and self.ref_db:
832            answer = "%s:%s%s" % (self.ref_db, self.ref, answer)
833        elif self.ref:
834            answer = self.ref + answer
835        # Is ref_db without ref meaningful?
836        if self.strand is None:
837            return answer
838        elif self.strand == +1:
839            return answer + "(+)"
840        elif self.strand == -1:
841            return answer + "(-)"
842        else:
843            # strand = 0, stranded but strand unknown, ? in GFF3
844            return answer + "(?)"
845
846    def __repr__(self):
847        """Represent the FeatureLocation object as a string for debugging."""
848        optional = ""
849        if self.strand is not None:
850            optional += ", strand=%r" % self.strand
851        if self.ref is not None:
852            optional += ", ref=%r" % self.ref
853        if self.ref_db is not None:
854            optional += ", ref_db=%r" % self.ref_db
855        return "%s(%r, %r%s)" % (
856            self.__class__.__name__,
857            self.start,
858            self.end,
859            optional,
860        )
861
862    def __add__(self, other):
863        """Combine location with another FeatureLocation object, or shift it.
864
865        You can add two feature locations to make a join CompoundLocation:
866
867        >>> from Bio.SeqFeature import FeatureLocation
868        >>> f1 = FeatureLocation(5, 10)
869        >>> f2 = FeatureLocation(20, 30)
870        >>> combined = f1 + f2
871        >>> print(combined)
872        join{[5:10], [20:30]}
873
874        This is thus equivalent to:
875
876        >>> from Bio.SeqFeature import CompoundLocation
877        >>> join = CompoundLocation([f1, f2])
878        >>> print(join)
879        join{[5:10], [20:30]}
880
881        You can also use sum(...) in this way:
882
883        >>> join = sum([f1, f2])
884        >>> print(join)
885        join{[5:10], [20:30]}
886
887        Furthermore, you can combine a FeatureLocation with a CompoundLocation
888        in this way.
889
890        Separately, adding an integer will give a new FeatureLocation with
891        its start and end offset by that amount. For example:
892
893        >>> print(f1)
894        [5:10]
895        >>> print(f1 + 100)
896        [105:110]
897        >>> print(200 + f1)
898        [205:210]
899
900        This can be useful when editing annotation.
901        """
902        if isinstance(other, FeatureLocation):
903            return CompoundLocation([self, other])
904        elif isinstance(other, int):
905            return self._shift(other)
906        else:
907            # This will allow CompoundLocation's __radd__ to be called:
908            return NotImplemented
909
910    def __radd__(self, other):
911        """Add a feature locationanother FeatureLocation object to the left."""
912        if isinstance(other, int):
913            return self._shift(other)
914        else:
915            return NotImplemented
916
917    def __nonzero__(self):
918        """Return True regardless of the length of the feature.
919
920        This behaviour is for backwards compatibility, since until the
921        __len__ method was added, a FeatureLocation always evaluated as True.
922
923        Note that in comparison, Seq objects, strings, lists, etc, will all
924        evaluate to False if they have length zero.
925
926        WARNING: The FeatureLocation may in future evaluate to False when its
927        length is zero (in order to better match normal python behaviour)!
928        """
929        return True
930
931    def __len__(self):
932        """Return the length of the region described by the FeatureLocation object.
933
934        Note that extra care may be needed for fuzzy locations, e.g.
935
936        >>> from Bio.SeqFeature import FeatureLocation
937        >>> from Bio.SeqFeature import BeforePosition, AfterPosition
938        >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10))
939        >>> len(loc)
940        5
941        """
942        return int(self._end) - int(self._start)
943
944    def __contains__(self, value):
945        """Check if an integer position is within the FeatureLocation object.
946
947        Note that extra care may be needed for fuzzy locations, e.g.
948
949        >>> from Bio.SeqFeature import FeatureLocation
950        >>> from Bio.SeqFeature import BeforePosition, AfterPosition
951        >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10))
952        >>> len(loc)
953        5
954        >>> [i for i in range(15) if i in loc]
955        [5, 6, 7, 8, 9]
956        """
957        if not isinstance(value, int):
958            raise ValueError(
959                "Currently we only support checking for integer "
960                "positions being within a FeatureLocation."
961            )
962        if value < self._start or value >= self._end:
963            return False
964        else:
965            return True
966
967    def __iter__(self):
968        """Iterate over the parent positions within the FeatureLocation object.
969
970        >>> from Bio.SeqFeature import FeatureLocation
971        >>> from Bio.SeqFeature import BeforePosition, AfterPosition
972        >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10))
973        >>> len(loc)
974        5
975        >>> for i in loc: print(i)
976        5
977        6
978        7
979        8
980        9
981        >>> list(loc)
982        [5, 6, 7, 8, 9]
983        >>> [i for i in range(15) if i in loc]
984        [5, 6, 7, 8, 9]
985
986        Note this is strand aware:
987
988        >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10), strand = -1)
989        >>> list(loc)
990        [9, 8, 7, 6, 5]
991        """
992        if self.strand == -1:
993            yield from range(self._end - 1, self._start - 1, -1)
994        else:
995            yield from range(self._start, self._end)
996
997    def __eq__(self, other):
998        """Implement equality by comparing all the location attributes."""
999        if not isinstance(other, FeatureLocation):
1000            return False
1001        return (
1002            self._start == other.start
1003            and self._end == other.end
1004            and self._strand == other.strand
1005            and self.ref == other.ref
1006            and self.ref_db == other.ref_db
1007        )
1008
1009    def _shift(self, offset):
1010        """Return a copy of the FeatureLocation shifted by an offset (PRIVATE).
1011
1012        Returns self when location is relative to an external reference.
1013        """
1014        # TODO - What if offset is a fuzzy position?
1015        if self.ref or self.ref_db:
1016            return self
1017        return FeatureLocation(
1018            start=self._start._shift(offset),
1019            end=self._end._shift(offset),
1020            strand=self.strand,
1021        )
1022
1023    def _flip(self, length):
1024        """Return a copy of the location after the parent is reversed (PRIVATE).
1025
1026        Returns self when location is relative to an external reference.
1027        """
1028        if self.ref or self.ref_db:
1029            return self
1030        # Note this will flip the start and end too!
1031        if self.strand == +1:
1032            flip_strand = -1
1033        elif self.strand == -1:
1034            flip_strand = +1
1035        else:
1036            # 0 or None
1037            flip_strand = self.strand
1038        return FeatureLocation(
1039            start=self._end._flip(length),
1040            end=self._start._flip(length),
1041            strand=flip_strand,
1042        )
1043
1044    @property
1045    def parts(self):
1046        """Read only list of sections (always one, the FeatureLocation object).
1047
1048        This is a convenience property allowing you to write code handling
1049        both simple FeatureLocation objects (with one part) and more complex
1050        CompoundLocation objects (with multiple parts) interchangeably.
1051        """
1052        return [self]
1053
1054    @property
1055    def start(self):
1056        """Start location - left most (minimum) value, regardless of strand.
1057
1058        Read only, returns an integer like position object, possibly a fuzzy
1059        position.
1060        """
1061        return self._start
1062
1063    @property
1064    def end(self):
1065        """End location - right most (maximum) value, regardless of strand.
1066
1067        Read only, returns an integer like position object, possibly a fuzzy
1068        position.
1069        """
1070        return self._end
1071
1072    @property
1073    def nofuzzy_start(self):
1074        """Start position (integer, approximated if fuzzy, read only) (OBSOLETE).
1075
1076        This is now an alias for int(feature.start), which should be
1077        used in preference -- unless you are trying to support old
1078        versions of Biopython.
1079        """
1080        try:
1081            return int(self._start)
1082        except TypeError:
1083            if isinstance(self._start, UnknownPosition):
1084                return None
1085            raise
1086
1087    @property
1088    def nofuzzy_end(self):
1089        """End position (integer, approximated if fuzzy, read only) (OBSOLETE).
1090
1091        This is now an alias for int(feature.end), which should be
1092        used in preference -- unless you are trying to support old
1093        versions of Biopython.
1094        """
1095        try:
1096            return int(self._end)
1097        except TypeError:
1098            if isinstance(self._end, UnknownPosition):
1099                return None
1100            raise
1101
1102    def extract(self, parent_sequence, references=None):
1103        """Extract the sequence from supplied parent sequence using the FeatureLocation object.
1104
1105        The parent_sequence can be a Seq like object or a string, and will
1106        generally return an object of the same type. The exception to this is
1107        a MutableSeq as the parent sequence will return a Seq object.
1108        If the location refers to other records, they must be supplied
1109        in the optional dictionary references.
1110
1111        >>> from Bio.Seq import Seq
1112        >>> from Bio.SeqFeature import FeatureLocation
1113        >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
1114        >>> feature_loc = FeatureLocation(8, 15)
1115        >>> feature_loc.extract(seq)
1116        Seq('VALIVIC')
1117
1118        """
1119        if self.ref or self.ref_db:
1120            if not references:
1121                raise ValueError(
1122                    f"Feature references another sequence ({self.ref}),"
1123                    " references mandatory"
1124                )
1125            elif self.ref not in references:
1126                # KeyError?
1127                raise ValueError(
1128                    f"Feature references another sequence ({self.ref}),"
1129                    " not found in references"
1130                )
1131            parent_sequence = references[self.ref]
1132            try:
1133                # If was a SeqRecord, just take the sequence
1134                # (should focus on the annotation of the feature)
1135                parent_sequence = parent_sequence.seq
1136            except AttributeError:
1137                pass
1138        if isinstance(parent_sequence, MutableSeq):
1139            # This avoids complications with reverse complements
1140            # (the MutableSeq reverse complement acts in situ)
1141            parent_sequence = Seq(parent_sequence)
1142        f_seq = parent_sequence[self.nofuzzy_start : self.nofuzzy_end]
1143        if self.strand == -1:
1144            try:
1145                f_seq = f_seq.reverse_complement()
1146            except AttributeError:
1147                assert isinstance(f_seq, str)
1148                f_seq = reverse_complement(f_seq)
1149        return f_seq
1150
1151
1152class CompoundLocation:
1153    """For handling joins etc where a feature location has several parts."""
1154
1155    def __init__(self, parts, operator="join"):
1156        """Initialize the class.
1157
1158        >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation
1159        >>> f1 = FeatureLocation(10, 40, strand=+1)
1160        >>> f2 = FeatureLocation(50, 59, strand=+1)
1161        >>> f = CompoundLocation([f1, f2])
1162        >>> len(f) == len(f1) + len(f2) == 39 == len(list(f))
1163        True
1164        >>> print(f.operator)
1165        join
1166        >>> 5 in f
1167        False
1168        >>> 15 in f
1169        True
1170        >>> f.strand
1171        1
1172
1173        Notice that the strand of the compound location is computed
1174        automatically - in the case of mixed strands on the sub-locations
1175        the overall strand is set to None.
1176
1177        >>> f = CompoundLocation([FeatureLocation(3, 6, strand=+1),
1178        ...                       FeatureLocation(10, 13, strand=-1)])
1179        >>> print(f.strand)
1180        None
1181        >>> len(f)
1182        6
1183        >>> list(f)
1184        [3, 4, 5, 12, 11, 10]
1185
1186        The example above doing list(f) iterates over the coordinates within the
1187        feature. This allows you to use max and min on the location, to find the
1188        range covered:
1189
1190        >>> min(f)
1191        3
1192        >>> max(f)
1193        12
1194
1195        More generally, you can use the compound location's start and end which
1196        give the full span covered, 0 <= start <= end <= full sequence length.
1197
1198        >>> f.start == min(f)
1199        True
1200        >>> f.end == max(f) + 1
1201        True
1202
1203        This is consistent with the behaviour of the simple FeatureLocation for
1204        a single region, where again the 'start' and 'end' do not necessarily
1205        give the biological start and end, but rather the 'minimal' and 'maximal'
1206        coordinate boundaries.
1207
1208        Note that adding locations provides a more intuitive method of
1209        construction:
1210
1211        >>> f = FeatureLocation(3, 6, strand=+1) + FeatureLocation(10, 13, strand=-1)
1212        >>> len(f)
1213        6
1214        >>> list(f)
1215        [3, 4, 5, 12, 11, 10]
1216        """
1217        self.operator = operator
1218        self.parts = list(parts)
1219        for loc in self.parts:
1220            if not isinstance(loc, FeatureLocation):
1221                raise ValueError(
1222                    "CompoundLocation should be given a list of "
1223                    "FeatureLocation objects, not %s" % loc.__class__
1224                )
1225        if len(parts) < 2:
1226            raise ValueError(
1227                "CompoundLocation should have at least 2 parts, not %r" % parts
1228            )
1229
1230    def __str__(self):
1231        """Return a representation of the CompoundLocation object (with python counting)."""
1232        return "%s{%s}" % (self.operator, ", ".join(str(loc) for loc in self.parts))
1233
1234    def __repr__(self):
1235        """Represent the CompoundLocation object as string for debugging."""
1236        return "%s(%r, %r)" % (self.__class__.__name__, self.parts, self.operator)
1237
1238    def _get_strand(self):
1239        """Get function for the strand property (PRIVATE)."""
1240        # Historically a join on the reverse strand has been represented
1241        # in Biopython with both the parent SeqFeature and its children
1242        # (the exons for a CDS) all given a strand of -1.  Likewise, for
1243        # a join feature on the forward strand they all have strand +1.
1244        # However, we must also consider evil mixed strand examples like
1245        # this, join(complement(69611..69724),139856..140087,140625..140650)
1246        if len({loc.strand for loc in self.parts}) == 1:
1247            return self.parts[0].strand
1248        else:
1249            return None  # i.e. mixed strands
1250
1251    def _set_strand(self, value):
1252        """Set function for the strand property (PRIVATE)."""
1253        # Should this be allowed/encouraged?
1254        for loc in self.parts:
1255            loc.strand = value
1256
1257    strand = property(
1258        fget=_get_strand,
1259        fset=_set_strand,
1260        doc="""Overall strand of the compound location.
1261
1262        If all the parts have the same strand, that is returned. Otherwise
1263        for mixed strands, this returns None.
1264
1265        >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation
1266        >>> f1 = FeatureLocation(15, 17, strand=1)
1267        >>> f2 = FeatureLocation(20, 30, strand=-1)
1268        >>> f = f1 + f2
1269        >>> f1.strand
1270        1
1271        >>> f2.strand
1272        -1
1273        >>> f.strand
1274        >>> f.strand is None
1275        True
1276
1277        If you set the strand of a CompoundLocation, this is applied to
1278        all the parts - use with caution:
1279
1280        >>> f.strand = 1
1281        >>> f1.strand
1282        1
1283        >>> f2.strand
1284        1
1285        >>> f.strand
1286        1
1287
1288        """,
1289    )
1290
1291    def __add__(self, other):
1292        """Combine locations, or shift the location by an integer offset.
1293
1294        >>> from Bio.SeqFeature import FeatureLocation
1295        >>> f1 = FeatureLocation(15, 17) + FeatureLocation(20, 30)
1296        >>> print(f1)
1297        join{[15:17], [20:30]}
1298
1299        You can add another FeatureLocation:
1300
1301        >>> print(f1 + FeatureLocation(40, 50))
1302        join{[15:17], [20:30], [40:50]}
1303        >>> print(FeatureLocation(5, 10) + f1)
1304        join{[5:10], [15:17], [20:30]}
1305
1306        You can also add another CompoundLocation:
1307
1308        >>> f2 = FeatureLocation(40, 50) + FeatureLocation(60, 70)
1309        >>> print(f2)
1310        join{[40:50], [60:70]}
1311        >>> print(f1 + f2)
1312        join{[15:17], [20:30], [40:50], [60:70]}
1313
1314        Also, as with the FeatureLocation, adding an integer shifts the
1315        location's co-ordinates by that offset:
1316
1317        >>> print(f1 + 100)
1318        join{[115:117], [120:130]}
1319        >>> print(200 + f1)
1320        join{[215:217], [220:230]}
1321        >>> print(f1 + (-5))
1322        join{[10:12], [15:25]}
1323        """
1324        if isinstance(other, FeatureLocation):
1325            return CompoundLocation(self.parts + [other], self.operator)
1326        elif isinstance(other, CompoundLocation):
1327            if self.operator != other.operator:
1328                # Handle join+order -> order as a special case?
1329                raise ValueError(
1330                    "Mixed operators %s and %s" % (self.operator, other.operator)
1331                )
1332            return CompoundLocation(self.parts + other.parts, self.operator)
1333        elif isinstance(other, int):
1334            return self._shift(other)
1335        else:
1336            raise NotImplementedError
1337
1338    def __radd__(self, other):
1339        """Add a feature to the left."""
1340        if isinstance(other, FeatureLocation):
1341            return CompoundLocation([other] + self.parts, self.operator)
1342        elif isinstance(other, int):
1343            return self._shift(other)
1344        else:
1345            raise NotImplementedError
1346
1347    def __contains__(self, value):
1348        """Check if an integer position is within the CompoundLocation object."""
1349        for loc in self.parts:
1350            if value in loc:
1351                return True
1352        return False
1353
1354    def __nonzero__(self):
1355        """Return True regardless of the length of the feature.
1356
1357        This behaviour is for backwards compatibility, since until the
1358        __len__ method was added, a FeatureLocation always evaluated as True.
1359
1360        Note that in comparison, Seq objects, strings, lists, etc, will all
1361        evaluate to False if they have length zero.
1362
1363        WARNING: The FeatureLocation may in future evaluate to False when its
1364        length is zero (in order to better match normal python behaviour)!
1365        """
1366        return True
1367
1368    def __len__(self):
1369        """Return the length of the CompoundLocation object."""
1370        return sum(len(loc) for loc in self.parts)
1371
1372    def __iter__(self):
1373        """Iterate over the parent positions within the CompoundLocation object."""
1374        for loc in self.parts:
1375            yield from loc
1376
1377    def __eq__(self, other):
1378        """Check if all parts of CompoundLocation are equal to all parts of other CompoundLocation."""
1379        if not isinstance(other, CompoundLocation):
1380            return False
1381        if len(self.parts) != len(other.parts):
1382            return False
1383        if self.operator != other.operator:
1384            return False
1385        for self_part, other_part in zip(self.parts, other.parts):
1386            if self_part != other_part:
1387                return False
1388        return True
1389
1390    def _shift(self, offset):
1391        """Return a copy of the CompoundLocation shifted by an offset (PRIVATE)."""
1392        return CompoundLocation(
1393            [loc._shift(offset) for loc in self.parts], self.operator
1394        )
1395
1396    def _flip(self, length):
1397        """Return a copy of the locations after the parent is reversed (PRIVATE).
1398
1399        Note that the order of the parts is NOT reversed too. Consider a CDS
1400        on the forward strand with exons small, medium and large (in length).
1401        Once we change the frame of reference to the reverse complement strand,
1402        the start codon is still part of the small exon, and the stop codon
1403        still part of the large exon - so the part order remains the same!
1404
1405        Here is an artificial example, were the features map to the two upper
1406        case regions and the lower case runs of n are not used:
1407
1408        >>> from Bio.Seq import Seq
1409        >>> from Bio.SeqFeature import FeatureLocation
1410        >>> dna = Seq("nnnnnAGCATCCTGCTGTACnnnnnnnnGAGAMTGCCATGCCCCTGGAGTGAnnnnn")
1411        >>> small = FeatureLocation(5, 20, strand=1)
1412        >>> large = FeatureLocation(28, 52, strand=1)
1413        >>> location = small + large
1414        >>> print(small)
1415        [5:20](+)
1416        >>> print(large)
1417        [28:52](+)
1418        >>> print(location)
1419        join{[5:20](+), [28:52](+)}
1420        >>> for part in location.parts:
1421        ...     print(len(part))
1422        ...
1423        15
1424        24
1425
1426        As you can see, this is a silly example where each "exon" is a word:
1427
1428        >>> print(small.extract(dna).translate())
1429        SILLY
1430        >>> print(large.extract(dna).translate())
1431        EXAMPLE*
1432        >>> print(location.extract(dna).translate())
1433        SILLYEXAMPLE*
1434        >>> for part in location.parts:
1435        ...     print(part.extract(dna).translate())
1436        ...
1437        SILLY
1438        EXAMPLE*
1439
1440        Now, let's look at this from the reverse strand frame of reference:
1441
1442        >>> flipped_dna = dna.reverse_complement()
1443        >>> flipped_location = location._flip(len(dna))
1444        >>> print(flipped_location.extract(flipped_dna).translate())
1445        SILLYEXAMPLE*
1446        >>> for part in flipped_location.parts:
1447        ...     print(part.extract(flipped_dna).translate())
1448        ...
1449        SILLY
1450        EXAMPLE*
1451
1452        The key point here is the first part of the CompoundFeature is still the
1453        small exon, while the second part is still the large exon:
1454
1455        >>> for part in flipped_location.parts:
1456        ...     print(len(part))
1457        ...
1458        15
1459        24
1460        >>> print(flipped_location)
1461        join{[37:52](-), [5:29](-)}
1462
1463        Notice the parts are not reversed. However, there was a bug here in older
1464        versions of Biopython which would have given join{[5:29](-), [37:52](-)}
1465        and the translation would have wrongly been "EXAMPLE*SILLY" instead.
1466
1467        """
1468        return CompoundLocation(
1469            [loc._flip(length) for loc in self.parts], self.operator
1470        )
1471
1472    @property
1473    def start(self):
1474        """Start location - left most (minimum) value, regardless of strand.
1475
1476        Read only, returns an integer like position object, possibly a fuzzy
1477        position.
1478
1479        For the special case of a CompoundLocation wrapping the origin of a
1480        circular genome, this will return zero.
1481        """
1482        return min(loc.start for loc in self.parts)
1483
1484    @property
1485    def end(self):
1486        """End location - right most (maximum) value, regardless of strand.
1487
1488        Read only, returns an integer like position object, possibly a fuzzy
1489        position.
1490
1491        For the special case of a CompoundLocation wrapping the origin of
1492        a circular genome this will match the genome length (minus one
1493        given how Python counts from zero).
1494        """
1495        return max(loc.end for loc in self.parts)
1496
1497    @property
1498    def nofuzzy_start(self):
1499        """Start position (integer, approximated if fuzzy, read only) (OBSOLETE).
1500
1501        This is an alias for int(feature.start), which should be used in
1502        preference -- unless you are trying to support old versions of
1503        Biopython.
1504        """
1505        try:
1506            return int(self.start)
1507        except TypeError:
1508            if isinstance(self.start, UnknownPosition):
1509                return None
1510            raise
1511
1512    @property
1513    def nofuzzy_end(self):
1514        """End position (integer, approximated if fuzzy, read only) (OBSOLETE).
1515
1516        This is an alias for int(feature.end), which should be used in
1517        preference -- unless you are trying to support old versions of
1518        Biopython.
1519        """
1520        try:
1521            return int(self.end)
1522        except TypeError:
1523            if isinstance(self.end, UnknownPosition):
1524                return None
1525            raise
1526
1527    @property
1528    def ref(self):
1529        """Not present in CompoundLocation, dummy method for API compatibility."""
1530        return None
1531
1532    @property
1533    def ref_db(self):
1534        """Not present in CompoundLocation, dummy method for API compatibility."""
1535        return None
1536
1537    def extract(self, parent_sequence, references=None):
1538        """Extract the sequence from supplied parent sequence using the CompoundLocation object.
1539
1540        The parent_sequence can be a Seq like object or a string, and will
1541        generally return an object of the same type. The exception to this is
1542        a MutableSeq as the parent sequence will return a Seq object.
1543        If the location refers to other records, they must be supplied
1544        in the optional dictionary references.
1545
1546        >>> from Bio.Seq import Seq
1547        >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation
1548        >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL")
1549        >>> fl1 = FeatureLocation(2, 8)
1550        >>> fl2 = FeatureLocation(10, 15)
1551        >>> fl3 = CompoundLocation([fl1,fl2])
1552        >>> fl3.extract(seq)
1553        Seq('QHKAMILIVIC')
1554
1555        """
1556        # This copes with mixed strand features & all on reverse:
1557        parts = [
1558            loc.extract(parent_sequence, references=references) for loc in self.parts
1559        ]
1560        f_seq = functools.reduce(lambda x, y: x + y, parts)
1561        return f_seq
1562
1563
1564class AbstractPosition:
1565    """Abstract base class representing a position."""
1566
1567    def __repr__(self):
1568        """Represent the AbstractPosition object as a string for debugging."""
1569        return "%s(...)" % (self.__class__.__name__)
1570
1571
1572class ExactPosition(int, AbstractPosition):
1573    """Specify the specific position of a boundary.
1574
1575    Arguments:
1576     - position - The position of the boundary.
1577     - extension - An optional argument which must be zero since we don't
1578       have an extension. The argument is provided so that the same number
1579       of arguments can be passed to all position types.
1580
1581    In this case, there is no fuzziness associated with the position.
1582
1583    >>> p = ExactPosition(5)
1584    >>> p
1585    ExactPosition(5)
1586    >>> print(p)
1587    5
1588
1589    >>> isinstance(p, AbstractPosition)
1590    True
1591    >>> isinstance(p, int)
1592    True
1593
1594    Integer comparisons and operations should work as expected:
1595
1596    >>> p == 5
1597    True
1598    >>> p < 6
1599    True
1600    >>> p <= 5
1601    True
1602    >>> p + 10
1603    15
1604
1605    """
1606
1607    def __new__(cls, position, extension=0):
1608        """Create an ExactPosition object."""
1609        if extension != 0:
1610            raise AttributeError(
1611                "Non-zero extension %s for exact position." % extension
1612            )
1613        return int.__new__(cls, position)
1614
1615    # Must define this on Python 3.8 onwards because we redefine __repr__
1616    def __str__(self):
1617        """Return a representation of the ExactPosition object (with python counting)."""
1618        return str(int(self))
1619
1620    def __repr__(self):
1621        """Represent the ExactPosition object as a string for debugging."""
1622        return "%s(%i)" % (self.__class__.__name__, int(self))
1623
1624    @property
1625    def position(self):
1626        """Legacy attribute to get position as integer (OBSOLETE)."""
1627        return int(self)
1628
1629    @property
1630    def extension(self):
1631        """Not present in this object, return zero (OBSOLETE)."""
1632        return 0
1633
1634    def _shift(self, offset):
1635        """Return a copy of the position object with its location shifted (PRIVATE)."""
1636        # By default preserve any subclass
1637        return self.__class__(int(self) + offset)
1638
1639    def _flip(self, length):
1640        """Return a copy of the location after the parent is reversed (PRIVATE)."""
1641        # By default preserve any subclass
1642        return self.__class__(length - int(self))
1643
1644
1645class UncertainPosition(ExactPosition):
1646    """Specify a specific position which is uncertain.
1647
1648    This is used in UniProt, e.g. ?222 for uncertain position 222, or in the
1649    XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL.
1650    """
1651
1652    pass
1653
1654
1655class UnknownPosition(AbstractPosition):
1656    """Specify a specific position which is unknown (has no position).
1657
1658    This is used in UniProt, e.g. ? or in the XML as unknown.
1659    """
1660
1661    def __repr__(self):
1662        """Represent the UnknownPosition object as a string for debugging."""
1663        return "%s()" % self.__class__.__name__
1664
1665    def __hash__(self):
1666        """Return the hash value of the UnknownPosition object."""
1667        return hash(None)
1668
1669    @property
1670    def position(self):
1671        """Legacy attribute to get location (None) (OBSOLETE)."""
1672        return None
1673
1674    @property
1675    def extension(self):  # noqa: D402
1676        """Legacy attribute to get extension (zero) as integer (OBSOLETE)."""  # noqa: D402
1677        return 0
1678
1679    def _shift(self, offset):
1680        """Return a copy of the position object with its location shifted (PRIVATE)."""
1681        return self
1682
1683    def _flip(self, length):
1684        """Return a copy of the location after the parent is reversed (PRIVATE)."""
1685        return self
1686
1687
1688class WithinPosition(int, AbstractPosition):
1689    """Specify the position of a boundary within some coordinates.
1690
1691    Arguments:
1692    - position - The default integer position
1693    - left - The start (left) position of the boundary
1694    - right - The end (right) position of the boundary
1695
1696    This allows dealing with a location like ((1.4)..100). This
1697    indicates that the start of the sequence is somewhere between 1
1698    and 4. Since this is a start coordinate, it should acts like
1699    it is at position 1 (or in Python counting, 0).
1700
1701    >>> p = WithinPosition(10, 10, 13)
1702    >>> p
1703    WithinPosition(10, left=10, right=13)
1704    >>> print(p)
1705    (10.13)
1706    >>> int(p)
1707    10
1708
1709    Basic integer comparisons and operations should work as though
1710    this were a plain integer:
1711
1712    >>> p == 10
1713    True
1714    >>> p in [9, 10, 11]
1715    True
1716    >>> p < 11
1717    True
1718    >>> p + 10
1719    20
1720
1721    >>> isinstance(p, WithinPosition)
1722    True
1723    >>> isinstance(p, AbstractPosition)
1724    True
1725    >>> isinstance(p, int)
1726    True
1727
1728    Note this also applies for comparison to other position objects,
1729    where again the integer behaviour is used:
1730
1731    >>> p == 10
1732    True
1733    >>> p == ExactPosition(10)
1734    True
1735    >>> p == BeforePosition(10)
1736    True
1737    >>> p == AfterPosition(10)
1738    True
1739
1740    If this were an end point, you would want the position to be 13:
1741
1742    >>> p2 = WithinPosition(13, 10, 13)
1743    >>> p2
1744    WithinPosition(13, left=10, right=13)
1745    >>> print(p2)
1746    (10.13)
1747    >>> int(p2)
1748    13
1749    >>> p2 == 13
1750    True
1751    >>> p2 == ExactPosition(13)
1752    True
1753
1754    The old legacy properties of position and extension give the
1755    starting/lower/left position as an integer, and the distance
1756    to the ending/higher/right position as an integer. Note that
1757    the position object will act like either the left or the right
1758    end-point depending on how it was created:
1759
1760    >>> p.position == p2.position == 10
1761    True
1762    >>> p.extension == p2.extension == 3
1763    True
1764    >>> int(p) == int(p2)
1765    False
1766    >>> p == 10
1767    True
1768    >>> p2 == 13
1769    True
1770
1771    """
1772
1773    def __new__(cls, position, left, right):
1774        """Create a WithinPosition object."""
1775        if not (position == left or position == right):
1776            raise RuntimeError(
1777                "WithinPosition: %r should match left %r or "
1778                "right %r" % (position, left, right)
1779            )
1780        obj = int.__new__(cls, position)
1781        obj._left = left
1782        obj._right = right
1783        return obj
1784
1785    def __getnewargs__(self):
1786        """Return the arguments accepted by __new__.
1787
1788        Necessary to allow pickling and unpickling of class instances.
1789        """
1790        return (int(self), self._left, self._right)
1791
1792    def __repr__(self):
1793        """Represent the WithinPosition object as a string for debugging."""
1794        return "%s(%i, left=%i, right=%i)" % (
1795            self.__class__.__name__,
1796            int(self),
1797            self._left,
1798            self._right,
1799        )
1800
1801    def __str__(self):
1802        """Return a representation of the WithinPosition object (with python counting)."""
1803        return "(%s.%s)" % (self._left, self._right)
1804
1805    @property
1806    def position(self):
1807        """Legacy attribute to get (left) position as integer (OBSOLETE)."""
1808        return self._left
1809
1810    @property
1811    def extension(self):  # noqa: D402
1812        """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE)."""  # noqa: D402
1813        return self._right - self._left
1814
1815    def _shift(self, offset):
1816        """Return a copy of the position object with its location shifted (PRIVATE)."""
1817        return self.__class__(
1818            int(self) + offset, self._left + offset, self._right + offset
1819        )
1820
1821    def _flip(self, length):
1822        """Return a copy of the location after the parent is reversed (PRIVATE)."""
1823        return self.__class__(
1824            length - int(self), length - self._right, length - self._left
1825        )
1826
1827
1828class BetweenPosition(int, AbstractPosition):
1829    """Specify the position of a boundary between two coordinates (OBSOLETE?).
1830
1831    Arguments:
1832     - position - The default integer position
1833     - left - The start (left) position of the boundary
1834     - right - The end (right) position of the boundary
1835
1836    This allows dealing with a position like 123^456. This
1837    indicates that the start of the sequence is somewhere between
1838    123 and 456. It is up to the parser to set the position argument
1839    to either boundary point (depending on if this is being used as
1840    a start or end of the feature). For example as a feature end:
1841
1842    >>> p = BetweenPosition(456, 123, 456)
1843    >>> p
1844    BetweenPosition(456, left=123, right=456)
1845    >>> print(p)
1846    (123^456)
1847    >>> int(p)
1848    456
1849
1850    Integer equality and comparison use the given position,
1851
1852    >>> p == 456
1853    True
1854    >>> p in [455, 456, 457]
1855    True
1856    >>> p > 300
1857    True
1858
1859    The old legacy properties of position and extension give the
1860    starting/lower/left position as an integer, and the distance
1861    to the ending/higher/right position as an integer. Note that
1862    the position object will act like either the left or the right
1863    end-point depending on how it was created:
1864
1865    >>> p2 = BetweenPosition(123, left=123, right=456)
1866    >>> p.position == p2.position == 123
1867    True
1868    >>> p.extension
1869    333
1870    >>> p2.extension
1871    333
1872    >>> p.extension == p2.extension == 333
1873    True
1874    >>> int(p) == int(p2)
1875    False
1876    >>> p == 456
1877    True
1878    >>> p2 == 123
1879    True
1880
1881    Note this potentially surprising behaviour:
1882
1883    >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123)
1884    True
1885    >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123)
1886    True
1887    >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123)
1888    True
1889
1890    i.e. For equality (and sorting) the position objects behave like
1891    integers.
1892
1893    """
1894
1895    def __new__(cls, position, left, right):
1896        """Create a new instance in BetweenPosition object."""
1897        assert position == left or position == right
1898        obj = int.__new__(cls, position)
1899        obj._left = left
1900        obj._right = right
1901        return obj
1902
1903    def __getnewargs__(self):
1904        """Return the arguments accepted by __new__.
1905
1906        Necessary to allow pickling and unpickling of class instances.
1907        """
1908        return (int(self), self._left, self._right)
1909
1910    def __repr__(self):
1911        """Represent the BetweenPosition object as a string for debugging."""
1912        return "%s(%i, left=%i, right=%i)" % (
1913            self.__class__.__name__,
1914            int(self),
1915            self._left,
1916            self._right,
1917        )
1918
1919    def __str__(self):
1920        """Return a representation of the BetweenPosition object (with python counting)."""
1921        return "(%s^%s)" % (self._left, self._right)
1922
1923    @property
1924    def position(self):
1925        """Legacy attribute to get (left) position as integer (OBSOLETE)."""
1926        return self._left
1927
1928    @property
1929    def extension(self):  # noqa: D402
1930        """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE)."""  # noqa: D402
1931        return self._right - self._left
1932
1933    def _shift(self, offset):
1934        """Return a copy of the position object with its location shifted (PRIVATE)."""
1935        return self.__class__(
1936            int(self) + offset, self._left + offset, self._right + offset
1937        )
1938
1939    def _flip(self, length):
1940        """Return a copy of the location after the parent is reversed (PRIVATE)."""
1941        return self.__class__(
1942            length - int(self), length - self._right, length - self._left
1943        )
1944
1945
1946class BeforePosition(int, AbstractPosition):
1947    """Specify a position where the actual location occurs before it.
1948
1949    Arguments:
1950     - position - The upper boundary of where the location can occur.
1951     - extension - An optional argument which must be zero since we don't
1952       have an extension. The argument is provided so that the same number
1953       of arguments can be passed to all position types.
1954
1955    This is used to specify positions like (<10..100) where the location
1956    occurs somewhere before position 10.
1957
1958    >>> p = BeforePosition(5)
1959    >>> p
1960    BeforePosition(5)
1961    >>> print(p)
1962    <5
1963    >>> int(p)
1964    5
1965    >>> p + 10
1966    15
1967
1968    Note this potentially surprising behaviour:
1969
1970    >>> p == ExactPosition(5)
1971    True
1972    >>> p == AfterPosition(5)
1973    True
1974
1975    Just remember that for equality and sorting the position objects act
1976    like integers.
1977    """
1978
1979    # Subclasses int so can't use __init__
1980    def __new__(cls, position, extension=0):
1981        """Create a new instance in BeforePosition object."""
1982        if extension != 0:
1983            raise AttributeError(
1984                "Non-zero extension %s for exact position." % extension
1985            )
1986        return int.__new__(cls, position)
1987
1988    @property
1989    def position(self):
1990        """Legacy attribute to get position as integer (OBSOLETE)."""
1991        return int(self)
1992
1993    @property
1994    def extension(self):  # noqa: D402
1995        """Legacy attribute to get extension (zero) as integer (OBSOLETE)."""  # noqa: D402
1996        return 0
1997
1998    def __repr__(self):
1999        """Represent the location as a string for debugging."""
2000        return "%s(%i)" % (self.__class__.__name__, int(self))
2001
2002    def __str__(self):
2003        """Return a representation of the BeforePosition object (with python counting)."""
2004        return "<%s" % self.position
2005
2006    def _shift(self, offset):
2007        """Return a copy of the position object with its location shifted (PRIVATE)."""
2008        return self.__class__(int(self) + offset)
2009
2010    def _flip(self, length):
2011        """Return a copy of the location after the parent is reversed (PRIVATE)."""
2012        return AfterPosition(length - int(self))
2013
2014
2015class AfterPosition(int, AbstractPosition):
2016    """Specify a position where the actual location is found after it.
2017
2018    Arguments:
2019     - position - The lower boundary of where the location can occur.
2020     - extension - An optional argument which must be zero since we don't
2021       have an extension. The argument is provided so that the same number
2022       of arguments can be passed to all position types.
2023
2024    This is used to specify positions like (>10..100) where the location
2025    occurs somewhere after position 10.
2026
2027    >>> p = AfterPosition(7)
2028    >>> p
2029    AfterPosition(7)
2030    >>> print(p)
2031    >7
2032    >>> int(p)
2033    7
2034    >>> p + 10
2035    17
2036
2037    >>> isinstance(p, AfterPosition)
2038    True
2039    >>> isinstance(p, AbstractPosition)
2040    True
2041    >>> isinstance(p, int)
2042    True
2043
2044    Note this potentially surprising behaviour:
2045
2046    >>> p == ExactPosition(7)
2047    True
2048    >>> p == BeforePosition(7)
2049    True
2050
2051    Just remember that for equality and sorting the position objects act
2052    like integers.
2053    """
2054
2055    # Subclasses int so can't use __init__
2056    def __new__(cls, position, extension=0):
2057        """Create a new instance of the AfterPosition object."""
2058        if extension != 0:
2059            raise AttributeError(
2060                "Non-zero extension %s for exact position." % extension
2061            )
2062        return int.__new__(cls, position)
2063
2064    @property
2065    def position(self):
2066        """Legacy attribute to get position as integer (OBSOLETE)."""
2067        return int(self)
2068
2069    @property
2070    def extension(self):  # noqa: D402
2071        """Legacy attribute to get extension (zero) as integer (OBSOLETE)."""  # noqa: D402
2072        return 0
2073
2074    def __repr__(self):
2075        """Represent the location as a string for debugging."""
2076        return "%s(%i)" % (self.__class__.__name__, int(self))
2077
2078    def __str__(self):
2079        """Return a representation of the AfterPosition object (with python counting)."""
2080        return ">%s" % self.position
2081
2082    def _shift(self, offset):
2083        """Return a copy of the position object with its location shifted (PRIVATE)."""
2084        return self.__class__(int(self) + offset)
2085
2086    def _flip(self, length):
2087        """Return a copy of the location after the parent is reversed (PRIVATE)."""
2088        return BeforePosition(length - int(self))
2089
2090
2091class OneOfPosition(int, AbstractPosition):
2092    """Specify a position where the location can be multiple positions.
2093
2094    This models the GenBank 'one-of(1888,1901)' function, and tries
2095    to make this fit within the Biopython Position models. If this was
2096    a start position it should act like 1888, but as an end position 1901.
2097
2098    >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)])
2099    >>> p
2100    OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)])
2101    >>> int(p)
2102    1888
2103
2104    Integer comparisons and operators act like using int(p),
2105
2106    >>> p == 1888
2107    True
2108    >>> p <= 1888
2109    True
2110    >>> p > 1888
2111    False
2112    >>> p + 100
2113    1988
2114
2115    >>> isinstance(p, OneOfPosition)
2116    True
2117    >>> isinstance(p, AbstractPosition)
2118    True
2119    >>> isinstance(p, int)
2120    True
2121
2122    The old legacy properties of position and extension give the
2123    starting/lowest/left-most position as an integer, and the
2124    distance to the ending/highest/right-most position as an integer.
2125    Note that the position object will act like one of the list of
2126    possible locations depending on how it was created:
2127
2128    >>> p2 = OneOfPosition(1901, [ExactPosition(1888), ExactPosition(1901)])
2129    >>> p.position == p2.position == 1888
2130    True
2131    >>> p.extension == p2.extension == 13
2132    True
2133    >>> int(p) == int(p2)
2134    False
2135    >>> p == 1888
2136    True
2137    >>> p2 == 1901
2138    True
2139
2140    """
2141
2142    def __new__(cls, position, choices):
2143        """Initialize with a set of possible positions.
2144
2145        position_list is a list of AbstractPosition derived objects,
2146        specifying possible locations.
2147
2148        position is an integer specifying the default behaviour.
2149        """
2150        if position not in choices:
2151            raise ValueError(
2152                "OneOfPosition: %r should match one of %r" % (position, choices)
2153            )
2154        obj = int.__new__(cls, position)
2155        obj.position_choices = choices
2156        return obj
2157
2158    def __getnewargs__(self):
2159        """Return the arguments accepted by __new__.
2160
2161        Necessary to allow pickling and unpickling of class instances.
2162        """
2163        return (int(self), self.position_choices)
2164
2165    @property
2166    def position(self):
2167        """Legacy attribute to get (left) position as integer (OBSOLETE)."""
2168        return min(int(pos) for pos in self.position_choices)
2169
2170    @property
2171    def extension(self):
2172        """Legacy attribute to get extension as integer (OBSOLETE)."""
2173        positions = [int(pos) for pos in self.position_choices]
2174        return max(positions) - min(positions)
2175
2176    def __repr__(self):
2177        """Represent the OneOfPosition object as a string for debugging."""
2178        return "%s(%i, choices=%r)" % (
2179            self.__class__.__name__,
2180            int(self),
2181            self.position_choices,
2182        )
2183
2184    def __str__(self):
2185        """Return a representation of the OneOfPosition object (with python counting)."""
2186        out = "one-of("
2187        for position in self.position_choices:
2188            out += "%s," % position
2189        # replace the last comma with the closing parenthesis
2190        return out[:-1] + ")"
2191
2192    def _shift(self, offset):
2193        """Return a copy of the position object with its location shifted (PRIVATE)."""
2194        return self.__class__(
2195            int(self) + offset, [p._shift(offset) for p in self.position_choices]
2196        )
2197
2198    def _flip(self, length):
2199        """Return a copy of the location after the parent is reversed (PRIVATE)."""
2200        return self.__class__(
2201            length - int(self), [p._flip(length) for p in self.position_choices[::-1]]
2202        )
2203
2204
2205class PositionGap:
2206    """Simple class to hold information about a gap between positions."""
2207
2208    def __init__(self, gap_size):
2209        """Intialize with a position object containing the gap information."""
2210        self.gap_size = gap_size
2211
2212    def __repr__(self):
2213        """Represent the position gap as a string for debugging."""
2214        return "%s(%r)" % (self.__class__.__name__, self.gap_size)
2215
2216    def __str__(self):
2217        """Return a representation of the PositionGap object (with python counting)."""
2218        return "gap(%s)" % self.gap_size
2219
2220
2221if __name__ == "__main__":
2222    from Bio._utils import run_doctest
2223
2224    run_doctest()
2225