1# ----------------------------------------------------------------------------
2# Copyright (c) 2013--, scikit-bio development team.
3#
4# Distributed under the terms of the Modified BSD License.
5#
6# The full license is in the file COPYING.txt, distributed with this software.
7# ----------------------------------------------------------------------------
8
9import operator
10import copy
11import functools
12
13from ._intersection import IntervalTree
14from skbio.util._decorator import experimental, classonlymethod
15
16
17class Interval:
18    """Stores the bounds and metadata of an interval feature.
19
20    This class stores an interval feature. An interval feature
21    is defined as a sub-region of a biological sequence or sequence
22    alignment that is a functional entity, e.g., a gene, a riboswitch,
23    an exon, etc. It can span a single contiguous region or multiple
24    non-contiguous regions (e.g. multiple exons in a transcript, or
25    multiple genes in an operon).
26
27    Parameters
28    ----------
29    interval_metadata : object
30        A reference to the ``IntervalMetadata`` object that this
31        ``Interval`` object is associated to.
32    bounds : iterable of tuple of int
33        Tuples representing start and end coordinates. It is *zero-based*
34        numbering. It is always inclusive on start bound and exclusive on
35        end bound.
36    fuzzy : iterable of tuple of bool, optional
37        Tuples representing the fuzziness of each bound coordinates.
38        If this isn't specified, then the fuzziness of all bound
39        coordinates are ``False``. If any of the coordinate fuzziness
40        is ``True``, it indicates that the exact bound point of a
41        interval feature is unknown. The bound may begin or end at
42        some points outside the specified coordinates. This
43        accommodates the location format [1]_ of INSDC.
44    metadata : dict, optional
45        Dictionary of attributes storing information of the feature
46        such as "strand", "gene_name", or "product".
47
48    See Also
49    --------
50    skbio.metadata.IntervalMetadata
51
52    Notes
53    -----
54    While the construction of an ``Interval`` object automatically add
55    itself to its associated ``IntervalMetadata`` object,
56    ``IntervalMetadata.add`` is the typical/easier way to
57    create and add it to ``IntervalMetadata``.
58
59    References
60    ----------
61    .. [1] ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html#3.4.3
62
63    Examples
64    --------
65    Hypothetically, let's say we have a gene called "genA" with 10 nt
66    as shown in the following diagram.  The second row represents the
67    two exons (indicated by "=") on this gene:
68
69    ::
70
71        TGGATTCTGC
72        -====--==-
73        0123456789
74
75    We can create an ``Interval`` object to represent the exons of the gene:
76
77    >>> from skbio.metadata import Interval, IntervalMetadata
78    >>> interval_metadata = IntervalMetadata(10)
79
80    Remember the coordinates are inclusive in lower bound and exclusive on
81    upper bound:
82
83    >>> gene = Interval(interval_metadata,
84    ...                 bounds=[(1, 5), (7, 9)],
85    ...                 metadata={'name': 'genA'})
86    >>> gene    # doctest: +ELLIPSIS
87    Interval(interval_metadata=..., bounds=[(1, 5), (7, 9)], \
88fuzzy=[(False, False), (False, False)], metadata={'name': 'genA'})
89
90    """
91    def __init__(self, interval_metadata, bounds,
92                 fuzzy=None, metadata=None):
93        if not isinstance(interval_metadata, IntervalMetadata):
94            raise TypeError('You need to provide an IntervalMetadata'
95                            'object, not %r' % interval_metadata)
96        # Intervals
97        self._interval_metadata = interval_metadata
98
99        self._bounds_fuzzy_setter(bounds, fuzzy)
100
101        # Metadata
102        if metadata is None:
103            metadata = {}
104        self.metadata = metadata
105
106        # add this interval feature to the associated IntervalMetadata
107        self._add()
108
109    def _add(self):
110        """Add the current ``Interval`` to the IntervalMetadata object."""
111        for bound in self.bounds:
112            start, end = bound
113            self._interval_metadata._interval_tree.add(start, end, self)
114        self._interval_metadata._intervals.append(self)
115
116    @experimental(as_of='0.5.1')
117    def __eq__(self, other):
118        '''Test if this ``Interval`` object is equal to another.
119
120        The equality is performed by checking if the ``metadata``,
121        ``bounds`` and ``fuzzy`` are equal. Since the ``bounds``
122        and the ``fuzzy`` are sorted, the permutations of them during
123        the ``Interval`` construction or assignment won't matter.
124
125        Parameters
126        ----------
127        other : Interval
128            Interval to test for equality against.
129
130        Returns
131        -------
132        bool
133            Indicates if the two objects are equal.
134        '''
135        return ((self.metadata == other.metadata) and
136                (self.bounds == other.bounds) and
137                (self.fuzzy == other.fuzzy))
138
139    @experimental(as_of='0.5.1')
140    def __ne__(self, other):
141        '''Test if this ``Interval`` object is not equal to another.
142
143        Parameters
144        ----------
145        other : Interval
146            Interval to test for inequality against.
147
148        Returns
149        -------
150        bool
151            Indicates if the two objects are not equal.
152        '''
153        return not (self == other)
154
155    @experimental(as_of='0.5.1')
156    def __repr__(self):
157        '''Return a string representation of this ``Interval`` object.
158
159        Returns
160        -------
161        str
162            String representation of this ``Interval`` object.
163        '''
164        if self.dropped:
165            s = ('{}(dropped=True, bounds={!r}, '
166                 'fuzzy={!r}, metadata={!r})')
167            return s.format(self.__class__.__name__,
168                            self.bounds, self.fuzzy, self.metadata)
169        else:
170            s = ('{}(interval_metadata=<{!r}>, bounds={!r}, '
171                 'fuzzy={!r}, metadata={!r})')
172            return s.format(self.__class__.__name__,
173                            id(self._interval_metadata),
174                            self.bounds, self.fuzzy, self.metadata)
175
176    @experimental(as_of='0.5.1')
177    def drop(self):
178        '''Drop this ``Interval`` object from the interval metadata it links to.
179
180        If the ``Interval`` object is dropped, you can still get values of
181        ``bounds``, ``fuzzy``, and ``metadata`` attributes, but you
182        can not change their values with the setters.
183
184        See Also
185        --------
186        skbio.metadata.IntervalMetadata.drop
187        '''
188        if not self.dropped:
189            self._interval_metadata.drop([self])
190
191    def _bounds_fuzzy_setter(self, bounds=None, fuzzy=None):
192        if self.dropped:
193            raise RuntimeError('Cannot change `bounds` or `fuzzy` '
194                               'on a dropped Interval object.')
195        # Casts to `list`, validation, sorting, and setting of `bounds`
196        # and `fuzzy` happen here.
197        if bounds is not None:
198            # check iterability
199            try:
200                # check iterability
201                bounds = list(bounds)
202            except TypeError:
203                raise TypeError('Cannot give an non-iterable (%r) '
204                                'to `bounds`.' % bounds)
205
206            # check it is not empty
207            if not bounds:
208                raise ValueError('Cannot give empty `bounds`.')
209            # check each contiguous span is in right format
210            for bound in bounds:
211                _assert_valid_bound(bound)
212
213            spans = len(bounds)
214        else:
215            spans = len(self.bounds)
216
217        if fuzzy is not None:
218            try:
219                fuzzy = list(fuzzy)
220            except TypeError:
221                raise TypeError(
222                    'Cannot give a non-iterable (%r) '
223                    'to `fuzzy`.' % fuzzy)
224
225            if len(fuzzy) != spans:
226                raise ValueError(
227                    'The length of fuzzy must '
228                    'be equal to the length of bounds.')
229
230            for fuzzy_i in fuzzy:
231                _assert_valid_fuzzy(fuzzy_i)
232
233        if bounds is None:
234            # `bounds` and `fuzzy` cannot both be omitted.
235            if fuzzy is None:
236                raise ValueError('Cannot give `None` to both `bounds` '
237                                 'and `fuzzy`.')
238            # If only `fuzzy` is provided, set `self.fuzzy` and don't
239            # change `self.bounds`.
240            else:
241                self._fuzzy = fuzzy
242        else:
243            # If only `bounds` is provided, reset `self.fuzzy` to
244            # all `False`.
245            if fuzzy is None:
246                bounds.sort()
247                self._check_bounds(bounds)
248                self._bounds = bounds
249                # reset all the fuzzy to False!!
250                del self.fuzzy
251
252            # If both `bounds` and `fuzzy` are provided, set
253            # `self.bounds` and `self.fuzzy`.
254            else:
255                bounds, fuzzy = [
256                    list(e) for e in zip(*sorted(zip(bounds, fuzzy)))]
257                self._check_bounds(bounds)
258                self._bounds = bounds
259                self._fuzzy = fuzzy
260
261            self._interval_metadata._is_stale_tree = True
262
263    def _check_bounds(self, bounds):
264        '''input `bounds` must be sorted.'''
265        upper_bound = self._interval_metadata.upper_bound
266        lower_bound = self._interval_metadata.lower_bound
267        if upper_bound is not None and bounds[-1][-1] > upper_bound:
268            raise ValueError('Cannot set `bounds` (%r) with coordinate '
269                             'larger than upper bound (%r)' %
270                             (bounds, upper_bound))
271
272        if bounds[0][0] < lower_bound:
273            raise ValueError('Cannot set `bounds` (%r) with coordinate '
274                             'smaller than lower bound (%r).' %
275                             (bounds, lower_bound))
276
277    @property
278    @experimental(as_of='0.5.1')
279    def fuzzy(self):
280        '''The openness of each coordinate.
281
282        This indicates that the exact bound of a interval feature
283        is unknown. The bound may begin or end at some points outside
284        the specified coordinates. This accommodates the bound format [1]_
285        of INSDC.
286
287        References
288        ----------
289        .. [1] ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html#3.4.3
290        '''
291        return self._fuzzy
292
293    @fuzzy.setter
294    @experimental(as_of='0.5.1')
295    def fuzzy(self, value):
296        '''Set ``fuzzy``.
297
298        The ``value`` should an iterable matching ``self.bounds``.
299        '''
300        self._bounds_fuzzy_setter(fuzzy=value)
301
302    @fuzzy.deleter
303    @experimental(as_of='0.5.1')
304    def fuzzy(self):
305        '''Delete ``fuzzy``.
306
307        This set all fuzzy to be ``False``.
308        '''
309        if self.dropped:
310            raise RuntimeError('Cannot change fuzzy on dropped '
311                               'Interval object.')
312        self._fuzzy = [(False, False)] * len(self.bounds)
313
314    @property
315    @experimental(as_of='0.5.1')
316    def bounds(self):
317        '''The coordinates of the interval feature.
318
319        It should be a list of tuples of int pair. Each tuple stores
320        the start and end coordinates of a span of the interval
321        feature. The coordinates are *zero-based*. They are inclusive on
322        the start and exclusive on the end.
323        '''
324        return self._bounds
325
326    @bounds.setter
327    @experimental(as_of='0.5.1')
328    def bounds(self, value):
329        '''Set ``bounds``.
330
331        WARNING: setting ``bounds`` will reset ``fuzzy`` value to ``False``.
332        This is not totally surprising because it is justifiable your old
333        ``fuzzy`` don't fit the new bounds.
334        '''
335        self._bounds_fuzzy_setter(bounds=value)
336
337    @property
338    @experimental(as_of='0.5.1')
339    def metadata(self):
340        '''The metadata of the interval feature.
341
342        It stores the metadata (eg. gene name, function, ID, etc.) of
343        the interval feature as a ``dict``.
344        '''
345        return self._metadata
346
347    @metadata.setter
348    @experimental(as_of='0.5.1')
349    def metadata(self, value):
350        if self.dropped:
351            raise RuntimeError('Cannot change metadata on dropped '
352                               'Interval object.')
353        if not isinstance(value, dict):
354            raise TypeError("metadata must be a dict, not %r" % value)
355        self._metadata = value
356
357    @metadata.deleter
358    @experimental(as_of='0.5.1')
359    def metadata(self):
360        '''Delete metadata.
361
362        This sets metadata to be empty dict.
363        '''
364        if self.dropped:
365            raise RuntimeError('Cannot change metadata to dropped '
366                               'Interval object.')
367        self._metadata = {}
368
369    @property
370    @experimental(as_of='0.5.1')
371    def dropped(self):
372        '''Boolean value indicating if the ``Interval`` object is dropped.
373
374        If it is dropped, it means it is not associated with IntervalMetadata
375        object any more.
376
377        Notes
378        -----
379        This property is not writable.
380
381        See Also
382        --------
383        skbio.metadata.Interval.drop
384        skbio.metadata.IntervalMetadata.drop
385        '''
386        return self._interval_metadata is None
387
388
389class IntervalMetadata():
390    """Stores the interval features.
391
392    ``IntervalMetadata`` object allows storage, modification, and
393    querying of interval features covering a region of a single coordinate
394    system. For instance, this can be used to store functional annotations
395    about genes across a genome. This object is also applied to the sequence
396    alignment.
397
398    This object is typically coupled with another object, such as a
399    ``Sequence`` object (or its child class), or a ``TabularMSA`` object.
400
401    Parameters
402    ----------
403    upper_bound : int or None
404        Defines the exclusive upper bound of the interval features. No
405        coordinate can be greater than it. ``None``
406        means that the coordinate space is unbounded.
407    copy_from : IntervalMetadata or None, optional
408        Create a new object from the input ``IntervalMetadata`` object by
409        shallow copying if it is not ``None``. The upper bound of the new
410        object will be updated with the ``upper_bound`` parameter specified.
411
412    Notes
413    -----
414    This class stores coordinates of all feature bounds into a interval
415    tree. It allows the speed up of query-by-bound. The building of
416    interval tree is deferred until necessary to save computation. It is
417    updated from all coordinates only when you need to fetch info from
418    the interval tree.
419
420    When you add a method into this class and if you method need to fetch
421    info from ``IntervalMetadata._interval_tree``, you should decorate it with
422    ``_rebuild_tree``. This decorator will check if the current interval tree
423    is stale and will update it if so. Additionally, if your method add,
424    delete, or changes the coordinates of any interval features, you should
425    set ``self._is_stale_tree`` to ``True`` at the end of your method to
426    indicate the interval tree becomes stale.
427
428    See Also
429    --------
430    skbio.metadata.Interval
431
432    Examples
433    --------
434    Let's say we have a sequence of length 10 and want to add annotation
435    to it. Create an ``IntervalMetadata`` object:
436
437    >>> from skbio.metadata import Interval, IntervalMetadata
438    >>> im = IntervalMetadata(10)
439
440    Let's add annotations of 3 genes:
441
442    >>> im.add(bounds=[(3, 9)],
443    ...        metadata={'gene': 'sagB'})  # doctest: +ELLIPSIS
444    Interval(interval_metadata=..., bounds=[(3, 9)], \
445fuzzy=[(False, False)], metadata={'gene': 'sagB'})
446    >>> im.add(bounds=[(3, 7)],
447    ...        metadata={'gene': 'sagC'})  # doctest: +ELLIPSIS
448    Interval(interval_metadata=..., bounds=[(3, 7)], \
449fuzzy=[(False, False)], metadata={'gene': 'sagC'})
450    >>> im.add(bounds=[(1, 2), (4, 7)],
451    ...        metadata={'gene': 'sagA'})  # doctest: +ELLIPSIS
452    Interval(interval_metadata=..., bounds=[(1, 2), (4, 7)], \
453fuzzy=[(False, False), (False, False)], metadata={'gene': 'sagA'})
454
455    Show the object representation:
456
457    >>> im    # doctest: +ELLIPSIS
458    3 interval features
459    -------------------
460    Interval(interval_metadata=..., bounds=[(3, 9)], \
461fuzzy=[(False, False)], metadata={'gene': 'sagB'})
462    Interval(interval_metadata=..., bounds=[(3, 7)], \
463fuzzy=[(False, False)], metadata={'gene': 'sagC'})
464    Interval(interval_metadata=..., bounds=[(1, 2), (4, 7)], \
465fuzzy=[(False, False), (False, False)], metadata={'gene': 'sagA'})
466
467    We can sort the genes by their bounds:
468
469    >>> im.sort()
470    >>> im    # doctest: +ELLIPSIS
471    3 interval features
472    -------------------
473    Interval(interval_metadata=..., bounds=[(1, 2), (4, 7)], \
474fuzzy=[(False, False), (False, False)], metadata={'gene': 'sagA'})
475    Interval(interval_metadata=..., bounds=[(3, 7)], \
476fuzzy=[(False, False)], metadata={'gene': 'sagC'})
477    Interval(interval_metadata=..., bounds=[(3, 9)], \
478fuzzy=[(False, False)], metadata={'gene': 'sagB'})
479
480    Query the genes by bound and/or metadata:
481
482    >>> intvls = im.query([(1, 2)], metadata={'gene': 'foo'})
483    >>> list(intvls)
484    []
485    >>> intvls = im.query([(7, 9)])
486    >>> list(intvls)  # doctest: +ELLIPSIS
487    [Interval(interval_metadata=..., bounds=[(3, 9)], \
488fuzzy=[(False, False)], metadata={'gene': 'sagB'})]
489    >>> intvls = im.query(metadata={'gene': 'sagA'})
490    >>> intvls = list(intvls)
491    >>> intvls  # doctest: +ELLIPSIS
492    [Interval(interval_metadata=..., bounds=[(1, 2), (4, 7)], \
493fuzzy=[(False, False), (False, False)], metadata={'gene': 'sagA'})]
494
495    Drop the gene(s) we get from query:
496
497    >>> im.drop(intvls)
498    >>> im.sort()
499    >>> im   # doctest: +ELLIPSIS
500    2 interval features
501    -------------------
502    Interval(interval_metadata=..., bounds=[(3, 7)], \
503fuzzy=[(False, False)], metadata={'gene': 'sagC'})
504    Interval(interval_metadata=..., bounds=[(3, 9)], \
505fuzzy=[(False, False)], metadata={'gene': 'sagB'})
506
507    """
508    default_write_format = 'gff3'
509
510    def __init__(self, upper_bound, copy_from=None):
511        self._upper_bound = upper_bound
512        if self.upper_bound is not None:
513            if self.upper_bound < self.lower_bound:
514                raise ValueError('Cannot set `upper_bound` (%r) '
515                                 'smaller than `lower_bound` (%r)'
516                                 % (self.upper_bound, self.lower_bound))
517        # List of Interval objects.
518        self._intervals = []
519
520        # IntervalTree object to allow faster querying of interval objects.
521        self._interval_tree = IntervalTree()
522
523        # Indicates if the IntervalTree needs to be rebuilt.
524        self._is_stale_tree = False
525
526        if copy_from is not None:
527            for interval in copy_from._intervals:
528                bounds_cp = interval.bounds[:]
529                fuzzy_cp = interval.fuzzy[:]
530                metadata_cp = copy.copy(interval.metadata)
531
532                self.add(bounds_cp, fuzzy=fuzzy_cp, metadata=metadata_cp)
533
534    @property
535    @experimental(as_of='0.5.1')
536    def upper_bound(self):
537        '''The exclusive upper bound of interval features.'''
538        return self._upper_bound
539
540    @property
541    @experimental(as_of='0.5.1')
542    def lower_bound(self):
543        '''The inclusive lower bound of interval features.'''
544        return 0
545
546    @property
547    @experimental(as_of='0.5.1')
548    def num_interval_features(self):
549        '''The total number of interval features.'''
550        return len(self._intervals)
551
552    def _rebuild_tree(method):
553        """Rebuild the IntervalTree."""
554        @functools.wraps(method)
555        def inner(self, *args, **kwargs):
556            if self._is_stale_tree is False:
557                return method(self, *args, **kwargs)
558            self._interval_tree = IntervalTree()
559            for f in self._intervals:
560                for start, end in f.bounds:
561                    self._interval_tree.add(start, end, f)
562            self._is_stale_tree = False
563            return method(self, *args, **kwargs)
564        return inner
565
566    def _reverse(self):
567        """Reverse ``IntervalMetadata`` object.
568
569        This operation reverses all of the interval coordinates.
570        For instance, this can be used to compare coordinates
571        in the forward strand to coordinates in the reversal strand.
572        """
573
574        for f in self._intervals:
575            try:
576                intvls = [(self.upper_bound - x[1], self.upper_bound - x[0])
577                          for x in reversed(f.bounds)]
578            except TypeError:
579                raise TypeError('You cannot reverse the coordinates '
580                                'when the upper bound is `None`')
581            f.bounds = intvls
582
583        # DON'T forget this!!!
584        self._is_stale_tree = True
585
586    @classonlymethod
587    @experimental(as_of="0.5.1")
588    def concat(cls, interval_metadata):
589        '''Concatenate an iterable of ``IntervalMetadata`` objects.
590
591        It concatenates the multiple ``IntervalMetadata`` objects into
592        one coordinate space. The order of the objects in the input
593        iterable matters. The coordinate of the second
594        ``InterableMetadata`` will be shifted up with the length of
595        the first ``IntervalMetadata`` object.
596
597        This function is useful when you concatenate multiple sequences.
598
599        Parameters
600        ----------
601        interval_metadata : Iterable (IntervalMetadata)
602            The interval metadata to concatenate.
603
604        Returns
605        -------
606        IntervalMetadata
607            Concatenated interval metadata.
608
609        Examples
610        --------
611        >>> from skbio.metadata import IntervalMetadata
612
613        Create two ``IntervalMetadata`` objects:
614
615        >>> im1 = IntervalMetadata(3)
616        >>> _ = im1.add([(0, 2)], [(True, False)], {'gene': 'sagA'})
617        >>> im2 = IntervalMetadata(4)
618        >>> _ = im2.add([(1, 4)], [(True, True)], {'gene': 'sagB'})
619
620        Concatenate them into a single coordinate space. The second
621        ``IntervalMetadata``'s interval features are all shifted
622        up. The resulting ``IntervalMetadata``'s upper bound is the
623        sum of upper bounds of concatenated objects:
624
625        >>> im = IntervalMetadata.concat([im1, im2])
626        >>> im   # doctest: +ELLIPSIS
627        2 interval features
628        -------------------
629        Interval(interval_metadata=<...>, bounds=[(0, 2)], \
630fuzzy=[(True, False)], metadata={'gene': 'sagA'})
631        Interval(interval_metadata=<...>, bounds=[(4, 7)], \
632fuzzy=[(True, True)], metadata={'gene': 'sagB'})
633        >>> im.upper_bound
634        7
635
636        '''
637        interval_metadata = list(interval_metadata)
638
639        if len(interval_metadata) == 0:
640            return cls(0)
641
642        upper_bound = 0
643        for im in interval_metadata:
644            try:
645                upper_bound += im.upper_bound
646            except TypeError:
647                raise TypeError('You cannot concat the interval metadata '
648                                'because its upper bound is `None`:\n%r' % im)
649        new = cls(upper_bound)
650
651        length = 0
652        for i, im in enumerate(interval_metadata):
653            for intvl in im._intervals:
654                bounds = intvl.bounds
655                fuzzy = intvl.fuzzy
656                if i != 0:
657                    bounds = [(start + length, end + length)
658                              for start, end in bounds]
659                new.add(bounds, fuzzy, intvl.metadata)
660            length += im.upper_bound
661
662        return new
663
664    @experimental(as_of='0.5.1')
665    def merge(self, other):
666        '''Merge the interval features of another ``IntervalMetadata`` object.
667
668        It adds all the interval features of the other object into
669        ``self``. Note this will not check if there are any duplicates
670        of interval features after merge.
671
672        Notes
673        -----
674        It will raise error if you merge an unbounded
675        ``IntervalMetadata`` object to the current object if it is
676        bounded. This avoids partially updating the current object if
677        the merge fails in the middle of the process due to the
678        possibility that some interval features to be merged may live
679        outside the current defined upper bound.
680
681        Parameters
682        ----------
683        other : ``IntervalMetadata``
684            The other ``IntervalMetadata`` to be merged.
685
686        '''
687        if self.upper_bound is not None:
688            if other.upper_bound is None:
689                raise ValueError(
690                    'Cannot merge an unbound IntervalMetadata object '
691                    'to a bounded one')
692            elif self.upper_bound != other.upper_bound:
693                raise ValueError(
694                    'The upper bounds of the two IntervalMetadata objects '
695                    'are not equal (%r != %r)' % (
696                        self.upper_bound, other.upper_bound))
697        if self.lower_bound != other.lower_bound:
698            raise ValueError(
699                'The lower bounds of the two IntervalMetadata objects '
700                'are not equal (%d != %d)' % (
701                    self.lower_bound, other.lower_bound))
702        for intvl in other._intervals:
703            self.add(intvl.bounds, intvl.fuzzy, intvl.metadata)
704
705    @experimental(as_of='0.5.1')
706    def sort(self, ascending=True):
707        '''Sort interval features by their coordinates.
708
709        It sorts by the start coordinate first. If they are the same between
710        two interval features, they will be sorted by comparing their end
711        coordinates. For example, an interval feature with [(1, 2), (4, 7)]
712        will be sorted in front of another one with [(1, 2), (3, 8)].
713
714        Parameters
715        ----------
716        ascending : bool, optional
717            sort in ascending or descending coordinates.
718        '''
719        self._intervals.sort(
720            key=lambda i: [i.bounds[0][0], i.bounds[-1][1]],
721            reverse=not ascending)
722
723    @experimental(as_of='0.5.1')
724    def add(self, bounds, fuzzy=None, metadata=None):
725        """Create and add an ``Interval`` to this ``IntervalMetadata``.
726
727        This method creates an ``Interval`` object and inserts it into
728        the ``IntervalMetadata`` object.
729
730        Parameters
731        ----------
732        bounds : iterable of tuple of ints
733            Tuples representing start and end coordinates. It is *zero-based*
734            numbering. It is always inclusive on start bound and exclusive on
735            end bound.
736        fuzzy : iterable of tuple of bool, optional
737            Tuples representing the fuzziness of each bound coordinates.
738        metadata : dict, optional
739            A dictionary of key-value pairs associated with the
740            ``Interval`` object.
741
742        Returns
743        -------
744        Interval
745            The ``Interval`` object added.
746
747        See Also
748        --------
749        skbio.metadata.Interval
750        """
751        # Add an interval to the tree. Note that the add functionality is
752        # built within the Interval constructor.
753        return Interval(interval_metadata=self,
754                        bounds=bounds,
755                        fuzzy=fuzzy,
756                        metadata=metadata)
757
758    @_rebuild_tree
759    def _query_interval(self, bound):
760        """Yield ``Interval`` objects that overlap with the bound."""
761        _assert_valid_bound(bound)
762
763        start, end = bound
764        intvls = self._interval_tree.find(start, end)
765        # if a ``Interval`` has many non-contiguous spans and
766        # multiple of them overlap with the bound, then
767        # this ``Interval`` object will be returned
768        # multiple times. So we need to remove duplicates.
769        seen = set()
770        for intvl in intvls:
771            if id(intvl) not in seen:
772                seen.add(id(intvl))
773                yield intvl
774
775    def _query_attribute(self, metadata, intervals=None):
776        """Yield ``Interval`` objects based on query attributes.
777
778        Parameters
779        ----------
780        metadata : dict or ``None``
781            If it is ``None``, return empty iterator; if it is
782            ``{}``, return an interator of all the ``Interval``
783            objects.
784        intervals : an iterable of ``Interval`` objects
785        """
786        if metadata is None:
787            return
788
789        if intervals is None:
790            intervals = self._intervals
791
792        for intvl in intervals:
793            for (key, value) in metadata.items():
794                if (key not in intvl.metadata or
795                        intvl.metadata[key] != value):
796                    break
797            else:
798                yield intvl
799
800    @experimental(as_of='0.5.1')
801    @_rebuild_tree
802    def query(self, bounds=None, metadata=None):
803        """Yield ``Interval`` object with the bounds and attributes.
804
805        The ``Interval`` objects must meet both requirements: 1) overlap
806        with any of the spans specified by ``bounds``; 2) satisfy
807        ``metadata`` specification. For instance, you can identify
808        all the recA genes that overlap with (10, 100) or (900, 1000)
809        with this code ``interval_metadata.query([(10, 100),
810        (900, 1000)], {'gene': 'recA'})``.
811
812        Parameters
813        ----------
814        bounds : iterable of tuples of int pair, optional
815            Specifies bounds to look for the ``Interval``
816            objects. An satisfying interval feature only need to overlap with
817            one bound. Default (``None``) means all ``Intervals`` meet
818            this requirement.
819
820        metadata : dict, optional
821            A dictionary of key word attributes associated with the
822            ``Interval`` object. It specifies what metadata keywords and
823            values to look for. Default (``None``) means all ``Intervals``
824            meet this requirement.
825
826        Yields
827        ------
828        Interval
829            ``Interval`` object satisfying the search criteria.
830        """
831        if bounds is None:
832            for intvl in self._query_attribute(metadata):
833                yield intvl
834        else:
835            for loc in bounds:
836                intvls = self._query_interval(loc)
837                if metadata is None:
838                    metadata = {}
839                for intvl in self._query_attribute(metadata, intvls):
840                    yield intvl
841
842    @experimental(as_of='0.5.1')
843    def drop(self, intervals, negate=False):
844        """Drops Interval objects.
845
846        The given ``Interval`` objects will be removed and their
847        associated ``IntervalMetadata`` will be set to ``None``.
848
849        Parameters
850        ----------
851        intervals : iterable of ``Interval``
852            ``Interval`` objects to drop from this object.
853        negate : bool
854            Negate the drop operation, i.e. keeping the specified intervals
855            instead of dropping them.
856        """
857        to_delete = {id(f) for f in intervals}
858
859        new_intvls = []
860        # iterate through queries and drop them
861        for intvl in self._intervals:
862            drop = id(intvl) in to_delete
863            if negate is True:
864                drop = not drop
865            if drop:
866                intvl._interval_metadata = None
867            else:
868                new_intvls.append(intvl)
869
870        self._intervals = new_intvls
871        self._is_stale_tree = True
872
873    @experimental(as_of='0.5.1')
874    def __eq__(self, other):
875        '''Test if this object is equal to another.
876
877        It checks if the coordinate spaces are the same between the
878        two objects. If so, then check if all the interval features
879        are equal between the two objects after sorting them by
880        bounds.
881
882        Parameters
883        ----------
884        other : IntervalMetadata
885            Interval metadata to test for equality against.
886
887        Returns
888        -------
889        bool
890            Indicates if the two objects are equal.
891        '''
892        if self.upper_bound != other.upper_bound or \
893           self.lower_bound != other.lower_bound:
894            return False
895        else:
896            self_intervals = sorted(self._intervals,
897                                    key=operator.attrgetter('bounds'))
898            other_intervals = sorted(other._intervals,
899                                     key=operator.attrgetter('bounds'))
900            return self_intervals == other_intervals
901
902    @experimental(as_of='0.5.1')
903    def __ne__(self, other):
904        '''Test if this object is not equal to another.
905
906        Parameters
907        ----------
908        other : IntervalMetadata
909            Interval metadata to test for inequality against.
910
911        Returns
912        -------
913        bool
914            Indicates if the two objects are not equal.
915
916        See Also
917        --------
918        skbio.metadata.IntervalMetadata.__eq__
919        '''
920        return not (self == other)
921
922    @experimental(as_of='0.5.1')
923    def __repr__(self):
924        '''Return a string representation of this object.
925
926        Returns
927        -------
928        str
929            String representation of this ``IntervalMetadata`` object.
930        '''
931        n = self.num_interval_features
932        l1 = '{} interval feature'.format(n)
933        if n != 1:
934            l1 += 's'
935        l2 = '-' * len(l1)
936
937        if n <= 5:
938            items = [repr(i) for i in self._intervals]
939        else:
940            # intentionally overwrite items[2] to make code cleaner
941            items = [repr(self._intervals[i]) for i in [0, 1, 2, n-2, n-1]]
942            items[2] = '...'
943
944        return '\n'.join([l1, l2] + items)
945
946    @experimental(as_of='0.5.1')
947    def __copy__(self):
948        '''Return a shallow copy.
949
950        Notes
951        -----
952        The ``IntervalMetadata`` copy will have copies of the
953        ``Interval`` objects present in this object.  The ``metadata``
954        dictionary of each ``Interval`` object will be a shallow copy.
955
956        See Also
957        --------
958        __deepcopy__
959        '''
960        return self._copy(False, {})
961
962    @experimental(as_of='0.5.1')
963    def __deepcopy__(self, memo):
964        '''Return a deep copy.
965
966        Notes
967        -----
968        The ``IntervalMetadata`` copy will have copies of the
969        ``Interval`` objects present in this object.  The ``metadata``
970        dictionary of each ``Interval`` object will be a deep copy.
971
972        See Also
973        --------
974        __copy__
975        '''
976        return self._copy(True, memo)
977
978    def _copy(self, deep, memo):
979        cp = IntervalMetadata(self.upper_bound)
980
981        for interval in self._intervals:
982            # Only need to shallow-copy `bounds` and `fuzzy`
983            # because their elements are immutable.
984            bounds_cp = interval.bounds[:]
985            fuzzy_cp = interval.fuzzy[:]
986            if deep:
987                metadata_cp = copy.deepcopy(interval.metadata, memo)
988            else:
989                metadata_cp = copy.copy(interval.metadata)
990
991            cp.add(bounds_cp,
992                   fuzzy=fuzzy_cp,
993                   metadata=metadata_cp)
994
995        return cp
996
997
998def _assert_valid_bound(bound):
999    if isinstance(bound, tuple):
1000        try:
1001            start, end = bound
1002        except ValueError:
1003            raise ValueError("A `bound` must be a tuple of exactly "
1004                             "two coordinates, not {!r}".format(bound))
1005        if not (isinstance(start, int) and
1006                isinstance(end, int)) or start > end:
1007            raise ValueError('`start` (%r) cannot be a larger int '
1008                             'than `end` (%r).' % (start, end))
1009    else:
1010        raise TypeError("Each `bound` must be a tuple, not {!r}".format(
1011            bound))
1012
1013
1014def _assert_valid_fuzzy(fuzzy):
1015    if isinstance(fuzzy, tuple):
1016        try:
1017            start, end = fuzzy
1018        except ValueError:
1019            raise ValueError("A `fuzzy` must be a tuple of exactly "
1020                             "two, not {!r}".format(fuzzy))
1021        if not (isinstance(start, bool) and isinstance(end, bool)):
1022            raise TypeError('A `fuzzy` must be a tuple of two booleans')
1023    else:
1024        raise TypeError("Each `fuzzy` must be a tuple, not {!r}".format(
1025            fuzzy))
1026