1# ---------------------------------------------------------------------------- 2# Copyright (c) 2013--, scikit-bio development team. 3# 4# Distributed under the terms of the Modified BSD License. 5# 6# The full license is in the file COPYING.txt, distributed with this software. 7# ---------------------------------------------------------------------------- 8 9import operator 10import copy 11import functools 12 13from ._intersection import IntervalTree 14from skbio.util._decorator import experimental, classonlymethod 15 16 17class Interval: 18 """Stores the bounds and metadata of an interval feature. 19 20 This class stores an interval feature. An interval feature 21 is defined as a sub-region of a biological sequence or sequence 22 alignment that is a functional entity, e.g., a gene, a riboswitch, 23 an exon, etc. It can span a single contiguous region or multiple 24 non-contiguous regions (e.g. multiple exons in a transcript, or 25 multiple genes in an operon). 26 27 Parameters 28 ---------- 29 interval_metadata : object 30 A reference to the ``IntervalMetadata`` object that this 31 ``Interval`` object is associated to. 32 bounds : iterable of tuple of int 33 Tuples representing start and end coordinates. It is *zero-based* 34 numbering. It is always inclusive on start bound and exclusive on 35 end bound. 36 fuzzy : iterable of tuple of bool, optional 37 Tuples representing the fuzziness of each bound coordinates. 38 If this isn't specified, then the fuzziness of all bound 39 coordinates are ``False``. If any of the coordinate fuzziness 40 is ``True``, it indicates that the exact bound point of a 41 interval feature is unknown. The bound may begin or end at 42 some points outside the specified coordinates. This 43 accommodates the location format [1]_ of INSDC. 44 metadata : dict, optional 45 Dictionary of attributes storing information of the feature 46 such as "strand", "gene_name", or "product". 47 48 See Also 49 -------- 50 skbio.metadata.IntervalMetadata 51 52 Notes 53 ----- 54 While the construction of an ``Interval`` object automatically add 55 itself to its associated ``IntervalMetadata`` object, 56 ``IntervalMetadata.add`` is the typical/easier way to 57 create and add it to ``IntervalMetadata``. 58 59 References 60 ---------- 61 .. [1] ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html#3.4.3 62 63 Examples 64 -------- 65 Hypothetically, let's say we have a gene called "genA" with 10 nt 66 as shown in the following diagram. The second row represents the 67 two exons (indicated by "=") on this gene: 68 69 :: 70 71 TGGATTCTGC 72 -====--==- 73 0123456789 74 75 We can create an ``Interval`` object to represent the exons of the gene: 76 77 >>> from skbio.metadata import Interval, IntervalMetadata 78 >>> interval_metadata = IntervalMetadata(10) 79 80 Remember the coordinates are inclusive in lower bound and exclusive on 81 upper bound: 82 83 >>> gene = Interval(interval_metadata, 84 ... bounds=[(1, 5), (7, 9)], 85 ... metadata={'name': 'genA'}) 86 >>> gene # doctest: +ELLIPSIS 87 Interval(interval_metadata=..., bounds=[(1, 5), (7, 9)], \ 88fuzzy=[(False, False), (False, False)], metadata={'name': 'genA'}) 89 90 """ 91 def __init__(self, interval_metadata, bounds, 92 fuzzy=None, metadata=None): 93 if not isinstance(interval_metadata, IntervalMetadata): 94 raise TypeError('You need to provide an IntervalMetadata' 95 'object, not %r' % interval_metadata) 96 # Intervals 97 self._interval_metadata = interval_metadata 98 99 self._bounds_fuzzy_setter(bounds, fuzzy) 100 101 # Metadata 102 if metadata is None: 103 metadata = {} 104 self.metadata = metadata 105 106 # add this interval feature to the associated IntervalMetadata 107 self._add() 108 109 def _add(self): 110 """Add the current ``Interval`` to the IntervalMetadata object.""" 111 for bound in self.bounds: 112 start, end = bound 113 self._interval_metadata._interval_tree.add(start, end, self) 114 self._interval_metadata._intervals.append(self) 115 116 @experimental(as_of='0.5.1') 117 def __eq__(self, other): 118 '''Test if this ``Interval`` object is equal to another. 119 120 The equality is performed by checking if the ``metadata``, 121 ``bounds`` and ``fuzzy`` are equal. Since the ``bounds`` 122 and the ``fuzzy`` are sorted, the permutations of them during 123 the ``Interval`` construction or assignment won't matter. 124 125 Parameters 126 ---------- 127 other : Interval 128 Interval to test for equality against. 129 130 Returns 131 ------- 132 bool 133 Indicates if the two objects are equal. 134 ''' 135 return ((self.metadata == other.metadata) and 136 (self.bounds == other.bounds) and 137 (self.fuzzy == other.fuzzy)) 138 139 @experimental(as_of='0.5.1') 140 def __ne__(self, other): 141 '''Test if this ``Interval`` object is not equal to another. 142 143 Parameters 144 ---------- 145 other : Interval 146 Interval to test for inequality against. 147 148 Returns 149 ------- 150 bool 151 Indicates if the two objects are not equal. 152 ''' 153 return not (self == other) 154 155 @experimental(as_of='0.5.1') 156 def __repr__(self): 157 '''Return a string representation of this ``Interval`` object. 158 159 Returns 160 ------- 161 str 162 String representation of this ``Interval`` object. 163 ''' 164 if self.dropped: 165 s = ('{}(dropped=True, bounds={!r}, ' 166 'fuzzy={!r}, metadata={!r})') 167 return s.format(self.__class__.__name__, 168 self.bounds, self.fuzzy, self.metadata) 169 else: 170 s = ('{}(interval_metadata=<{!r}>, bounds={!r}, ' 171 'fuzzy={!r}, metadata={!r})') 172 return s.format(self.__class__.__name__, 173 id(self._interval_metadata), 174 self.bounds, self.fuzzy, self.metadata) 175 176 @experimental(as_of='0.5.1') 177 def drop(self): 178 '''Drop this ``Interval`` object from the interval metadata it links to. 179 180 If the ``Interval`` object is dropped, you can still get values of 181 ``bounds``, ``fuzzy``, and ``metadata`` attributes, but you 182 can not change their values with the setters. 183 184 See Also 185 -------- 186 skbio.metadata.IntervalMetadata.drop 187 ''' 188 if not self.dropped: 189 self._interval_metadata.drop([self]) 190 191 def _bounds_fuzzy_setter(self, bounds=None, fuzzy=None): 192 if self.dropped: 193 raise RuntimeError('Cannot change `bounds` or `fuzzy` ' 194 'on a dropped Interval object.') 195 # Casts to `list`, validation, sorting, and setting of `bounds` 196 # and `fuzzy` happen here. 197 if bounds is not None: 198 # check iterability 199 try: 200 # check iterability 201 bounds = list(bounds) 202 except TypeError: 203 raise TypeError('Cannot give an non-iterable (%r) ' 204 'to `bounds`.' % bounds) 205 206 # check it is not empty 207 if not bounds: 208 raise ValueError('Cannot give empty `bounds`.') 209 # check each contiguous span is in right format 210 for bound in bounds: 211 _assert_valid_bound(bound) 212 213 spans = len(bounds) 214 else: 215 spans = len(self.bounds) 216 217 if fuzzy is not None: 218 try: 219 fuzzy = list(fuzzy) 220 except TypeError: 221 raise TypeError( 222 'Cannot give a non-iterable (%r) ' 223 'to `fuzzy`.' % fuzzy) 224 225 if len(fuzzy) != spans: 226 raise ValueError( 227 'The length of fuzzy must ' 228 'be equal to the length of bounds.') 229 230 for fuzzy_i in fuzzy: 231 _assert_valid_fuzzy(fuzzy_i) 232 233 if bounds is None: 234 # `bounds` and `fuzzy` cannot both be omitted. 235 if fuzzy is None: 236 raise ValueError('Cannot give `None` to both `bounds` ' 237 'and `fuzzy`.') 238 # If only `fuzzy` is provided, set `self.fuzzy` and don't 239 # change `self.bounds`. 240 else: 241 self._fuzzy = fuzzy 242 else: 243 # If only `bounds` is provided, reset `self.fuzzy` to 244 # all `False`. 245 if fuzzy is None: 246 bounds.sort() 247 self._check_bounds(bounds) 248 self._bounds = bounds 249 # reset all the fuzzy to False!! 250 del self.fuzzy 251 252 # If both `bounds` and `fuzzy` are provided, set 253 # `self.bounds` and `self.fuzzy`. 254 else: 255 bounds, fuzzy = [ 256 list(e) for e in zip(*sorted(zip(bounds, fuzzy)))] 257 self._check_bounds(bounds) 258 self._bounds = bounds 259 self._fuzzy = fuzzy 260 261 self._interval_metadata._is_stale_tree = True 262 263 def _check_bounds(self, bounds): 264 '''input `bounds` must be sorted.''' 265 upper_bound = self._interval_metadata.upper_bound 266 lower_bound = self._interval_metadata.lower_bound 267 if upper_bound is not None and bounds[-1][-1] > upper_bound: 268 raise ValueError('Cannot set `bounds` (%r) with coordinate ' 269 'larger than upper bound (%r)' % 270 (bounds, upper_bound)) 271 272 if bounds[0][0] < lower_bound: 273 raise ValueError('Cannot set `bounds` (%r) with coordinate ' 274 'smaller than lower bound (%r).' % 275 (bounds, lower_bound)) 276 277 @property 278 @experimental(as_of='0.5.1') 279 def fuzzy(self): 280 '''The openness of each coordinate. 281 282 This indicates that the exact bound of a interval feature 283 is unknown. The bound may begin or end at some points outside 284 the specified coordinates. This accommodates the bound format [1]_ 285 of INSDC. 286 287 References 288 ---------- 289 .. [1] ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html#3.4.3 290 ''' 291 return self._fuzzy 292 293 @fuzzy.setter 294 @experimental(as_of='0.5.1') 295 def fuzzy(self, value): 296 '''Set ``fuzzy``. 297 298 The ``value`` should an iterable matching ``self.bounds``. 299 ''' 300 self._bounds_fuzzy_setter(fuzzy=value) 301 302 @fuzzy.deleter 303 @experimental(as_of='0.5.1') 304 def fuzzy(self): 305 '''Delete ``fuzzy``. 306 307 This set all fuzzy to be ``False``. 308 ''' 309 if self.dropped: 310 raise RuntimeError('Cannot change fuzzy on dropped ' 311 'Interval object.') 312 self._fuzzy = [(False, False)] * len(self.bounds) 313 314 @property 315 @experimental(as_of='0.5.1') 316 def bounds(self): 317 '''The coordinates of the interval feature. 318 319 It should be a list of tuples of int pair. Each tuple stores 320 the start and end coordinates of a span of the interval 321 feature. The coordinates are *zero-based*. They are inclusive on 322 the start and exclusive on the end. 323 ''' 324 return self._bounds 325 326 @bounds.setter 327 @experimental(as_of='0.5.1') 328 def bounds(self, value): 329 '''Set ``bounds``. 330 331 WARNING: setting ``bounds`` will reset ``fuzzy`` value to ``False``. 332 This is not totally surprising because it is justifiable your old 333 ``fuzzy`` don't fit the new bounds. 334 ''' 335 self._bounds_fuzzy_setter(bounds=value) 336 337 @property 338 @experimental(as_of='0.5.1') 339 def metadata(self): 340 '''The metadata of the interval feature. 341 342 It stores the metadata (eg. gene name, function, ID, etc.) of 343 the interval feature as a ``dict``. 344 ''' 345 return self._metadata 346 347 @metadata.setter 348 @experimental(as_of='0.5.1') 349 def metadata(self, value): 350 if self.dropped: 351 raise RuntimeError('Cannot change metadata on dropped ' 352 'Interval object.') 353 if not isinstance(value, dict): 354 raise TypeError("metadata must be a dict, not %r" % value) 355 self._metadata = value 356 357 @metadata.deleter 358 @experimental(as_of='0.5.1') 359 def metadata(self): 360 '''Delete metadata. 361 362 This sets metadata to be empty dict. 363 ''' 364 if self.dropped: 365 raise RuntimeError('Cannot change metadata to dropped ' 366 'Interval object.') 367 self._metadata = {} 368 369 @property 370 @experimental(as_of='0.5.1') 371 def dropped(self): 372 '''Boolean value indicating if the ``Interval`` object is dropped. 373 374 If it is dropped, it means it is not associated with IntervalMetadata 375 object any more. 376 377 Notes 378 ----- 379 This property is not writable. 380 381 See Also 382 -------- 383 skbio.metadata.Interval.drop 384 skbio.metadata.IntervalMetadata.drop 385 ''' 386 return self._interval_metadata is None 387 388 389class IntervalMetadata(): 390 """Stores the interval features. 391 392 ``IntervalMetadata`` object allows storage, modification, and 393 querying of interval features covering a region of a single coordinate 394 system. For instance, this can be used to store functional annotations 395 about genes across a genome. This object is also applied to the sequence 396 alignment. 397 398 This object is typically coupled with another object, such as a 399 ``Sequence`` object (or its child class), or a ``TabularMSA`` object. 400 401 Parameters 402 ---------- 403 upper_bound : int or None 404 Defines the exclusive upper bound of the interval features. No 405 coordinate can be greater than it. ``None`` 406 means that the coordinate space is unbounded. 407 copy_from : IntervalMetadata or None, optional 408 Create a new object from the input ``IntervalMetadata`` object by 409 shallow copying if it is not ``None``. The upper bound of the new 410 object will be updated with the ``upper_bound`` parameter specified. 411 412 Notes 413 ----- 414 This class stores coordinates of all feature bounds into a interval 415 tree. It allows the speed up of query-by-bound. The building of 416 interval tree is deferred until necessary to save computation. It is 417 updated from all coordinates only when you need to fetch info from 418 the interval tree. 419 420 When you add a method into this class and if you method need to fetch 421 info from ``IntervalMetadata._interval_tree``, you should decorate it with 422 ``_rebuild_tree``. This decorator will check if the current interval tree 423 is stale and will update it if so. Additionally, if your method add, 424 delete, or changes the coordinates of any interval features, you should 425 set ``self._is_stale_tree`` to ``True`` at the end of your method to 426 indicate the interval tree becomes stale. 427 428 See Also 429 -------- 430 skbio.metadata.Interval 431 432 Examples 433 -------- 434 Let's say we have a sequence of length 10 and want to add annotation 435 to it. Create an ``IntervalMetadata`` object: 436 437 >>> from skbio.metadata import Interval, IntervalMetadata 438 >>> im = IntervalMetadata(10) 439 440 Let's add annotations of 3 genes: 441 442 >>> im.add(bounds=[(3, 9)], 443 ... metadata={'gene': 'sagB'}) # doctest: +ELLIPSIS 444 Interval(interval_metadata=..., bounds=[(3, 9)], \ 445fuzzy=[(False, False)], metadata={'gene': 'sagB'}) 446 >>> im.add(bounds=[(3, 7)], 447 ... metadata={'gene': 'sagC'}) # doctest: +ELLIPSIS 448 Interval(interval_metadata=..., bounds=[(3, 7)], \ 449fuzzy=[(False, False)], metadata={'gene': 'sagC'}) 450 >>> im.add(bounds=[(1, 2), (4, 7)], 451 ... metadata={'gene': 'sagA'}) # doctest: +ELLIPSIS 452 Interval(interval_metadata=..., bounds=[(1, 2), (4, 7)], \ 453fuzzy=[(False, False), (False, False)], metadata={'gene': 'sagA'}) 454 455 Show the object representation: 456 457 >>> im # doctest: +ELLIPSIS 458 3 interval features 459 ------------------- 460 Interval(interval_metadata=..., bounds=[(3, 9)], \ 461fuzzy=[(False, False)], metadata={'gene': 'sagB'}) 462 Interval(interval_metadata=..., bounds=[(3, 7)], \ 463fuzzy=[(False, False)], metadata={'gene': 'sagC'}) 464 Interval(interval_metadata=..., bounds=[(1, 2), (4, 7)], \ 465fuzzy=[(False, False), (False, False)], metadata={'gene': 'sagA'}) 466 467 We can sort the genes by their bounds: 468 469 >>> im.sort() 470 >>> im # doctest: +ELLIPSIS 471 3 interval features 472 ------------------- 473 Interval(interval_metadata=..., bounds=[(1, 2), (4, 7)], \ 474fuzzy=[(False, False), (False, False)], metadata={'gene': 'sagA'}) 475 Interval(interval_metadata=..., bounds=[(3, 7)], \ 476fuzzy=[(False, False)], metadata={'gene': 'sagC'}) 477 Interval(interval_metadata=..., bounds=[(3, 9)], \ 478fuzzy=[(False, False)], metadata={'gene': 'sagB'}) 479 480 Query the genes by bound and/or metadata: 481 482 >>> intvls = im.query([(1, 2)], metadata={'gene': 'foo'}) 483 >>> list(intvls) 484 [] 485 >>> intvls = im.query([(7, 9)]) 486 >>> list(intvls) # doctest: +ELLIPSIS 487 [Interval(interval_metadata=..., bounds=[(3, 9)], \ 488fuzzy=[(False, False)], metadata={'gene': 'sagB'})] 489 >>> intvls = im.query(metadata={'gene': 'sagA'}) 490 >>> intvls = list(intvls) 491 >>> intvls # doctest: +ELLIPSIS 492 [Interval(interval_metadata=..., bounds=[(1, 2), (4, 7)], \ 493fuzzy=[(False, False), (False, False)], metadata={'gene': 'sagA'})] 494 495 Drop the gene(s) we get from query: 496 497 >>> im.drop(intvls) 498 >>> im.sort() 499 >>> im # doctest: +ELLIPSIS 500 2 interval features 501 ------------------- 502 Interval(interval_metadata=..., bounds=[(3, 7)], \ 503fuzzy=[(False, False)], metadata={'gene': 'sagC'}) 504 Interval(interval_metadata=..., bounds=[(3, 9)], \ 505fuzzy=[(False, False)], metadata={'gene': 'sagB'}) 506 507 """ 508 default_write_format = 'gff3' 509 510 def __init__(self, upper_bound, copy_from=None): 511 self._upper_bound = upper_bound 512 if self.upper_bound is not None: 513 if self.upper_bound < self.lower_bound: 514 raise ValueError('Cannot set `upper_bound` (%r) ' 515 'smaller than `lower_bound` (%r)' 516 % (self.upper_bound, self.lower_bound)) 517 # List of Interval objects. 518 self._intervals = [] 519 520 # IntervalTree object to allow faster querying of interval objects. 521 self._interval_tree = IntervalTree() 522 523 # Indicates if the IntervalTree needs to be rebuilt. 524 self._is_stale_tree = False 525 526 if copy_from is not None: 527 for interval in copy_from._intervals: 528 bounds_cp = interval.bounds[:] 529 fuzzy_cp = interval.fuzzy[:] 530 metadata_cp = copy.copy(interval.metadata) 531 532 self.add(bounds_cp, fuzzy=fuzzy_cp, metadata=metadata_cp) 533 534 @property 535 @experimental(as_of='0.5.1') 536 def upper_bound(self): 537 '''The exclusive upper bound of interval features.''' 538 return self._upper_bound 539 540 @property 541 @experimental(as_of='0.5.1') 542 def lower_bound(self): 543 '''The inclusive lower bound of interval features.''' 544 return 0 545 546 @property 547 @experimental(as_of='0.5.1') 548 def num_interval_features(self): 549 '''The total number of interval features.''' 550 return len(self._intervals) 551 552 def _rebuild_tree(method): 553 """Rebuild the IntervalTree.""" 554 @functools.wraps(method) 555 def inner(self, *args, **kwargs): 556 if self._is_stale_tree is False: 557 return method(self, *args, **kwargs) 558 self._interval_tree = IntervalTree() 559 for f in self._intervals: 560 for start, end in f.bounds: 561 self._interval_tree.add(start, end, f) 562 self._is_stale_tree = False 563 return method(self, *args, **kwargs) 564 return inner 565 566 def _reverse(self): 567 """Reverse ``IntervalMetadata`` object. 568 569 This operation reverses all of the interval coordinates. 570 For instance, this can be used to compare coordinates 571 in the forward strand to coordinates in the reversal strand. 572 """ 573 574 for f in self._intervals: 575 try: 576 intvls = [(self.upper_bound - x[1], self.upper_bound - x[0]) 577 for x in reversed(f.bounds)] 578 except TypeError: 579 raise TypeError('You cannot reverse the coordinates ' 580 'when the upper bound is `None`') 581 f.bounds = intvls 582 583 # DON'T forget this!!! 584 self._is_stale_tree = True 585 586 @classonlymethod 587 @experimental(as_of="0.5.1") 588 def concat(cls, interval_metadata): 589 '''Concatenate an iterable of ``IntervalMetadata`` objects. 590 591 It concatenates the multiple ``IntervalMetadata`` objects into 592 one coordinate space. The order of the objects in the input 593 iterable matters. The coordinate of the second 594 ``InterableMetadata`` will be shifted up with the length of 595 the first ``IntervalMetadata`` object. 596 597 This function is useful when you concatenate multiple sequences. 598 599 Parameters 600 ---------- 601 interval_metadata : Iterable (IntervalMetadata) 602 The interval metadata to concatenate. 603 604 Returns 605 ------- 606 IntervalMetadata 607 Concatenated interval metadata. 608 609 Examples 610 -------- 611 >>> from skbio.metadata import IntervalMetadata 612 613 Create two ``IntervalMetadata`` objects: 614 615 >>> im1 = IntervalMetadata(3) 616 >>> _ = im1.add([(0, 2)], [(True, False)], {'gene': 'sagA'}) 617 >>> im2 = IntervalMetadata(4) 618 >>> _ = im2.add([(1, 4)], [(True, True)], {'gene': 'sagB'}) 619 620 Concatenate them into a single coordinate space. The second 621 ``IntervalMetadata``'s interval features are all shifted 622 up. The resulting ``IntervalMetadata``'s upper bound is the 623 sum of upper bounds of concatenated objects: 624 625 >>> im = IntervalMetadata.concat([im1, im2]) 626 >>> im # doctest: +ELLIPSIS 627 2 interval features 628 ------------------- 629 Interval(interval_metadata=<...>, bounds=[(0, 2)], \ 630fuzzy=[(True, False)], metadata={'gene': 'sagA'}) 631 Interval(interval_metadata=<...>, bounds=[(4, 7)], \ 632fuzzy=[(True, True)], metadata={'gene': 'sagB'}) 633 >>> im.upper_bound 634 7 635 636 ''' 637 interval_metadata = list(interval_metadata) 638 639 if len(interval_metadata) == 0: 640 return cls(0) 641 642 upper_bound = 0 643 for im in interval_metadata: 644 try: 645 upper_bound += im.upper_bound 646 except TypeError: 647 raise TypeError('You cannot concat the interval metadata ' 648 'because its upper bound is `None`:\n%r' % im) 649 new = cls(upper_bound) 650 651 length = 0 652 for i, im in enumerate(interval_metadata): 653 for intvl in im._intervals: 654 bounds = intvl.bounds 655 fuzzy = intvl.fuzzy 656 if i != 0: 657 bounds = [(start + length, end + length) 658 for start, end in bounds] 659 new.add(bounds, fuzzy, intvl.metadata) 660 length += im.upper_bound 661 662 return new 663 664 @experimental(as_of='0.5.1') 665 def merge(self, other): 666 '''Merge the interval features of another ``IntervalMetadata`` object. 667 668 It adds all the interval features of the other object into 669 ``self``. Note this will not check if there are any duplicates 670 of interval features after merge. 671 672 Notes 673 ----- 674 It will raise error if you merge an unbounded 675 ``IntervalMetadata`` object to the current object if it is 676 bounded. This avoids partially updating the current object if 677 the merge fails in the middle of the process due to the 678 possibility that some interval features to be merged may live 679 outside the current defined upper bound. 680 681 Parameters 682 ---------- 683 other : ``IntervalMetadata`` 684 The other ``IntervalMetadata`` to be merged. 685 686 ''' 687 if self.upper_bound is not None: 688 if other.upper_bound is None: 689 raise ValueError( 690 'Cannot merge an unbound IntervalMetadata object ' 691 'to a bounded one') 692 elif self.upper_bound != other.upper_bound: 693 raise ValueError( 694 'The upper bounds of the two IntervalMetadata objects ' 695 'are not equal (%r != %r)' % ( 696 self.upper_bound, other.upper_bound)) 697 if self.lower_bound != other.lower_bound: 698 raise ValueError( 699 'The lower bounds of the two IntervalMetadata objects ' 700 'are not equal (%d != %d)' % ( 701 self.lower_bound, other.lower_bound)) 702 for intvl in other._intervals: 703 self.add(intvl.bounds, intvl.fuzzy, intvl.metadata) 704 705 @experimental(as_of='0.5.1') 706 def sort(self, ascending=True): 707 '''Sort interval features by their coordinates. 708 709 It sorts by the start coordinate first. If they are the same between 710 two interval features, they will be sorted by comparing their end 711 coordinates. For example, an interval feature with [(1, 2), (4, 7)] 712 will be sorted in front of another one with [(1, 2), (3, 8)]. 713 714 Parameters 715 ---------- 716 ascending : bool, optional 717 sort in ascending or descending coordinates. 718 ''' 719 self._intervals.sort( 720 key=lambda i: [i.bounds[0][0], i.bounds[-1][1]], 721 reverse=not ascending) 722 723 @experimental(as_of='0.5.1') 724 def add(self, bounds, fuzzy=None, metadata=None): 725 """Create and add an ``Interval`` to this ``IntervalMetadata``. 726 727 This method creates an ``Interval`` object and inserts it into 728 the ``IntervalMetadata`` object. 729 730 Parameters 731 ---------- 732 bounds : iterable of tuple of ints 733 Tuples representing start and end coordinates. It is *zero-based* 734 numbering. It is always inclusive on start bound and exclusive on 735 end bound. 736 fuzzy : iterable of tuple of bool, optional 737 Tuples representing the fuzziness of each bound coordinates. 738 metadata : dict, optional 739 A dictionary of key-value pairs associated with the 740 ``Interval`` object. 741 742 Returns 743 ------- 744 Interval 745 The ``Interval`` object added. 746 747 See Also 748 -------- 749 skbio.metadata.Interval 750 """ 751 # Add an interval to the tree. Note that the add functionality is 752 # built within the Interval constructor. 753 return Interval(interval_metadata=self, 754 bounds=bounds, 755 fuzzy=fuzzy, 756 metadata=metadata) 757 758 @_rebuild_tree 759 def _query_interval(self, bound): 760 """Yield ``Interval`` objects that overlap with the bound.""" 761 _assert_valid_bound(bound) 762 763 start, end = bound 764 intvls = self._interval_tree.find(start, end) 765 # if a ``Interval`` has many non-contiguous spans and 766 # multiple of them overlap with the bound, then 767 # this ``Interval`` object will be returned 768 # multiple times. So we need to remove duplicates. 769 seen = set() 770 for intvl in intvls: 771 if id(intvl) not in seen: 772 seen.add(id(intvl)) 773 yield intvl 774 775 def _query_attribute(self, metadata, intervals=None): 776 """Yield ``Interval`` objects based on query attributes. 777 778 Parameters 779 ---------- 780 metadata : dict or ``None`` 781 If it is ``None``, return empty iterator; if it is 782 ``{}``, return an interator of all the ``Interval`` 783 objects. 784 intervals : an iterable of ``Interval`` objects 785 """ 786 if metadata is None: 787 return 788 789 if intervals is None: 790 intervals = self._intervals 791 792 for intvl in intervals: 793 for (key, value) in metadata.items(): 794 if (key not in intvl.metadata or 795 intvl.metadata[key] != value): 796 break 797 else: 798 yield intvl 799 800 @experimental(as_of='0.5.1') 801 @_rebuild_tree 802 def query(self, bounds=None, metadata=None): 803 """Yield ``Interval`` object with the bounds and attributes. 804 805 The ``Interval`` objects must meet both requirements: 1) overlap 806 with any of the spans specified by ``bounds``; 2) satisfy 807 ``metadata`` specification. For instance, you can identify 808 all the recA genes that overlap with (10, 100) or (900, 1000) 809 with this code ``interval_metadata.query([(10, 100), 810 (900, 1000)], {'gene': 'recA'})``. 811 812 Parameters 813 ---------- 814 bounds : iterable of tuples of int pair, optional 815 Specifies bounds to look for the ``Interval`` 816 objects. An satisfying interval feature only need to overlap with 817 one bound. Default (``None``) means all ``Intervals`` meet 818 this requirement. 819 820 metadata : dict, optional 821 A dictionary of key word attributes associated with the 822 ``Interval`` object. It specifies what metadata keywords and 823 values to look for. Default (``None``) means all ``Intervals`` 824 meet this requirement. 825 826 Yields 827 ------ 828 Interval 829 ``Interval`` object satisfying the search criteria. 830 """ 831 if bounds is None: 832 for intvl in self._query_attribute(metadata): 833 yield intvl 834 else: 835 for loc in bounds: 836 intvls = self._query_interval(loc) 837 if metadata is None: 838 metadata = {} 839 for intvl in self._query_attribute(metadata, intvls): 840 yield intvl 841 842 @experimental(as_of='0.5.1') 843 def drop(self, intervals, negate=False): 844 """Drops Interval objects. 845 846 The given ``Interval`` objects will be removed and their 847 associated ``IntervalMetadata`` will be set to ``None``. 848 849 Parameters 850 ---------- 851 intervals : iterable of ``Interval`` 852 ``Interval`` objects to drop from this object. 853 negate : bool 854 Negate the drop operation, i.e. keeping the specified intervals 855 instead of dropping them. 856 """ 857 to_delete = {id(f) for f in intervals} 858 859 new_intvls = [] 860 # iterate through queries and drop them 861 for intvl in self._intervals: 862 drop = id(intvl) in to_delete 863 if negate is True: 864 drop = not drop 865 if drop: 866 intvl._interval_metadata = None 867 else: 868 new_intvls.append(intvl) 869 870 self._intervals = new_intvls 871 self._is_stale_tree = True 872 873 @experimental(as_of='0.5.1') 874 def __eq__(self, other): 875 '''Test if this object is equal to another. 876 877 It checks if the coordinate spaces are the same between the 878 two objects. If so, then check if all the interval features 879 are equal between the two objects after sorting them by 880 bounds. 881 882 Parameters 883 ---------- 884 other : IntervalMetadata 885 Interval metadata to test for equality against. 886 887 Returns 888 ------- 889 bool 890 Indicates if the two objects are equal. 891 ''' 892 if self.upper_bound != other.upper_bound or \ 893 self.lower_bound != other.lower_bound: 894 return False 895 else: 896 self_intervals = sorted(self._intervals, 897 key=operator.attrgetter('bounds')) 898 other_intervals = sorted(other._intervals, 899 key=operator.attrgetter('bounds')) 900 return self_intervals == other_intervals 901 902 @experimental(as_of='0.5.1') 903 def __ne__(self, other): 904 '''Test if this object is not equal to another. 905 906 Parameters 907 ---------- 908 other : IntervalMetadata 909 Interval metadata to test for inequality against. 910 911 Returns 912 ------- 913 bool 914 Indicates if the two objects are not equal. 915 916 See Also 917 -------- 918 skbio.metadata.IntervalMetadata.__eq__ 919 ''' 920 return not (self == other) 921 922 @experimental(as_of='0.5.1') 923 def __repr__(self): 924 '''Return a string representation of this object. 925 926 Returns 927 ------- 928 str 929 String representation of this ``IntervalMetadata`` object. 930 ''' 931 n = self.num_interval_features 932 l1 = '{} interval feature'.format(n) 933 if n != 1: 934 l1 += 's' 935 l2 = '-' * len(l1) 936 937 if n <= 5: 938 items = [repr(i) for i in self._intervals] 939 else: 940 # intentionally overwrite items[2] to make code cleaner 941 items = [repr(self._intervals[i]) for i in [0, 1, 2, n-2, n-1]] 942 items[2] = '...' 943 944 return '\n'.join([l1, l2] + items) 945 946 @experimental(as_of='0.5.1') 947 def __copy__(self): 948 '''Return a shallow copy. 949 950 Notes 951 ----- 952 The ``IntervalMetadata`` copy will have copies of the 953 ``Interval`` objects present in this object. The ``metadata`` 954 dictionary of each ``Interval`` object will be a shallow copy. 955 956 See Also 957 -------- 958 __deepcopy__ 959 ''' 960 return self._copy(False, {}) 961 962 @experimental(as_of='0.5.1') 963 def __deepcopy__(self, memo): 964 '''Return a deep copy. 965 966 Notes 967 ----- 968 The ``IntervalMetadata`` copy will have copies of the 969 ``Interval`` objects present in this object. The ``metadata`` 970 dictionary of each ``Interval`` object will be a deep copy. 971 972 See Also 973 -------- 974 __copy__ 975 ''' 976 return self._copy(True, memo) 977 978 def _copy(self, deep, memo): 979 cp = IntervalMetadata(self.upper_bound) 980 981 for interval in self._intervals: 982 # Only need to shallow-copy `bounds` and `fuzzy` 983 # because their elements are immutable. 984 bounds_cp = interval.bounds[:] 985 fuzzy_cp = interval.fuzzy[:] 986 if deep: 987 metadata_cp = copy.deepcopy(interval.metadata, memo) 988 else: 989 metadata_cp = copy.copy(interval.metadata) 990 991 cp.add(bounds_cp, 992 fuzzy=fuzzy_cp, 993 metadata=metadata_cp) 994 995 return cp 996 997 998def _assert_valid_bound(bound): 999 if isinstance(bound, tuple): 1000 try: 1001 start, end = bound 1002 except ValueError: 1003 raise ValueError("A `bound` must be a tuple of exactly " 1004 "two coordinates, not {!r}".format(bound)) 1005 if not (isinstance(start, int) and 1006 isinstance(end, int)) or start > end: 1007 raise ValueError('`start` (%r) cannot be a larger int ' 1008 'than `end` (%r).' % (start, end)) 1009 else: 1010 raise TypeError("Each `bound` must be a tuple, not {!r}".format( 1011 bound)) 1012 1013 1014def _assert_valid_fuzzy(fuzzy): 1015 if isinstance(fuzzy, tuple): 1016 try: 1017 start, end = fuzzy 1018 except ValueError: 1019 raise ValueError("A `fuzzy` must be a tuple of exactly " 1020 "two, not {!r}".format(fuzzy)) 1021 if not (isinstance(start, bool) and isinstance(end, bool)): 1022 raise TypeError('A `fuzzy` must be a tuple of two booleans') 1023 else: 1024 raise TypeError("Each `fuzzy` must be a tuple, not {!r}".format( 1025 fuzzy)) 1026