1# Copyright 2012 by Wibowo Arindrarto. All rights reserved. 2# This file is part of the Biopython distribution and governed by your 3# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 4# Please see the LICENSE file that should have been included as part of this 5# package. 6"""Bio.SearchIO objects to model high scoring regions between query and hit.""" 7 8import warnings 9from operator import ge, le 10 11from Bio import BiopythonWarning 12from Bio.Align import MultipleSeqAlignment 13from Bio.Seq import Seq 14from Bio.SeqRecord import SeqRecord 15 16from Bio.SearchIO._utils import ( 17 singleitem, 18 allitems, 19 fullcascade, 20 fragcascade, 21 getattr_str, 22) 23 24from ._base import _BaseHSP 25 26 27class HSP(_BaseHSP): 28 """Class representing high-scoring region(s) between query and hit. 29 30 HSP (high-scoring pair) objects are contained by Hit objects (see Hit). 31 In most cases, HSP objects store the bulk of the statistics and results 32 (e.g. e-value, bitscores, query sequence, etc.) produced by a search 33 program. 34 35 Depending on the search output file format, a given HSP will contain one 36 or more HSPFragment object(s). Examples of search programs that produce HSP 37 with one HSPFragments are BLAST, HMMER, and FASTA. Other programs such as 38 BLAT or Exonerate may produce HSPs containing more than one HSPFragment. 39 However, their native terminologies may differ: in BLAT these fragments 40 are called 'blocks' while in Exonerate they are called exons or NER. 41 42 Here are examples from each type of HSP. The first one comes from a BLAST 43 search:: 44 45 >>> from Bio import SearchIO 46 >>> blast_qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) 47 >>> blast_hsp = blast_qresult[1][0] # the first HSP from the second hit 48 >>> blast_hsp 49 HSP(hit_id='gi|301171311|ref|NR_035856.1|', query_id='33211', 1 fragments) 50 >>> print(blast_hsp) 51 Query: 33211 mir_1 52 Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... 53 Query range: [1:61] (1) 54 Hit range: [0:60] (1) 55 Quick stats: evalue 1.7e-22; bitscore 109.49 56 Fragments: 1 (60 columns) 57 Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 58 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 59 Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 60 61 For HSPs with a single HSPFragment, you can invoke ``print`` on it and see the 62 underlying sequence alignment, if it exists. This is not the case for HSPs 63 with more than one HSPFragment. Below is an example, using an HSP from a 64 BLAT search. Invoking ``print`` on these HSPs will instead show a table of the 65 HSPFragment objects it contains:: 66 67 >>> blat_qresult = SearchIO.read('Blat/mirna.pslx', 'blat-psl', pslx=True) 68 >>> blat_hsp = blat_qresult[1][0] # the first HSP from the second hit 69 >>> blat_hsp 70 HSP(hit_id='chr11', query_id='blat_1', 2 fragments) 71 >>> print(blat_hsp) 72 Query: blat_1 <unknown description> 73 Hit: chr11 <unknown description> 74 Query range: [42:67] (-1) 75 Hit range: [59018929:59018955] (1) 76 Quick stats: evalue ?; bitscore ? 77 Fragments: --- -------------- ---------------------- ---------------------- 78 # Span Query range Hit range 79 --- -------------- ---------------------- ---------------------- 80 0 6 [61:67] [59018929:59018935] 81 1 16 [42:58] [59018939:59018955] 82 83 Notice that in HSPs with more than one HSPFragments, the HSP's ``query_range`` 84 ``hit_range`` properties encompasses all fragments it contains. 85 86 You can check whether an HSP has more than one HSPFragments or not using the 87 ``is_fragmented`` property:: 88 89 >>> blast_hsp.is_fragmented 90 False 91 >>> blat_hsp.is_fragmented 92 True 93 94 Since HSP objects are also containers similar to Python lists, you can 95 access a single fragment in an HSP using its integer index:: 96 97 >>> blat_fragment = blat_hsp[0] 98 >>> print(blat_fragment) 99 Query: blat_1 <unknown description> 100 Hit: chr11 <unknown description> 101 Query range: [61:67] (-1) 102 Hit range: [59018929:59018935] (1) 103 Fragments: 1 (6 columns) 104 Query - tatagt 105 Hit - tatagt 106 107 This applies to HSPs objects with a single fragment as well:: 108 109 >>> blast_fragment = blast_hsp[0] 110 >>> print(blast_fragment) 111 Query: 33211 mir_1 112 Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... 113 Query range: [1:61] (1) 114 Hit range: [0:60] (1) 115 Fragments: 1 (60 columns) 116 Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 117 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 118 Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 119 120 Regardless of the search output file format, HSP objects provide the 121 properties listed below. These properties always return values in a list, 122 due to the HSP object itself being a list-like container. However, for 123 HSP objects with a single HSPFragment, shortcut properties that fetches 124 the item from the list are also provided. 125 126 +----------------------+---------------------+-----------------------------+ 127 | Property | Shortcut | Value | 128 +======================+=====================+=============================+ 129 | aln_all | aln | HSP alignments as | 130 | | | MultipleSeqAlignment object | 131 +----------------------+---------------------+-----------------------------+ 132 | aln_annotation_all | aln_annotation | dictionary of annotation(s) | 133 | | | of all fragments' alignments| 134 +----------------------+---------------------+-----------------------------+ 135 | fragments | fragment | HSPFragment objects | 136 +----------------------+---------------------+-----------------------------+ 137 | hit_all | hit | hit sequence as SeqRecord | 138 | | | objects | 139 +----------------------+---------------------+-----------------------------+ 140 | hit_features_all | hit_features | SeqFeatures of all hit | 141 | | | fragments | 142 +----------------------+---------------------+-----------------------------+ 143 | hit_start_all | hit_start* | start coordinates of the | 144 | | | hit fragments | 145 +----------------------+---------------------+-----------------------------+ 146 | hit_end_all | hit_end* | end coordinates of the hit | 147 | | | fragments | 148 +----------------------+---------------------+-----------------------------+ 149 | hit_span_all | hit_span* | sizes of each hit fragments | 150 +----------------------+---------------------+-----------------------------+ 151 | hit_strand_all | hit_strand | strand orientations of the | 152 | | | hit fragments | 153 +----------------------+---------------------+-----------------------------+ 154 | hit_frame_all | hit_frame | reading frames of the hit | 155 | | | fragments | 156 +----------------------+---------------------+-----------------------------+ 157 | hit_range_all | hit_range | tuples of start and end | 158 | | | coordinates of each hit | 159 | | | fragment | 160 +----------------------+---------------------+-----------------------------+ 161 | query_all | query | query sequence as SeqRecord | 162 | | | object | 163 +----------------------+---------------------+-----------------------------+ 164 | query_features_all | query_features | SeqFeatures of all query | 165 | | | fragments | 166 +----------------------+---------------------+-----------------------------+ 167 | query_start_all | query_start* | start coordinates of the | 168 | | | fragments | 169 +----------------------+---------------------+-----------------------------+ 170 | query_end_all | query_end* | end coordinates of the | 171 | | | query fragments | 172 +----------------------+---------------------+-----------------------------+ 173 | query_span_all | query_span* | sizes of each query | 174 | | | fragments | 175 +----------------------+---------------------+-----------------------------+ 176 | query_strand_all | query_strand | strand orientations of the | 177 | | | query fragments | 178 +----------------------+---------------------+-----------------------------+ 179 | query_frame_all | query_frame | reading frames of the query | 180 | | | fragments | 181 +----------------------+---------------------+-----------------------------+ 182 | query_range_all | query_range | tuples of start and end | 183 | | | coordinates of each query | 184 | | | fragment | 185 +----------------------+---------------------+-----------------------------+ 186 187 For all types of HSP objects, the property will return the values in a list. 188 Shorcuts are only applicable for HSPs with one fragment. Except the ones 189 noted, if they are used on an HSP with more than one fragments, an exception 190 will be raised. 191 192 For properties that may be used in HSPs with multiple or single fragments 193 (``*_start``, ``*_end``, and ``*_span`` properties), their interpretation depends 194 on how many fragment the HSP has: 195 196 +------------+---------------------------------------------------+ 197 | Property | Value | 198 +============+===================================================+ 199 | hit_start | smallest coordinate value of all hit fragments | 200 +------------+---------------------------------------------------+ 201 | hit_end | largest coordinate value of all hit fragments | 202 +------------+---------------------------------------------------+ 203 | hit_span | difference between ``hit_start`` and ``hit_end`` | 204 +------------+---------------------------------------------------+ 205 | query_start| smallest coordinate value of all query fragments | 206 +------------+---------------------------------------------------+ 207 | query_end | largest coordinate value of all query fragments | 208 +------------+---------------------------------------------------+ 209 | query_span | difference between ``query_start`` and | 210 | | ``query_end`` | 211 +------------+---------------------------------------------------+ 212 213 In addition to the objects listed above, HSP objects also provide the 214 following properties and/or attributes: 215 216 +--------------------+------------------------------------------------------+ 217 | Property | Value | 218 +====================+======================================================+ 219 | aln_span | total number of residues in all HSPFragment objects | 220 +--------------------+------------------------------------------------------+ 221 | molecule_type | molecule_type of the hit and query SeqRecord objects | 222 +--------------------+------------------------------------------------------+ 223 | is_fragmented | boolean, whether there are multiple fragments or not | 224 +--------------------+------------------------------------------------------+ 225 | hit_id | ID of the hit sequence | 226 +--------------------+------------------------------------------------------+ 227 | hit_description | description of the hit sequence | 228 +--------------------+------------------------------------------------------+ 229 | hit_inter_ranges | list of hit sequence coordinates of the regions | 230 | | between fragments | 231 +--------------------+------------------------------------------------------+ 232 | hit_inter_spans | list of lengths of the regions between hit fragments | 233 +--------------------+------------------------------------------------------+ 234 | output_index | 0-based index for storing the order by which the HSP | 235 | | appears in the output file (default: -1). | 236 +--------------------+------------------------------------------------------+ 237 | query_id | ID of the query sequence | 238 +--------------------+------------------------------------------------------+ 239 | query_description | description of the query sequence | 240 +--------------------+------------------------------------------------------+ 241 | query_inter_ranges | list of query sequence coordinates of the regions | 242 | | between fragments | 243 +--------------------+------------------------------------------------------+ 244 | query_inter_spans | list of lengths of the regions between query | 245 | | fragments | 246 +--------------------+------------------------------------------------------+ 247 248 .. [1] may be used in HSPs with multiple fragments 249 250 """ 251 252 # attributes we don't want to transfer when creating a new Hit class 253 # from this one 254 _NON_STICKY_ATTRS = ("_items",) 255 256 def __init__(self, fragments=(), output_index=-1): 257 """Initialize an HSP object. 258 259 :param fragments: fragments contained in the HSP object 260 :type fragments: iterable yielding HSPFragment 261 :param output_index: optional index / ordering of the HSP fragment in 262 the original input file. 263 :type output_index: integer 264 265 HSP objects must be initialized with a list containing at least one 266 HSPFragment object. If multiple HSPFragment objects are used for 267 initialization, they must all have the same ``query_id``, 268 ``query_description``, ``hit_id``, ``hit_description``, and 269 ``molecule_type`` properties. 270 271 """ 272 if not fragments: 273 raise ValueError("HSP objects must have at least one HSPFragment object.") 274 # TODO - Move this into the for look in case hsps is a single use 275 # iterable? 276 # check that all fragments contain the same IDs, descriptions, 277 # molecule_type 278 for attr in ( 279 "query_id", 280 "query_description", 281 "hit_id", 282 "hit_description", 283 "molecule_type", 284 ): 285 if len({getattr(frag, attr) for frag in fragments}) != 1: 286 raise ValueError( 287 "HSP object can not contain fragments with more than one %s." % attr 288 ) 289 290 self.output_index = output_index 291 self._items = [] 292 for fragment in fragments: 293 self._validate_fragment(fragment) 294 self._items.append(fragment) 295 296 def __repr__(self): 297 """Return string representation of HSP object.""" 298 return "%s(hit_id=%r, query_id=%r, %r fragments)" % ( 299 self.__class__.__name__, 300 self.hit_id, 301 self.query_id, 302 len(self), 303 ) 304 305 def __iter__(self): 306 """Iterate over HSP items.""" 307 return iter(self._items) 308 309 def __contains__(self, fragment): 310 """Return True if HSPFragment is on HSP items.""" 311 return fragment in self._items 312 313 def __len__(self): 314 """Return number of HSPs items.""" 315 return len(self._items) 316 317 def __bool__(self): 318 """Return True if it has HSPs.""" 319 return bool(self._items) 320 321 def __str__(self): 322 """Return a human readable summary of the HSP object.""" 323 lines = [] 324 # set hsp info line 325 statline = [] 326 # evalue 327 evalue = getattr_str(self, "evalue", fmt="%.2g") 328 statline.append("evalue " + evalue) 329 # bitscore 330 bitscore = getattr_str(self, "bitscore", fmt="%.2f") 331 statline.append("bitscore " + bitscore) 332 lines.append("Quick stats: " + "; ".join(statline)) 333 334 if len(self.fragments) == 1: 335 return "\n".join( 336 [self._str_hsp_header(), "\n".join(lines), self.fragments[0]._str_aln()] 337 ) 338 else: 339 lines.append( 340 " Fragments: %s %s %s %s" % ("-" * 3, "-" * 14, "-" * 22, "-" * 22) 341 ) 342 pattern = "%16s %14s %22s %22s" 343 lines.append(pattern % ("#", "Span", "Query range", "Hit range")) 344 lines.append(pattern % ("-" * 3, "-" * 14, "-" * 22, "-" * 22)) 345 for idx, block in enumerate(self.fragments): 346 # set hsp line and table 347 # alignment span 348 aln_span = getattr_str(block, "aln_span") 349 # query region 350 query_start = getattr_str(block, "query_start") 351 query_end = getattr_str(block, "query_end") 352 query_range = "[%s:%s]" % (query_start, query_end) 353 # max column length is 20 354 query_range = ( 355 query_range[:20] + "~]" if len(query_range) > 22 else query_range 356 ) 357 # hit region 358 hit_start = getattr_str(block, "hit_start") 359 hit_end = getattr_str(block, "hit_end") 360 hit_range = "[%s:%s]" % (hit_start, hit_end) 361 hit_range = hit_range[:20] + "~]" if len(hit_range) > 22 else hit_range 362 # append the hsp row 363 lines.append(pattern % (str(idx), aln_span, query_range, hit_range)) 364 365 return self._str_hsp_header() + "\n" + "\n".join(lines) 366 367 def __getitem__(self, idx): 368 """Return object of index idx.""" 369 # if key is slice, return a new HSP instance 370 if isinstance(idx, slice): 371 obj = self.__class__(self._items[idx]) 372 self._transfer_attrs(obj) 373 return obj 374 return self._items[idx] 375 376 def __setitem__(self, idx, fragments): 377 """Set an item of index idx with the given fragments.""" 378 # handle case if hsps is a list of hsp 379 if isinstance(fragments, (list, tuple)): 380 for fragment in fragments: 381 self._validate_fragment(fragment) 382 else: 383 self._validate_fragment(fragments) 384 385 self._items[idx] = fragments 386 387 def __delitem__(self, idx): 388 """Delete item of index idx.""" 389 # note that this may result in an empty HSP object, which should be 390 # invalid 391 del self._items[idx] 392 393 def _validate_fragment(self, fragment): 394 if not isinstance(fragment, HSPFragment): 395 raise TypeError("HSP objects can only contain HSPFragment objects.") 396 # HACK: to make validation during __init__ work 397 if self._items: 398 if fragment.hit_id != self.hit_id: 399 raise ValueError( 400 "Expected HSPFragment with hit ID %r, found %r instead." 401 % (self.id, fragment.hit_id) 402 ) 403 404 if fragment.hit_description != self.hit_description: 405 raise ValueError( 406 "Expected HSPFragment with hit description %r, found %r instead." 407 % (self.description, fragment.hit_description) 408 ) 409 410 if fragment.query_id != self.query_id: 411 raise ValueError( 412 "Expected HSPFragment with query ID %r, found %r instead." 413 % (self.query_id, fragment.query_id) 414 ) 415 416 if fragment.query_description != self.query_description: 417 raise ValueError( 418 "Expected HSP with query description %r, found %r instead." 419 % (self.query_description, fragment.query_description) 420 ) 421 422 def _aln_span_get(self): 423 # length of all alignments 424 # alignment span can be its own attribute, or computed from 425 # query / hit length 426 return sum(frg.aln_span for frg in self.fragments) 427 428 aln_span = property( 429 fget=_aln_span_get, doc="Total number of columns in all HSPFragment objects." 430 ) 431 432 # coordinate properties # 433 def _get_coords(self, seq_type, coord_type): 434 assert seq_type in ("hit", "query") 435 assert coord_type in ("start", "end") 436 coord_name = "%s_%s" % (seq_type, coord_type) 437 coords = [getattr(frag, coord_name) for frag in self.fragments] 438 if None in coords: 439 warnings.warn( 440 "'None' exist in %s coordinates; ignored" % (coord_name), 441 BiopythonWarning, 442 ) 443 return coords 444 445 def _hit_start_get(self): 446 return min(self._get_coords("hit", "start")) 447 448 hit_start = property( 449 fget=_hit_start_get, doc="Smallest coordinate value of all hit fragments." 450 ) 451 452 def _query_start_get(self): 453 return min(self._get_coords("query", "start")) 454 455 query_start = property( 456 fget=_query_start_get, doc="Smallest coordinate value of all query fragments." 457 ) 458 459 def _hit_end_get(self): 460 return max(self._get_coords("hit", "end")) 461 462 hit_end = property( 463 fget=_hit_end_get, doc="Largest coordinate value of all hit fragments." 464 ) 465 466 def _query_end_get(self): 467 return max(self._get_coords("query", "end")) 468 469 query_end = property( 470 fget=_query_end_get, doc="Largest coordinate value of all hit fragments." 471 ) 472 473 # coordinate-dependent properties # 474 def _hit_span_get(self): 475 try: 476 return self.hit_end - self.hit_start 477 except TypeError: # triggered if any of the coordinates are None 478 return None 479 480 hit_span = property( 481 fget=_hit_span_get, doc="The number of hit residues covered by the HSP." 482 ) 483 484 def _query_span_get(self): 485 try: 486 return self.query_end - self.query_start 487 except TypeError: # triggered if any of the coordinates are None 488 return None 489 490 query_span = property( 491 fget=_query_span_get, doc="The number of query residues covered by the HSP." 492 ) 493 494 def _hit_range_get(self): 495 return (self.hit_start, self.hit_end) 496 497 hit_range = property( 498 fget=_hit_range_get, doc="Tuple of HSP hit start and end coordinates." 499 ) 500 501 def _query_range_get(self): 502 return (self.query_start, self.query_end) 503 504 query_range = property( 505 fget=_query_range_get, doc="Tuple of HSP query start and end coordinates." 506 ) 507 508 def _inter_ranges_get(self, seq_type): 509 # this property assumes that there are no mixed strands in a hit/query 510 assert seq_type in ("query", "hit") 511 strand = getattr(self, "%s_strand_all" % seq_type)[0] 512 coords = getattr(self, "%s_range_all" % seq_type) 513 # determine function used to set inter range 514 # start and end coordinates, given two pairs 515 # of fragment start and end coordinates 516 if strand == -1: 517 startfunc, endfunc = min, max 518 else: 519 startfunc, endfunc = max, min 520 inter_coords = [] 521 for idx, coord in enumerate(coords[:-1]): 522 start = startfunc(coords[idx]) 523 end = endfunc(coords[idx + 1]) 524 inter_coords.append((min(start, end), max(start, end))) 525 526 return inter_coords 527 528 def _hit_inter_ranges_get(self): 529 return self._inter_ranges_get("hit") 530 531 hit_inter_ranges = property( 532 fget=_hit_inter_ranges_get, 533 doc="Hit sequence coordinates of the regions between fragments.", 534 ) 535 536 def _query_inter_ranges_get(self): 537 return self._inter_ranges_get("query") 538 539 query_inter_ranges = property( 540 fget=_query_inter_ranges_get, 541 doc="Query sequence coordinates of the regions between fragments.", 542 ) 543 544 def _inter_spans_get(self, seq_type): 545 assert seq_type in ("query", "hit") 546 attr_name = "%s_inter_ranges" % seq_type 547 return [coord[1] - coord[0] for coord in getattr(self, attr_name)] 548 549 def _hit_inter_spans_get(self): 550 return self._inter_spans_get("hit") 551 552 hit_inter_spans = property( 553 fget=_hit_inter_spans_get, doc="Lengths of regions between hit fragments." 554 ) 555 556 def _query_inter_spans_get(self): 557 return self._inter_spans_get("query") 558 559 query_inter_spans = property( 560 fget=_query_inter_spans_get, doc="Lengths of regions between query fragments." 561 ) 562 563 # shortcuts for fragments' properties # 564 565 # bool check if there's more than one fragments 566 is_fragmented = property( 567 lambda self: len(self) > 1, 568 doc="Whether the HSP has more than one HSPFragment objects.", 569 ) 570 571 # first item properties with setters 572 hit_description = fullcascade( 573 "hit_description", doc="Description of the hit sequence." 574 ) 575 576 query_description = fullcascade( 577 "query_description", doc="Description of the query sequence." 578 ) 579 580 hit_id = fullcascade("hit_id", doc="ID of the hit sequence.") 581 582 query_id = fullcascade("query_id", doc="ID of the query sequence.") 583 584 molecule_type = fullcascade( 585 "molecule_type", doc="molecule_type of the hit and query SeqRecord objects." 586 ) 587 588 # properties for single-fragment HSPs 589 fragment = singleitem(doc="HSPFragment object, first fragment.") 590 591 hit = singleitem("hit", doc="Hit sequence as a SeqRecord object, first fragment.") 592 593 query = singleitem( 594 "query", doc="Query sequence as a SeqRecord object, first fragment." 595 ) 596 597 aln = singleitem( 598 "aln", doc="Alignment of the first fragment as a MultipleSeqAlignment object." 599 ) 600 601 aln_annotation = singleitem( 602 "aln_annotation", 603 doc="Dictionary of annotation(s) of the first fragment's alignment.", 604 ) 605 606 hit_features = singleitem( 607 "hit_features", doc="Hit sequence features, first fragment." 608 ) 609 610 query_features = singleitem( 611 "query_features", doc="Query sequence features, first fragment." 612 ) 613 614 hit_strand = singleitem("hit_strand", doc="Hit strand orientation, first fragment.") 615 616 query_strand = singleitem( 617 "query_strand", doc="Query strand orientation, first fragment." 618 ) 619 620 hit_frame = singleitem( 621 "hit_frame", doc="Hit sequence reading frame, first fragment." 622 ) 623 624 query_frame = singleitem( 625 "query_frame", doc="Query sequence reading frame, first fragment." 626 ) 627 628 # properties for multi-fragment HSPs 629 fragments = allitems(doc="List of all HSPFragment objects.") 630 631 hit_all = allitems( 632 "hit", doc="List of all fragments' hit sequences as SeqRecord objects." 633 ) 634 635 query_all = allitems( 636 "query", doc="List of all fragments' query sequences as SeqRecord objects." 637 ) 638 639 aln_all = allitems( 640 "aln", doc="List of all fragments' alignments as MultipleSeqAlignment objects." 641 ) 642 643 aln_annotation_all = allitems( 644 "aln_annotation", 645 doc="Dictionary of annotation(s) of all fragments' alignments.", 646 ) 647 648 hit_features_all = allitems( 649 "hit_features", doc="List of all hit sequence features." 650 ) 651 652 query_features_all = allitems( 653 "query_features", doc="List of all query sequence features." 654 ) 655 656 hit_strand_all = allitems( 657 "hit_strand", doc="List of all fragments' hit sequence strands." 658 ) 659 660 query_strand_all = allitems( 661 "query_strand", doc="List of all fragments' query sequence strands" 662 ) 663 664 hit_frame_all = allitems( 665 "hit_frame", doc="List of all fragments' hit sequence reading frames." 666 ) 667 668 query_frame_all = allitems( 669 "query_frame", doc="List of all fragments' query sequence reading frames." 670 ) 671 672 hit_start_all = allitems( 673 "hit_start", doc="List of all fragments' hit start coordinates." 674 ) 675 676 query_start_all = allitems( 677 "query_start", doc="List of all fragments' query start coordinates." 678 ) 679 680 hit_end_all = allitems("hit_end", doc="List of all fragments' hit end coordinates.") 681 682 query_end_all = allitems( 683 "query_end", doc="List of all fragments' query end coordinates." 684 ) 685 686 hit_span_all = allitems("hit_span", doc="List of all fragments' hit sequence size.") 687 688 query_span_all = allitems( 689 "query_span", doc="List of all fragments' query sequence size." 690 ) 691 692 hit_range_all = allitems( 693 "hit_range", doc="List of all fragments' hit start and end coordinates." 694 ) 695 696 query_range_all = allitems( 697 "query_range", doc="List of all fragments' query start and end coordinates." 698 ) 699 700 701class HSPFragment(_BaseHSP): 702 """Class representing a contiguous alignment of hit-query sequence. 703 704 HSPFragment forms the core of any parsed search output file. Depending on 705 the search output file format, it may contain the actual query and/or hit 706 sequences that produces the search hits. These sequences are stored as 707 SeqRecord objects (see SeqRecord): 708 709 >>> from Bio import SearchIO 710 >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) 711 >>> fragment = qresult[0][0][0] # first hit, first hsp, first fragment 712 >>> print(fragment) 713 Query: 33211 mir_1 714 Hit: gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 520b (MIR520... 715 Query range: [0:61] (1) 716 Hit range: [0:61] (1) 717 Fragments: 1 (61 columns) 718 Query - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 719 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 720 Hit - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 721 722 # the query sequence is a SeqRecord object 723 >>> fragment.query.__class__ 724 <class 'Bio.SeqRecord.SeqRecord'> 725 >>> print(fragment.query) 726 ID: 33211 727 Name: aligned query sequence 728 Description: mir_1 729 Number of features: 0 730 /molecule_type=DNA 731 Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG') 732 733 # the hit sequence is a SeqRecord object as well 734 >>> fragment.hit.__class__ 735 <class 'Bio.SeqRecord.SeqRecord'> 736 >>> print(fragment.hit) 737 ID: gi|262205317|ref|NR_030195.1| 738 Name: aligned hit sequence 739 Description: Homo sapiens microRNA 520b (MIR520B), microRNA 740 Number of features: 0 741 /molecule_type=DNA 742 Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG') 743 744 # when both query and hit are present, we get a MultipleSeqAlignment object 745 >>> fragment.aln.__class__ 746 <class 'Bio.Align.MultipleSeqAlignment'> 747 >>> print(fragment.aln) 748 Alignment with 2 rows and 61 columns 749 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG 33211 750 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG gi|262205317|ref|NR_030195.1| 751 752 """ 753 754 def __init__( 755 self, 756 hit_id="<unknown id>", 757 query_id="<unknown id>", 758 hit=None, 759 query=None, 760 molecule_type=None, 761 ): 762 """Initialize the class.""" 763 self._molecule_type = molecule_type 764 self.aln_annotation = {} 765 766 self._hit_id = hit_id 767 self._query_id = query_id 768 769 for seq_type in ("query", "hit"): 770 # query or hit attributes default attributes 771 setattr(self, "_%s_description" % seq_type, "<unknown description>") 772 setattr(self, "_%s_features" % seq_type, []) 773 # query or hit attributes whose default attribute is None 774 for attr in ("strand", "frame", "start", "end"): 775 setattr(self, "%s_%s" % (seq_type, attr), None) 776 # self.query or self.hit 777 if eval(seq_type): 778 setattr(self, seq_type, eval(seq_type)) 779 else: 780 setattr(self, seq_type, None) 781 782 def __repr__(self): 783 """Return HSPFragment info; hit id, query id, number of columns.""" 784 info = "hit_id=%r, query_id=%r" % (self.hit_id, self.query_id) 785 try: 786 info += ", %i columns" % len(self) 787 except AttributeError: 788 pass 789 return "%s(%s)" % (self.__class__.__name__, info) 790 791 def __len__(self): 792 """Return alignment span.""" 793 return self.aln_span 794 795 def __str__(self): 796 """Return string of HSP header and alignments.""" 797 return self._str_hsp_header() + "\n" + self._str_aln() 798 799 def __getitem__(self, idx): 800 """Return object of index idx.""" 801 if self.aln is not None: 802 obj = self.__class__( 803 hit_id=self.hit_id, 804 query_id=self.query_id, 805 molecule_type=self.molecule_type, 806 ) 807 # transfer query and hit attributes 808 # let SeqRecord handle feature slicing, then retrieve the sliced 809 # features into the sliced HSPFragment 810 if self.query is not None: 811 obj.query = self.query[idx] 812 obj.query_features = obj.query.features 813 if self.hit is not None: 814 obj.hit = self.hit[idx] 815 obj.hit_features = obj.hit.features 816 # description, strand, frame 817 for attr in ("description", "strand", "frame"): 818 for seq_type in ("hit", "query"): 819 attr_name = "%s_%s" % (seq_type, attr) 820 self_val = getattr(self, attr_name) 821 setattr(obj, attr_name, self_val) 822 # alignment annotation should be transferred, since we can compute 823 # the resulting annotation 824 obj.aln_annotation = {} 825 for key, value in self.aln_annotation.items(): 826 assert len(value[idx]) == len(obj) 827 obj.aln_annotation[key] = value[idx] 828 return obj 829 else: 830 raise TypeError( 831 "Slicing for HSP objects without alignment is not supported." 832 ) 833 834 def _str_aln(self): 835 lines = [] 836 # alignment length 837 aln_span = getattr_str(self, "aln_span") 838 lines.append(" Fragments: 1 (%s columns)" % aln_span) 839 # sequences 840 if self.query is not None and self.hit is not None: 841 try: 842 qseq = self.query.seq 843 except AttributeError: # query is None 844 qseq = "?" 845 try: 846 hseq = self.hit.seq 847 except AttributeError: # hit is None 848 hseq = "?" 849 850 # similarity line 851 simil = "" 852 if "similarity" in self.aln_annotation and isinstance( 853 self.aln_annotation.get("similarity"), str 854 ): 855 simil = self.aln_annotation["similarity"] 856 857 if self.aln_span <= 67: 858 lines.append("%10s - %s" % ("Query", qseq)) 859 if simil: 860 lines.append(" %s" % simil) 861 lines.append("%10s - %s" % ("Hit", hseq)) 862 else: 863 # adjust continuation character length, so we don't display 864 # the same residues twice 865 if self.aln_span - 66 > 3: 866 cont = "~" * 3 867 else: 868 cont = "~" * (self.aln_span - 66) 869 lines.append("%10s - %s%s%s" % ("Query", qseq[:59], cont, qseq[-5:])) 870 if simil: 871 lines.append(" %s%s%s" % (simil[:59], cont, simil[-5:])) 872 lines.append("%10s - %s%s%s" % ("Hit", hseq[:59], cont, hseq[-5:])) 873 874 return "\n".join(lines) 875 876 # sequence properties # 877 def _set_seq(self, seq, seq_type): 878 """Check the given sequence for attribute setting (PRIVATE). 879 880 :param seq: sequence to check 881 :type seq: string or SeqRecord 882 :param seq_type: sequence type 883 :type seq_type: string, choice of 'hit' or 'query' 884 885 """ 886 assert seq_type in ("hit", "query") 887 if seq is None: 888 return seq # return immediately if seq is None 889 else: 890 if not isinstance(seq, (str, SeqRecord)): 891 raise TypeError( 892 "%s sequence must be a string or a SeqRecord object." % seq_type 893 ) 894 # check length if the opposite sequence is not None 895 opp_type = "hit" if seq_type == "query" else "query" 896 opp_seq = getattr(self, "_%s" % opp_type, None) 897 if opp_seq is not None: 898 if len(seq) != len(opp_seq): 899 raise ValueError( 900 "Sequence lengths do not match. Expected: %r (%s); found: %r (%s)." 901 % (len(opp_seq), opp_type, len(seq), seq_type) 902 ) 903 904 seq_id = getattr(self, "%s_id" % seq_type) 905 seq_desc = getattr(self, "%s_description" % seq_type) 906 seq_feats = getattr(self, "%s_features" % seq_type) 907 seq_name = "aligned %s sequence" % seq_type 908 909 if isinstance(seq, SeqRecord): 910 seq.id = seq_id 911 seq.description = seq_desc 912 seq.name = seq_name 913 seq.features = seq_feats 914 seq.annotations["molecule_type"] = self.molecule_type 915 elif isinstance(seq, str): 916 seq = SeqRecord( 917 Seq(seq), 918 id=seq_id, 919 name=seq_name, 920 description=seq_desc, 921 features=seq_feats, 922 annotations={"molecule_type": self.molecule_type}, 923 ) 924 925 return seq 926 927 def _hit_get(self): 928 return self._hit 929 930 def _hit_set(self, value): 931 self._hit = self._set_seq(value, "hit") 932 933 hit = property( 934 fget=_hit_get, 935 fset=_hit_set, 936 doc="Hit sequence as a SeqRecord object, defaults to None.", 937 ) 938 939 def _query_get(self): 940 return self._query 941 942 def _query_set(self, value): 943 self._query = self._set_seq(value, "query") 944 945 query = property( 946 fget=_query_get, 947 fset=_query_set, 948 doc="Query sequence as a SeqRecord object, defaults to None.", 949 ) 950 951 def _aln_get(self): 952 if self.query is None and self.hit is None: 953 return None 954 if self.hit is None: 955 msa = MultipleSeqAlignment([self.query]) 956 elif self.query is None: 957 msa = MultipleSeqAlignment([self.hit]) 958 else: 959 msa = MultipleSeqAlignment([self.query, self.hit]) 960 molecule_type = self.molecule_type 961 if molecule_type is not None: 962 msa.molecule_type = molecule_type 963 return msa 964 965 aln = property( 966 fget=_aln_get, 967 doc="Query-hit alignment as a MultipleSeqAlignment object, defaults to None.", 968 ) 969 970 def _molecule_type_get(self): 971 return self._molecule_type 972 973 def _molecule_type_set(self, value): 974 self._molecule_type = value 975 try: 976 self.query.annotations["molecule_type"] = value 977 except AttributeError: 978 pass 979 try: 980 self.hit.annotations["molecule_type"] = value 981 except AttributeError: 982 pass 983 984 molecule_type = property( 985 fget=_molecule_type_get, 986 fset=_molecule_type_set, 987 doc="molecule type used in the fragment's " 988 "sequence records and alignment, defaults to None.", 989 ) 990 991 def _aln_span_get(self): 992 # length of alignment (gaps included) 993 # alignment span can be its own attribute, or computed from 994 # query / hit length 995 try: 996 self._aln_span 997 except AttributeError: 998 if self.query is not None: 999 self._aln_span = len(self.query) 1000 elif self.hit is not None: 1001 self._aln_span = len(self.hit) 1002 1003 return self._aln_span 1004 1005 def _aln_span_set(self, value): 1006 self._aln_span = value 1007 1008 aln_span = property( 1009 fget=_aln_span_get, 1010 fset=_aln_span_set, 1011 doc="The number of alignment columns covered by the fragment.", 1012 ) 1013 1014 # id, description, and features properties # 1015 hit_description = fragcascade("description", "hit", doc="Hit sequence description.") 1016 1017 query_description = fragcascade( 1018 "description", "query", doc="Query sequence description." 1019 ) 1020 1021 hit_id = fragcascade("id", "hit", doc="Hit sequence ID.") 1022 1023 query_id = fragcascade("id", "query", doc="Query sequence ID.") 1024 1025 hit_features = fragcascade("features", "hit", doc="Hit sequence features.") 1026 1027 query_features = fragcascade("features", "query", doc="Query sequence features.") 1028 1029 # strand properties # 1030 def _prep_strand(self, strand): 1031 # follow SeqFeature's convention 1032 if strand not in (-1, 0, 1, None): 1033 raise ValueError("Strand should be -1, 0, 1, or None; not %r" % strand) 1034 return strand 1035 1036 def _get_strand(self, seq_type): 1037 assert seq_type in ("hit", "query") 1038 strand = getattr(self, "_%s_strand" % seq_type) 1039 1040 if strand is None: 1041 # try to compute strand from frame 1042 frame = getattr(self, "%s_frame" % seq_type) 1043 if frame is not None: 1044 try: 1045 strand = frame // abs(frame) 1046 except ZeroDivisionError: 1047 strand = 0 1048 setattr(self, "%s_strand" % seq_type, strand) 1049 1050 return strand 1051 1052 def _hit_strand_get(self): 1053 return self._get_strand("hit") 1054 1055 def _hit_strand_set(self, value): 1056 self._hit_strand = self._prep_strand(value) 1057 1058 hit_strand = property( 1059 fget=_hit_strand_get, 1060 fset=_hit_strand_set, 1061 doc="Hit sequence strand, defaults to None.", 1062 ) 1063 1064 def _query_strand_get(self): 1065 return self._get_strand("query") 1066 1067 def _query_strand_set(self, value): 1068 self._query_strand = self._prep_strand(value) 1069 1070 query_strand = property( 1071 fget=_query_strand_get, 1072 fset=_query_strand_set, 1073 doc="Query sequence strand, defaults to None.", 1074 ) 1075 1076 # frame properties # 1077 def _prep_frame(self, frame): 1078 if frame not in (-3, -2, -1, 0, 1, 2, 3, None): 1079 raise ValueError( 1080 "Strand should be an integer between -3 and 3, or None; not %r" % frame 1081 ) 1082 return frame 1083 1084 def _hit_frame_get(self): 1085 return self._hit_frame 1086 1087 def _hit_frame_set(self, value): 1088 self._hit_frame = self._prep_frame(value) 1089 1090 hit_frame = property( 1091 fget=_hit_frame_get, 1092 fset=_hit_frame_set, 1093 doc="Hit sequence reading frame, defaults to None.", 1094 ) 1095 1096 def _query_frame_get(self): 1097 """Get query sequence reading frame (PRIVATE).""" 1098 return self._query_frame 1099 1100 def _query_frame_set(self, value): 1101 """Set query sequence reading frame (PRIVATE).""" 1102 self._query_frame = self._prep_frame(value) 1103 1104 query_frame = property( 1105 fget=_query_frame_get, 1106 fset=_query_frame_set, 1107 doc="Query sequence reading frame, defaults to None.", 1108 ) 1109 1110 # coordinate properties # 1111 def _prep_coord(self, coord, opp_coord_name, op): 1112 # coord must either be None or int 1113 if coord is None: 1114 return coord 1115 assert isinstance(coord, int) 1116 # try to get opposite coordinate, if it's not present, return 1117 try: 1118 opp_coord = getattr(self, opp_coord_name) 1119 except AttributeError: 1120 return coord 1121 # if opposite coordinate is None, return 1122 if opp_coord is None: 1123 return coord 1124 # otherwise compare it to coord ('>=' or '<=') 1125 else: 1126 assert op(coord, opp_coord) 1127 return coord 1128 1129 def _hit_start_get(self): 1130 """Get the sequence hit start coordinate (PRIVATE).""" 1131 return self._hit_start 1132 1133 def _hit_start_set(self, value): 1134 """Set the sequence hit start coordinate (PRIVATE).""" 1135 self._hit_start = self._prep_coord(value, "hit_end", le) 1136 1137 hit_start = property( 1138 fget=_hit_start_get, 1139 fset=_hit_start_set, 1140 doc="Hit sequence start coordinate, defaults to None.", 1141 ) 1142 1143 def _query_start_get(self): 1144 """Get the query sequence start coordinate (PRIVATE).""" 1145 return self._query_start 1146 1147 def _query_start_set(self, value): 1148 """Set the query sequence start coordinate (PRIVATE).""" 1149 self._query_start = self._prep_coord(value, "query_end", le) 1150 1151 query_start = property( 1152 fget=_query_start_get, 1153 fset=_query_start_set, 1154 doc="Query sequence start coordinate, defaults to None.", 1155 ) 1156 1157 def _hit_end_get(self): 1158 """Get the hit sequence end coordinate (PRIVATE).""" 1159 return self._hit_end 1160 1161 def _hit_end_set(self, value): 1162 """Set the hit sequence end coordinate (PRIVATE).""" 1163 self._hit_end = self._prep_coord(value, "hit_start", ge) 1164 1165 hit_end = property( 1166 fget=_hit_end_get, 1167 fset=_hit_end_set, 1168 doc="Hit sequence end coordinate, defaults to None.", 1169 ) 1170 1171 def _query_end_get(self): 1172 """Get the query sequence end coordinate (PRIVATE).""" 1173 return self._query_end 1174 1175 def _query_end_set(self, value): 1176 """Set the query sequence end coordinate (PRIVATE).""" 1177 self._query_end = self._prep_coord(value, "query_start", ge) 1178 1179 query_end = property( 1180 fget=_query_end_get, 1181 fset=_query_end_set, 1182 doc="Query sequence end coordinate, defaults to None.", 1183 ) 1184 1185 # coordinate-dependent properties # 1186 def _hit_span_get(self): 1187 """Return the number of residues covered by the hit sequence (PRIVATE).""" 1188 try: 1189 return self.hit_end - self.hit_start 1190 except TypeError: # triggered if any of the coordinates are None 1191 return None 1192 1193 hit_span = property( 1194 fget=_hit_span_get, doc="The number of residues covered by the hit sequence." 1195 ) 1196 1197 def _query_span_get(self): 1198 """Return the number or residues covered by the query (PRIVATE).""" 1199 try: 1200 return self.query_end - self.query_start 1201 except TypeError: # triggered if any of the coordinates are None 1202 return None 1203 1204 query_span = property( 1205 fget=_query_span_get, 1206 doc="The number of residues covered by the query sequence.", 1207 ) 1208 1209 def _hit_range_get(self): 1210 """Return the start and end of a hit (PRIVATE).""" 1211 return (self.hit_start, self.hit_end) 1212 1213 hit_range = property( 1214 fget=_hit_range_get, doc="Tuple of hit start and end coordinates." 1215 ) 1216 1217 def _query_range_get(self): 1218 """Return the start and end of a query (PRIVATE).""" 1219 return (self.query_start, self.query_end) 1220 1221 query_range = property( 1222 fget=_query_range_get, doc="Tuple of query start and end coordinates." 1223 ) 1224 1225 1226# if not used as a module, run the doctest 1227if __name__ == "__main__": 1228 from Bio._utils import run_doctest 1229 1230 run_doctest() 1231