1# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
2# This file is part of the Biopython distribution and governed by your
3# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
4# Please see the LICENSE file that should have been included as part of this
5# package.
6"""Bio.SearchIO objects to model high scoring regions between query and hit."""
7
8import warnings
9from operator import ge, le
10
11from Bio import BiopythonWarning
12from Bio.Align import MultipleSeqAlignment
13from Bio.Seq import Seq
14from Bio.SeqRecord import SeqRecord
15
16from Bio.SearchIO._utils import (
17    singleitem,
18    allitems,
19    fullcascade,
20    fragcascade,
21    getattr_str,
22)
23
24from ._base import _BaseHSP
25
26
27class HSP(_BaseHSP):
28    """Class representing high-scoring region(s) between query and hit.
29
30    HSP (high-scoring pair) objects are contained by Hit objects (see Hit).
31    In most cases, HSP objects store the bulk of the statistics and results
32    (e.g. e-value, bitscores, query sequence, etc.) produced by a search
33    program.
34
35    Depending on the search output file format, a given HSP will contain one
36    or more HSPFragment object(s). Examples of search programs that produce HSP
37    with one HSPFragments are BLAST, HMMER, and FASTA. Other programs such as
38    BLAT or Exonerate may produce HSPs containing more than one HSPFragment.
39    However, their native terminologies may differ: in BLAT these fragments
40    are called 'blocks' while in Exonerate they are called exons or NER.
41
42    Here are examples from each type of HSP. The first one comes from a BLAST
43    search::
44
45        >>> from Bio import SearchIO
46        >>> blast_qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml'))
47        >>> blast_hsp = blast_qresult[1][0]     # the first HSP from the second hit
48        >>> blast_hsp
49        HSP(hit_id='gi|301171311|ref|NR_035856.1|', query_id='33211', 1 fragments)
50        >>> print(blast_hsp)
51              Query: 33211 mir_1
52                Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ...
53        Query range: [1:61] (1)
54          Hit range: [0:60] (1)
55        Quick stats: evalue 1.7e-22; bitscore 109.49
56          Fragments: 1 (60 columns)
57             Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG
58                     ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
59               Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG
60
61    For HSPs with a single HSPFragment, you can invoke ``print`` on it and see the
62    underlying sequence alignment, if it exists. This is not the case for HSPs
63    with more than one HSPFragment. Below is an example, using an HSP from a
64    BLAT search. Invoking ``print`` on these HSPs will instead show a table of the
65    HSPFragment objects it contains::
66
67        >>> blat_qresult = SearchIO.read('Blat/mirna.pslx', 'blat-psl', pslx=True)
68        >>> blat_hsp = blat_qresult[1][0]       # the first HSP from the second hit
69        >>> blat_hsp
70        HSP(hit_id='chr11', query_id='blat_1', 2 fragments)
71        >>> print(blat_hsp)
72              Query: blat_1 <unknown description>
73                Hit: chr11 <unknown description>
74        Query range: [42:67] (-1)
75          Hit range: [59018929:59018955] (1)
76        Quick stats: evalue ?; bitscore ?
77          Fragments: ---  --------------  ----------------------  ----------------------
78                       #            Span             Query range               Hit range
79                     ---  --------------  ----------------------  ----------------------
80                       0               6                 [61:67]     [59018929:59018935]
81                       1              16                 [42:58]     [59018939:59018955]
82
83    Notice that in HSPs with more than one HSPFragments, the HSP's ``query_range``
84    ``hit_range`` properties encompasses all fragments it contains.
85
86    You can check whether an HSP has more than one HSPFragments or not using the
87    ``is_fragmented`` property::
88
89        >>> blast_hsp.is_fragmented
90        False
91        >>> blat_hsp.is_fragmented
92        True
93
94    Since HSP objects are also containers similar to Python lists, you can
95    access a single fragment in an HSP using its integer index::
96
97        >>> blat_fragment = blat_hsp[0]
98        >>> print(blat_fragment)
99              Query: blat_1 <unknown description>
100                Hit: chr11 <unknown description>
101        Query range: [61:67] (-1)
102          Hit range: [59018929:59018935] (1)
103          Fragments: 1 (6 columns)
104             Query - tatagt
105               Hit - tatagt
106
107    This applies to HSPs objects with a single fragment as well::
108
109        >>> blast_fragment = blast_hsp[0]
110        >>> print(blast_fragment)
111              Query: 33211 mir_1
112                Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ...
113        Query range: [1:61] (1)
114          Hit range: [0:60] (1)
115          Fragments: 1 (60 columns)
116             Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG
117                     ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
118               Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG
119
120    Regardless of the search output file format, HSP objects provide the
121    properties listed below. These properties always return values in a list,
122    due to the HSP object itself being a list-like container. However, for
123    HSP objects with a single HSPFragment, shortcut properties that fetches
124    the item from the list are also provided.
125
126    +----------------------+---------------------+-----------------------------+
127    | Property             | Shortcut            | Value                       |
128    +======================+=====================+=============================+
129    | aln_all              | aln                 | HSP alignments as           |
130    |                      |                     | MultipleSeqAlignment object |
131    +----------------------+---------------------+-----------------------------+
132    | aln_annotation_all   | aln_annotation      | dictionary of annotation(s) |
133    |                      |                     | of all fragments' alignments|
134    +----------------------+---------------------+-----------------------------+
135    | fragments            | fragment            | HSPFragment objects         |
136    +----------------------+---------------------+-----------------------------+
137    | hit_all              | hit                 | hit sequence as SeqRecord   |
138    |                      |                     | objects                     |
139    +----------------------+---------------------+-----------------------------+
140    | hit_features_all     | hit_features        | SeqFeatures of all hit      |
141    |                      |                     | fragments                   |
142    +----------------------+---------------------+-----------------------------+
143    | hit_start_all        | hit_start*          | start coordinates of the    |
144    |                      |                     | hit fragments               |
145    +----------------------+---------------------+-----------------------------+
146    | hit_end_all          | hit_end*            | end coordinates of the hit  |
147    |                      |                     | fragments                   |
148    +----------------------+---------------------+-----------------------------+
149    | hit_span_all         | hit_span*           | sizes of each hit fragments |
150    +----------------------+---------------------+-----------------------------+
151    | hit_strand_all       | hit_strand          | strand orientations of the  |
152    |                      |                     | hit fragments               |
153    +----------------------+---------------------+-----------------------------+
154    | hit_frame_all        | hit_frame           | reading frames of the hit   |
155    |                      |                     | fragments                   |
156    +----------------------+---------------------+-----------------------------+
157    | hit_range_all        | hit_range           | tuples of start and end     |
158    |                      |                     | coordinates of each hit     |
159    |                      |                     | fragment                    |
160    +----------------------+---------------------+-----------------------------+
161    | query_all            | query               | query sequence as SeqRecord |
162    |                      |                     | object                      |
163    +----------------------+---------------------+-----------------------------+
164    | query_features_all   | query_features      | SeqFeatures of all query    |
165    |                      |                     | fragments                   |
166    +----------------------+---------------------+-----------------------------+
167    | query_start_all      | query_start*        | start coordinates of the    |
168    |                      |                     | fragments                   |
169    +----------------------+---------------------+-----------------------------+
170    | query_end_all        | query_end*          | end coordinates of the      |
171    |                      |                     | query fragments             |
172    +----------------------+---------------------+-----------------------------+
173    | query_span_all       | query_span*         | sizes of each query         |
174    |                      |                     | fragments                   |
175    +----------------------+---------------------+-----------------------------+
176    | query_strand_all     | query_strand        | strand orientations of the  |
177    |                      |                     | query fragments             |
178    +----------------------+---------------------+-----------------------------+
179    | query_frame_all      | query_frame         | reading frames of the query |
180    |                      |                     | fragments                   |
181    +----------------------+---------------------+-----------------------------+
182    | query_range_all      | query_range         | tuples of start and end     |
183    |                      |                     | coordinates of each query   |
184    |                      |                     | fragment                    |
185    +----------------------+---------------------+-----------------------------+
186
187    For all types of HSP objects, the property will return the values in a list.
188    Shorcuts are only applicable for HSPs with one fragment. Except the ones
189    noted, if they are used on an HSP with more than one fragments, an exception
190    will be raised.
191
192    For properties that may be used in HSPs with multiple or single fragments
193    (``*_start``, ``*_end``, and ``*_span`` properties), their interpretation depends
194    on how many fragment the HSP has:
195
196    +------------+---------------------------------------------------+
197    | Property   | Value                                             |
198    +============+===================================================+
199    | hit_start  | smallest coordinate value of all hit fragments    |
200    +------------+---------------------------------------------------+
201    | hit_end    | largest coordinate value of all hit fragments     |
202    +------------+---------------------------------------------------+
203    | hit_span   | difference between ``hit_start`` and ``hit_end``  |
204    +------------+---------------------------------------------------+
205    | query_start| smallest coordinate value of all query fragments  |
206    +------------+---------------------------------------------------+
207    | query_end  | largest coordinate value of all query fragments   |
208    +------------+---------------------------------------------------+
209    | query_span | difference between ``query_start`` and            |
210    |            | ``query_end``                                     |
211    +------------+---------------------------------------------------+
212
213    In addition to the objects listed above, HSP objects also provide the
214    following properties and/or attributes:
215
216    +--------------------+------------------------------------------------------+
217    | Property           | Value                                                |
218    +====================+======================================================+
219    | aln_span           | total number of residues in all HSPFragment objects  |
220    +--------------------+------------------------------------------------------+
221    | molecule_type      | molecule_type of the hit and query SeqRecord objects |
222    +--------------------+------------------------------------------------------+
223    | is_fragmented      | boolean, whether there are multiple fragments or not |
224    +--------------------+------------------------------------------------------+
225    | hit_id             | ID of the hit sequence                               |
226    +--------------------+------------------------------------------------------+
227    | hit_description    | description of the hit sequence                      |
228    +--------------------+------------------------------------------------------+
229    | hit_inter_ranges   | list of hit sequence coordinates of the regions      |
230    |                    | between fragments                                    |
231    +--------------------+------------------------------------------------------+
232    | hit_inter_spans    | list of lengths of the regions between hit fragments |
233    +--------------------+------------------------------------------------------+
234    | output_index       | 0-based index for storing the order by which the HSP |
235    |                    | appears in the output file (default: -1).            |
236    +--------------------+------------------------------------------------------+
237    | query_id           | ID of the query sequence                             |
238    +--------------------+------------------------------------------------------+
239    | query_description  | description of the query sequence                    |
240    +--------------------+------------------------------------------------------+
241    | query_inter_ranges | list of query sequence coordinates of the regions    |
242    |                    | between fragments                                    |
243    +--------------------+------------------------------------------------------+
244    | query_inter_spans  | list of lengths of the regions between query         |
245    |                    | fragments                                            |
246    +--------------------+------------------------------------------------------+
247
248    .. [1] may be used in HSPs with multiple fragments
249
250    """
251
252    # attributes we don't want to transfer when creating a new Hit class
253    # from this one
254    _NON_STICKY_ATTRS = ("_items",)
255
256    def __init__(self, fragments=(), output_index=-1):
257        """Initialize an HSP object.
258
259        :param fragments: fragments contained in the HSP object
260        :type fragments: iterable yielding HSPFragment
261        :param output_index: optional index / ordering of the HSP fragment in
262            the original input file.
263        :type output_index: integer
264
265        HSP objects must be initialized with a list containing at least one
266        HSPFragment object. If multiple HSPFragment objects are used for
267        initialization, they must all have the same ``query_id``,
268        ``query_description``, ``hit_id``, ``hit_description``, and
269        ``molecule_type`` properties.
270
271        """
272        if not fragments:
273            raise ValueError("HSP objects must have at least one HSPFragment object.")
274        # TODO - Move this into the for look in case hsps is a single use
275        # iterable?
276        # check that all fragments contain the same IDs, descriptions,
277        # molecule_type
278        for attr in (
279            "query_id",
280            "query_description",
281            "hit_id",
282            "hit_description",
283            "molecule_type",
284        ):
285            if len({getattr(frag, attr) for frag in fragments}) != 1:
286                raise ValueError(
287                    "HSP object can not contain fragments with more than one %s." % attr
288                )
289
290        self.output_index = output_index
291        self._items = []
292        for fragment in fragments:
293            self._validate_fragment(fragment)
294            self._items.append(fragment)
295
296    def __repr__(self):
297        """Return string representation of HSP object."""
298        return "%s(hit_id=%r, query_id=%r, %r fragments)" % (
299            self.__class__.__name__,
300            self.hit_id,
301            self.query_id,
302            len(self),
303        )
304
305    def __iter__(self):
306        """Iterate over HSP items."""
307        return iter(self._items)
308
309    def __contains__(self, fragment):
310        """Return True if HSPFragment is on HSP items."""
311        return fragment in self._items
312
313    def __len__(self):
314        """Return number of HSPs items."""
315        return len(self._items)
316
317    def __bool__(self):
318        """Return True if it has HSPs."""
319        return bool(self._items)
320
321    def __str__(self):
322        """Return a human readable summary of the HSP object."""
323        lines = []
324        # set hsp info line
325        statline = []
326        # evalue
327        evalue = getattr_str(self, "evalue", fmt="%.2g")
328        statline.append("evalue " + evalue)
329        # bitscore
330        bitscore = getattr_str(self, "bitscore", fmt="%.2f")
331        statline.append("bitscore " + bitscore)
332        lines.append("Quick stats: " + "; ".join(statline))
333
334        if len(self.fragments) == 1:
335            return "\n".join(
336                [self._str_hsp_header(), "\n".join(lines), self.fragments[0]._str_aln()]
337            )
338        else:
339            lines.append(
340                "  Fragments: %s  %s  %s  %s" % ("-" * 3, "-" * 14, "-" * 22, "-" * 22)
341            )
342            pattern = "%16s  %14s  %22s  %22s"
343            lines.append(pattern % ("#", "Span", "Query range", "Hit range"))
344            lines.append(pattern % ("-" * 3, "-" * 14, "-" * 22, "-" * 22))
345            for idx, block in enumerate(self.fragments):
346                # set hsp line and table
347                # alignment span
348                aln_span = getattr_str(block, "aln_span")
349                # query region
350                query_start = getattr_str(block, "query_start")
351                query_end = getattr_str(block, "query_end")
352                query_range = "[%s:%s]" % (query_start, query_end)
353                # max column length is 20
354                query_range = (
355                    query_range[:20] + "~]" if len(query_range) > 22 else query_range
356                )
357                # hit region
358                hit_start = getattr_str(block, "hit_start")
359                hit_end = getattr_str(block, "hit_end")
360                hit_range = "[%s:%s]" % (hit_start, hit_end)
361                hit_range = hit_range[:20] + "~]" if len(hit_range) > 22 else hit_range
362                # append the hsp row
363                lines.append(pattern % (str(idx), aln_span, query_range, hit_range))
364
365            return self._str_hsp_header() + "\n" + "\n".join(lines)
366
367    def __getitem__(self, idx):
368        """Return object of index idx."""
369        # if key is slice, return a new HSP instance
370        if isinstance(idx, slice):
371            obj = self.__class__(self._items[idx])
372            self._transfer_attrs(obj)
373            return obj
374        return self._items[idx]
375
376    def __setitem__(self, idx, fragments):
377        """Set an item of index idx with the given fragments."""
378        # handle case if hsps is a list of hsp
379        if isinstance(fragments, (list, tuple)):
380            for fragment in fragments:
381                self._validate_fragment(fragment)
382        else:
383            self._validate_fragment(fragments)
384
385        self._items[idx] = fragments
386
387    def __delitem__(self, idx):
388        """Delete item of index idx."""
389        # note that this may result in an empty HSP object, which should be
390        # invalid
391        del self._items[idx]
392
393    def _validate_fragment(self, fragment):
394        if not isinstance(fragment, HSPFragment):
395            raise TypeError("HSP objects can only contain HSPFragment objects.")
396        # HACK: to make validation during __init__ work
397        if self._items:
398            if fragment.hit_id != self.hit_id:
399                raise ValueError(
400                    "Expected HSPFragment with hit ID %r, found %r instead."
401                    % (self.id, fragment.hit_id)
402                )
403
404            if fragment.hit_description != self.hit_description:
405                raise ValueError(
406                    "Expected HSPFragment with hit description %r, found %r instead."
407                    % (self.description, fragment.hit_description)
408                )
409
410            if fragment.query_id != self.query_id:
411                raise ValueError(
412                    "Expected HSPFragment with query ID %r, found %r instead."
413                    % (self.query_id, fragment.query_id)
414                )
415
416            if fragment.query_description != self.query_description:
417                raise ValueError(
418                    "Expected HSP with query description %r, found %r instead."
419                    % (self.query_description, fragment.query_description)
420                )
421
422    def _aln_span_get(self):
423        # length of all alignments
424        # alignment span can be its own attribute, or computed from
425        # query / hit length
426        return sum(frg.aln_span for frg in self.fragments)
427
428    aln_span = property(
429        fget=_aln_span_get, doc="Total number of columns in all HSPFragment objects."
430    )
431
432    # coordinate properties #
433    def _get_coords(self, seq_type, coord_type):
434        assert seq_type in ("hit", "query")
435        assert coord_type in ("start", "end")
436        coord_name = "%s_%s" % (seq_type, coord_type)
437        coords = [getattr(frag, coord_name) for frag in self.fragments]
438        if None in coords:
439            warnings.warn(
440                "'None' exist in %s coordinates; ignored" % (coord_name),
441                BiopythonWarning,
442            )
443        return coords
444
445    def _hit_start_get(self):
446        return min(self._get_coords("hit", "start"))
447
448    hit_start = property(
449        fget=_hit_start_get, doc="Smallest coordinate value of all hit fragments."
450    )
451
452    def _query_start_get(self):
453        return min(self._get_coords("query", "start"))
454
455    query_start = property(
456        fget=_query_start_get, doc="Smallest coordinate value of all query fragments."
457    )
458
459    def _hit_end_get(self):
460        return max(self._get_coords("hit", "end"))
461
462    hit_end = property(
463        fget=_hit_end_get, doc="Largest coordinate value of all hit fragments."
464    )
465
466    def _query_end_get(self):
467        return max(self._get_coords("query", "end"))
468
469    query_end = property(
470        fget=_query_end_get, doc="Largest coordinate value of all hit fragments."
471    )
472
473    # coordinate-dependent properties #
474    def _hit_span_get(self):
475        try:
476            return self.hit_end - self.hit_start
477        except TypeError:  # triggered if any of the coordinates are None
478            return None
479
480    hit_span = property(
481        fget=_hit_span_get, doc="The number of hit residues covered by the HSP."
482    )
483
484    def _query_span_get(self):
485        try:
486            return self.query_end - self.query_start
487        except TypeError:  # triggered if any of the coordinates are None
488            return None
489
490    query_span = property(
491        fget=_query_span_get, doc="The number of query residues covered by the HSP."
492    )
493
494    def _hit_range_get(self):
495        return (self.hit_start, self.hit_end)
496
497    hit_range = property(
498        fget=_hit_range_get, doc="Tuple of HSP hit start and end coordinates."
499    )
500
501    def _query_range_get(self):
502        return (self.query_start, self.query_end)
503
504    query_range = property(
505        fget=_query_range_get, doc="Tuple of HSP query start and end coordinates."
506    )
507
508    def _inter_ranges_get(self, seq_type):
509        # this property assumes that there are no mixed strands in a hit/query
510        assert seq_type in ("query", "hit")
511        strand = getattr(self, "%s_strand_all" % seq_type)[0]
512        coords = getattr(self, "%s_range_all" % seq_type)
513        # determine function used to set inter range
514        # start and end coordinates, given two pairs
515        # of fragment start and end coordinates
516        if strand == -1:
517            startfunc, endfunc = min, max
518        else:
519            startfunc, endfunc = max, min
520        inter_coords = []
521        for idx, coord in enumerate(coords[:-1]):
522            start = startfunc(coords[idx])
523            end = endfunc(coords[idx + 1])
524            inter_coords.append((min(start, end), max(start, end)))
525
526        return inter_coords
527
528    def _hit_inter_ranges_get(self):
529        return self._inter_ranges_get("hit")
530
531    hit_inter_ranges = property(
532        fget=_hit_inter_ranges_get,
533        doc="Hit sequence coordinates of the regions between fragments.",
534    )
535
536    def _query_inter_ranges_get(self):
537        return self._inter_ranges_get("query")
538
539    query_inter_ranges = property(
540        fget=_query_inter_ranges_get,
541        doc="Query sequence coordinates of the regions between fragments.",
542    )
543
544    def _inter_spans_get(self, seq_type):
545        assert seq_type in ("query", "hit")
546        attr_name = "%s_inter_ranges" % seq_type
547        return [coord[1] - coord[0] for coord in getattr(self, attr_name)]
548
549    def _hit_inter_spans_get(self):
550        return self._inter_spans_get("hit")
551
552    hit_inter_spans = property(
553        fget=_hit_inter_spans_get, doc="Lengths of regions between hit fragments."
554    )
555
556    def _query_inter_spans_get(self):
557        return self._inter_spans_get("query")
558
559    query_inter_spans = property(
560        fget=_query_inter_spans_get, doc="Lengths of regions between query fragments."
561    )
562
563    # shortcuts for fragments' properties #
564
565    # bool check if there's more than one fragments
566    is_fragmented = property(
567        lambda self: len(self) > 1,
568        doc="Whether the HSP has more than one HSPFragment objects.",
569    )
570
571    # first item properties with setters
572    hit_description = fullcascade(
573        "hit_description", doc="Description of the hit sequence."
574    )
575
576    query_description = fullcascade(
577        "query_description", doc="Description of the query sequence."
578    )
579
580    hit_id = fullcascade("hit_id", doc="ID of the hit sequence.")
581
582    query_id = fullcascade("query_id", doc="ID of the query sequence.")
583
584    molecule_type = fullcascade(
585        "molecule_type", doc="molecule_type of the hit and query SeqRecord objects."
586    )
587
588    # properties for single-fragment HSPs
589    fragment = singleitem(doc="HSPFragment object, first fragment.")
590
591    hit = singleitem("hit", doc="Hit sequence as a SeqRecord object, first fragment.")
592
593    query = singleitem(
594        "query", doc="Query sequence as a SeqRecord object, first fragment."
595    )
596
597    aln = singleitem(
598        "aln", doc="Alignment of the first fragment as a MultipleSeqAlignment object."
599    )
600
601    aln_annotation = singleitem(
602        "aln_annotation",
603        doc="Dictionary of annotation(s) of the first fragment's alignment.",
604    )
605
606    hit_features = singleitem(
607        "hit_features", doc="Hit sequence features, first fragment."
608    )
609
610    query_features = singleitem(
611        "query_features", doc="Query sequence features, first fragment."
612    )
613
614    hit_strand = singleitem("hit_strand", doc="Hit strand orientation, first fragment.")
615
616    query_strand = singleitem(
617        "query_strand", doc="Query strand orientation, first fragment."
618    )
619
620    hit_frame = singleitem(
621        "hit_frame", doc="Hit sequence reading frame, first fragment."
622    )
623
624    query_frame = singleitem(
625        "query_frame", doc="Query sequence reading frame, first fragment."
626    )
627
628    # properties for multi-fragment HSPs
629    fragments = allitems(doc="List of all HSPFragment objects.")
630
631    hit_all = allitems(
632        "hit", doc="List of all fragments' hit sequences as SeqRecord objects."
633    )
634
635    query_all = allitems(
636        "query", doc="List of all fragments' query sequences as SeqRecord objects."
637    )
638
639    aln_all = allitems(
640        "aln", doc="List of all fragments' alignments as MultipleSeqAlignment objects."
641    )
642
643    aln_annotation_all = allitems(
644        "aln_annotation",
645        doc="Dictionary of annotation(s) of all fragments' alignments.",
646    )
647
648    hit_features_all = allitems(
649        "hit_features", doc="List of all hit sequence features."
650    )
651
652    query_features_all = allitems(
653        "query_features", doc="List of all query sequence features."
654    )
655
656    hit_strand_all = allitems(
657        "hit_strand", doc="List of all fragments' hit sequence strands."
658    )
659
660    query_strand_all = allitems(
661        "query_strand", doc="List of all fragments' query sequence strands"
662    )
663
664    hit_frame_all = allitems(
665        "hit_frame", doc="List of all fragments' hit sequence reading frames."
666    )
667
668    query_frame_all = allitems(
669        "query_frame", doc="List of all fragments' query sequence reading frames."
670    )
671
672    hit_start_all = allitems(
673        "hit_start", doc="List of all fragments' hit start coordinates."
674    )
675
676    query_start_all = allitems(
677        "query_start", doc="List of all fragments' query start coordinates."
678    )
679
680    hit_end_all = allitems("hit_end", doc="List of all fragments' hit end coordinates.")
681
682    query_end_all = allitems(
683        "query_end", doc="List of all fragments' query end coordinates."
684    )
685
686    hit_span_all = allitems("hit_span", doc="List of all fragments' hit sequence size.")
687
688    query_span_all = allitems(
689        "query_span", doc="List of all fragments' query sequence size."
690    )
691
692    hit_range_all = allitems(
693        "hit_range", doc="List of all fragments' hit start and end coordinates."
694    )
695
696    query_range_all = allitems(
697        "query_range", doc="List of all fragments' query start and end coordinates."
698    )
699
700
701class HSPFragment(_BaseHSP):
702    """Class representing a contiguous alignment of hit-query sequence.
703
704    HSPFragment forms the core of any parsed search output file. Depending on
705    the search output file format, it may contain the actual query and/or hit
706    sequences that produces the search hits. These sequences are stored as
707    SeqRecord objects (see SeqRecord):
708
709    >>> from Bio import SearchIO
710    >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml'))
711    >>> fragment = qresult[0][0][0]   # first hit, first hsp, first fragment
712    >>> print(fragment)
713          Query: 33211 mir_1
714            Hit: gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 520b (MIR520...
715    Query range: [0:61] (1)
716      Hit range: [0:61] (1)
717      Fragments: 1 (61 columns)
718         Query - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG
719                 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
720           Hit - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG
721
722    # the query sequence is a SeqRecord object
723    >>> fragment.query.__class__
724    <class 'Bio.SeqRecord.SeqRecord'>
725    >>> print(fragment.query)
726    ID: 33211
727    Name: aligned query sequence
728    Description: mir_1
729    Number of features: 0
730    /molecule_type=DNA
731    Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG')
732
733    # the hit sequence is a SeqRecord object as well
734    >>> fragment.hit.__class__
735    <class 'Bio.SeqRecord.SeqRecord'>
736    >>> print(fragment.hit)
737    ID: gi|262205317|ref|NR_030195.1|
738    Name: aligned hit sequence
739    Description: Homo sapiens microRNA 520b (MIR520B), microRNA
740    Number of features: 0
741    /molecule_type=DNA
742    Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG')
743
744    # when both query and hit are present, we get a MultipleSeqAlignment object
745    >>> fragment.aln.__class__
746    <class 'Bio.Align.MultipleSeqAlignment'>
747    >>> print(fragment.aln)
748    Alignment with 2 rows and 61 columns
749    CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG 33211
750    CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG gi|262205317|ref|NR_030195.1|
751
752    """
753
754    def __init__(
755        self,
756        hit_id="<unknown id>",
757        query_id="<unknown id>",
758        hit=None,
759        query=None,
760        molecule_type=None,
761    ):
762        """Initialize the class."""
763        self._molecule_type = molecule_type
764        self.aln_annotation = {}
765
766        self._hit_id = hit_id
767        self._query_id = query_id
768
769        for seq_type in ("query", "hit"):
770            # query or hit attributes default attributes
771            setattr(self, "_%s_description" % seq_type, "<unknown description>")
772            setattr(self, "_%s_features" % seq_type, [])
773            # query or hit attributes whose default attribute is None
774            for attr in ("strand", "frame", "start", "end"):
775                setattr(self, "%s_%s" % (seq_type, attr), None)
776            # self.query or self.hit
777            if eval(seq_type):
778                setattr(self, seq_type, eval(seq_type))
779            else:
780                setattr(self, seq_type, None)
781
782    def __repr__(self):
783        """Return HSPFragment info; hit id, query id, number of columns."""
784        info = "hit_id=%r, query_id=%r" % (self.hit_id, self.query_id)
785        try:
786            info += ", %i columns" % len(self)
787        except AttributeError:
788            pass
789        return "%s(%s)" % (self.__class__.__name__, info)
790
791    def __len__(self):
792        """Return alignment span."""
793        return self.aln_span
794
795    def __str__(self):
796        """Return string of HSP header and alignments."""
797        return self._str_hsp_header() + "\n" + self._str_aln()
798
799    def __getitem__(self, idx):
800        """Return object of index idx."""
801        if self.aln is not None:
802            obj = self.__class__(
803                hit_id=self.hit_id,
804                query_id=self.query_id,
805                molecule_type=self.molecule_type,
806            )
807            # transfer query and hit attributes
808            # let SeqRecord handle feature slicing, then retrieve the sliced
809            # features into the sliced HSPFragment
810            if self.query is not None:
811                obj.query = self.query[idx]
812                obj.query_features = obj.query.features
813            if self.hit is not None:
814                obj.hit = self.hit[idx]
815                obj.hit_features = obj.hit.features
816            # description, strand, frame
817            for attr in ("description", "strand", "frame"):
818                for seq_type in ("hit", "query"):
819                    attr_name = "%s_%s" % (seq_type, attr)
820                    self_val = getattr(self, attr_name)
821                    setattr(obj, attr_name, self_val)
822            # alignment annotation should be transferred, since we can compute
823            # the resulting annotation
824            obj.aln_annotation = {}
825            for key, value in self.aln_annotation.items():
826                assert len(value[idx]) == len(obj)
827                obj.aln_annotation[key] = value[idx]
828            return obj
829        else:
830            raise TypeError(
831                "Slicing for HSP objects without alignment is not supported."
832            )
833
834    def _str_aln(self):
835        lines = []
836        # alignment length
837        aln_span = getattr_str(self, "aln_span")
838        lines.append("  Fragments: 1 (%s columns)" % aln_span)
839        # sequences
840        if self.query is not None and self.hit is not None:
841            try:
842                qseq = self.query.seq
843            except AttributeError:  # query is None
844                qseq = "?"
845            try:
846                hseq = self.hit.seq
847            except AttributeError:  # hit is None
848                hseq = "?"
849
850            # similarity line
851            simil = ""
852            if "similarity" in self.aln_annotation and isinstance(
853                self.aln_annotation.get("similarity"), str
854            ):
855                simil = self.aln_annotation["similarity"]
856
857            if self.aln_span <= 67:
858                lines.append("%10s - %s" % ("Query", qseq))
859                if simil:
860                    lines.append("             %s" % simil)
861                lines.append("%10s - %s" % ("Hit", hseq))
862            else:
863                # adjust continuation character length, so we don't display
864                # the same residues twice
865                if self.aln_span - 66 > 3:
866                    cont = "~" * 3
867                else:
868                    cont = "~" * (self.aln_span - 66)
869                lines.append("%10s - %s%s%s" % ("Query", qseq[:59], cont, qseq[-5:]))
870                if simil:
871                    lines.append("             %s%s%s" % (simil[:59], cont, simil[-5:]))
872                lines.append("%10s - %s%s%s" % ("Hit", hseq[:59], cont, hseq[-5:]))
873
874        return "\n".join(lines)
875
876    # sequence properties #
877    def _set_seq(self, seq, seq_type):
878        """Check the given sequence for attribute setting (PRIVATE).
879
880        :param seq: sequence to check
881        :type seq: string or SeqRecord
882        :param seq_type: sequence type
883        :type seq_type: string, choice of 'hit' or 'query'
884
885        """
886        assert seq_type in ("hit", "query")
887        if seq is None:
888            return seq  # return immediately if seq is None
889        else:
890            if not isinstance(seq, (str, SeqRecord)):
891                raise TypeError(
892                    "%s sequence must be a string or a SeqRecord object." % seq_type
893                )
894        # check length if the opposite sequence is not None
895        opp_type = "hit" if seq_type == "query" else "query"
896        opp_seq = getattr(self, "_%s" % opp_type, None)
897        if opp_seq is not None:
898            if len(seq) != len(opp_seq):
899                raise ValueError(
900                    "Sequence lengths do not match. Expected: %r (%s); found: %r (%s)."
901                    % (len(opp_seq), opp_type, len(seq), seq_type)
902                )
903
904        seq_id = getattr(self, "%s_id" % seq_type)
905        seq_desc = getattr(self, "%s_description" % seq_type)
906        seq_feats = getattr(self, "%s_features" % seq_type)
907        seq_name = "aligned %s sequence" % seq_type
908
909        if isinstance(seq, SeqRecord):
910            seq.id = seq_id
911            seq.description = seq_desc
912            seq.name = seq_name
913            seq.features = seq_feats
914            seq.annotations["molecule_type"] = self.molecule_type
915        elif isinstance(seq, str):
916            seq = SeqRecord(
917                Seq(seq),
918                id=seq_id,
919                name=seq_name,
920                description=seq_desc,
921                features=seq_feats,
922                annotations={"molecule_type": self.molecule_type},
923            )
924
925        return seq
926
927    def _hit_get(self):
928        return self._hit
929
930    def _hit_set(self, value):
931        self._hit = self._set_seq(value, "hit")
932
933    hit = property(
934        fget=_hit_get,
935        fset=_hit_set,
936        doc="Hit sequence as a SeqRecord object, defaults to None.",
937    )
938
939    def _query_get(self):
940        return self._query
941
942    def _query_set(self, value):
943        self._query = self._set_seq(value, "query")
944
945    query = property(
946        fget=_query_get,
947        fset=_query_set,
948        doc="Query sequence as a SeqRecord object, defaults to None.",
949    )
950
951    def _aln_get(self):
952        if self.query is None and self.hit is None:
953            return None
954        if self.hit is None:
955            msa = MultipleSeqAlignment([self.query])
956        elif self.query is None:
957            msa = MultipleSeqAlignment([self.hit])
958        else:
959            msa = MultipleSeqAlignment([self.query, self.hit])
960        molecule_type = self.molecule_type
961        if molecule_type is not None:
962            msa.molecule_type = molecule_type
963        return msa
964
965    aln = property(
966        fget=_aln_get,
967        doc="Query-hit alignment as a MultipleSeqAlignment object, defaults to None.",
968    )
969
970    def _molecule_type_get(self):
971        return self._molecule_type
972
973    def _molecule_type_set(self, value):
974        self._molecule_type = value
975        try:
976            self.query.annotations["molecule_type"] = value
977        except AttributeError:
978            pass
979        try:
980            self.hit.annotations["molecule_type"] = value
981        except AttributeError:
982            pass
983
984    molecule_type = property(
985        fget=_molecule_type_get,
986        fset=_molecule_type_set,
987        doc="molecule type used in the fragment's "
988        "sequence records and alignment, defaults to None.",
989    )
990
991    def _aln_span_get(self):
992        # length of alignment (gaps included)
993        # alignment span can be its own attribute, or computed from
994        # query / hit length
995        try:
996            self._aln_span
997        except AttributeError:
998            if self.query is not None:
999                self._aln_span = len(self.query)
1000            elif self.hit is not None:
1001                self._aln_span = len(self.hit)
1002
1003        return self._aln_span
1004
1005    def _aln_span_set(self, value):
1006        self._aln_span = value
1007
1008    aln_span = property(
1009        fget=_aln_span_get,
1010        fset=_aln_span_set,
1011        doc="The number of alignment columns covered by the fragment.",
1012    )
1013
1014    # id, description, and features properties #
1015    hit_description = fragcascade("description", "hit", doc="Hit sequence description.")
1016
1017    query_description = fragcascade(
1018        "description", "query", doc="Query sequence description."
1019    )
1020
1021    hit_id = fragcascade("id", "hit", doc="Hit sequence ID.")
1022
1023    query_id = fragcascade("id", "query", doc="Query sequence ID.")
1024
1025    hit_features = fragcascade("features", "hit", doc="Hit sequence features.")
1026
1027    query_features = fragcascade("features", "query", doc="Query sequence features.")
1028
1029    # strand properties #
1030    def _prep_strand(self, strand):
1031        # follow SeqFeature's convention
1032        if strand not in (-1, 0, 1, None):
1033            raise ValueError("Strand should be -1, 0, 1, or None; not %r" % strand)
1034        return strand
1035
1036    def _get_strand(self, seq_type):
1037        assert seq_type in ("hit", "query")
1038        strand = getattr(self, "_%s_strand" % seq_type)
1039
1040        if strand is None:
1041            # try to compute strand from frame
1042            frame = getattr(self, "%s_frame" % seq_type)
1043            if frame is not None:
1044                try:
1045                    strand = frame // abs(frame)
1046                except ZeroDivisionError:
1047                    strand = 0
1048                setattr(self, "%s_strand" % seq_type, strand)
1049
1050        return strand
1051
1052    def _hit_strand_get(self):
1053        return self._get_strand("hit")
1054
1055    def _hit_strand_set(self, value):
1056        self._hit_strand = self._prep_strand(value)
1057
1058    hit_strand = property(
1059        fget=_hit_strand_get,
1060        fset=_hit_strand_set,
1061        doc="Hit sequence strand, defaults to None.",
1062    )
1063
1064    def _query_strand_get(self):
1065        return self._get_strand("query")
1066
1067    def _query_strand_set(self, value):
1068        self._query_strand = self._prep_strand(value)
1069
1070    query_strand = property(
1071        fget=_query_strand_get,
1072        fset=_query_strand_set,
1073        doc="Query sequence strand, defaults to None.",
1074    )
1075
1076    # frame properties #
1077    def _prep_frame(self, frame):
1078        if frame not in (-3, -2, -1, 0, 1, 2, 3, None):
1079            raise ValueError(
1080                "Strand should be an integer between -3 and 3, or None; not %r" % frame
1081            )
1082        return frame
1083
1084    def _hit_frame_get(self):
1085        return self._hit_frame
1086
1087    def _hit_frame_set(self, value):
1088        self._hit_frame = self._prep_frame(value)
1089
1090    hit_frame = property(
1091        fget=_hit_frame_get,
1092        fset=_hit_frame_set,
1093        doc="Hit sequence reading frame, defaults to None.",
1094    )
1095
1096    def _query_frame_get(self):
1097        """Get query sequence reading frame (PRIVATE)."""
1098        return self._query_frame
1099
1100    def _query_frame_set(self, value):
1101        """Set query sequence reading frame (PRIVATE)."""
1102        self._query_frame = self._prep_frame(value)
1103
1104    query_frame = property(
1105        fget=_query_frame_get,
1106        fset=_query_frame_set,
1107        doc="Query sequence reading frame, defaults to None.",
1108    )
1109
1110    # coordinate properties #
1111    def _prep_coord(self, coord, opp_coord_name, op):
1112        # coord must either be None or int
1113        if coord is None:
1114            return coord
1115        assert isinstance(coord, int)
1116        # try to get opposite coordinate, if it's not present, return
1117        try:
1118            opp_coord = getattr(self, opp_coord_name)
1119        except AttributeError:
1120            return coord
1121        # if opposite coordinate is None, return
1122        if opp_coord is None:
1123            return coord
1124        # otherwise compare it to coord ('>=' or '<=')
1125        else:
1126            assert op(coord, opp_coord)
1127        return coord
1128
1129    def _hit_start_get(self):
1130        """Get the sequence hit start coordinate (PRIVATE)."""
1131        return self._hit_start
1132
1133    def _hit_start_set(self, value):
1134        """Set the sequence hit start coordinate (PRIVATE)."""
1135        self._hit_start = self._prep_coord(value, "hit_end", le)
1136
1137    hit_start = property(
1138        fget=_hit_start_get,
1139        fset=_hit_start_set,
1140        doc="Hit sequence start coordinate, defaults to None.",
1141    )
1142
1143    def _query_start_get(self):
1144        """Get the query sequence start coordinate (PRIVATE)."""
1145        return self._query_start
1146
1147    def _query_start_set(self, value):
1148        """Set the query sequence start coordinate (PRIVATE)."""
1149        self._query_start = self._prep_coord(value, "query_end", le)
1150
1151    query_start = property(
1152        fget=_query_start_get,
1153        fset=_query_start_set,
1154        doc="Query sequence start coordinate, defaults to None.",
1155    )
1156
1157    def _hit_end_get(self):
1158        """Get the hit sequence end coordinate (PRIVATE)."""
1159        return self._hit_end
1160
1161    def _hit_end_set(self, value):
1162        """Set the hit sequence end coordinate (PRIVATE)."""
1163        self._hit_end = self._prep_coord(value, "hit_start", ge)
1164
1165    hit_end = property(
1166        fget=_hit_end_get,
1167        fset=_hit_end_set,
1168        doc="Hit sequence end coordinate, defaults to None.",
1169    )
1170
1171    def _query_end_get(self):
1172        """Get the query sequence end coordinate (PRIVATE)."""
1173        return self._query_end
1174
1175    def _query_end_set(self, value):
1176        """Set the query sequence end coordinate (PRIVATE)."""
1177        self._query_end = self._prep_coord(value, "query_start", ge)
1178
1179    query_end = property(
1180        fget=_query_end_get,
1181        fset=_query_end_set,
1182        doc="Query sequence end coordinate, defaults to None.",
1183    )
1184
1185    # coordinate-dependent properties #
1186    def _hit_span_get(self):
1187        """Return the number of residues covered by the hit sequence (PRIVATE)."""
1188        try:
1189            return self.hit_end - self.hit_start
1190        except TypeError:  # triggered if any of the coordinates are None
1191            return None
1192
1193    hit_span = property(
1194        fget=_hit_span_get, doc="The number of residues covered by the hit sequence."
1195    )
1196
1197    def _query_span_get(self):
1198        """Return the number or residues covered by the query (PRIVATE)."""
1199        try:
1200            return self.query_end - self.query_start
1201        except TypeError:  # triggered if any of the coordinates are None
1202            return None
1203
1204    query_span = property(
1205        fget=_query_span_get,
1206        doc="The number of residues covered by the query sequence.",
1207    )
1208
1209    def _hit_range_get(self):
1210        """Return the start and end of a hit (PRIVATE)."""
1211        return (self.hit_start, self.hit_end)
1212
1213    hit_range = property(
1214        fget=_hit_range_get, doc="Tuple of hit start and end coordinates."
1215    )
1216
1217    def _query_range_get(self):
1218        """Return the start and end of a query (PRIVATE)."""
1219        return (self.query_start, self.query_end)
1220
1221    query_range = property(
1222        fget=_query_range_get, doc="Tuple of query start and end coordinates."
1223    )
1224
1225
1226# if not used as a module, run the doctest
1227if __name__ == "__main__":
1228    from Bio._utils import run_doctest
1229
1230    run_doctest()
1231