1# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
2#
3# This file is part of the Biopython distribution and governed by your
4# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
5# Please see the LICENSE file that should have been included as part of this
6# package.
7
8"""Bio.SearchIO parser for BLAST+ plain text output formats.
9
10At the moment this is a wrapper around Biopython's NCBIStandalone text
11parser (which is now deprecated).
12
13"""
14
15from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
16from Bio.SearchIO._legacy import NCBIStandalone
17
18
19__all__ = ("BlastTextParser",)
20
21
22class BlastTextParser:
23    """Parser for the BLAST text format."""
24
25    def __init__(self, handle):
26        """Initialize the class."""
27        self.handle = handle
28        blast_parser = NCBIStandalone.BlastParser()
29        self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
30
31    def __iter__(self):
32        """Iterate over BlastTextParser, yields query results."""
33        for rec in self.blast_iter:
34            # set attributes to SearchIO's
35            # get id and desc
36            if rec.query.startswith(">"):
37                rec.query = rec.query[1:]
38            try:
39                qid, qdesc = rec.query.split(" ", 1)
40            except ValueError:
41                qid, qdesc = rec.query, ""
42            qdesc = qdesc.replace("\n", "").replace("\r", "")
43
44            qresult = QueryResult(id=qid)
45            qresult.program = rec.application.lower()
46            qresult.target = rec.database
47            qresult.seq_len = rec.query_letters
48            qresult.version = rec.version
49
50            # determine molecule_type based on program
51            if qresult.program == "blastn":
52                molecule_type = "DNA"
53            elif qresult.program in ["blastp", "blastx", "tblastn", "tblastx"]:
54                molecule_type = "protein"
55
56            # iterate over the 'alignments' (hits) and the hit table
57            for idx, aln in enumerate(rec.alignments):
58                # get id and desc
59                if aln.title.startswith("> "):
60                    aln.title = aln.title[2:]
61                elif aln.title.startswith(">"):
62                    aln.title = aln.title[1:]
63                try:
64                    hid, hdesc = aln.title.split(" ", 1)
65                except ValueError:
66                    hid, hdesc = aln.title, ""
67                hdesc = hdesc.replace("\n", "").replace("\r", "")
68
69                # iterate over the hsps and group them in a list
70                hsp_list = []
71                for bhsp in aln.hsps:
72                    frag = HSPFragment(hid, qid)
73                    frag.molecule_type = molecule_type
74                    # set alignment length
75                    frag.aln_span = bhsp.identities[1]
76                    # set frames
77                    try:
78                        frag.query_frame = int(bhsp.frame[0])
79                    except IndexError:
80                        if qresult.program in ("blastp", "tblastn"):
81                            frag.query_frame = 0
82                        else:
83                            frag.query_frame = 1
84                    try:
85                        frag.hit_frame = int(bhsp.frame[1])
86                    except IndexError:
87                        if qresult.program in ("blastp", "tblastn"):
88                            frag.hit_frame = 0
89                        else:
90                            frag.hit_frame = 1
91                    # set query coordinates
92                    frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1
93                    frag.query_end = max(bhsp.query_start, bhsp.query_end)
94                    # set hit coordinates
95                    frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1
96                    frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end)
97                    # set query, hit sequences and its annotation
98                    qseq = ""
99                    hseq = ""
100                    midline = ""
101                    for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match):
102                        qchar, hchar, mchar = seqtrio
103                        if qchar == " " or hchar == " ":
104                            assert all(" " == x for x in seqtrio)
105                        else:
106                            qseq += qchar
107                            hseq += hchar
108                            midline += mchar
109                    frag.query, frag.hit = qseq, hseq
110                    frag.aln_annotation["similarity"] = midline
111
112                    # create HSP object with the fragment
113                    hsp = HSP([frag])
114                    hsp.evalue = bhsp.expect
115                    hsp.bitscore = bhsp.bits
116                    hsp.bitscore_raw = bhsp.score
117                    # set gap
118                    try:
119                        hsp.gap_num = bhsp.gaps[0]
120                    except IndexError:
121                        hsp.gap_num = 0
122                    # set identity
123                    hsp.ident_num = bhsp.identities[0]
124                    hsp.pos_num = bhsp.positives[0]
125                    if hsp.pos_num is None:
126                        hsp.pos_num = hsp[0].aln_span
127
128                    hsp_list.append(hsp)
129
130                hit = Hit(hsp_list)
131                hit.seq_len = aln.length
132                hit.description = hdesc
133                qresult.append(hit)
134
135            qresult.description = qdesc
136            yield qresult
137
138
139# if not used as a module, run the doctest
140if __name__ == "__main__":
141    from Bio._utils import run_doctest
142
143    run_doctest()
144