1# Copyright 1999-2000 by Jeffrey Chang.  All rights reserved.
2#
3# This file is part of the Biopython distribution and governed by your
4# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
5# Please see the LICENSE file that should have been included as part of this
6# package.
7"""Record classes to hold BLAST output.
8
9Classes:
10Blast              Holds all the information from a blast search.
11PSIBlast           Holds all the information from a psi-blast search.
12
13Header             Holds information from the header.
14Description        Holds information about one hit description.
15Alignment          Holds information about one alignment hit.
16HSP                Holds information about one HSP.
17MultipleAlignment  Holds information about a multiple alignment.
18DatabaseReport     Holds information from the database report.
19Parameters         Holds information from the parameters.
20
21"""
22# XXX finish printable BLAST output
23
24from Bio.Seq import Seq
25from Bio.SeqRecord import SeqRecord
26from Bio.Align import MultipleSeqAlignment
27
28
29def fmt_(value, format_spec="%s", default_str="<unknown>"):
30    """Ensure the given value formats to a string correctly."""
31    if value is None:
32        return default_str
33    return format_spec % value
34
35
36class Header:
37    """Saves information from a blast header.
38
39    Members:
40    application         The name of the BLAST flavor that generated this data.
41    version             Version of blast used.
42    date                Date this data was generated.
43    reference           Reference for blast.
44
45    query               Name of query sequence.
46    query_letters       Number of letters in the query sequence.  (int)
47
48    database            Name of the database.
49    database_sequences  Number of sequences in the database.  (int)
50    database_letters    Number of letters in the database.  (int)
51
52    """
53
54    def __init__(self):
55        """Initialize the class."""
56        self.application = ""
57        self.version = ""
58        self.date = ""
59        self.reference = ""
60
61        self.query = ""
62        self.query_letters = None
63
64        self.database = ""
65        self.database_sequences = None
66        self.database_letters = None
67
68
69class Description:
70    """Stores information about one hit in the descriptions section.
71
72    Members:
73    title           Title of the hit.
74    score           Number of bits.  (int)
75    bits            Bit score. (float)
76    e               E value.  (float)
77    num_alignments  Number of alignments for the same subject.  (int)
78    """
79
80    def __init__(self):
81        """Initialize the class."""
82        self.title = ""
83        self.score = None
84        self.bits = None
85        self.e = None
86        self.num_alignments = None
87
88    def __str__(self):
89        """Return the description as a string."""
90        return "%-66s %5s  %s" % (self.title, self.score, self.e)
91
92
93class DescriptionExt(Description):
94    """Extended description record for BLASTXML version 2.
95
96    Members:
97    items           List of DescriptionExtItem
98    """
99
100    def __init__(self):
101        """Initialize the class."""
102        super().__init__()
103
104        self.items = []
105
106    def append_item(self, item):
107        """Add a description extended record."""
108        if len(self.items) == 0:
109            self.title = str(item)
110        self.items.append(item)
111
112
113class DescriptionExtItem:
114    """Stores information about one record in hit description for BLASTXML version 2.
115
116    Members:
117    id              Database identifier
118    title           Title of the hit.
119    """
120
121    def __init__(self):
122        """Initialize the class."""
123        self.id = None
124        self.title = None
125        self.accession = None
126        self.taxid = None
127        self.sciname = None
128
129    def __str__(self):
130        """Return the description identifier and title as a string."""
131        return "%s %s" % (self.id, self.title)
132
133
134class Alignment:
135    """Stores information about one hit in the alignments section.
136
137    Members:
138    title      Name.
139    hit_id     Hit identifier. (str)
140    hit_def    Hit definition. (str)
141    length     Length.  (int)
142    hsps       A list of HSP objects.
143
144    """
145
146    def __init__(self):
147        """Initialize the class."""
148        self.title = ""
149        self.hit_id = ""
150        self.hit_def = ""
151        self.length = None
152        self.hsps = []
153
154    def __str__(self):
155        """Return the BLAST alignment as a formatted string."""
156        lines = self.title.split("\n")
157        lines.append("Length = %s\n" % self.length)
158        return "\n           ".join(lines)
159
160
161class HSP:
162    """Stores information about one hsp in an alignment hit.
163
164    Members:
165        - score           BLAST score of hit.  (float)
166        - bits            Number of bits for that score.  (float)
167        - expect          Expect value.  (float)
168        - num_alignments  Number of alignments for same subject.  (int)
169        - identities      Number of identities (int) if using the XML parser.
170          Tuple of number of identities/total aligned (int, int)
171          if using the (obsolete) plain text parser.
172        - positives       Number of positives (int) if using the XML parser.
173          Tuple of number of positives/total aligned (int, int)
174          if using the (obsolete) plain text parser.
175        - gaps            Number of gaps (int) if using the XML parser.
176          Tuple of number of gaps/total aligned (int, int) if
177          using the (obsolete) plain text parser.
178        - align_length    Length of the alignment. (int)
179        - strand          Tuple of (query, target) strand.
180        - frame           Tuple of 1 or 2 frame shifts, depending on the flavor.
181
182        - query           The query sequence.
183        - query_start     The start residue for the query sequence.  (1-based)
184        - query_end       The end residue for the query sequence.  (1-based)
185        - match           The match sequence.
186        - sbjct           The sbjct sequence.
187        - sbjct_start     The start residue for the sbjct sequence.  (1-based)
188        - sbjct_end       The end residue for the sbjct sequence.  (1-based)
189
190    Not all flavors of BLAST return values for every attribute::
191
192                  score     expect     identities   positives    strand  frame
193        BLASTP     X          X            X            X
194        BLASTN     X          X            X            X          X
195        BLASTX     X          X            X            X                  X
196        TBLASTN    X          X            X            X                  X
197        TBLASTX    X          X            X            X                 X/X
198
199    Note: for BLASTX, the query sequence is shown as a protein sequence,
200    but the numbering is based on the nucleotides.  Thus, the numbering
201    is 3x larger than the number of amino acid residues.  A similar effect
202    can be seen for the sbjct sequence in TBLASTN, and for both sequences
203    in TBLASTX.
204
205    Also, for negative frames, the sequence numbering starts from
206    query_start and counts down.
207
208    """
209
210    def __init__(self):
211        """Initialize the class."""
212        self.score = None
213        self.bits = None
214        self.expect = None
215        self.num_alignments = None
216        self.identities = (None, None)
217        self.positives = (None, None)
218        self.gaps = (None, None)
219        self.align_length = None
220        self.strand = (None, None)
221        self.frame = ()
222
223        self.query = ""
224        self.query_start = None
225        self.query_end = None
226        self.match = ""
227        self.sbjct = ""
228        self.sbjct_start = None
229        self.sbjct_end = None
230
231    def __str__(self):
232        """Return the BLAST HSP as a formatted string."""
233        lines = [
234            "Score %s (%s bits), expectation %s, alignment length %s"
235            % (
236                fmt_(self.score, "%i"),
237                fmt_(self.bits, "%i"),
238                fmt_(self.expect, "%0.1e"),
239                fmt_(self.align_length, "%i"),
240            )
241        ]
242        if self.align_length is None:
243            return "\n".join(lines)
244        if self.align_length < 50:
245            lines.append(
246                "Query:%8s %s %s" % (self.query_start, self.query, self.query_end)
247            )
248            lines.append("               %s" % self.match)
249            lines.append(
250                "Sbjct:%8s %s %s" % (self.sbjct_start, self.sbjct, self.sbjct_end)
251            )
252        else:
253            lines.append(
254                "Query:%8s %s...%s %s"
255                % (self.query_start, self.query[:45], self.query[-3:], self.query_end,)
256            )
257            lines.append("               %s...%s" % (self.match[:45], self.match[-3:]))
258            lines.append(
259                "Sbjct:%8s %s...%s %s"
260                % (self.sbjct_start, self.sbjct[:45], self.sbjct[-3:], self.sbjct_end)
261            )
262        return "\n".join(lines)
263
264
265class MultipleAlignment:
266    """Holds information about a multiple alignment.
267
268    Members:
269    alignment  A list of tuples (name, start residue, sequence, end residue).
270
271    The start residue is 1-based.  It may be blank, if that sequence is
272    not aligned in the multiple alignment.
273
274    """
275
276    def __init__(self):
277        """Initialize the class."""
278        self.alignment = []
279
280    def to_generic(self):
281        """Retrieve generic alignment object for the given alignment.
282
283        Instead of the tuples, this returns a MultipleSeqAlignment object
284        from Bio.Align, through which you can manipulate and query
285        the object.
286
287        Thanks to James Casbon for the code.
288        """
289        seq_parts = []
290        seq_names = []
291        parse_number = 0
292        n = 0
293        for name, start, seq, end in self.alignment:
294            if name == "QUERY":  # QUERY is the first in each alignment block
295                parse_number += 1
296                n = 0
297
298            if parse_number == 1:  # create on first_parse, append on all others
299                seq_parts.append(seq)
300                seq_names.append(name)
301            else:
302                seq_parts[n] += seq
303                n += 1
304
305        records = (
306            SeqRecord(Seq(seq), name) for (name, seq) in zip(seq_names, seq_parts)
307        )
308        return MultipleSeqAlignment(records)
309
310
311class Round:
312    """Holds information from a PSI-BLAST round.
313
314    Members:
315    number       Round number.  (int)
316    reused_seqs  Sequences in model, found again.  List of Description objects.
317    new_seqs     Sequences not found, or below threshold.  List of Description.
318    alignments          A list of Alignment objects.
319    multiple_alignment  A MultipleAlignment object.
320    """
321
322    def __init__(self):
323        """Initialize the class."""
324        self.number = None
325        self.reused_seqs = []
326        self.new_seqs = []
327        self.alignments = []
328        self.multiple_alignment = None
329
330
331class DatabaseReport:
332    """Holds information about a database report.
333
334    Members:
335    database_name              List of database names.  (can have multiple dbs)
336    num_letters_in_database    Number of letters in the database.  (int)
337    num_sequences_in_database  List of number of sequences in the database.
338    posted_date                List of the dates the databases were posted.
339    ka_params                  A tuple of (lambda, k, h) values.  (floats)
340    gapped                     # XXX this isn't set right!
341    ka_params_gap              A tuple of (lambda, k, h) values.  (floats)
342
343    """
344
345    def __init__(self):
346        """Initialize the class."""
347        self.database_name = []
348        self.posted_date = []
349        self.num_letters_in_database = []
350        self.num_sequences_in_database = []
351        self.ka_params = (None, None, None)
352        self.gapped = 0
353        self.ka_params_gap = (None, None, None)
354
355
356class Parameters:
357    """Holds information about the parameters.
358
359    Members:
360    matrix              Name of the matrix.
361    gap_penalties       Tuple of (open, extend) penalties.  (floats)
362    sc_match            Match score for nucleotide-nucleotide comparison
363    sc_mismatch         Mismatch penalty for nucleotide-nucleotide comparison
364    num_hits            Number of hits to the database.  (int)
365    num_sequences       Number of sequences.  (int)
366    num_good_extends    Number of extensions.  (int)
367    num_seqs_better_e   Number of sequences better than e-value.  (int)
368    hsps_no_gap         Number of HSP's better, without gapping.  (int)
369    hsps_prelim_gapped  Number of HSP's gapped in prelim test.  (int)
370    hsps_prelim_gapped_attemped  Number of HSP's attempted in prelim.  (int)
371    hsps_gapped         Total number of HSP's gapped.  (int)
372    query_length        Length of the query.  (int)
373    query_id            Identifier of the query sequence. (str)
374    database_length     Number of letters in the database.  (int)
375    effective_hsp_length         Effective HSP length.  (int)
376    effective_query_length       Effective length of query.  (int)
377    effective_database_length    Effective length of database.  (int)
378    effective_search_space       Effective search space.  (int)
379    effective_search_space_used  Effective search space used.  (int)
380    frameshift          Frameshift window.  Tuple of (int, float)
381    threshold           Threshold.  (int)
382    window_size         Window size.  (int)
383    dropoff_1st_pass    Tuple of (score, bits).  (int, float)
384    gap_x_dropoff       Tuple of (score, bits).  (int, float)
385    gap_x_dropoff_final Tuple of (score, bits).  (int, float)
386    gap_trigger         Tuple of (score, bits).  (int, float)
387    blast_cutoff        Tuple of (score, bits).  (int, float)
388    """
389
390    def __init__(self):
391        """Initialize the class."""
392        self.matrix = ""
393        self.gap_penalties = (None, None)
394        self.sc_match = None
395        self.sc_mismatch = None
396        self.num_hits = None
397        self.num_sequences = None
398        self.num_good_extends = None
399        self.num_seqs_better_e = None
400        self.hsps_no_gap = None
401        self.hsps_prelim_gapped = None
402        self.hsps_prelim_gapped_attemped = None
403        self.hsps_gapped = None
404        self.query_id = None
405        self.query_length = None
406        self.database_length = None
407        self.effective_hsp_length = None
408        self.effective_query_length = None
409        self.effective_database_length = None
410        self.effective_search_space = None
411        self.effective_search_space_used = None
412        self.frameshift = (None, None)
413        self.threshold = None
414        self.window_size = None
415        self.dropoff_1st_pass = (None, None)
416        self.gap_x_dropoff = (None, None)
417        self.gap_x_dropoff_final = (None, None)
418        self.gap_trigger = (None, None)
419        self.blast_cutoff = (None, None)
420
421
422# TODO - Add a friendly __str__ method to BLAST results
423class Blast(Header, DatabaseReport, Parameters):
424    """Saves the results from a blast search.
425
426    Members:
427    descriptions        A list of Description objects.
428    alignments          A list of Alignment objects.
429    multiple_alignment  A MultipleAlignment object.
430    + members inherited from base classes
431
432    """
433
434    def __init__(self):
435        """Initialize the class."""
436        Header.__init__(self)
437        DatabaseReport.__init__(self)
438        Parameters.__init__(self)
439        self.descriptions = []
440        self.alignments = []
441        self.multiple_alignment = None
442
443
444class PSIBlast(Header, DatabaseReport, Parameters):
445    """Saves the results from a blastpgp search.
446
447    Members:
448    rounds       A list of Round objects.
449    converged    Whether the search converged.
450    + members inherited from base classes
451
452    """
453
454    def __init__(self):
455        """Initialize the class."""
456        Header.__init__(self)
457        DatabaseReport.__init__(self)
458        Parameters.__init__(self)
459        self.rounds = []
460        self.converged = 0
461