1# Copyright 1999-2000 by Jeffrey Chang. All rights reserved. 2# 3# This file is part of the Biopython distribution and governed by your 4# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 5# Please see the LICENSE file that should have been included as part of this 6# package. 7"""Record classes to hold BLAST output. 8 9Classes: 10Blast Holds all the information from a blast search. 11PSIBlast Holds all the information from a psi-blast search. 12 13Header Holds information from the header. 14Description Holds information about one hit description. 15Alignment Holds information about one alignment hit. 16HSP Holds information about one HSP. 17MultipleAlignment Holds information about a multiple alignment. 18DatabaseReport Holds information from the database report. 19Parameters Holds information from the parameters. 20 21""" 22# XXX finish printable BLAST output 23 24from Bio.Seq import Seq 25from Bio.SeqRecord import SeqRecord 26from Bio.Align import MultipleSeqAlignment 27 28 29def fmt_(value, format_spec="%s", default_str="<unknown>"): 30 """Ensure the given value formats to a string correctly.""" 31 if value is None: 32 return default_str 33 return format_spec % value 34 35 36class Header: 37 """Saves information from a blast header. 38 39 Members: 40 application The name of the BLAST flavor that generated this data. 41 version Version of blast used. 42 date Date this data was generated. 43 reference Reference for blast. 44 45 query Name of query sequence. 46 query_letters Number of letters in the query sequence. (int) 47 48 database Name of the database. 49 database_sequences Number of sequences in the database. (int) 50 database_letters Number of letters in the database. (int) 51 52 """ 53 54 def __init__(self): 55 """Initialize the class.""" 56 self.application = "" 57 self.version = "" 58 self.date = "" 59 self.reference = "" 60 61 self.query = "" 62 self.query_letters = None 63 64 self.database = "" 65 self.database_sequences = None 66 self.database_letters = None 67 68 69class Description: 70 """Stores information about one hit in the descriptions section. 71 72 Members: 73 title Title of the hit. 74 score Number of bits. (int) 75 bits Bit score. (float) 76 e E value. (float) 77 num_alignments Number of alignments for the same subject. (int) 78 """ 79 80 def __init__(self): 81 """Initialize the class.""" 82 self.title = "" 83 self.score = None 84 self.bits = None 85 self.e = None 86 self.num_alignments = None 87 88 def __str__(self): 89 """Return the description as a string.""" 90 return "%-66s %5s %s" % (self.title, self.score, self.e) 91 92 93class DescriptionExt(Description): 94 """Extended description record for BLASTXML version 2. 95 96 Members: 97 items List of DescriptionExtItem 98 """ 99 100 def __init__(self): 101 """Initialize the class.""" 102 super().__init__() 103 104 self.items = [] 105 106 def append_item(self, item): 107 """Add a description extended record.""" 108 if len(self.items) == 0: 109 self.title = str(item) 110 self.items.append(item) 111 112 113class DescriptionExtItem: 114 """Stores information about one record in hit description for BLASTXML version 2. 115 116 Members: 117 id Database identifier 118 title Title of the hit. 119 """ 120 121 def __init__(self): 122 """Initialize the class.""" 123 self.id = None 124 self.title = None 125 self.accession = None 126 self.taxid = None 127 self.sciname = None 128 129 def __str__(self): 130 """Return the description identifier and title as a string.""" 131 return "%s %s" % (self.id, self.title) 132 133 134class Alignment: 135 """Stores information about one hit in the alignments section. 136 137 Members: 138 title Name. 139 hit_id Hit identifier. (str) 140 hit_def Hit definition. (str) 141 length Length. (int) 142 hsps A list of HSP objects. 143 144 """ 145 146 def __init__(self): 147 """Initialize the class.""" 148 self.title = "" 149 self.hit_id = "" 150 self.hit_def = "" 151 self.length = None 152 self.hsps = [] 153 154 def __str__(self): 155 """Return the BLAST alignment as a formatted string.""" 156 lines = self.title.split("\n") 157 lines.append("Length = %s\n" % self.length) 158 return "\n ".join(lines) 159 160 161class HSP: 162 """Stores information about one hsp in an alignment hit. 163 164 Members: 165 - score BLAST score of hit. (float) 166 - bits Number of bits for that score. (float) 167 - expect Expect value. (float) 168 - num_alignments Number of alignments for same subject. (int) 169 - identities Number of identities (int) if using the XML parser. 170 Tuple of number of identities/total aligned (int, int) 171 if using the (obsolete) plain text parser. 172 - positives Number of positives (int) if using the XML parser. 173 Tuple of number of positives/total aligned (int, int) 174 if using the (obsolete) plain text parser. 175 - gaps Number of gaps (int) if using the XML parser. 176 Tuple of number of gaps/total aligned (int, int) if 177 using the (obsolete) plain text parser. 178 - align_length Length of the alignment. (int) 179 - strand Tuple of (query, target) strand. 180 - frame Tuple of 1 or 2 frame shifts, depending on the flavor. 181 182 - query The query sequence. 183 - query_start The start residue for the query sequence. (1-based) 184 - query_end The end residue for the query sequence. (1-based) 185 - match The match sequence. 186 - sbjct The sbjct sequence. 187 - sbjct_start The start residue for the sbjct sequence. (1-based) 188 - sbjct_end The end residue for the sbjct sequence. (1-based) 189 190 Not all flavors of BLAST return values for every attribute:: 191 192 score expect identities positives strand frame 193 BLASTP X X X X 194 BLASTN X X X X X 195 BLASTX X X X X X 196 TBLASTN X X X X X 197 TBLASTX X X X X X/X 198 199 Note: for BLASTX, the query sequence is shown as a protein sequence, 200 but the numbering is based on the nucleotides. Thus, the numbering 201 is 3x larger than the number of amino acid residues. A similar effect 202 can be seen for the sbjct sequence in TBLASTN, and for both sequences 203 in TBLASTX. 204 205 Also, for negative frames, the sequence numbering starts from 206 query_start and counts down. 207 208 """ 209 210 def __init__(self): 211 """Initialize the class.""" 212 self.score = None 213 self.bits = None 214 self.expect = None 215 self.num_alignments = None 216 self.identities = (None, None) 217 self.positives = (None, None) 218 self.gaps = (None, None) 219 self.align_length = None 220 self.strand = (None, None) 221 self.frame = () 222 223 self.query = "" 224 self.query_start = None 225 self.query_end = None 226 self.match = "" 227 self.sbjct = "" 228 self.sbjct_start = None 229 self.sbjct_end = None 230 231 def __str__(self): 232 """Return the BLAST HSP as a formatted string.""" 233 lines = [ 234 "Score %s (%s bits), expectation %s, alignment length %s" 235 % ( 236 fmt_(self.score, "%i"), 237 fmt_(self.bits, "%i"), 238 fmt_(self.expect, "%0.1e"), 239 fmt_(self.align_length, "%i"), 240 ) 241 ] 242 if self.align_length is None: 243 return "\n".join(lines) 244 if self.align_length < 50: 245 lines.append( 246 "Query:%8s %s %s" % (self.query_start, self.query, self.query_end) 247 ) 248 lines.append(" %s" % self.match) 249 lines.append( 250 "Sbjct:%8s %s %s" % (self.sbjct_start, self.sbjct, self.sbjct_end) 251 ) 252 else: 253 lines.append( 254 "Query:%8s %s...%s %s" 255 % (self.query_start, self.query[:45], self.query[-3:], self.query_end,) 256 ) 257 lines.append(" %s...%s" % (self.match[:45], self.match[-3:])) 258 lines.append( 259 "Sbjct:%8s %s...%s %s" 260 % (self.sbjct_start, self.sbjct[:45], self.sbjct[-3:], self.sbjct_end) 261 ) 262 return "\n".join(lines) 263 264 265class MultipleAlignment: 266 """Holds information about a multiple alignment. 267 268 Members: 269 alignment A list of tuples (name, start residue, sequence, end residue). 270 271 The start residue is 1-based. It may be blank, if that sequence is 272 not aligned in the multiple alignment. 273 274 """ 275 276 def __init__(self): 277 """Initialize the class.""" 278 self.alignment = [] 279 280 def to_generic(self): 281 """Retrieve generic alignment object for the given alignment. 282 283 Instead of the tuples, this returns a MultipleSeqAlignment object 284 from Bio.Align, through which you can manipulate and query 285 the object. 286 287 Thanks to James Casbon for the code. 288 """ 289 seq_parts = [] 290 seq_names = [] 291 parse_number = 0 292 n = 0 293 for name, start, seq, end in self.alignment: 294 if name == "QUERY": # QUERY is the first in each alignment block 295 parse_number += 1 296 n = 0 297 298 if parse_number == 1: # create on first_parse, append on all others 299 seq_parts.append(seq) 300 seq_names.append(name) 301 else: 302 seq_parts[n] += seq 303 n += 1 304 305 records = ( 306 SeqRecord(Seq(seq), name) for (name, seq) in zip(seq_names, seq_parts) 307 ) 308 return MultipleSeqAlignment(records) 309 310 311class Round: 312 """Holds information from a PSI-BLAST round. 313 314 Members: 315 number Round number. (int) 316 reused_seqs Sequences in model, found again. List of Description objects. 317 new_seqs Sequences not found, or below threshold. List of Description. 318 alignments A list of Alignment objects. 319 multiple_alignment A MultipleAlignment object. 320 """ 321 322 def __init__(self): 323 """Initialize the class.""" 324 self.number = None 325 self.reused_seqs = [] 326 self.new_seqs = [] 327 self.alignments = [] 328 self.multiple_alignment = None 329 330 331class DatabaseReport: 332 """Holds information about a database report. 333 334 Members: 335 database_name List of database names. (can have multiple dbs) 336 num_letters_in_database Number of letters in the database. (int) 337 num_sequences_in_database List of number of sequences in the database. 338 posted_date List of the dates the databases were posted. 339 ka_params A tuple of (lambda, k, h) values. (floats) 340 gapped # XXX this isn't set right! 341 ka_params_gap A tuple of (lambda, k, h) values. (floats) 342 343 """ 344 345 def __init__(self): 346 """Initialize the class.""" 347 self.database_name = [] 348 self.posted_date = [] 349 self.num_letters_in_database = [] 350 self.num_sequences_in_database = [] 351 self.ka_params = (None, None, None) 352 self.gapped = 0 353 self.ka_params_gap = (None, None, None) 354 355 356class Parameters: 357 """Holds information about the parameters. 358 359 Members: 360 matrix Name of the matrix. 361 gap_penalties Tuple of (open, extend) penalties. (floats) 362 sc_match Match score for nucleotide-nucleotide comparison 363 sc_mismatch Mismatch penalty for nucleotide-nucleotide comparison 364 num_hits Number of hits to the database. (int) 365 num_sequences Number of sequences. (int) 366 num_good_extends Number of extensions. (int) 367 num_seqs_better_e Number of sequences better than e-value. (int) 368 hsps_no_gap Number of HSP's better, without gapping. (int) 369 hsps_prelim_gapped Number of HSP's gapped in prelim test. (int) 370 hsps_prelim_gapped_attemped Number of HSP's attempted in prelim. (int) 371 hsps_gapped Total number of HSP's gapped. (int) 372 query_length Length of the query. (int) 373 query_id Identifier of the query sequence. (str) 374 database_length Number of letters in the database. (int) 375 effective_hsp_length Effective HSP length. (int) 376 effective_query_length Effective length of query. (int) 377 effective_database_length Effective length of database. (int) 378 effective_search_space Effective search space. (int) 379 effective_search_space_used Effective search space used. (int) 380 frameshift Frameshift window. Tuple of (int, float) 381 threshold Threshold. (int) 382 window_size Window size. (int) 383 dropoff_1st_pass Tuple of (score, bits). (int, float) 384 gap_x_dropoff Tuple of (score, bits). (int, float) 385 gap_x_dropoff_final Tuple of (score, bits). (int, float) 386 gap_trigger Tuple of (score, bits). (int, float) 387 blast_cutoff Tuple of (score, bits). (int, float) 388 """ 389 390 def __init__(self): 391 """Initialize the class.""" 392 self.matrix = "" 393 self.gap_penalties = (None, None) 394 self.sc_match = None 395 self.sc_mismatch = None 396 self.num_hits = None 397 self.num_sequences = None 398 self.num_good_extends = None 399 self.num_seqs_better_e = None 400 self.hsps_no_gap = None 401 self.hsps_prelim_gapped = None 402 self.hsps_prelim_gapped_attemped = None 403 self.hsps_gapped = None 404 self.query_id = None 405 self.query_length = None 406 self.database_length = None 407 self.effective_hsp_length = None 408 self.effective_query_length = None 409 self.effective_database_length = None 410 self.effective_search_space = None 411 self.effective_search_space_used = None 412 self.frameshift = (None, None) 413 self.threshold = None 414 self.window_size = None 415 self.dropoff_1st_pass = (None, None) 416 self.gap_x_dropoff = (None, None) 417 self.gap_x_dropoff_final = (None, None) 418 self.gap_trigger = (None, None) 419 self.blast_cutoff = (None, None) 420 421 422# TODO - Add a friendly __str__ method to BLAST results 423class Blast(Header, DatabaseReport, Parameters): 424 """Saves the results from a blast search. 425 426 Members: 427 descriptions A list of Description objects. 428 alignments A list of Alignment objects. 429 multiple_alignment A MultipleAlignment object. 430 + members inherited from base classes 431 432 """ 433 434 def __init__(self): 435 """Initialize the class.""" 436 Header.__init__(self) 437 DatabaseReport.__init__(self) 438 Parameters.__init__(self) 439 self.descriptions = [] 440 self.alignments = [] 441 self.multiple_alignment = None 442 443 444class PSIBlast(Header, DatabaseReport, Parameters): 445 """Saves the results from a blastpgp search. 446 447 Members: 448 rounds A list of Round objects. 449 converged Whether the search converged. 450 + members inherited from base classes 451 452 """ 453 454 def __init__(self): 455 """Initialize the class.""" 456 Header.__init__(self) 457 DatabaseReport.__init__(self) 458 Parameters.__init__(self) 459 self.rounds = [] 460 self.converged = 0 461