1# Copyright 1999-2000 by Jeffrey Chang. All rights reserved. 2# This code is part of the Biopython distribution and governed by its 3# license. Please see the LICENSE file that should have been included 4# as part of this package. 5# Patches by Mike Poidinger to support multiple databases. 6# Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 7 8"""Code for calling standalone BLAST and parsing plain text output (DEPRECATED). 9 10Rather than parsing the human readable plain text BLAST output (which seems to 11change with every update to BLAST), we and the NBCI recommend you parse the 12XML output instead. The plain text parser in this module still works at the 13time of writing, but is considered obsolete and updating it to cope with the 14latest versions of BLAST is not a priority for us. 15 16This module also provides code to work with the "legacy" standalone version of 17NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of 18the same name. These functions are very limited for dealing with the output as 19files rather than handles, for which the wrappers in Bio.Blast.Applications are 20preferred. Furthermore, the NCBI themselves regard these command line tools as 21"legacy", and encourage using the new BLAST+ tools instead. Biopython has 22wrappers for these under Bio.Blast.Applications (see the tutorial). 23""" 24 25import re 26 27from io import StringIO 28from Bio.SearchIO._legacy.ParserSupport import ( 29 UndoHandle, 30 AbstractParser, 31 AbstractConsumer, 32 read_and_call, 33 read_and_call_until, 34 read_and_call_while, 35 attempt_read_and_call, 36 is_blank_line, 37 safe_peekline, 38 safe_readline, 39) 40from Bio.Blast import Record 41 42from Bio import BiopythonWarning 43import warnings 44 45_score_e_re = re.compile(r"Score +E") 46 47 48class LowQualityBlastError(Exception): 49 """Error caused by running a low quality sequence through BLAST. 50 51 When low quality sequences (like GenBank entries containing only 52 stretches of a single nucleotide) are BLASTed, they will result in 53 BLAST generating an error and not being able to perform the BLAST. 54 search. This error should be raised for the BLAST reports produced 55 in this case. 56 """ 57 58 pass 59 60 61class ShortQueryBlastError(Exception): 62 """Error caused by running a short query sequence through BLAST. 63 64 If the query sequence is too short, BLAST outputs warnings and errors:: 65 66 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 67 [blastall] ERROR: [000.000] AT1G08320: Blast: 68 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 69 done 70 71 This exception is raised when that condition is detected. 72 """ 73 74 pass 75 76 77class _Scanner: 78 """Scan BLAST output from blastall or blastpgp. 79 80 Tested with blastall and blastpgp v2.0.10, v2.0.11 81 82 Methods: 83 - feed Feed data into the scanner. 84 85 """ 86 87 def __init__(self): 88 """Raise warning that this module is outdated.""" 89 warnings.warn( 90 "Parsing BLAST plain text output file is not a well supported" 91 " functionality anymore. Consider generating your BLAST output for parsing" 92 " as XML or tabular format instead.", 93 BiopythonWarning, 94 ) 95 96 def feed(self, handle, consumer): 97 """Feed in a BLAST report for scanning. 98 99 Arguments: 100 - handle is a file-like object that contains the BLAST report. 101 - consumer is a Consumer object that will receive events as the 102 report is scanned. 103 """ 104 if isinstance(handle, UndoHandle): 105 uhandle = handle 106 else: 107 uhandle = UndoHandle(handle) 108 109 # Try to fast-forward to the beginning of the blast report. 110 read_and_call_until(uhandle, consumer.noevent, contains="BLAST") 111 # Now scan the BLAST report. 112 self._scan_header(uhandle, consumer) 113 self._scan_rounds(uhandle, consumer) 114 self._scan_database_report(uhandle, consumer) 115 self._scan_parameters(uhandle, consumer) 116 117 def _scan_header(self, uhandle, consumer): 118 # BLASTP 2.0.10 [Aug-26-1999] 119 # 120 # 121 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 122 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 123 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 124 # programs", Nucleic Acids Res. 25:3389-3402. 125 # 126 # Query= test 127 # (140 letters) 128 # 129 # Database: sdqib40-1.35.seg.fa 130 # 1323 sequences; 223,339 total letters 131 # 132 # ======================================================== 133 # This next example is from the online version of Blast, 134 # note there are TWO references, an RID line, and also 135 # the database is BEFORE the query line. 136 # Note there possibleuse of non-ASCII in the author names. 137 # ======================================================== 138 # 139 # BLASTP 2.2.15 [Oct-15-2006] 140 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 141 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 142 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 143 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 144 # 145 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 146 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 147 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 148 # protein database searches with composition-based statistics 149 # and other refinements", Nucleic Acids Res. 29:2994-3005. 150 # 151 # RID: 1166022616-19998-65316425856.BLASTQ1 152 # 153 # 154 # Database: All non-redundant GenBank CDS 155 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 156 # 4,254,166 sequences; 1,462,033,012 total letters 157 # Query= gi:16127998 158 # Length=428 159 # 160 161 consumer.start_header() 162 163 read_and_call(uhandle, consumer.version, contains="BLAST") 164 read_and_call_while(uhandle, consumer.noevent, blank=1) 165 166 # There might be a <pre> line, for qblast output. 167 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 168 169 # Read the reference(s) 170 while attempt_read_and_call(uhandle, consumer.reference, start="Reference"): 171 # References are normally multiline terminated by a blank line 172 # (or, based on the old code, the RID line) 173 while True: 174 line = uhandle.readline() 175 if is_blank_line(line): 176 consumer.noevent(line) 177 break 178 elif line.startswith("RID"): 179 break 180 else: 181 # More of the reference 182 consumer.reference(line) 183 184 # Deal with the optional RID: ... 185 read_and_call_while(uhandle, consumer.noevent, blank=1) 186 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 187 read_and_call_while(uhandle, consumer.noevent, blank=1) 188 189 # blastpgp may have a reference for compositional score matrix 190 # adjustment (see Bug 2502): 191 if attempt_read_and_call(uhandle, consumer.reference, start="Reference"): 192 read_and_call_until(uhandle, consumer.reference, blank=1) 193 read_and_call_while(uhandle, consumer.noevent, blank=1) 194 195 # blastpgp has a Reference for composition-based statistics. 196 if attempt_read_and_call(uhandle, consumer.reference, start="Reference"): 197 read_and_call_until(uhandle, consumer.reference, blank=1) 198 read_and_call_while(uhandle, consumer.noevent, blank=1) 199 200 line = uhandle.peekline() 201 assert line.strip() != "" 202 assert not line.startswith("RID:") 203 if line.startswith("Query="): 204 # This is an old style query then database... 205 206 # Read the Query lines and the following blank line. 207 read_and_call(uhandle, consumer.query_info, start="Query=") 208 read_and_call_until(uhandle, consumer.query_info, blank=1) 209 read_and_call_while(uhandle, consumer.noevent, blank=1) 210 211 # Read the database lines and the following blank line. 212 read_and_call_until(uhandle, consumer.database_info, end="total letters") 213 read_and_call(uhandle, consumer.database_info, contains="sequences") 214 read_and_call_while(uhandle, consumer.noevent, blank=1) 215 elif line.startswith("Database:"): 216 # This is a new style database then query... 217 read_and_call_until(uhandle, consumer.database_info, end="total letters") 218 read_and_call(uhandle, consumer.database_info, contains="sequences") 219 read_and_call_while(uhandle, consumer.noevent, blank=1) 220 221 # Read the Query lines and the following blank line. 222 # Or, on BLAST 2.2.22+ there is no blank link - need to spot 223 # the "... Score E" line instead. 224 read_and_call(uhandle, consumer.query_info, start="Query=") 225 # BLAST 2.2.25+ has a blank line before Length= 226 read_and_call_until(uhandle, consumer.query_info, start="Length=") 227 while True: 228 line = uhandle.peekline() 229 if not line.strip() or _score_e_re.search(line) is not None: 230 break 231 # It is more of the query (and its length) 232 read_and_call(uhandle, consumer.query_info) 233 read_and_call_while(uhandle, consumer.noevent, blank=1) 234 else: 235 raise ValueError("Invalid header?") 236 237 consumer.end_header() 238 239 def _scan_rounds(self, uhandle, consumer): 240 # Scan a bunch of rounds. 241 # Each round begins with either a "Searching......" line 242 # or a 'Score E' line followed by descriptions and alignments. 243 # The email server doesn't give the "Searching....." line. 244 # If there is no 'Searching.....' line then you'll first see a 245 # 'Results from round' line 246 247 while not self._eof(uhandle): 248 line = safe_peekline(uhandle) 249 if ( 250 not line.startswith("Searching") 251 and not line.startswith("Results from round") 252 and _score_e_re.search(line) is None 253 and "No hits found" not in line 254 ): 255 break 256 self._scan_descriptions(uhandle, consumer) 257 self._scan_alignments(uhandle, consumer) 258 259 def _scan_descriptions(self, uhandle, consumer): 260 # Searching..................................................done 261 # Results from round 2 262 # 263 # 264 # Sc 265 # Sequences producing significant alignments: (b 266 # Sequences used in model and found again: 267 # 268 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 269 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 270 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 271 # 272 # Sequences not found previously or not previously below threshold: 273 # 274 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 275 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 276 # 277 278 # If PSI-BLAST, may also have: 279 # 280 # CONVERGED! 281 282 consumer.start_descriptions() 283 284 # Read 'Searching' 285 # This line seems to be missing in BLASTN 2.1.2 (others?) 286 attempt_read_and_call(uhandle, consumer.noevent, start="Searching") 287 288 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 289 # If this happens, the handle will yield no more information. 290 if not uhandle.peekline(): 291 raise ValueError( 292 "Unexpected end of blast report. Looks suspiciously like a PSI-BLAST crash." 293 ) 294 295 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 296 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 297 # [blastall] ERROR: [000.000] AT1G08320: Blast: 298 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 299 # done 300 # Reported by David Weisman. 301 # Check for these error lines and ignore them for now. Let 302 # the BlastErrorParser deal with them. 303 line = uhandle.peekline() 304 if "ERROR:" in line or line.startswith("done"): 305 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 306 read_and_call(uhandle, consumer.noevent, start="done") 307 308 # Check to see if this is PSI-BLAST. 309 # If it is, the 'Searching' line will be followed by: 310 # (version 2.0.10) 311 # Searching............................. 312 # Results from round 2 313 # or (version 2.0.11) 314 # Searching............................. 315 # 316 # 317 # Results from round 2 318 319 # Skip a bunch of blank lines. 320 read_and_call_while(uhandle, consumer.noevent, blank=1) 321 # Check for the results line if it's there. 322 if attempt_read_and_call(uhandle, consumer.round, start="Results"): 323 read_and_call_while(uhandle, consumer.noevent, blank=1) 324 325 # Three things can happen here: 326 # 1. line contains 'Score E' 327 # 2. line contains "No hits found" 328 # 3. no descriptions 329 # The first one begins a bunch of descriptions. The last two 330 # indicates that no descriptions follow, and we should go straight 331 # to the alignments. 332 if not attempt_read_and_call( 333 uhandle, consumer.description_header, has_re=_score_e_re 334 ): 335 # Either case 2 or 3. Look for "No hits found". 336 attempt_read_and_call(uhandle, consumer.no_hits, contains="No hits found") 337 try: 338 read_and_call_while(uhandle, consumer.noevent, blank=1) 339 except ValueError as err: 340 if str(err) != "Unexpected end of stream.": 341 raise 342 343 consumer.end_descriptions() 344 # Stop processing. 345 return 346 347 # Read the score header lines 348 read_and_call(uhandle, consumer.description_header, start="Sequences producing") 349 350 # If PSI-BLAST, read the 'Sequences used in model' line. 351 attempt_read_and_call( 352 uhandle, consumer.model_sequences, start="Sequences used in model" 353 ) 354 read_and_call_while(uhandle, consumer.noevent, blank=1) 355 356 # In BLAT, rather than a "No hits found" line, we just 357 # get no descriptions (and no alignments). This can be 358 # spotted because the next line is the database block: 359 if safe_peekline(uhandle).startswith(" Database:"): 360 consumer.end_descriptions() 361 # Stop processing. 362 return 363 364 # Read the descriptions and the following blank lines, making 365 # sure that there are descriptions. 366 if not uhandle.peekline().startswith("Sequences not found"): 367 read_and_call_until(uhandle, consumer.description, blank=1) 368 read_and_call_while(uhandle, consumer.noevent, blank=1) 369 370 # If PSI-BLAST, read the 'Sequences not found' line followed 371 # by more descriptions. However, I need to watch out for the 372 # case where there were no sequences not found previously, in 373 # which case there will be no more descriptions. 374 if attempt_read_and_call( 375 uhandle, consumer.nonmodel_sequences, start="Sequences not found" 376 ): 377 # Read the descriptions and the following blank lines. 378 read_and_call_while(uhandle, consumer.noevent, blank=1) 379 line = safe_peekline(uhandle) 380 # Brad -- added check for QUERY. On some PSI-BLAST outputs 381 # there will be a 'Sequences not found' line followed by no 382 # descriptions. Check for this case since the first thing you'll 383 # get is a blank line and then 'QUERY' 384 if ( 385 not line.startswith("CONVERGED") 386 and line[0] != ">" 387 and not line.startswith("QUERY") 388 ): 389 read_and_call_until(uhandle, consumer.description, blank=1) 390 read_and_call_while(uhandle, consumer.noevent, blank=1) 391 392 attempt_read_and_call(uhandle, consumer.converged, start="CONVERGED") 393 read_and_call_while(uhandle, consumer.noevent, blank=1) 394 395 consumer.end_descriptions() 396 397 def _scan_alignments(self, uhandle, consumer): 398 if self._eof(uhandle): 399 return 400 401 # qblast inserts a helpful line here. 402 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 403 404 # First, check to see if I'm at the database report. 405 line = safe_peekline(uhandle) 406 if not line: 407 # EOF 408 return 409 elif line.startswith(" Database") or line.startswith("Lambda"): 410 return 411 elif line[0] == ">": 412 # XXX make a better check here between pairwise and masterslave 413 self._scan_pairwise_alignments(uhandle, consumer) 414 elif line.startswith("Effective"): 415 return 416 else: 417 # XXX put in a check to make sure I'm in a masterslave alignment 418 self._scan_masterslave_alignment(uhandle, consumer) 419 420 def _scan_pairwise_alignments(self, uhandle, consumer): 421 while not self._eof(uhandle): 422 line = safe_peekline(uhandle) 423 if line[0] != ">": 424 break 425 self._scan_one_pairwise_alignment(uhandle, consumer) 426 427 def _scan_one_pairwise_alignment(self, uhandle, consumer): 428 if self._eof(uhandle): 429 return 430 consumer.start_alignment() 431 432 self._scan_alignment_header(uhandle, consumer) 433 434 # Scan a bunch of score/alignment pairs. 435 while True: 436 if self._eof(uhandle): 437 # Shouldn't have issued that _scan_alignment_header event... 438 break 439 line = safe_peekline(uhandle) 440 if not line.startswith(" Score"): 441 break 442 self._scan_hsp(uhandle, consumer) 443 consumer.end_alignment() 444 445 def _scan_alignment_header(self, uhandle, consumer): 446 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 447 # stearothermophilus] 448 # Length = 81 449 # 450 # Or, more recently with different white space: 451 # 452 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 453 # gi|15829258|ref|NP_308031.1| threonine synthase 454 # ... 455 # Length=428 456 read_and_call(uhandle, consumer.title, start=">") 457 while True: 458 line = safe_readline(uhandle) 459 if line.lstrip().startswith(("Length =", "Length=")): 460 consumer.length(line) 461 break 462 elif is_blank_line(line): 463 # Check to make sure I haven't missed the Length line 464 raise ValueError("I missed the Length in an alignment header") 465 consumer.title(line) 466 467 # Older versions of BLAST will have a line with some spaces. 468 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 469 if not attempt_read_and_call(uhandle, consumer.noevent, start=" "): 470 read_and_call(uhandle, consumer.noevent, blank=1) 471 472 def _scan_hsp(self, uhandle, consumer): 473 consumer.start_hsp() 474 self._scan_hsp_header(uhandle, consumer) 475 self._scan_hsp_alignment(uhandle, consumer) 476 consumer.end_hsp() 477 478 def _scan_hsp_header(self, uhandle, consumer): 479 # Score = 22.7 bits (47), Expect = 2.5 480 # Identities = 10/36 (27%), Positives = 18/36 (49%) 481 # Strand = Plus / Plus 482 # Frame = +3 483 # 484 485 read_and_call(uhandle, consumer.score, start=" Score") 486 read_and_call(uhandle, consumer.identities, start=" Identities") 487 # BLASTN 488 attempt_read_and_call(uhandle, consumer.strand, start=" Strand") 489 # BLASTX, TBLASTN, TBLASTX 490 attempt_read_and_call(uhandle, consumer.frame, start=" Frame") 491 read_and_call(uhandle, consumer.noevent, blank=1) 492 493 def _scan_hsp_alignment(self, uhandle, consumer): 494 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 495 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 496 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 497 # 498 # Query: 64 AEKILIKR 71 499 # I +K 500 # Sbjct: 70 PNIIQLKD 77 501 # 502 503 while True: 504 # Blastn adds an extra line filled with spaces before Query 505 attempt_read_and_call(uhandle, consumer.noevent, start=" ") 506 read_and_call(uhandle, consumer.query, start="Query") 507 read_and_call(uhandle, consumer.align, start=" ") 508 read_and_call(uhandle, consumer.sbjct, start="Sbjct") 509 try: 510 read_and_call_while(uhandle, consumer.noevent, blank=1) 511 except ValueError as err: 512 if str(err) != "Unexpected end of stream.": 513 raise 514 # End of File (well, it looks like it with recent versions 515 # of BLAST for multiple queries after the Iterator class 516 # has broken up the whole file into chunks). 517 break 518 line = safe_peekline(uhandle) 519 # Alignment continues if I see a 'Query' or the spaces for Blastn. 520 if not (line.startswith("Query") or line.startswith(" ")): 521 break 522 523 def _scan_masterslave_alignment(self, uhandle, consumer): 524 consumer.start_alignment() 525 while True: 526 line = safe_readline(uhandle) 527 # Check to see whether I'm finished reading the alignment. 528 # This is indicated by 1) database section, 2) next psi-blast 529 # round, which can also be a 'Results from round' if no 530 # searching line is present 531 # patch by chapmanb 532 if line.startswith("Searching") or line.startswith("Results from round"): 533 uhandle.saveline(line) 534 break 535 elif line.startswith(" Database"): 536 uhandle.saveline(line) 537 break 538 elif is_blank_line(line): 539 consumer.noevent(line) 540 else: 541 consumer.multalign(line) 542 read_and_call_while(uhandle, consumer.noevent, blank=1) 543 consumer.end_alignment() 544 545 def _eof(self, uhandle): 546 try: 547 line = safe_peekline(uhandle) 548 except ValueError as err: 549 if str(err) != "Unexpected end of stream.": 550 raise 551 line = "" 552 return not line 553 554 def _scan_database_report(self, uhandle, consumer): 555 # Database: sdqib40-1.35.seg.fa 556 # Posted date: Nov 1, 1999 4:25 PM 557 # Number of letters in database: 223,339 558 # Number of sequences in database: 1323 559 # 560 # Lambda K H 561 # 0.322 0.133 0.369 562 # 563 # Gapped 564 # Lambda K H 565 # 0.270 0.0470 0.230 566 # 567 ########################################## 568 # Or, more recently Blast 2.2.15 gives less blank lines 569 ########################################## 570 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 571 # environmental samples 572 # Posted date: Dec 12, 2006 5:51 PM 573 # Number of letters in database: 667,088,753 574 # Number of sequences in database: 2,094,974 575 # Lambda K H 576 # 0.319 0.136 0.395 577 # Gapped 578 # Lambda K H 579 # 0.267 0.0410 0.140 580 581 if self._eof(uhandle): 582 return 583 584 consumer.start_database_report() 585 586 # Subset of the database(s) listed below 587 # Number of letters searched: 562,618,960 588 # Number of sequences searched: 228,924 589 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 590 read_and_call(uhandle, consumer.noevent, contains="letters") 591 read_and_call(uhandle, consumer.noevent, contains="sequences") 592 read_and_call(uhandle, consumer.noevent, start=" ") 593 594 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 595 # was missing the "Database" stanza completely. 596 while attempt_read_and_call(uhandle, consumer.database, start=" Database"): 597 # BLAT output ends abruptly here, without any of the other 598 # information. Check to see if this is the case. If so, 599 # then end the database report here gracefully. 600 if not uhandle.peekline().strip() or uhandle.peekline().startswith("BLAST"): 601 consumer.end_database_report() 602 return 603 604 # Database can span multiple lines. 605 read_and_call_until(uhandle, consumer.database, start=" Posted") 606 read_and_call(uhandle, consumer.posted_date, start=" Posted") 607 read_and_call( 608 uhandle, consumer.num_letters_in_database, start=" Number of letters" 609 ) 610 read_and_call( 611 uhandle, 612 consumer.num_sequences_in_database, 613 start=" Number of sequences", 614 ) 615 # There may not be a line starting with spaces... 616 attempt_read_and_call(uhandle, consumer.noevent, start=" ") 617 618 line = safe_readline(uhandle) 619 uhandle.saveline(line) 620 if "Lambda" in line: 621 break 622 623 try: 624 read_and_call(uhandle, consumer.noevent, start="Lambda") 625 read_and_call(uhandle, consumer.ka_params) 626 except Exception: # TODO: ValueError, AttributeError? 627 pass 628 629 # This blank line is optional: 630 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 631 632 # not BLASTP 633 attempt_read_and_call(uhandle, consumer.gapped, start="Gapped") 634 # not TBLASTX 635 if attempt_read_and_call(uhandle, consumer.noevent, start="Lambda"): 636 read_and_call(uhandle, consumer.ka_params_gap) 637 638 # Blast 2.2.4 can sometimes skip the whole parameter section. 639 # Thus, I need to be careful not to read past the end of the 640 # file. 641 try: 642 read_and_call_while(uhandle, consumer.noevent, blank=1) 643 except ValueError as x: 644 if str(x) != "Unexpected end of stream.": 645 raise 646 consumer.end_database_report() 647 648 def _scan_parameters(self, uhandle, consumer): 649 # Matrix: BLOSUM62 650 # Gap Penalties: Existence: 11, Extension: 1 651 # Number of Hits to DB: 50604 652 # Number of Sequences: 1323 653 # Number of extensions: 1526 654 # Number of successful extensions: 6 655 # Number of sequences better than 10.0: 5 656 # Number of HSP's better than 10.0 without gapping: 5 657 # Number of HSP's successfully gapped in prelim test: 0 658 # Number of HSP's that attempted gapping in prelim test: 1 659 # Number of HSP's gapped (non-prelim): 5 660 # length of query: 140 661 # length of database: 223,339 662 # effective HSP length: 39 663 # effective length of query: 101 664 # effective length of database: 171,742 665 # effective search space: 17345942 666 # effective search space used: 17345942 667 # T: 11 668 # A: 40 669 # X1: 16 ( 7.4 bits) 670 # X2: 38 (14.8 bits) 671 # X3: 64 (24.9 bits) 672 # S1: 41 (21.9 bits) 673 # S2: 42 (20.8 bits) 674 ########################################## 675 # Or, more recently Blast(x) 2.2.15 gives 676 ########################################## 677 # Matrix: BLOSUM62 678 # Gap Penalties: Existence: 11, Extension: 1 679 # Number of Sequences: 4535438 680 # Number of Hits to DB: 2,588,844,100 681 # Number of extensions: 60427286 682 # Number of successful extensions: 126433 683 # Number of sequences better than 2.0: 30 684 # Number of HSP's gapped: 126387 685 # Number of HSP's successfully gapped: 35 686 # Length of query: 291 687 # Length of database: 1,573,298,872 688 # Length adjustment: 130 689 # Effective length of query: 161 690 # Effective length of database: 983,691,932 691 # Effective search space: 158374401052 692 # Effective search space used: 158374401052 693 # Neighboring words threshold: 12 694 # Window for multiple hits: 40 695 # X1: 16 ( 7.3 bits) 696 # X2: 38 (14.6 bits) 697 # X3: 64 (24.7 bits) 698 # S1: 41 (21.7 bits) 699 # S2: 32 (16.9 bits) 700 701 # Blast 2.2.4 can sometimes skip the whole parameter section. 702 # BLAT also skips the whole parameter section. 703 # Thus, check to make sure that the parameter section really 704 # exists. 705 if not uhandle.peekline().strip(): 706 return 707 708 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 709 # "Number of Sequences" lines. 710 consumer.start_parameters() 711 712 # Matrix line may be missing in BLASTN 2.2.9 713 attempt_read_and_call(uhandle, consumer.matrix, start="Matrix") 714 # not TBLASTX 715 attempt_read_and_call(uhandle, consumer.gap_penalties, start="Gap") 716 attempt_read_and_call( 717 uhandle, consumer.num_sequences, start="Number of Sequences" 718 ) 719 attempt_read_and_call(uhandle, consumer.num_hits, start="Number of Hits") 720 attempt_read_and_call( 721 uhandle, consumer.num_sequences, start="Number of Sequences" 722 ) 723 attempt_read_and_call( 724 uhandle, consumer.num_extends, start="Number of extensions" 725 ) 726 attempt_read_and_call( 727 uhandle, consumer.num_good_extends, start="Number of successful" 728 ) 729 attempt_read_and_call( 730 uhandle, consumer.num_seqs_better_e, start="Number of sequences" 731 ) 732 733 # not BLASTN, TBLASTX 734 if attempt_read_and_call( 735 uhandle, consumer.hsps_no_gap, start="Number of HSP's better" 736 ): 737 # BLASTN 2.2.9 738 if attempt_read_and_call( 739 uhandle, consumer.noevent, start="Number of HSP's gapped:" 740 ): 741 read_and_call( 742 uhandle, consumer.noevent, start="Number of HSP's successfully" 743 ) 744 # This is omitted in 2.2.15 745 attempt_read_and_call( 746 uhandle, consumer.noevent, start="Number of extra gapped extensions" 747 ) 748 else: 749 read_and_call( 750 uhandle, 751 consumer.hsps_prelim_gapped, 752 start="Number of HSP's successfully", 753 ) 754 read_and_call( 755 uhandle, 756 consumer.hsps_prelim_gap_attempted, 757 start="Number of HSP's that", 758 ) 759 read_and_call( 760 uhandle, consumer.hsps_gapped, start="Number of HSP's gapped" 761 ) 762 # e.g. BLASTX 2.2.15 where the "better" line is missing 763 elif attempt_read_and_call( 764 uhandle, consumer.noevent, start="Number of HSP's gapped" 765 ): 766 read_and_call( 767 uhandle, consumer.noevent, start="Number of HSP's successfully" 768 ) 769 770 # not in blastx 2.2.1 771 attempt_read_and_call( 772 uhandle, consumer.query_length, has_re=re.compile(r"[Ll]ength of query") 773 ) 774 # Not in BLASTX 2.2.22+ 775 attempt_read_and_call( 776 uhandle, 777 consumer.database_length, 778 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase"), 779 ) 780 781 # BLASTN 2.2.9 782 attempt_read_and_call(uhandle, consumer.noevent, start="Length adjustment") 783 attempt_read_and_call( 784 uhandle, consumer.effective_hsp_length, start="effective HSP" 785 ) 786 # Not in blastx 2.2.1 787 attempt_read_and_call( 788 uhandle, 789 consumer.effective_query_length, 790 has_re=re.compile(r"[Ee]ffective length of query"), 791 ) 792 793 # This is not in BLASTP 2.2.15 794 attempt_read_and_call( 795 uhandle, 796 consumer.effective_database_length, 797 has_re=re.compile(r"[Ee]ffective length of \s*[Dd]atabase"), 798 ) 799 # Not in blastx 2.2.1, added a ':' to distinguish between 800 # this and the 'effective search space used' line 801 attempt_read_and_call( 802 uhandle, 803 consumer.effective_search_space, 804 has_re=re.compile(r"[Ee]ffective search space:"), 805 ) 806 # Does not appear in BLASTP 2.0.5 807 attempt_read_and_call( 808 uhandle, 809 consumer.effective_search_space_used, 810 has_re=re.compile(r"[Ee]ffective search space used"), 811 ) 812 813 # BLASTX, TBLASTN, TBLASTX 814 attempt_read_and_call(uhandle, consumer.frameshift, start="frameshift") 815 816 # not in BLASTN 2.2.9 817 attempt_read_and_call(uhandle, consumer.threshold, start="T") 818 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 819 attempt_read_and_call( 820 uhandle, consumer.threshold, start="Neighboring words threshold" 821 ) 822 823 # not in BLASTX 2.2.15 824 attempt_read_and_call(uhandle, consumer.window_size, start="A") 825 # get this instead: "Window for multiple hits: 40" 826 attempt_read_and_call( 827 uhandle, consumer.window_size, start="Window for multiple hits" 828 ) 829 830 # not in BLASTX 2.2.22+ 831 attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start="X1") 832 # not TBLASTN 833 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start="X2") 834 835 # not BLASTN, TBLASTX 836 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, start="X3") 837 838 # not TBLASTN 839 attempt_read_and_call(uhandle, consumer.gap_trigger, start="S1") 840 # not in blastx 2.2.1 841 # first we make sure we have additional lines to work with, if 842 # not then the file is done and we don't have a final S2 843 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 844 read_and_call(uhandle, consumer.blast_cutoff, start="S2") 845 846 consumer.end_parameters() 847 848 849class BlastParser(AbstractParser): 850 """Parses BLAST data into a Record.Blast object.""" 851 852 def __init__(self): 853 """Initialize the class.""" 854 self._scanner = _Scanner() 855 self._consumer = _BlastConsumer() 856 857 def parse(self, handle): 858 """Parse BLAST handle into a Record.Blast object.""" 859 self._scanner.feed(handle, self._consumer) 860 return self._consumer.data 861 862 863class PSIBlastParser(AbstractParser): 864 """Parses BLAST data into a Record.PSIBlast object.""" 865 866 def __init__(self): 867 """Initialize the class.""" 868 self._scanner = _Scanner() 869 self._consumer = _PSIBlastConsumer() 870 871 def parse(self, handle): 872 """Parse BLAST handle into a Record.PSIBlast object.""" 873 self._scanner.feed(handle, self._consumer) 874 return self._consumer.data 875 876 877class _HeaderConsumer: 878 def start_header(self): 879 self._header = Record.Header() 880 881 def version(self, line): 882 c = line.split() 883 self._header.application = c[0] 884 self._header.version = c[1] 885 if len(c) > 2: 886 # The date is missing in the new C++ output from blastx 2.2.22+ 887 # Just get "BLASTX 2.2.22+\n" and that's all. 888 self._header.date = c[2][1:-1] 889 890 def reference(self, line): 891 if line.startswith("Reference: "): 892 self._header.reference = line[11:] 893 else: 894 self._header.reference += line 895 896 def query_info(self, line): 897 if line.startswith("Query= "): 898 self._header.query = line[7:].lstrip() 899 elif line.startswith("Length="): 900 # New style way to give the query length in BLAST 2.2.22+ (the C++ code) 901 self._header.query_letters = _safe_int(line[7:].strip()) 902 elif not line.startswith(" "): # continuation of query_info 903 self._header.query = "%s%s" % (self._header.query, line) 904 else: 905 # Hope it is the old style way to give the query length: 906 (letters,) = _re_search( 907 r"([0-9,]+) letters", 908 line, 909 "I could not find the number of letters in line\n%s" % line, 910 ) 911 self._header.query_letters = _safe_int(letters) 912 913 def database_info(self, line): 914 line = line.rstrip() 915 if line.startswith("Database: "): 916 self._header.database = line[10:] 917 elif not line.endswith("total letters"): 918 if self._header.database: 919 # Need to include a space when merging multi line datase descr 920 self._header.database += " " + line.strip() 921 else: 922 self._header.database = line.strip() 923 else: 924 sequences, letters = _re_search( 925 r"([0-9,]+) sequences; ([0-9,-]+) total letters", 926 line, 927 "I could not find the sequences and letters in line\n%s" % line, 928 ) 929 self._header.database_sequences = _safe_int(sequences) 930 self._header.database_letters = _safe_int(letters) 931 932 def end_header(self): 933 # Get rid of the trailing newlines 934 self._header.reference = self._header.reference.rstrip() 935 self._header.query = self._header.query.rstrip() 936 937 938class _DescriptionConsumer: 939 def start_descriptions(self): 940 self._descriptions = [] 941 self._model_sequences = [] 942 self._nonmodel_sequences = [] 943 self._converged = 0 944 self._type = None 945 self._roundnum = None 946 947 self.__has_n = 0 # Does the description line contain an N value? 948 949 def description_header(self, line): 950 if line.startswith("Sequences producing"): 951 cols = line.split() 952 if cols[-1] == "N": 953 self.__has_n = 1 954 955 def description(self, line): 956 dh = self._parse(line) 957 if self._type == "model": 958 self._model_sequences.append(dh) 959 elif self._type == "nonmodel": 960 self._nonmodel_sequences.append(dh) 961 else: 962 self._descriptions.append(dh) 963 964 def model_sequences(self, line): 965 self._type = "model" 966 967 def nonmodel_sequences(self, line): 968 self._type = "nonmodel" 969 970 def converged(self, line): 971 self._converged = 1 972 973 def no_hits(self, line): 974 pass 975 976 def round(self, line): 977 if not line.startswith("Results from round"): 978 raise ValueError("I didn't understand the round line\n%s" % line) 979 self._roundnum = _safe_int(line[18:].strip()) 980 981 def end_descriptions(self): 982 pass 983 984 def _parse(self, description_line): 985 line = description_line # for convenience 986 dh = Record.Description() 987 988 # I need to separate the score and p-value from the title. 989 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 990 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 991 # special cases to handle: 992 # - title must be preserved exactly (including whitespaces) 993 # - score could be equal to e-value (not likely, but what if??) 994 # - sometimes there's an "N" score of '1'. 995 cols = line.split() 996 if len(cols) < 3: 997 raise ValueError("Line does not appear to contain description:\n%s" % line) 998 if self.__has_n: 999 i = line.rfind(cols[-1]) # find start of N 1000 i = line.rfind(cols[-2], 0, i) # find start of p-value 1001 i = line.rfind(cols[-3], 0, i) # find start of score 1002 else: 1003 i = line.rfind(cols[-1]) # find start of p-value 1004 i = line.rfind(cols[-2], 0, i) # find start of score 1005 if self.__has_n: 1006 dh.title, dh.score, dh.e, dh.num_alignments = ( 1007 line[:i].rstrip(), 1008 cols[-3], 1009 cols[-2], 1010 cols[-1], 1011 ) 1012 else: 1013 dh.title, dh.score, dh.e, dh.num_alignments = ( 1014 line[:i].rstrip(), 1015 cols[-2], 1016 cols[-1], 1017 1, 1018 ) 1019 dh.num_alignments = _safe_int(dh.num_alignments) 1020 dh.score = _safe_int(dh.score) 1021 dh.e = _safe_float(dh.e) 1022 return dh 1023 1024 1025class _AlignmentConsumer: 1026 # This is a little bit tricky. An alignment can either be a 1027 # pairwise alignment or a multiple alignment. Since it's difficult 1028 # to know a-priori which one the blast record will contain, I'm going 1029 # to make one class that can parse both of them. 1030 def start_alignment(self): 1031 self._alignment = Record.Alignment() 1032 self._multiple_alignment = Record.MultipleAlignment() 1033 1034 def title(self, line): 1035 if self._alignment.title: 1036 self._alignment.title += " " 1037 self._alignment.title += line.strip() 1038 1039 def length(self, line): 1040 # e.g. "Length = 81" or more recently, "Length=428" 1041 parts = line.replace(" ", "").split("=") 1042 if len(parts) != 2: 1043 raise ValueError("Unrecognised format length line: %r" % line) 1044 self._alignment.length = parts[1] 1045 self._alignment.length = _safe_int(self._alignment.length) 1046 1047 def multalign(self, line): 1048 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 1049 if line.startswith("QUERY") or line.startswith("blast_tmp"): 1050 # If this is the first line of the multiple alignment, 1051 # then I need to figure out how the line is formatted. 1052 1053 # Format of line is: 1054 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 1055 try: 1056 name, start, seq, end = line.split() 1057 except ValueError: 1058 raise ValueError("I do not understand the line\n%s" % line) from None 1059 self._start_index = line.index(start, len(name)) 1060 self._seq_index = line.index(seq, self._start_index + len(start)) 1061 # subtract 1 for the space 1062 self._name_length = self._start_index - 1 1063 self._start_length = self._seq_index - self._start_index - 1 1064 self._seq_length = line.rfind(end) - self._seq_index - 1 1065 1066 # self._seq_index = line.index(seq) 1067 # # subtract 1 for the space 1068 # self._seq_length = line.rfind(end) - self._seq_index - 1 1069 # self._start_index = line.index(start) 1070 # self._start_length = self._seq_index - self._start_index - 1 1071 # self._name_length = self._start_index 1072 1073 # Extract the information from the line 1074 name = line[: self._name_length] 1075 name = name.rstrip() 1076 start = line[self._start_index : self._start_index + self._start_length] 1077 start = start.rstrip() 1078 if start: 1079 start = _safe_int(start) 1080 end = line[self._seq_index + self._seq_length :].rstrip() 1081 if end: 1082 end = _safe_int(end) 1083 seq = line[self._seq_index : self._seq_index + self._seq_length].rstrip() 1084 # right pad the sequence with spaces if necessary 1085 if len(seq) < self._seq_length: 1086 seq += " " * (self._seq_length - len(seq)) 1087 1088 # I need to make sure the sequence is aligned correctly with the query. 1089 # First, I will find the length of the query. Then, if necessary, 1090 # I will pad my current sequence with spaces so that they will line 1091 # up correctly. 1092 1093 # Two possible things can happen: 1094 # QUERY 1095 # 504 1096 # 1097 # QUERY 1098 # 403 1099 # 1100 # Sequence 504 will need padding at the end. Since I won't know 1101 # this until the end of the alignment, this will be handled in 1102 # end_alignment. 1103 # Sequence 403 will need padding before being added to the alignment. 1104 1105 align = self._multiple_alignment.alignment # for convenience 1106 align.append((name, start, seq, end)) 1107 1108 # This is old code that tried to line up all the sequences 1109 # in a multiple alignment by using the sequence title's as 1110 # identifiers. The problem with this is that BLAST assigns 1111 # different HSP's from the same sequence the same id. Thus, 1112 # in one alignment block, there may be multiple sequences with 1113 # the same id. I'm not sure how to handle this, so I'm not 1114 # going to. 1115 1116 # # If the sequence is the query, then just add it. 1117 # if name == 'QUERY': 1118 # if len(align) == 0: 1119 # align.append((name, start, seq)) 1120 # else: 1121 # aname, astart, aseq = align[0] 1122 # if name != aname: 1123 # raise ValueError, "Query is not the first sequence" 1124 # aseq = aseq + seq 1125 # align[0] = aname, astart, aseq 1126 # else: 1127 # if len(align) == 0: 1128 # raise ValueError, "I could not find the query sequence" 1129 # qname, qstart, qseq = align[0] 1130 # 1131 # # Now find my sequence in the multiple alignment. 1132 # for i in range(1, len(align)): 1133 # aname, astart, aseq = align[i] 1134 # if name == aname: 1135 # index = i 1136 # break 1137 # else: 1138 # # If I couldn't find it, then add a new one. 1139 # align.append((None, None, None)) 1140 # index = len(align)-1 1141 # # Make sure to left-pad it. 1142 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1143 # 1144 # if len(qseq) != len(aseq) + len(seq): 1145 # # If my sequences are shorter than the query sequence, 1146 # # then I will need to pad some spaces to make them line up. 1147 # # Since I've already right padded seq, that means aseq 1148 # # must be too short. 1149 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1150 # aseq = aseq + seq 1151 # if astart is None: 1152 # astart = start 1153 # align[index] = aname, astart, aseq 1154 1155 def end_alignment(self): 1156 # Remove trailing newlines 1157 if self._alignment: 1158 self._alignment.title = self._alignment.title.rstrip() 1159 1160 # This code is also obsolete. See note above. 1161 # If there's a multiple alignment, I will need to make sure 1162 # all the sequences are aligned. That is, I may need to 1163 # right-pad the sequences. 1164 # if self._multiple_alignment is not None: 1165 # align = self._multiple_alignment.alignment 1166 # seqlen = None 1167 # for i in range(len(align)): 1168 # name, start, seq = align[i] 1169 # if seqlen is None: 1170 # seqlen = len(seq) 1171 # else: 1172 # if len(seq) < seqlen: 1173 # seq = seq + ' '*(seqlen - len(seq)) 1174 # align[i] = name, start, seq 1175 # elif len(seq) > seqlen: 1176 # raise ValueError, \ 1177 # "Sequence %s is longer than the query" % name 1178 1179 # Clean up some variables, if they exist. 1180 try: 1181 del self._seq_index 1182 del self._seq_length 1183 del self._start_index 1184 del self._start_length 1185 del self._name_length 1186 except AttributeError: 1187 pass 1188 1189 1190class _HSPConsumer: 1191 def start_hsp(self): 1192 self._hsp = Record.HSP() 1193 1194 def score(self, line): 1195 self._hsp.bits, self._hsp.score = _re_search( 1196 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", 1197 line, 1198 "I could not find the score in line\n%s" % line, 1199 ) 1200 self._hsp.score = _safe_float(self._hsp.score) 1201 self._hsp.bits = _safe_float(self._hsp.bits) 1202 1203 x, y = _re_search( 1204 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", 1205 line, 1206 "I could not find the expect in line\n%s" % line, 1207 ) 1208 if x: 1209 self._hsp.num_alignments = _safe_int(x) 1210 else: 1211 self._hsp.num_alignments = 1 1212 self._hsp.expect = _safe_float(y) 1213 1214 def identities(self, line): 1215 x, y = _re_search( 1216 r"Identities = (\d+)\/(\d+)", 1217 line, 1218 "I could not find the identities in line\n%s" % line, 1219 ) 1220 self._hsp.identities = _safe_int(x), _safe_int(y) 1221 self._hsp.align_length = _safe_int(y) 1222 1223 if "Positives" in line: 1224 x, y = _re_search( 1225 r"Positives = (\d+)\/(\d+)", 1226 line, 1227 "I could not find the positives in line\n%s" % line, 1228 ) 1229 self._hsp.positives = _safe_int(x), _safe_int(y) 1230 assert self._hsp.align_length == _safe_int(y) 1231 1232 if "Gaps" in line: 1233 x, y = _re_search( 1234 r"Gaps = (\d+)\/(\d+)", 1235 line, 1236 "I could not find the gaps in line\n%s" % line, 1237 ) 1238 self._hsp.gaps = _safe_int(x), _safe_int(y) 1239 assert self._hsp.align_length == _safe_int(y) 1240 1241 def strand(self, line): 1242 self._hsp.strand = _re_search( 1243 r"Strand\s?=\s?(\w+)\s?/\s?(\w+)", 1244 line, 1245 "I could not find the strand in line\n%s" % line, 1246 ) 1247 1248 def frame(self, line): 1249 # Frame can be in formats: 1250 # Frame = +1 1251 # Frame = +2 / +2 1252 if "/" in line: 1253 self._hsp.frame = _re_search( 1254 r"Frame\s?=\s?([-+][123])\s?/\s?([-+][123])", 1255 line, 1256 "I could not find the frame in line\n%s" % line, 1257 ) 1258 else: 1259 self._hsp.frame = _re_search( 1260 r"Frame = ([-+][123])", 1261 line, 1262 "I could not find the frame in line\n%s" % line, 1263 ) 1264 1265 # Match a space, if one is available. Masahir Ishikawa found a 1266 # case where there's no space between the start and the sequence: 1267 # Query: 100tt 101 1268 # line below modified by Yair Benita, Sep 2004 1269 # Note that the colon is not always present. 2006 1270 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)") 1271 1272 def query(self, line): 1273 m = self._query_re.search(line) 1274 if m is None: 1275 if ( 1276 line.strip() 1277 == "Query ------------------------------------------------------------" 1278 ): 1279 # Special case - long gap relative to the subject, 1280 # note there is no start/end present, cannot update those 1281 self._hsp.query += "-" * 60 1282 self._query_len = 60 # number of dashes 1283 self._query_start_index = 13 # offset of first dash 1284 return 1285 raise ValueError("I could not find the query in line\n%s" % line) 1286 1287 # line below modified by Yair Benita, Sep 2004. 1288 # added the end attribute for the query 1289 colon, start, seq, end = m.groups() 1290 seq = seq.strip() 1291 self._hsp.query += seq 1292 if self._hsp.query_start is None: 1293 self._hsp.query_start = _safe_int(start) 1294 1295 # line below added by Yair Benita, Sep 2004. 1296 # added the end attribute for the query 1297 self._hsp.query_end = _safe_int(end) 1298 1299 # Get index for sequence start (regular expression element 3) 1300 self._query_start_index = m.start(3) 1301 self._query_len = len(seq) 1302 1303 def align(self, line): 1304 seq = line[self._query_start_index :].rstrip() 1305 if len(seq) < self._query_len: 1306 # Make sure the alignment is the same length as the query 1307 seq += " " * (self._query_len - len(seq)) 1308 if len(seq) < self._query_len: 1309 raise ValueError("Match is longer than the query in line\n%s" % line) 1310 self._hsp.match += seq 1311 1312 # To match how we do the query, cache the regular expression. 1313 # Note that the colon is not always present. 1314 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)") 1315 1316 def sbjct(self, line): 1317 m = self._sbjct_re.search(line) 1318 if m is None: 1319 raise ValueError("I could not find the sbjct in line\n%s" % line) 1320 colon, start, seq, end = m.groups() 1321 # mikep 26/9/00 1322 # On occasion, there is a blast hit with no subject match 1323 # so far, it only occurs with 1-line short "matches" 1324 # I have decided to let these pass as they appear 1325 if not seq.strip(): 1326 seq = " " * self._query_len 1327 else: 1328 seq = seq.strip() 1329 self._hsp.sbjct += seq 1330 if self._hsp.sbjct_start is None: 1331 self._hsp.sbjct_start = _safe_int(start) 1332 1333 self._hsp.sbjct_end = _safe_int(end) 1334 if len(seq) != self._query_len: 1335 raise ValueError( 1336 "QUERY and SBJCT sequence lengths don't match (%i %r vs %i) in line\n%s" 1337 % (self._query_len, self._hsp.query, len(seq), line) 1338 ) 1339 1340 del self._query_start_index # clean up unused variables 1341 del self._query_len 1342 1343 def end_hsp(self): 1344 pass 1345 1346 1347class _DatabaseReportConsumer: 1348 def start_database_report(self): 1349 self._dr = Record.DatabaseReport() 1350 1351 def database(self, line): 1352 m = re.search(r"Database: (.+)$", line) 1353 if m: 1354 self._dr.database_name.append(m.group(1)) 1355 elif self._dr.database_name: 1356 # This must be a continuation of the previous name. 1357 self._dr.database_name[-1] = "%s%s" % ( 1358 self._dr.database_name[-1], 1359 line.strip(), 1360 ) 1361 1362 def posted_date(self, line): 1363 self._dr.posted_date.append( 1364 _re_search( 1365 r"Posted date:\s*(.+)$", 1366 line, 1367 "I could not find the posted date in line\n%s" % line, 1368 ) 1369 ) 1370 1371 def num_letters_in_database(self, line): 1372 (letters,) = _get_cols( 1373 line, (-1,), ncols=6, expected={2: "letters", 4: "database:"} 1374 ) 1375 self._dr.num_letters_in_database.append(_safe_int(letters)) 1376 1377 def num_sequences_in_database(self, line): 1378 (sequences,) = _get_cols( 1379 line, (-1,), ncols=6, expected={2: "sequences", 4: "database:"} 1380 ) 1381 self._dr.num_sequences_in_database.append(_safe_int(sequences)) 1382 1383 def ka_params(self, line): 1384 self._dr.ka_params = [_safe_float(x) for x in line.split()] 1385 1386 def gapped(self, line): 1387 self._dr.gapped = 1 1388 1389 def ka_params_gap(self, line): 1390 self._dr.ka_params_gap = [_safe_float(x) for x in line.split()] 1391 1392 def end_database_report(self): 1393 pass 1394 1395 1396class _ParametersConsumer: 1397 def start_parameters(self): 1398 self._params = Record.Parameters() 1399 1400 def matrix(self, line): 1401 self._params.matrix = line[8:].rstrip() 1402 1403 def gap_penalties(self, line): 1404 self._params.gap_penalties = [ 1405 _safe_float(x) 1406 for x in _get_cols( 1407 line, (3, 5), ncols=6, expected={2: "Existence:", 4: "Extension:"} 1408 ) 1409 ] 1410 1411 def num_hits(self, line): 1412 if "1st pass" in line: 1413 (x,) = _get_cols(line, (-4,), ncols=11, expected={2: "Hits"}) 1414 self._params.num_hits = _safe_int(x) 1415 else: 1416 (x,) = _get_cols(line, (-1,), ncols=6, expected={2: "Hits"}) 1417 self._params.num_hits = _safe_int(x) 1418 1419 def num_sequences(self, line): 1420 if "1st pass" in line: 1421 (x,) = _get_cols(line, (-4,), ncols=9, expected={2: "Sequences:"}) 1422 self._params.num_sequences = _safe_int(x) 1423 else: 1424 (x,) = _get_cols(line, (-1,), ncols=4, expected={2: "Sequences:"}) 1425 self._params.num_sequences = _safe_int(x) 1426 1427 def num_extends(self, line): 1428 if "1st pass" in line: 1429 (x,) = _get_cols(line, (-4,), ncols=9, expected={2: "extensions:"}) 1430 self._params.num_extends = _safe_int(x) 1431 else: 1432 (x,) = _get_cols(line, (-1,), ncols=4, expected={2: "extensions:"}) 1433 self._params.num_extends = _safe_int(x) 1434 1435 def num_good_extends(self, line): 1436 if "1st pass" in line: 1437 (x,) = _get_cols(line, (-4,), ncols=10, expected={3: "extensions:"}) 1438 self._params.num_good_extends = _safe_int(x) 1439 else: 1440 (x,) = _get_cols(line, (-1,), ncols=5, expected={3: "extensions:"}) 1441 self._params.num_good_extends = _safe_int(x) 1442 1443 def num_seqs_better_e(self, line): 1444 (self._params.num_seqs_better_e,) = _get_cols( 1445 line, (-1,), ncols=7, expected={2: "sequences"} 1446 ) 1447 self._params.num_seqs_better_e = _safe_int(self._params.num_seqs_better_e) 1448 1449 def hsps_no_gap(self, line): 1450 (self._params.hsps_no_gap,) = _get_cols( 1451 line, (-1,), ncols=9, expected={3: "better", 7: "gapping:"} 1452 ) 1453 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap) 1454 1455 def hsps_prelim_gapped(self, line): 1456 (self._params.hsps_prelim_gapped,) = _get_cols( 1457 line, (-1,), ncols=9, expected={4: "gapped", 6: "prelim"} 1458 ) 1459 self._params.hsps_prelim_gapped = _safe_int(self._params.hsps_prelim_gapped) 1460 1461 def hsps_prelim_gapped_attempted(self, line): 1462 (self._params.hsps_prelim_gapped_attempted,) = _get_cols( 1463 line, (-1,), ncols=10, expected={4: "attempted", 7: "prelim"} 1464 ) 1465 self._params.hsps_prelim_gapped_attempted = _safe_int( 1466 self._params.hsps_prelim_gapped_attempted 1467 ) 1468 1469 def hsps_gapped(self, line): 1470 (self._params.hsps_gapped,) = _get_cols( 1471 line, (-1,), ncols=6, expected={3: "gapped"} 1472 ) 1473 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped) 1474 1475 def query_length(self, line): 1476 (self._params.query_length,) = _get_cols( 1477 line.lower(), (-1,), ncols=4, expected={0: "length", 2: "query:"} 1478 ) 1479 self._params.query_length = _safe_int(self._params.query_length) 1480 1481 def database_length(self, line): 1482 (self._params.database_length,) = _get_cols( 1483 line.lower(), (-1,), ncols=4, expected={0: "length", 2: "database:"} 1484 ) 1485 self._params.database_length = _safe_int(self._params.database_length) 1486 1487 def effective_hsp_length(self, line): 1488 (self._params.effective_hsp_length,) = _get_cols( 1489 line, (-1,), ncols=4, expected={1: "HSP", 2: "length:"} 1490 ) 1491 self._params.effective_hsp_length = _safe_int(self._params.effective_hsp_length) 1492 1493 def effective_query_length(self, line): 1494 (self._params.effective_query_length,) = _get_cols( 1495 line, (-1,), ncols=5, expected={1: "length", 3: "query:"} 1496 ) 1497 self._params.effective_query_length = _safe_int( 1498 self._params.effective_query_length 1499 ) 1500 1501 def effective_database_length(self, line): 1502 (self._params.effective_database_length,) = _get_cols( 1503 line.lower(), (-1,), ncols=5, expected={1: "length", 3: "database:"} 1504 ) 1505 self._params.effective_database_length = _safe_int( 1506 self._params.effective_database_length 1507 ) 1508 1509 def effective_search_space(self, line): 1510 (self._params.effective_search_space,) = _get_cols( 1511 line, (-1,), ncols=4, expected={1: "search"} 1512 ) 1513 self._params.effective_search_space = _safe_int( 1514 self._params.effective_search_space 1515 ) 1516 1517 def effective_search_space_used(self, line): 1518 (self._params.effective_search_space_used,) = _get_cols( 1519 line, (-1,), ncols=5, expected={1: "search", 3: "used:"} 1520 ) 1521 self._params.effective_search_space_used = _safe_int( 1522 self._params.effective_search_space_used 1523 ) 1524 1525 def frameshift(self, line): 1526 self._params.frameshift = _get_cols( 1527 line, (4, 5), ncols=6, expected={0: "frameshift", 2: "decay"} 1528 ) 1529 1530 def threshold(self, line): 1531 if line[:2] == "T:": 1532 # Assume its an old style line like "T: 123" 1533 (self._params.threshold,) = _get_cols( 1534 line, (1,), ncols=2, expected={0: "T:"} 1535 ) 1536 elif line[:28] == "Neighboring words threshold:": 1537 (self._params.threshold,) = _get_cols( 1538 line, 1539 (3,), 1540 ncols=4, 1541 expected={0: "Neighboring", 1: "words", 2: "threshold:"}, 1542 ) 1543 else: 1544 raise ValueError("Unrecognised threshold line:\n%s" % line) 1545 self._params.threshold = _safe_int(self._params.threshold) 1546 1547 def window_size(self, line): 1548 if line[:2] == "A:": 1549 (self._params.window_size,) = _get_cols( 1550 line, (1,), ncols=2, expected={0: "A:"} 1551 ) 1552 elif line[:25] == "Window for multiple hits:": 1553 (self._params.window_size,) = _get_cols( 1554 line, (4,), ncols=5, expected={0: "Window", 2: "multiple", 3: "hits:"} 1555 ) 1556 else: 1557 raise ValueError("Unrecognised window size line:\n%s" % line) 1558 self._params.window_size = _safe_int(self._params.window_size) 1559 1560 def dropoff_1st_pass(self, line): 1561 score, bits = _re_search( 1562 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", 1563 line, 1564 "I could not find the dropoff in line\n%s" % line, 1565 ) 1566 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits) 1567 1568 def gap_x_dropoff(self, line): 1569 score, bits = _re_search( 1570 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", 1571 line, 1572 "I could not find the gap dropoff in line\n%s" % line, 1573 ) 1574 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits) 1575 1576 def gap_x_dropoff_final(self, line): 1577 score, bits = _re_search( 1578 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", 1579 line, 1580 "I could not find the gap dropoff final in line\n%s" % line, 1581 ) 1582 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits) 1583 1584 def gap_trigger(self, line): 1585 score, bits = _re_search( 1586 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", 1587 line, 1588 "I could not find the gap trigger in line\n%s" % line, 1589 ) 1590 self._params.gap_trigger = _safe_int(score), _safe_float(bits) 1591 1592 def blast_cutoff(self, line): 1593 score, bits = _re_search( 1594 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", 1595 line, 1596 "I could not find the blast cutoff in line\n%s" % line, 1597 ) 1598 self._params.blast_cutoff = _safe_int(score), _safe_float(bits) 1599 1600 def end_parameters(self): 1601 pass 1602 1603 1604class _BlastConsumer( 1605 AbstractConsumer, 1606 _HeaderConsumer, 1607 _DescriptionConsumer, 1608 _AlignmentConsumer, 1609 _HSPConsumer, 1610 _DatabaseReportConsumer, 1611 _ParametersConsumer, 1612): 1613 # This Consumer is inherits from many other consumer classes that handle 1614 # the actual dirty work. An alternate way to do it is to create objects 1615 # of those classes and then delegate the parsing tasks to them in a 1616 # decorator-type pattern. The disadvantage of that is that the method 1617 # names will need to be resolved in this classes. However, using 1618 # a decorator will retain more control in this class (which may or 1619 # may not be a bad thing). In addition, having each sub-consumer as 1620 # its own object prevents this object's dictionary from being cluttered 1621 # with members and reduces the chance of member collisions. 1622 def __init__(self): 1623 self.data = None 1624 1625 def round(self, line): 1626 # Make sure nobody's trying to pass me PSI-BLAST data! 1627 raise ValueError("This consumer doesn't handle PSI-BLAST data") 1628 1629 def start_header(self): 1630 self.data = Record.Blast() 1631 _HeaderConsumer.start_header(self) 1632 1633 def end_header(self): 1634 _HeaderConsumer.end_header(self) 1635 self.data.__dict__.update(self._header.__dict__) 1636 1637 def end_descriptions(self): 1638 self.data.descriptions = self._descriptions 1639 1640 def end_alignment(self): 1641 _AlignmentConsumer.end_alignment(self) 1642 if self._alignment.hsps: 1643 self.data.alignments.append(self._alignment) 1644 if self._multiple_alignment.alignment: 1645 self.data.multiple_alignment = self._multiple_alignment 1646 1647 def end_hsp(self): 1648 _HSPConsumer.end_hsp(self) 1649 try: 1650 self._alignment.hsps.append(self._hsp) 1651 except AttributeError: 1652 raise ValueError("Found an HSP before an alignment") from None 1653 1654 def end_database_report(self): 1655 _DatabaseReportConsumer.end_database_report(self) 1656 self.data.__dict__.update(self._dr.__dict__) 1657 1658 def end_parameters(self): 1659 _ParametersConsumer.end_parameters(self) 1660 self.data.__dict__.update(self._params.__dict__) 1661 1662 1663class _PSIBlastConsumer( 1664 AbstractConsumer, 1665 _HeaderConsumer, 1666 _DescriptionConsumer, 1667 _AlignmentConsumer, 1668 _HSPConsumer, 1669 _DatabaseReportConsumer, 1670 _ParametersConsumer, 1671): 1672 def __init__(self): 1673 self.data = None 1674 1675 def start_header(self): 1676 self.data = Record.PSIBlast() 1677 _HeaderConsumer.start_header(self) 1678 1679 def end_header(self): 1680 _HeaderConsumer.end_header(self) 1681 self.data.__dict__.update(self._header.__dict__) 1682 1683 def start_descriptions(self): 1684 self._round = Record.Round() 1685 self.data.rounds.append(self._round) 1686 _DescriptionConsumer.start_descriptions(self) 1687 1688 def end_descriptions(self): 1689 _DescriptionConsumer.end_descriptions(self) 1690 self._round.number = self._roundnum 1691 if self._descriptions: 1692 self._round.new_seqs.extend(self._descriptions) 1693 self._round.reused_seqs.extend(self._model_sequences) 1694 self._round.new_seqs.extend(self._nonmodel_sequences) 1695 if self._converged: 1696 self.data.converged = 1 1697 1698 def end_alignment(self): 1699 _AlignmentConsumer.end_alignment(self) 1700 if self._alignment.hsps: 1701 self._round.alignments.append(self._alignment) 1702 if self._multiple_alignment: 1703 self._round.multiple_alignment = self._multiple_alignment 1704 1705 def end_hsp(self): 1706 _HSPConsumer.end_hsp(self) 1707 try: 1708 self._alignment.hsps.append(self._hsp) 1709 except AttributeError: 1710 raise ValueError("Found an HSP before an alignment") from None 1711 1712 def end_database_report(self): 1713 _DatabaseReportConsumer.end_database_report(self) 1714 self.data.__dict__.update(self._dr.__dict__) 1715 1716 def end_parameters(self): 1717 _ParametersConsumer.end_parameters(self) 1718 self.data.__dict__.update(self._params.__dict__) 1719 1720 1721class Iterator: 1722 """Iterates over a file of multiple BLAST results. 1723 1724 Methods: 1725 next Return the next record from the stream, or None. 1726 1727 """ 1728 1729 def __init__(self, handle, parser=None): 1730 """Initialize a new iterator. 1731 1732 Arguments: 1733 - handle is a file-like object. 1734 - parser is an optional Parser object to change the results 1735 into another form. If set to None, then the raw contents 1736 of the file will be returned. 1737 """ 1738 try: 1739 handle.readline 1740 except AttributeError: 1741 raise ValueError( 1742 "I expected a file handle or file-like object, got %s" % type(handle) 1743 ) from None 1744 self._uhandle = UndoHandle(handle) 1745 self._parser = parser 1746 self._header = [] 1747 1748 def __next__(self): 1749 """Return the next Blast record from the file. 1750 1751 If no more records, return None. 1752 """ 1753 lines = [] 1754 query = False 1755 while True: 1756 line = self._uhandle.readline() 1757 if not line: 1758 break 1759 # If I've reached the next one, then put the line back and stop. 1760 if lines and ( 1761 line.startswith("BLAST") 1762 or line.startswith("BLAST", 1) 1763 or line.startswith("<?xml ") 1764 ): 1765 self._uhandle.saveline(line) 1766 break 1767 # New style files omit the BLAST line to mark a new query: 1768 if line.startswith("Query="): 1769 if not query: 1770 if not self._header: 1771 self._header = lines[:] 1772 query = True 1773 else: 1774 # Start of another record 1775 self._uhandle.saveline(line) 1776 break 1777 lines.append(line) 1778 1779 if query and "BLAST" not in lines[0]: 1780 # Cheat and re-insert the header 1781 # print("-"*50) 1782 # print("".join(self._header)) 1783 # print("-"*50) 1784 # print("".join(lines)) 1785 # print("-"*50) 1786 lines = self._header + lines 1787 1788 if not lines: 1789 return None 1790 1791 data = "".join(lines) 1792 if self._parser is not None: 1793 return self._parser.parse(StringIO(data)) 1794 return data 1795 1796 def __iter__(self): 1797 return iter(self.__next__, None) 1798 1799 1800def _re_search(regex, line, error_msg): 1801 m = re.search(regex, line) 1802 if not m: 1803 raise ValueError(error_msg) 1804 return m.groups() 1805 1806 1807def _get_cols(line, cols_to_get, ncols=None, expected=None): 1808 if expected is None: 1809 expected = {} 1810 cols = line.split() 1811 1812 # Check to make sure number of columns is correct 1813 if ncols is not None and len(cols) != ncols: 1814 raise ValueError( 1815 "I expected %d columns (got %d) in line\n%s" % (ncols, len(cols), line) 1816 ) 1817 1818 # Check to make sure columns contain the correct data 1819 for k in expected: 1820 if cols[k] != expected[k]: 1821 raise ValueError( 1822 "I expected '%s' in column %d in line\n%s" % (expected[k], k, line) 1823 ) 1824 1825 # Construct the answer tuple 1826 results = [] 1827 for c in cols_to_get: 1828 results.append(cols[c]) 1829 return tuple(results) 1830 1831 1832def _safe_int(str): 1833 try: 1834 return int(str) 1835 except ValueError: 1836 # Something went wrong. Try to clean up the string. 1837 # Remove all commas from the string 1838 str = str.replace(",", "") 1839 # try again after removing commas. 1840 # Note int() will return a long rather than overflow 1841 try: 1842 return int(str) 1843 except ValueError: 1844 pass 1845 # Call float to handle things like "54.3", note could lose precision, e.g. 1846 # >>> int("5399354557888517312") 1847 # 5399354557888517312 1848 # >>> int(float("5399354557888517312")) 1849 # 5399354557888517120 1850 return int(float(str)) 1851 1852 1853def _safe_float(str): 1854 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 1855 # float('e-172') does not produce an error on his platform. Thus, 1856 # we need to check the string for this condition. 1857 1858 # Sometimes BLAST leaves of the '1' in front of an exponent. 1859 if str and str[0] in ["E", "e"]: 1860 str = "1" + str 1861 try: 1862 return float(str) 1863 except ValueError: 1864 # Remove all commas from the string 1865 str = str.replace(",", "") 1866 # try again. 1867 return float(str) 1868 1869 1870class _BlastErrorConsumer(_BlastConsumer): 1871 def __init__(self): 1872 _BlastConsumer.__init__(self) 1873 1874 def noevent(self, line): 1875 if "Query must be at least wordsize" in line: 1876 raise ShortQueryBlastError("Query must be at least wordsize") 1877 # Now pass the line back up to the superclass. 1878 method = getattr( 1879 _BlastConsumer, "noevent", _BlastConsumer.__getattr__(self, "noevent") 1880 ) 1881 method(line) 1882 1883 1884class BlastErrorParser(AbstractParser): 1885 """Attempt to catch and diagnose BLAST errors while parsing. 1886 1887 This utilizes the BlastParser module but adds an additional layer 1888 of complexity on top of it by attempting to diagnose ValueErrors 1889 that may actually indicate problems during BLAST parsing. 1890 1891 Current BLAST problems this detects are: 1892 - LowQualityBlastError - When BLASTing really low quality sequences 1893 (ie. some GenBank entries which are just short stretches of a single 1894 nucleotide), BLAST will report an error with the sequence and be 1895 unable to search with this. This will lead to a badly formatted 1896 BLAST report that the parsers choke on. The parser will convert the 1897 ValueError to a LowQualityBlastError and attempt to provide useful 1898 information. 1899 """ 1900 1901 def __init__(self, bad_report_handle=None): 1902 """Initialize a parser that tries to catch BlastErrors. 1903 1904 Arguments: 1905 - bad_report_handle - An optional argument specifying a handle 1906 where bad reports should be sent. This would allow you to save 1907 all of the bad reports to a file, for instance. If no handle 1908 is specified, the bad reports will not be saved. 1909 """ 1910 self._bad_report_handle = bad_report_handle 1911 1912 # self._b_parser = BlastParser() 1913 self._scanner = _Scanner() 1914 self._consumer = _BlastErrorConsumer() 1915 1916 def parse(self, handle): 1917 """Parse a handle, attempting to diagnose errors.""" 1918 results = handle.read() 1919 1920 try: 1921 self._scanner.feed(StringIO(results), self._consumer) 1922 except ValueError: 1923 # if we have a bad_report_file, save the info to it first 1924 if self._bad_report_handle: 1925 # send the info to the error handle 1926 self._bad_report_handle.write(results) 1927 1928 # now we want to try and diagnose the error 1929 self._diagnose_error(StringIO(results), self._consumer.data) 1930 1931 # if we got here we can't figure out the problem 1932 # so we should pass along the syntax error we got 1933 raise 1934 return self._consumer.data 1935 1936 def _diagnose_error(self, handle, data_record): 1937 """Attempt to diagnose an error in the passed handle (PRIVATE). 1938 1939 Arguments: 1940 - handle - The handle potentially containing the error 1941 - data_record - The data record partially created by the consumer. 1942 """ 1943 line = handle.readline() 1944 1945 while line: 1946 # 'Searchingdone' instead of 'Searching......done' seems 1947 # to indicate a failure to perform the BLAST due to 1948 # low quality sequence 1949 if line.startswith("Searchingdone"): 1950 raise LowQualityBlastError( 1951 "Blast failure occurred on query: ", data_record.query 1952 ) 1953 line = handle.readline() 1954