1# Copyright 1999-2000 by Jeffrey Chang.  All rights reserved.
2# This code is part of the Biopython distribution and governed by its
3# license.  Please see the LICENSE file that should have been included
4# as part of this package.
5# Patches by Mike Poidinger to support multiple databases.
6# Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15
7
8"""Code for calling standalone BLAST and parsing plain text output (DEPRECATED).
9
10Rather than parsing the human readable plain text BLAST output (which seems to
11change with every update to BLAST), we and the NBCI recommend you parse the
12XML output instead. The plain text parser in this module still works at the
13time of writing, but is considered obsolete and updating it to cope with the
14latest versions of BLAST is not a priority for us.
15
16This module also provides code to work with the "legacy" standalone version of
17NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of
18the same name. These functions are very limited for dealing with the output as
19files rather than handles, for which the wrappers in Bio.Blast.Applications are
20preferred. Furthermore, the NCBI themselves regard these command line tools as
21"legacy", and encourage using the new BLAST+ tools instead. Biopython has
22wrappers for these under Bio.Blast.Applications (see the tutorial).
23"""
24
25import re
26
27from io import StringIO
28from Bio.SearchIO._legacy.ParserSupport import (
29    UndoHandle,
30    AbstractParser,
31    AbstractConsumer,
32    read_and_call,
33    read_and_call_until,
34    read_and_call_while,
35    attempt_read_and_call,
36    is_blank_line,
37    safe_peekline,
38    safe_readline,
39)
40from Bio.Blast import Record
41
42from Bio import BiopythonWarning
43import warnings
44
45_score_e_re = re.compile(r"Score +E")
46
47
48class LowQualityBlastError(Exception):
49    """Error caused by running a low quality sequence through BLAST.
50
51    When low quality sequences (like GenBank entries containing only
52    stretches of a single nucleotide) are BLASTed, they will result in
53    BLAST generating an error and not being able to perform the BLAST.
54    search. This error should be raised for the BLAST reports produced
55    in this case.
56    """
57
58    pass
59
60
61class ShortQueryBlastError(Exception):
62    """Error caused by running a short query sequence through BLAST.
63
64    If the query sequence is too short, BLAST outputs warnings and errors::
65
66        Searching[blastall] WARNING:  [000.000]  AT1G08320: SetUpBlastSearch failed.
67        [blastall] ERROR:  [000.000]  AT1G08320: Blast:
68        [blastall] ERROR:  [000.000]  AT1G08320: Blast: Query must be at least wordsize
69        done
70
71    This exception is raised when that condition is detected.
72    """
73
74    pass
75
76
77class _Scanner:
78    """Scan BLAST output from blastall or blastpgp.
79
80    Tested with blastall and blastpgp v2.0.10, v2.0.11
81
82    Methods:
83     - feed     Feed data into the scanner.
84
85    """
86
87    def __init__(self):
88        """Raise warning that this module is outdated."""
89        warnings.warn(
90            "Parsing BLAST plain text output file is not a well supported"
91            " functionality anymore. Consider generating your BLAST output for parsing"
92            " as XML or tabular format instead.",
93            BiopythonWarning,
94        )
95
96    def feed(self, handle, consumer):
97        """Feed in a BLAST report for scanning.
98
99        Arguments:
100         - handle is a file-like object that contains the BLAST report.
101         - consumer is a Consumer object that will receive events as the
102           report is scanned.
103        """
104        if isinstance(handle, UndoHandle):
105            uhandle = handle
106        else:
107            uhandle = UndoHandle(handle)
108
109        # Try to fast-forward to the beginning of the blast report.
110        read_and_call_until(uhandle, consumer.noevent, contains="BLAST")
111        # Now scan the BLAST report.
112        self._scan_header(uhandle, consumer)
113        self._scan_rounds(uhandle, consumer)
114        self._scan_database_report(uhandle, consumer)
115        self._scan_parameters(uhandle, consumer)
116
117    def _scan_header(self, uhandle, consumer):
118        # BLASTP 2.0.10 [Aug-26-1999]
119        #
120        #
121        # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf
122        # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997),
123        # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea
124        # programs",  Nucleic Acids Res. 25:3389-3402.
125        #
126        # Query= test
127        #          (140 letters)
128        #
129        # Database: sdqib40-1.35.seg.fa
130        #            1323 sequences; 223,339 total letters
131        #
132        # ========================================================
133        # This next example is from the online version of Blast,
134        # note there are TWO references, an RID line, and also
135        # the database is BEFORE the query line.
136        # Note there possibleuse of non-ASCII in the author names.
137        # ========================================================
138        #
139        # BLASTP 2.2.15 [Oct-15-2006]
140        # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer,
141        # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman
142        # (1997), "Gapped BLAST and PSI-BLAST: a new generation of
143        # protein database search programs", Nucleic Acids Res. 25:3389-3402.
144        #
145        # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei
146        # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and
147        # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST
148        # protein database searches with composition-based statistics
149        # and other refinements", Nucleic Acids Res. 29:2994-3005.
150        #
151        # RID: 1166022616-19998-65316425856.BLASTQ1
152        #
153        #
154        # Database: All non-redundant GenBank CDS
155        # translations+PDB+SwissProt+PIR+PRF excluding environmental samples
156        #            4,254,166 sequences; 1,462,033,012 total letters
157        # Query=  gi:16127998
158        # Length=428
159        #
160
161        consumer.start_header()
162
163        read_and_call(uhandle, consumer.version, contains="BLAST")
164        read_and_call_while(uhandle, consumer.noevent, blank=1)
165
166        # There might be a <pre> line, for qblast output.
167        attempt_read_and_call(uhandle, consumer.noevent, start="<pre>")
168
169        # Read the reference(s)
170        while attempt_read_and_call(uhandle, consumer.reference, start="Reference"):
171            # References are normally multiline terminated by a blank line
172            # (or, based on the old code, the RID line)
173            while True:
174                line = uhandle.readline()
175                if is_blank_line(line):
176                    consumer.noevent(line)
177                    break
178                elif line.startswith("RID"):
179                    break
180                else:
181                    # More of the reference
182                    consumer.reference(line)
183
184        # Deal with the optional RID: ...
185        read_and_call_while(uhandle, consumer.noevent, blank=1)
186        attempt_read_and_call(uhandle, consumer.reference, start="RID:")
187        read_and_call_while(uhandle, consumer.noevent, blank=1)
188
189        # blastpgp may have a reference for compositional score matrix
190        # adjustment (see Bug 2502):
191        if attempt_read_and_call(uhandle, consumer.reference, start="Reference"):
192            read_and_call_until(uhandle, consumer.reference, blank=1)
193            read_and_call_while(uhandle, consumer.noevent, blank=1)
194
195        # blastpgp has a Reference for composition-based statistics.
196        if attempt_read_and_call(uhandle, consumer.reference, start="Reference"):
197            read_and_call_until(uhandle, consumer.reference, blank=1)
198            read_and_call_while(uhandle, consumer.noevent, blank=1)
199
200        line = uhandle.peekline()
201        assert line.strip() != ""
202        assert not line.startswith("RID:")
203        if line.startswith("Query="):
204            # This is an old style query then database...
205
206            # Read the Query lines and the following blank line.
207            read_and_call(uhandle, consumer.query_info, start="Query=")
208            read_and_call_until(uhandle, consumer.query_info, blank=1)
209            read_and_call_while(uhandle, consumer.noevent, blank=1)
210
211            # Read the database lines and the following blank line.
212            read_and_call_until(uhandle, consumer.database_info, end="total letters")
213            read_and_call(uhandle, consumer.database_info, contains="sequences")
214            read_and_call_while(uhandle, consumer.noevent, blank=1)
215        elif line.startswith("Database:"):
216            # This is a new style database then query...
217            read_and_call_until(uhandle, consumer.database_info, end="total letters")
218            read_and_call(uhandle, consumer.database_info, contains="sequences")
219            read_and_call_while(uhandle, consumer.noevent, blank=1)
220
221            # Read the Query lines and the following blank line.
222            # Or, on BLAST 2.2.22+ there is no blank link - need to spot
223            # the "... Score     E" line instead.
224            read_and_call(uhandle, consumer.query_info, start="Query=")
225            # BLAST 2.2.25+ has a blank line before Length=
226            read_and_call_until(uhandle, consumer.query_info, start="Length=")
227            while True:
228                line = uhandle.peekline()
229                if not line.strip() or _score_e_re.search(line) is not None:
230                    break
231                # It is more of the query (and its length)
232                read_and_call(uhandle, consumer.query_info)
233            read_and_call_while(uhandle, consumer.noevent, blank=1)
234        else:
235            raise ValueError("Invalid header?")
236
237        consumer.end_header()
238
239    def _scan_rounds(self, uhandle, consumer):
240        # Scan a bunch of rounds.
241        # Each round begins with either a "Searching......" line
242        # or a 'Score     E' line followed by descriptions and alignments.
243        # The email server doesn't give the "Searching....." line.
244        # If there is no 'Searching.....' line then you'll first see a
245        # 'Results from round' line
246
247        while not self._eof(uhandle):
248            line = safe_peekline(uhandle)
249            if (
250                not line.startswith("Searching")
251                and not line.startswith("Results from round")
252                and _score_e_re.search(line) is None
253                and "No hits found" not in line
254            ):
255                break
256            self._scan_descriptions(uhandle, consumer)
257            self._scan_alignments(uhandle, consumer)
258
259    def _scan_descriptions(self, uhandle, consumer):
260        # Searching..................................................done
261        # Results from round 2
262        #
263        #
264        #                                                                    Sc
265        # Sequences producing significant alignments:                        (b
266        # Sequences used in model and found again:
267        #
268        # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ...
269        # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B...
270        # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)]
271        #
272        # Sequences not found previously or not previously below threshold:
273        #
274        # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia]
275        # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb...
276        #
277
278        # If PSI-BLAST, may also have:
279        #
280        # CONVERGED!
281
282        consumer.start_descriptions()
283
284        # Read 'Searching'
285        # This line seems to be missing in BLASTN 2.1.2 (others?)
286        attempt_read_and_call(uhandle, consumer.noevent, start="Searching")
287
288        # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here.
289        # If this happens, the handle will yield no more information.
290        if not uhandle.peekline():
291            raise ValueError(
292                "Unexpected end of blast report. Looks suspiciously like a PSI-BLAST crash."
293            )
294
295        # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here:
296        # Searching[blastall] WARNING:  [000.000]  AT1G08320: SetUpBlastSearch
297        # [blastall] ERROR:  [000.000]  AT1G08320: Blast:
298        # [blastall] ERROR:  [000.000]  AT1G08320: Blast: Query must be at leas
299        # done
300        # Reported by David Weisman.
301        # Check for these error lines and ignore them for now.  Let
302        # the BlastErrorParser deal with them.
303        line = uhandle.peekline()
304        if "ERROR:" in line or line.startswith("done"):
305            read_and_call_while(uhandle, consumer.noevent, contains="ERROR:")
306            read_and_call(uhandle, consumer.noevent, start="done")
307
308        # Check to see if this is PSI-BLAST.
309        # If it is, the 'Searching' line will be followed by:
310        # (version 2.0.10)
311        #     Searching.............................
312        #     Results from round 2
313        # or (version 2.0.11)
314        #     Searching.............................
315        #
316        #
317        #     Results from round 2
318
319        # Skip a bunch of blank lines.
320        read_and_call_while(uhandle, consumer.noevent, blank=1)
321        # Check for the results line if it's there.
322        if attempt_read_and_call(uhandle, consumer.round, start="Results"):
323            read_and_call_while(uhandle, consumer.noevent, blank=1)
324
325        # Three things can happen here:
326        # 1.  line contains 'Score     E'
327        # 2.  line contains "No hits found"
328        # 3.  no descriptions
329        # The first one begins a bunch of descriptions.  The last two
330        # indicates that no descriptions follow, and we should go straight
331        # to the alignments.
332        if not attempt_read_and_call(
333            uhandle, consumer.description_header, has_re=_score_e_re
334        ):
335            # Either case 2 or 3.  Look for "No hits found".
336            attempt_read_and_call(uhandle, consumer.no_hits, contains="No hits found")
337            try:
338                read_and_call_while(uhandle, consumer.noevent, blank=1)
339            except ValueError as err:
340                if str(err) != "Unexpected end of stream.":
341                    raise
342
343            consumer.end_descriptions()
344            # Stop processing.
345            return
346
347        # Read the score header lines
348        read_and_call(uhandle, consumer.description_header, start="Sequences producing")
349
350        # If PSI-BLAST, read the 'Sequences used in model' line.
351        attempt_read_and_call(
352            uhandle, consumer.model_sequences, start="Sequences used in model"
353        )
354        read_and_call_while(uhandle, consumer.noevent, blank=1)
355
356        # In BLAT, rather than a "No hits found" line, we just
357        # get no descriptions (and no alignments). This can be
358        # spotted because the next line is the database block:
359        if safe_peekline(uhandle).startswith("  Database:"):
360            consumer.end_descriptions()
361            # Stop processing.
362            return
363
364        # Read the descriptions and the following blank lines, making
365        # sure that there are descriptions.
366        if not uhandle.peekline().startswith("Sequences not found"):
367            read_and_call_until(uhandle, consumer.description, blank=1)
368            read_and_call_while(uhandle, consumer.noevent, blank=1)
369
370        # If PSI-BLAST, read the 'Sequences not found' line followed
371        # by more descriptions.  However, I need to watch out for the
372        # case where there were no sequences not found previously, in
373        # which case there will be no more descriptions.
374        if attempt_read_and_call(
375            uhandle, consumer.nonmodel_sequences, start="Sequences not found"
376        ):
377            # Read the descriptions and the following blank lines.
378            read_and_call_while(uhandle, consumer.noevent, blank=1)
379            line = safe_peekline(uhandle)
380            # Brad -- added check for QUERY. On some PSI-BLAST outputs
381            # there will be a 'Sequences not found' line followed by no
382            # descriptions. Check for this case since the first thing you'll
383            # get is a blank line and then 'QUERY'
384            if (
385                not line.startswith("CONVERGED")
386                and line[0] != ">"
387                and not line.startswith("QUERY")
388            ):
389                read_and_call_until(uhandle, consumer.description, blank=1)
390                read_and_call_while(uhandle, consumer.noevent, blank=1)
391
392        attempt_read_and_call(uhandle, consumer.converged, start="CONVERGED")
393        read_and_call_while(uhandle, consumer.noevent, blank=1)
394
395        consumer.end_descriptions()
396
397    def _scan_alignments(self, uhandle, consumer):
398        if self._eof(uhandle):
399            return
400
401        # qblast inserts a helpful line here.
402        attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS")
403
404        # First, check to see if I'm at the database report.
405        line = safe_peekline(uhandle)
406        if not line:
407            # EOF
408            return
409        elif line.startswith("  Database") or line.startswith("Lambda"):
410            return
411        elif line[0] == ">":
412            # XXX make a better check here between pairwise and masterslave
413            self._scan_pairwise_alignments(uhandle, consumer)
414        elif line.startswith("Effective"):
415            return
416        else:
417            # XXX put in a check to make sure I'm in a masterslave alignment
418            self._scan_masterslave_alignment(uhandle, consumer)
419
420    def _scan_pairwise_alignments(self, uhandle, consumer):
421        while not self._eof(uhandle):
422            line = safe_peekline(uhandle)
423            if line[0] != ">":
424                break
425            self._scan_one_pairwise_alignment(uhandle, consumer)
426
427    def _scan_one_pairwise_alignment(self, uhandle, consumer):
428        if self._eof(uhandle):
429            return
430        consumer.start_alignment()
431
432        self._scan_alignment_header(uhandle, consumer)
433
434        # Scan a bunch of score/alignment pairs.
435        while True:
436            if self._eof(uhandle):
437                # Shouldn't have issued that _scan_alignment_header event...
438                break
439            line = safe_peekline(uhandle)
440            if not line.startswith(" Score"):
441                break
442            self._scan_hsp(uhandle, consumer)
443        consumer.end_alignment()
444
445    def _scan_alignment_header(self, uhandle, consumer):
446        # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus
447        #           stearothermophilus]
448        #           Length = 81
449        #
450        # Or, more recently with different white space:
451        #
452        # >gi|15799684|ref|NP_285696.1| threonine synthase ...
453        #  gi|15829258|ref|NP_308031.1| threonine synthase
454        #  ...
455        # Length=428
456        read_and_call(uhandle, consumer.title, start=">")
457        while True:
458            line = safe_readline(uhandle)
459            if line.lstrip().startswith(("Length =", "Length=")):
460                consumer.length(line)
461                break
462            elif is_blank_line(line):
463                # Check to make sure I haven't missed the Length line
464                raise ValueError("I missed the Length in an alignment header")
465            consumer.title(line)
466
467        # Older versions of BLAST will have a line with some spaces.
468        # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line.
469        if not attempt_read_and_call(uhandle, consumer.noevent, start="          "):
470            read_and_call(uhandle, consumer.noevent, blank=1)
471
472    def _scan_hsp(self, uhandle, consumer):
473        consumer.start_hsp()
474        self._scan_hsp_header(uhandle, consumer)
475        self._scan_hsp_alignment(uhandle, consumer)
476        consumer.end_hsp()
477
478    def _scan_hsp_header(self, uhandle, consumer):
479        #  Score = 22.7 bits (47), Expect = 2.5
480        #  Identities = 10/36 (27%), Positives = 18/36 (49%)
481        #  Strand = Plus / Plus
482        #  Frame = +3
483        #
484
485        read_and_call(uhandle, consumer.score, start=" Score")
486        read_and_call(uhandle, consumer.identities, start=" Identities")
487        # BLASTN
488        attempt_read_and_call(uhandle, consumer.strand, start=" Strand")
489        # BLASTX, TBLASTN, TBLASTX
490        attempt_read_and_call(uhandle, consumer.frame, start=" Frame")
491        read_and_call(uhandle, consumer.noevent, blank=1)
492
493    def _scan_hsp_alignment(self, uhandle, consumer):
494        # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF
495        #           GRGVS+         TC    Y  + + V GGG+ + EE   L     +   I R+
496        # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG
497        #
498        # Query: 64 AEKILIKR 71
499        #              I +K
500        # Sbjct: 70 PNIIQLKD 77
501        #
502
503        while True:
504            # Blastn adds an extra line filled with spaces before Query
505            attempt_read_and_call(uhandle, consumer.noevent, start="     ")
506            read_and_call(uhandle, consumer.query, start="Query")
507            read_and_call(uhandle, consumer.align, start="     ")
508            read_and_call(uhandle, consumer.sbjct, start="Sbjct")
509            try:
510                read_and_call_while(uhandle, consumer.noevent, blank=1)
511            except ValueError as err:
512                if str(err) != "Unexpected end of stream.":
513                    raise
514                # End of File (well, it looks like it with recent versions
515                # of BLAST for multiple queries after the Iterator class
516                # has broken up the whole file into chunks).
517                break
518            line = safe_peekline(uhandle)
519            # Alignment continues if I see a 'Query' or the spaces for Blastn.
520            if not (line.startswith("Query") or line.startswith("     ")):
521                break
522
523    def _scan_masterslave_alignment(self, uhandle, consumer):
524        consumer.start_alignment()
525        while True:
526            line = safe_readline(uhandle)
527            # Check to see whether I'm finished reading the alignment.
528            # This is indicated by 1) database section, 2) next psi-blast
529            # round, which can also be a 'Results from round' if no
530            # searching line is present
531            # patch by chapmanb
532            if line.startswith("Searching") or line.startswith("Results from round"):
533                uhandle.saveline(line)
534                break
535            elif line.startswith("  Database"):
536                uhandle.saveline(line)
537                break
538            elif is_blank_line(line):
539                consumer.noevent(line)
540            else:
541                consumer.multalign(line)
542        read_and_call_while(uhandle, consumer.noevent, blank=1)
543        consumer.end_alignment()
544
545    def _eof(self, uhandle):
546        try:
547            line = safe_peekline(uhandle)
548        except ValueError as err:
549            if str(err) != "Unexpected end of stream.":
550                raise
551            line = ""
552        return not line
553
554    def _scan_database_report(self, uhandle, consumer):
555        #   Database: sdqib40-1.35.seg.fa
556        #     Posted date:  Nov 1, 1999  4:25 PM
557        #   Number of letters in database: 223,339
558        #   Number of sequences in database:  1323
559        #
560        # Lambda     K      H
561        #    0.322    0.133    0.369
562        #
563        # Gapped
564        # Lambda     K      H
565        #    0.270   0.0470    0.230
566        #
567        ##########################################
568        # Or, more recently Blast 2.2.15 gives less blank lines
569        ##########################################
570        #   Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding
571        # environmental samples
572        #     Posted date:  Dec 12, 2006  5:51 PM
573        #   Number of letters in database: 667,088,753
574        #   Number of sequences in database:  2,094,974
575        # Lambda     K      H
576        #    0.319    0.136    0.395
577        # Gapped
578        # Lambda     K      H
579        #    0.267   0.0410    0.140
580
581        if self._eof(uhandle):
582            return
583
584        consumer.start_database_report()
585
586        # Subset of the database(s) listed below
587        #    Number of letters searched: 562,618,960
588        #    Number of sequences searched:  228,924
589        if attempt_read_and_call(uhandle, consumer.noevent, start="  Subset"):
590            read_and_call(uhandle, consumer.noevent, contains="letters")
591            read_and_call(uhandle, consumer.noevent, contains="sequences")
592            read_and_call(uhandle, consumer.noevent, start="  ")
593
594        # Sameet Mehta reported seeing output from BLASTN 2.2.9 that
595        # was missing the "Database" stanza completely.
596        while attempt_read_and_call(uhandle, consumer.database, start="  Database"):
597            # BLAT output ends abruptly here, without any of the other
598            # information.  Check to see if this is the case.  If so,
599            # then end the database report here gracefully.
600            if not uhandle.peekline().strip() or uhandle.peekline().startswith("BLAST"):
601                consumer.end_database_report()
602                return
603
604            # Database can span multiple lines.
605            read_and_call_until(uhandle, consumer.database, start="    Posted")
606            read_and_call(uhandle, consumer.posted_date, start="    Posted")
607            read_and_call(
608                uhandle, consumer.num_letters_in_database, start="  Number of letters"
609            )
610            read_and_call(
611                uhandle,
612                consumer.num_sequences_in_database,
613                start="  Number of sequences",
614            )
615            # There may not be a line starting with spaces...
616            attempt_read_and_call(uhandle, consumer.noevent, start="  ")
617
618            line = safe_readline(uhandle)
619            uhandle.saveline(line)
620            if "Lambda" in line:
621                break
622
623        try:
624            read_and_call(uhandle, consumer.noevent, start="Lambda")
625            read_and_call(uhandle, consumer.ka_params)
626        except Exception:  # TODO: ValueError, AttributeError?
627            pass
628
629        # This blank line is optional:
630        attempt_read_and_call(uhandle, consumer.noevent, blank=1)
631
632        # not BLASTP
633        attempt_read_and_call(uhandle, consumer.gapped, start="Gapped")
634        # not TBLASTX
635        if attempt_read_and_call(uhandle, consumer.noevent, start="Lambda"):
636            read_and_call(uhandle, consumer.ka_params_gap)
637
638        # Blast 2.2.4 can sometimes skip the whole parameter section.
639        # Thus, I need to be careful not to read past the end of the
640        # file.
641        try:
642            read_and_call_while(uhandle, consumer.noevent, blank=1)
643        except ValueError as x:
644            if str(x) != "Unexpected end of stream.":
645                raise
646        consumer.end_database_report()
647
648    def _scan_parameters(self, uhandle, consumer):
649        # Matrix: BLOSUM62
650        # Gap Penalties: Existence: 11, Extension: 1
651        # Number of Hits to DB: 50604
652        # Number of Sequences: 1323
653        # Number of extensions: 1526
654        # Number of successful extensions: 6
655        # Number of sequences better than 10.0: 5
656        # Number of HSP's better than 10.0 without gapping: 5
657        # Number of HSP's successfully gapped in prelim test: 0
658        # Number of HSP's that attempted gapping in prelim test: 1
659        # Number of HSP's gapped (non-prelim): 5
660        # length of query: 140
661        # length of database: 223,339
662        # effective HSP length: 39
663        # effective length of query: 101
664        # effective length of database: 171,742
665        # effective search space: 17345942
666        # effective search space used: 17345942
667        # T: 11
668        # A: 40
669        # X1: 16 ( 7.4 bits)
670        # X2: 38 (14.8 bits)
671        # X3: 64 (24.9 bits)
672        # S1: 41 (21.9 bits)
673        # S2: 42 (20.8 bits)
674        ##########################################
675        # Or, more recently Blast(x) 2.2.15 gives
676        ##########################################
677        # Matrix: BLOSUM62
678        # Gap Penalties: Existence: 11, Extension: 1
679        # Number of Sequences: 4535438
680        # Number of Hits to DB: 2,588,844,100
681        # Number of extensions: 60427286
682        # Number of successful extensions: 126433
683        # Number of sequences better than  2.0: 30
684        # Number of HSP's gapped: 126387
685        # Number of HSP's successfully gapped: 35
686        # Length of query: 291
687        # Length of database: 1,573,298,872
688        # Length adjustment: 130
689        # Effective length of query: 161
690        # Effective length of database: 983,691,932
691        # Effective search space: 158374401052
692        # Effective search space used: 158374401052
693        # Neighboring words threshold: 12
694        # Window for multiple hits: 40
695        # X1: 16 ( 7.3 bits)
696        # X2: 38 (14.6 bits)
697        # X3: 64 (24.7 bits)
698        # S1: 41 (21.7 bits)
699        # S2: 32 (16.9 bits)
700
701        # Blast 2.2.4 can sometimes skip the whole parameter section.
702        # BLAT also skips the whole parameter section.
703        # Thus, check to make sure that the parameter section really
704        # exists.
705        if not uhandle.peekline().strip():
706            return
707
708        # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and
709        # "Number of Sequences" lines.
710        consumer.start_parameters()
711
712        # Matrix line may be missing in BLASTN 2.2.9
713        attempt_read_and_call(uhandle, consumer.matrix, start="Matrix")
714        # not TBLASTX
715        attempt_read_and_call(uhandle, consumer.gap_penalties, start="Gap")
716        attempt_read_and_call(
717            uhandle, consumer.num_sequences, start="Number of Sequences"
718        )
719        attempt_read_and_call(uhandle, consumer.num_hits, start="Number of Hits")
720        attempt_read_and_call(
721            uhandle, consumer.num_sequences, start="Number of Sequences"
722        )
723        attempt_read_and_call(
724            uhandle, consumer.num_extends, start="Number of extensions"
725        )
726        attempt_read_and_call(
727            uhandle, consumer.num_good_extends, start="Number of successful"
728        )
729        attempt_read_and_call(
730            uhandle, consumer.num_seqs_better_e, start="Number of sequences"
731        )
732
733        # not BLASTN, TBLASTX
734        if attempt_read_and_call(
735            uhandle, consumer.hsps_no_gap, start="Number of HSP's better"
736        ):
737            # BLASTN 2.2.9
738            if attempt_read_and_call(
739                uhandle, consumer.noevent, start="Number of HSP's gapped:"
740            ):
741                read_and_call(
742                    uhandle, consumer.noevent, start="Number of HSP's successfully"
743                )
744                # This is omitted in 2.2.15
745                attempt_read_and_call(
746                    uhandle, consumer.noevent, start="Number of extra gapped extensions"
747                )
748            else:
749                read_and_call(
750                    uhandle,
751                    consumer.hsps_prelim_gapped,
752                    start="Number of HSP's successfully",
753                )
754                read_and_call(
755                    uhandle,
756                    consumer.hsps_prelim_gap_attempted,
757                    start="Number of HSP's that",
758                )
759                read_and_call(
760                    uhandle, consumer.hsps_gapped, start="Number of HSP's gapped"
761                )
762        # e.g. BLASTX 2.2.15 where the "better" line is missing
763        elif attempt_read_and_call(
764            uhandle, consumer.noevent, start="Number of HSP's gapped"
765        ):
766            read_and_call(
767                uhandle, consumer.noevent, start="Number of HSP's successfully"
768            )
769
770        # not in blastx 2.2.1
771        attempt_read_and_call(
772            uhandle, consumer.query_length, has_re=re.compile(r"[Ll]ength of query")
773        )
774        # Not in BLASTX 2.2.22+
775        attempt_read_and_call(
776            uhandle,
777            consumer.database_length,
778            has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase"),
779        )
780
781        # BLASTN 2.2.9
782        attempt_read_and_call(uhandle, consumer.noevent, start="Length adjustment")
783        attempt_read_and_call(
784            uhandle, consumer.effective_hsp_length, start="effective HSP"
785        )
786        # Not in blastx 2.2.1
787        attempt_read_and_call(
788            uhandle,
789            consumer.effective_query_length,
790            has_re=re.compile(r"[Ee]ffective length of query"),
791        )
792
793        # This is not in BLASTP 2.2.15
794        attempt_read_and_call(
795            uhandle,
796            consumer.effective_database_length,
797            has_re=re.compile(r"[Ee]ffective length of \s*[Dd]atabase"),
798        )
799        # Not in blastx 2.2.1, added a ':' to distinguish between
800        # this and the 'effective search space used' line
801        attempt_read_and_call(
802            uhandle,
803            consumer.effective_search_space,
804            has_re=re.compile(r"[Ee]ffective search space:"),
805        )
806        # Does not appear in BLASTP 2.0.5
807        attempt_read_and_call(
808            uhandle,
809            consumer.effective_search_space_used,
810            has_re=re.compile(r"[Ee]ffective search space used"),
811        )
812
813        # BLASTX, TBLASTN, TBLASTX
814        attempt_read_and_call(uhandle, consumer.frameshift, start="frameshift")
815
816        # not in BLASTN 2.2.9
817        attempt_read_and_call(uhandle, consumer.threshold, start="T")
818        # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12"
819        attempt_read_and_call(
820            uhandle, consumer.threshold, start="Neighboring words threshold"
821        )
822
823        # not in BLASTX 2.2.15
824        attempt_read_and_call(uhandle, consumer.window_size, start="A")
825        # get this instead: "Window for multiple hits: 40"
826        attempt_read_and_call(
827            uhandle, consumer.window_size, start="Window for multiple hits"
828        )
829
830        # not in BLASTX 2.2.22+
831        attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start="X1")
832        # not TBLASTN
833        attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start="X2")
834
835        # not BLASTN, TBLASTX
836        attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, start="X3")
837
838        # not TBLASTN
839        attempt_read_and_call(uhandle, consumer.gap_trigger, start="S1")
840        # not in blastx 2.2.1
841        # first we make sure we have additional lines to work with, if
842        # not then the file is done and we don't have a final S2
843        if not is_blank_line(uhandle.peekline(), allow_spaces=1):
844            read_and_call(uhandle, consumer.blast_cutoff, start="S2")
845
846        consumer.end_parameters()
847
848
849class BlastParser(AbstractParser):
850    """Parses BLAST data into a Record.Blast object."""
851
852    def __init__(self):
853        """Initialize the class."""
854        self._scanner = _Scanner()
855        self._consumer = _BlastConsumer()
856
857    def parse(self, handle):
858        """Parse BLAST handle into a Record.Blast object."""
859        self._scanner.feed(handle, self._consumer)
860        return self._consumer.data
861
862
863class PSIBlastParser(AbstractParser):
864    """Parses BLAST data into a Record.PSIBlast object."""
865
866    def __init__(self):
867        """Initialize the class."""
868        self._scanner = _Scanner()
869        self._consumer = _PSIBlastConsumer()
870
871    def parse(self, handle):
872        """Parse BLAST handle into a Record.PSIBlast object."""
873        self._scanner.feed(handle, self._consumer)
874        return self._consumer.data
875
876
877class _HeaderConsumer:
878    def start_header(self):
879        self._header = Record.Header()
880
881    def version(self, line):
882        c = line.split()
883        self._header.application = c[0]
884        self._header.version = c[1]
885        if len(c) > 2:
886            # The date is missing in the new C++ output from blastx 2.2.22+
887            # Just get "BLASTX 2.2.22+\n" and that's all.
888            self._header.date = c[2][1:-1]
889
890    def reference(self, line):
891        if line.startswith("Reference: "):
892            self._header.reference = line[11:]
893        else:
894            self._header.reference += line
895
896    def query_info(self, line):
897        if line.startswith("Query= "):
898            self._header.query = line[7:].lstrip()
899        elif line.startswith("Length="):
900            # New style way to give the query length in BLAST 2.2.22+ (the C++ code)
901            self._header.query_letters = _safe_int(line[7:].strip())
902        elif not line.startswith("       "):  # continuation of query_info
903            self._header.query = "%s%s" % (self._header.query, line)
904        else:
905            # Hope it is the old style way to give the query length:
906            (letters,) = _re_search(
907                r"([0-9,]+) letters",
908                line,
909                "I could not find the number of letters in line\n%s" % line,
910            )
911            self._header.query_letters = _safe_int(letters)
912
913    def database_info(self, line):
914        line = line.rstrip()
915        if line.startswith("Database: "):
916            self._header.database = line[10:]
917        elif not line.endswith("total letters"):
918            if self._header.database:
919                # Need to include a space when merging multi line datase descr
920                self._header.database += " " + line.strip()
921            else:
922                self._header.database = line.strip()
923        else:
924            sequences, letters = _re_search(
925                r"([0-9,]+) sequences; ([0-9,-]+) total letters",
926                line,
927                "I could not find the sequences and letters in line\n%s" % line,
928            )
929            self._header.database_sequences = _safe_int(sequences)
930            self._header.database_letters = _safe_int(letters)
931
932    def end_header(self):
933        # Get rid of the trailing newlines
934        self._header.reference = self._header.reference.rstrip()
935        self._header.query = self._header.query.rstrip()
936
937
938class _DescriptionConsumer:
939    def start_descriptions(self):
940        self._descriptions = []
941        self._model_sequences = []
942        self._nonmodel_sequences = []
943        self._converged = 0
944        self._type = None
945        self._roundnum = None
946
947        self.__has_n = 0  # Does the description line contain an N value?
948
949    def description_header(self, line):
950        if line.startswith("Sequences producing"):
951            cols = line.split()
952            if cols[-1] == "N":
953                self.__has_n = 1
954
955    def description(self, line):
956        dh = self._parse(line)
957        if self._type == "model":
958            self._model_sequences.append(dh)
959        elif self._type == "nonmodel":
960            self._nonmodel_sequences.append(dh)
961        else:
962            self._descriptions.append(dh)
963
964    def model_sequences(self, line):
965        self._type = "model"
966
967    def nonmodel_sequences(self, line):
968        self._type = "nonmodel"
969
970    def converged(self, line):
971        self._converged = 1
972
973    def no_hits(self, line):
974        pass
975
976    def round(self, line):
977        if not line.startswith("Results from round"):
978            raise ValueError("I didn't understand the round line\n%s" % line)
979        self._roundnum = _safe_int(line[18:].strip())
980
981    def end_descriptions(self):
982        pass
983
984    def _parse(self, description_line):
985        line = description_line  # for convenience
986        dh = Record.Description()
987
988        # I need to separate the score and p-value from the title.
989        # sp|P21297|FLBT_CAUCR FLBT PROTEIN     [snip]         284  7e-77
990        # sp|P21297|FLBT_CAUCR FLBT PROTEIN     [snip]         284  7e-77  1
991        # special cases to handle:
992        #   - title must be preserved exactly (including whitespaces)
993        #   - score could be equal to e-value (not likely, but what if??)
994        #   - sometimes there's an "N" score of '1'.
995        cols = line.split()
996        if len(cols) < 3:
997            raise ValueError("Line does not appear to contain description:\n%s" % line)
998        if self.__has_n:
999            i = line.rfind(cols[-1])  # find start of N
1000            i = line.rfind(cols[-2], 0, i)  # find start of p-value
1001            i = line.rfind(cols[-3], 0, i)  # find start of score
1002        else:
1003            i = line.rfind(cols[-1])  # find start of p-value
1004            i = line.rfind(cols[-2], 0, i)  # find start of score
1005        if self.__has_n:
1006            dh.title, dh.score, dh.e, dh.num_alignments = (
1007                line[:i].rstrip(),
1008                cols[-3],
1009                cols[-2],
1010                cols[-1],
1011            )
1012        else:
1013            dh.title, dh.score, dh.e, dh.num_alignments = (
1014                line[:i].rstrip(),
1015                cols[-2],
1016                cols[-1],
1017                1,
1018            )
1019        dh.num_alignments = _safe_int(dh.num_alignments)
1020        dh.score = _safe_int(dh.score)
1021        dh.e = _safe_float(dh.e)
1022        return dh
1023
1024
1025class _AlignmentConsumer:
1026    # This is a little bit tricky.  An alignment can either be a
1027    # pairwise alignment or a multiple alignment.  Since it's difficult
1028    # to know a-priori which one the blast record will contain, I'm going
1029    # to make one class that can parse both of them.
1030    def start_alignment(self):
1031        self._alignment = Record.Alignment()
1032        self._multiple_alignment = Record.MultipleAlignment()
1033
1034    def title(self, line):
1035        if self._alignment.title:
1036            self._alignment.title += " "
1037        self._alignment.title += line.strip()
1038
1039    def length(self, line):
1040        # e.g. "Length = 81" or more recently, "Length=428"
1041        parts = line.replace(" ", "").split("=")
1042        if len(parts) != 2:
1043            raise ValueError("Unrecognised format length line: %r" % line)
1044        self._alignment.length = parts[1]
1045        self._alignment.length = _safe_int(self._alignment.length)
1046
1047    def multalign(self, line):
1048        # Standalone version uses 'QUERY', while WWW version uses blast_tmp.
1049        if line.startswith("QUERY") or line.startswith("blast_tmp"):
1050            # If this is the first line of the multiple alignment,
1051            # then I need to figure out how the line is formatted.
1052
1053            # Format of line is:
1054            # QUERY 1   acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60
1055            try:
1056                name, start, seq, end = line.split()
1057            except ValueError:
1058                raise ValueError("I do not understand the line\n%s" % line) from None
1059            self._start_index = line.index(start, len(name))
1060            self._seq_index = line.index(seq, self._start_index + len(start))
1061            # subtract 1 for the space
1062            self._name_length = self._start_index - 1
1063            self._start_length = self._seq_index - self._start_index - 1
1064            self._seq_length = line.rfind(end) - self._seq_index - 1
1065
1066            # self._seq_index = line.index(seq)
1067            # # subtract 1 for the space
1068            # self._seq_length = line.rfind(end) - self._seq_index - 1
1069            # self._start_index = line.index(start)
1070            # self._start_length = self._seq_index - self._start_index - 1
1071            # self._name_length = self._start_index
1072
1073        # Extract the information from the line
1074        name = line[: self._name_length]
1075        name = name.rstrip()
1076        start = line[self._start_index : self._start_index + self._start_length]
1077        start = start.rstrip()
1078        if start:
1079            start = _safe_int(start)
1080        end = line[self._seq_index + self._seq_length :].rstrip()
1081        if end:
1082            end = _safe_int(end)
1083        seq = line[self._seq_index : self._seq_index + self._seq_length].rstrip()
1084        # right pad the sequence with spaces if necessary
1085        if len(seq) < self._seq_length:
1086            seq += " " * (self._seq_length - len(seq))
1087
1088        # I need to make sure the sequence is aligned correctly with the query.
1089        # First, I will find the length of the query.  Then, if necessary,
1090        # I will pad my current sequence with spaces so that they will line
1091        # up correctly.
1092
1093        # Two possible things can happen:
1094        # QUERY
1095        # 504
1096        #
1097        # QUERY
1098        # 403
1099        #
1100        # Sequence 504 will need padding at the end.  Since I won't know
1101        # this until the end of the alignment, this will be handled in
1102        # end_alignment.
1103        # Sequence 403 will need padding before being added to the alignment.
1104
1105        align = self._multiple_alignment.alignment  # for convenience
1106        align.append((name, start, seq, end))
1107
1108        # This is old code that tried to line up all the sequences
1109        # in a multiple alignment by using the sequence title's as
1110        # identifiers.  The problem with this is that BLAST assigns
1111        # different HSP's from the same sequence the same id.  Thus,
1112        # in one alignment block, there may be multiple sequences with
1113        # the same id.  I'm not sure how to handle this, so I'm not
1114        # going to.
1115
1116        # # If the sequence is the query, then just add it.
1117        # if name == 'QUERY':
1118        #     if len(align) == 0:
1119        #         align.append((name, start, seq))
1120        #     else:
1121        #         aname, astart, aseq = align[0]
1122        #         if name != aname:
1123        #             raise ValueError, "Query is not the first sequence"
1124        #         aseq = aseq + seq
1125        #         align[0] = aname, astart, aseq
1126        # else:
1127        #     if len(align) == 0:
1128        #         raise ValueError, "I could not find the query sequence"
1129        #     qname, qstart, qseq = align[0]
1130        #
1131        #     # Now find my sequence in the multiple alignment.
1132        #     for i in range(1, len(align)):
1133        #         aname, astart, aseq = align[i]
1134        #         if name == aname:
1135        #             index = i
1136        #             break
1137        #     else:
1138        #         # If I couldn't find it, then add a new one.
1139        #         align.append((None, None, None))
1140        #         index = len(align)-1
1141        #         # Make sure to left-pad it.
1142        #         aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq))
1143        #
1144        #     if len(qseq) != len(aseq) + len(seq):
1145        #         # If my sequences are shorter than the query sequence,
1146        #         # then I will need to pad some spaces to make them line up.
1147        #         # Since I've already right padded seq, that means aseq
1148        #         # must be too short.
1149        #         aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq))
1150        #     aseq = aseq + seq
1151        #     if astart is None:
1152        #         astart = start
1153        #     align[index] = aname, astart, aseq
1154
1155    def end_alignment(self):
1156        # Remove trailing newlines
1157        if self._alignment:
1158            self._alignment.title = self._alignment.title.rstrip()
1159
1160        # This code is also obsolete.  See note above.
1161        # If there's a multiple alignment, I will need to make sure
1162        # all the sequences are aligned.  That is, I may need to
1163        # right-pad the sequences.
1164        # if self._multiple_alignment is not None:
1165        #     align = self._multiple_alignment.alignment
1166        #     seqlen = None
1167        #     for i in range(len(align)):
1168        #         name, start, seq = align[i]
1169        #         if seqlen is None:
1170        #             seqlen = len(seq)
1171        #         else:
1172        #             if len(seq) < seqlen:
1173        #                 seq = seq + ' '*(seqlen - len(seq))
1174        #                 align[i] = name, start, seq
1175        #             elif len(seq) > seqlen:
1176        #                 raise ValueError, \
1177        #                       "Sequence %s is longer than the query" % name
1178
1179        # Clean up some variables, if they exist.
1180        try:
1181            del self._seq_index
1182            del self._seq_length
1183            del self._start_index
1184            del self._start_length
1185            del self._name_length
1186        except AttributeError:
1187            pass
1188
1189
1190class _HSPConsumer:
1191    def start_hsp(self):
1192        self._hsp = Record.HSP()
1193
1194    def score(self, line):
1195        self._hsp.bits, self._hsp.score = _re_search(
1196            r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)",
1197            line,
1198            "I could not find the score in line\n%s" % line,
1199        )
1200        self._hsp.score = _safe_float(self._hsp.score)
1201        self._hsp.bits = _safe_float(self._hsp.bits)
1202
1203        x, y = _re_search(
1204            r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)",
1205            line,
1206            "I could not find the expect in line\n%s" % line,
1207        )
1208        if x:
1209            self._hsp.num_alignments = _safe_int(x)
1210        else:
1211            self._hsp.num_alignments = 1
1212        self._hsp.expect = _safe_float(y)
1213
1214    def identities(self, line):
1215        x, y = _re_search(
1216            r"Identities = (\d+)\/(\d+)",
1217            line,
1218            "I could not find the identities in line\n%s" % line,
1219        )
1220        self._hsp.identities = _safe_int(x), _safe_int(y)
1221        self._hsp.align_length = _safe_int(y)
1222
1223        if "Positives" in line:
1224            x, y = _re_search(
1225                r"Positives = (\d+)\/(\d+)",
1226                line,
1227                "I could not find the positives in line\n%s" % line,
1228            )
1229            self._hsp.positives = _safe_int(x), _safe_int(y)
1230            assert self._hsp.align_length == _safe_int(y)
1231
1232        if "Gaps" in line:
1233            x, y = _re_search(
1234                r"Gaps = (\d+)\/(\d+)",
1235                line,
1236                "I could not find the gaps in line\n%s" % line,
1237            )
1238            self._hsp.gaps = _safe_int(x), _safe_int(y)
1239            assert self._hsp.align_length == _safe_int(y)
1240
1241    def strand(self, line):
1242        self._hsp.strand = _re_search(
1243            r"Strand\s?=\s?(\w+)\s?/\s?(\w+)",
1244            line,
1245            "I could not find the strand in line\n%s" % line,
1246        )
1247
1248    def frame(self, line):
1249        # Frame can be in formats:
1250        # Frame = +1
1251        # Frame = +2 / +2
1252        if "/" in line:
1253            self._hsp.frame = _re_search(
1254                r"Frame\s?=\s?([-+][123])\s?/\s?([-+][123])",
1255                line,
1256                "I could not find the frame in line\n%s" % line,
1257            )
1258        else:
1259            self._hsp.frame = _re_search(
1260                r"Frame = ([-+][123])",
1261                line,
1262                "I could not find the frame in line\n%s" % line,
1263            )
1264
1265    # Match a space, if one is available.  Masahir Ishikawa found a
1266    # case where there's no space between the start and the sequence:
1267    # Query: 100tt 101
1268    # line below modified by Yair Benita, Sep 2004
1269    # Note that the colon is not always present. 2006
1270    _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1271
1272    def query(self, line):
1273        m = self._query_re.search(line)
1274        if m is None:
1275            if (
1276                line.strip()
1277                == "Query        ------------------------------------------------------------"
1278            ):
1279                # Special case - long gap relative to the subject,
1280                # note there is no start/end present, cannot update those
1281                self._hsp.query += "-" * 60
1282                self._query_len = 60  # number of dashes
1283                self._query_start_index = 13  # offset of first dash
1284                return
1285            raise ValueError("I could not find the query in line\n%s" % line)
1286
1287        # line below modified by Yair Benita, Sep 2004.
1288        # added the end attribute for the query
1289        colon, start, seq, end = m.groups()
1290        seq = seq.strip()
1291        self._hsp.query += seq
1292        if self._hsp.query_start is None:
1293            self._hsp.query_start = _safe_int(start)
1294
1295        # line below added by Yair Benita, Sep 2004.
1296        # added the end attribute for the query
1297        self._hsp.query_end = _safe_int(end)
1298
1299        # Get index for sequence start (regular expression element 3)
1300        self._query_start_index = m.start(3)
1301        self._query_len = len(seq)
1302
1303    def align(self, line):
1304        seq = line[self._query_start_index :].rstrip()
1305        if len(seq) < self._query_len:
1306            # Make sure the alignment is the same length as the query
1307            seq += " " * (self._query_len - len(seq))
1308            if len(seq) < self._query_len:
1309                raise ValueError("Match is longer than the query in line\n%s" % line)
1310        self._hsp.match += seq
1311
1312    # To match how we do the query, cache the regular expression.
1313    # Note that the colon is not always present.
1314    _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1315
1316    def sbjct(self, line):
1317        m = self._sbjct_re.search(line)
1318        if m is None:
1319            raise ValueError("I could not find the sbjct in line\n%s" % line)
1320        colon, start, seq, end = m.groups()
1321        # mikep 26/9/00
1322        # On occasion, there is a blast hit with no subject match
1323        # so far, it only occurs with 1-line short "matches"
1324        # I have decided to let these pass as they appear
1325        if not seq.strip():
1326            seq = " " * self._query_len
1327        else:
1328            seq = seq.strip()
1329        self._hsp.sbjct += seq
1330        if self._hsp.sbjct_start is None:
1331            self._hsp.sbjct_start = _safe_int(start)
1332
1333        self._hsp.sbjct_end = _safe_int(end)
1334        if len(seq) != self._query_len:
1335            raise ValueError(
1336                "QUERY and SBJCT sequence lengths don't match (%i %r vs %i) in line\n%s"
1337                % (self._query_len, self._hsp.query, len(seq), line)
1338            )
1339
1340        del self._query_start_index  # clean up unused variables
1341        del self._query_len
1342
1343    def end_hsp(self):
1344        pass
1345
1346
1347class _DatabaseReportConsumer:
1348    def start_database_report(self):
1349        self._dr = Record.DatabaseReport()
1350
1351    def database(self, line):
1352        m = re.search(r"Database: (.+)$", line)
1353        if m:
1354            self._dr.database_name.append(m.group(1))
1355        elif self._dr.database_name:
1356            # This must be a continuation of the previous name.
1357            self._dr.database_name[-1] = "%s%s" % (
1358                self._dr.database_name[-1],
1359                line.strip(),
1360            )
1361
1362    def posted_date(self, line):
1363        self._dr.posted_date.append(
1364            _re_search(
1365                r"Posted date:\s*(.+)$",
1366                line,
1367                "I could not find the posted date in line\n%s" % line,
1368            )
1369        )
1370
1371    def num_letters_in_database(self, line):
1372        (letters,) = _get_cols(
1373            line, (-1,), ncols=6, expected={2: "letters", 4: "database:"}
1374        )
1375        self._dr.num_letters_in_database.append(_safe_int(letters))
1376
1377    def num_sequences_in_database(self, line):
1378        (sequences,) = _get_cols(
1379            line, (-1,), ncols=6, expected={2: "sequences", 4: "database:"}
1380        )
1381        self._dr.num_sequences_in_database.append(_safe_int(sequences))
1382
1383    def ka_params(self, line):
1384        self._dr.ka_params = [_safe_float(x) for x in line.split()]
1385
1386    def gapped(self, line):
1387        self._dr.gapped = 1
1388
1389    def ka_params_gap(self, line):
1390        self._dr.ka_params_gap = [_safe_float(x) for x in line.split()]
1391
1392    def end_database_report(self):
1393        pass
1394
1395
1396class _ParametersConsumer:
1397    def start_parameters(self):
1398        self._params = Record.Parameters()
1399
1400    def matrix(self, line):
1401        self._params.matrix = line[8:].rstrip()
1402
1403    def gap_penalties(self, line):
1404        self._params.gap_penalties = [
1405            _safe_float(x)
1406            for x in _get_cols(
1407                line, (3, 5), ncols=6, expected={2: "Existence:", 4: "Extension:"}
1408            )
1409        ]
1410
1411    def num_hits(self, line):
1412        if "1st pass" in line:
1413            (x,) = _get_cols(line, (-4,), ncols=11, expected={2: "Hits"})
1414            self._params.num_hits = _safe_int(x)
1415        else:
1416            (x,) = _get_cols(line, (-1,), ncols=6, expected={2: "Hits"})
1417            self._params.num_hits = _safe_int(x)
1418
1419    def num_sequences(self, line):
1420        if "1st pass" in line:
1421            (x,) = _get_cols(line, (-4,), ncols=9, expected={2: "Sequences:"})
1422            self._params.num_sequences = _safe_int(x)
1423        else:
1424            (x,) = _get_cols(line, (-1,), ncols=4, expected={2: "Sequences:"})
1425            self._params.num_sequences = _safe_int(x)
1426
1427    def num_extends(self, line):
1428        if "1st pass" in line:
1429            (x,) = _get_cols(line, (-4,), ncols=9, expected={2: "extensions:"})
1430            self._params.num_extends = _safe_int(x)
1431        else:
1432            (x,) = _get_cols(line, (-1,), ncols=4, expected={2: "extensions:"})
1433            self._params.num_extends = _safe_int(x)
1434
1435    def num_good_extends(self, line):
1436        if "1st pass" in line:
1437            (x,) = _get_cols(line, (-4,), ncols=10, expected={3: "extensions:"})
1438            self._params.num_good_extends = _safe_int(x)
1439        else:
1440            (x,) = _get_cols(line, (-1,), ncols=5, expected={3: "extensions:"})
1441            self._params.num_good_extends = _safe_int(x)
1442
1443    def num_seqs_better_e(self, line):
1444        (self._params.num_seqs_better_e,) = _get_cols(
1445            line, (-1,), ncols=7, expected={2: "sequences"}
1446        )
1447        self._params.num_seqs_better_e = _safe_int(self._params.num_seqs_better_e)
1448
1449    def hsps_no_gap(self, line):
1450        (self._params.hsps_no_gap,) = _get_cols(
1451            line, (-1,), ncols=9, expected={3: "better", 7: "gapping:"}
1452        )
1453        self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1454
1455    def hsps_prelim_gapped(self, line):
1456        (self._params.hsps_prelim_gapped,) = _get_cols(
1457            line, (-1,), ncols=9, expected={4: "gapped", 6: "prelim"}
1458        )
1459        self._params.hsps_prelim_gapped = _safe_int(self._params.hsps_prelim_gapped)
1460
1461    def hsps_prelim_gapped_attempted(self, line):
1462        (self._params.hsps_prelim_gapped_attempted,) = _get_cols(
1463            line, (-1,), ncols=10, expected={4: "attempted", 7: "prelim"}
1464        )
1465        self._params.hsps_prelim_gapped_attempted = _safe_int(
1466            self._params.hsps_prelim_gapped_attempted
1467        )
1468
1469    def hsps_gapped(self, line):
1470        (self._params.hsps_gapped,) = _get_cols(
1471            line, (-1,), ncols=6, expected={3: "gapped"}
1472        )
1473        self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1474
1475    def query_length(self, line):
1476        (self._params.query_length,) = _get_cols(
1477            line.lower(), (-1,), ncols=4, expected={0: "length", 2: "query:"}
1478        )
1479        self._params.query_length = _safe_int(self._params.query_length)
1480
1481    def database_length(self, line):
1482        (self._params.database_length,) = _get_cols(
1483            line.lower(), (-1,), ncols=4, expected={0: "length", 2: "database:"}
1484        )
1485        self._params.database_length = _safe_int(self._params.database_length)
1486
1487    def effective_hsp_length(self, line):
1488        (self._params.effective_hsp_length,) = _get_cols(
1489            line, (-1,), ncols=4, expected={1: "HSP", 2: "length:"}
1490        )
1491        self._params.effective_hsp_length = _safe_int(self._params.effective_hsp_length)
1492
1493    def effective_query_length(self, line):
1494        (self._params.effective_query_length,) = _get_cols(
1495            line, (-1,), ncols=5, expected={1: "length", 3: "query:"}
1496        )
1497        self._params.effective_query_length = _safe_int(
1498            self._params.effective_query_length
1499        )
1500
1501    def effective_database_length(self, line):
1502        (self._params.effective_database_length,) = _get_cols(
1503            line.lower(), (-1,), ncols=5, expected={1: "length", 3: "database:"}
1504        )
1505        self._params.effective_database_length = _safe_int(
1506            self._params.effective_database_length
1507        )
1508
1509    def effective_search_space(self, line):
1510        (self._params.effective_search_space,) = _get_cols(
1511            line, (-1,), ncols=4, expected={1: "search"}
1512        )
1513        self._params.effective_search_space = _safe_int(
1514            self._params.effective_search_space
1515        )
1516
1517    def effective_search_space_used(self, line):
1518        (self._params.effective_search_space_used,) = _get_cols(
1519            line, (-1,), ncols=5, expected={1: "search", 3: "used:"}
1520        )
1521        self._params.effective_search_space_used = _safe_int(
1522            self._params.effective_search_space_used
1523        )
1524
1525    def frameshift(self, line):
1526        self._params.frameshift = _get_cols(
1527            line, (4, 5), ncols=6, expected={0: "frameshift", 2: "decay"}
1528        )
1529
1530    def threshold(self, line):
1531        if line[:2] == "T:":
1532            # Assume its an old style line like "T: 123"
1533            (self._params.threshold,) = _get_cols(
1534                line, (1,), ncols=2, expected={0: "T:"}
1535            )
1536        elif line[:28] == "Neighboring words threshold:":
1537            (self._params.threshold,) = _get_cols(
1538                line,
1539                (3,),
1540                ncols=4,
1541                expected={0: "Neighboring", 1: "words", 2: "threshold:"},
1542            )
1543        else:
1544            raise ValueError("Unrecognised threshold line:\n%s" % line)
1545        self._params.threshold = _safe_int(self._params.threshold)
1546
1547    def window_size(self, line):
1548        if line[:2] == "A:":
1549            (self._params.window_size,) = _get_cols(
1550                line, (1,), ncols=2, expected={0: "A:"}
1551            )
1552        elif line[:25] == "Window for multiple hits:":
1553            (self._params.window_size,) = _get_cols(
1554                line, (4,), ncols=5, expected={0: "Window", 2: "multiple", 3: "hits:"}
1555            )
1556        else:
1557            raise ValueError("Unrecognised window size line:\n%s" % line)
1558        self._params.window_size = _safe_int(self._params.window_size)
1559
1560    def dropoff_1st_pass(self, line):
1561        score, bits = _re_search(
1562            r"X1: (\d+) \(\s*([0-9,.]+) bits\)",
1563            line,
1564            "I could not find the dropoff in line\n%s" % line,
1565        )
1566        self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1567
1568    def gap_x_dropoff(self, line):
1569        score, bits = _re_search(
1570            r"X2: (\d+) \(\s*([0-9,.]+) bits\)",
1571            line,
1572            "I could not find the gap dropoff in line\n%s" % line,
1573        )
1574        self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1575
1576    def gap_x_dropoff_final(self, line):
1577        score, bits = _re_search(
1578            r"X3: (\d+) \(\s*([0-9,.]+) bits\)",
1579            line,
1580            "I could not find the gap dropoff final in line\n%s" % line,
1581        )
1582        self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1583
1584    def gap_trigger(self, line):
1585        score, bits = _re_search(
1586            r"S1: (\d+) \(\s*([0-9,.]+) bits\)",
1587            line,
1588            "I could not find the gap trigger in line\n%s" % line,
1589        )
1590        self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1591
1592    def blast_cutoff(self, line):
1593        score, bits = _re_search(
1594            r"S2: (\d+) \(\s*([0-9,.]+) bits\)",
1595            line,
1596            "I could not find the blast cutoff in line\n%s" % line,
1597        )
1598        self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1599
1600    def end_parameters(self):
1601        pass
1602
1603
1604class _BlastConsumer(
1605    AbstractConsumer,
1606    _HeaderConsumer,
1607    _DescriptionConsumer,
1608    _AlignmentConsumer,
1609    _HSPConsumer,
1610    _DatabaseReportConsumer,
1611    _ParametersConsumer,
1612):
1613    # This Consumer is inherits from many other consumer classes that handle
1614    # the actual dirty work.  An alternate way to do it is to create objects
1615    # of those classes and then delegate the parsing tasks to them in a
1616    # decorator-type pattern.  The disadvantage of that is that the method
1617    # names will need to be resolved in this classes.  However, using
1618    # a decorator will retain more control in this class (which may or
1619    # may not be a bad thing).  In addition, having each sub-consumer as
1620    # its own object prevents this object's dictionary from being cluttered
1621    # with members and reduces the chance of member collisions.
1622    def __init__(self):
1623        self.data = None
1624
1625    def round(self, line):
1626        # Make sure nobody's trying to pass me PSI-BLAST data!
1627        raise ValueError("This consumer doesn't handle PSI-BLAST data")
1628
1629    def start_header(self):
1630        self.data = Record.Blast()
1631        _HeaderConsumer.start_header(self)
1632
1633    def end_header(self):
1634        _HeaderConsumer.end_header(self)
1635        self.data.__dict__.update(self._header.__dict__)
1636
1637    def end_descriptions(self):
1638        self.data.descriptions = self._descriptions
1639
1640    def end_alignment(self):
1641        _AlignmentConsumer.end_alignment(self)
1642        if self._alignment.hsps:
1643            self.data.alignments.append(self._alignment)
1644        if self._multiple_alignment.alignment:
1645            self.data.multiple_alignment = self._multiple_alignment
1646
1647    def end_hsp(self):
1648        _HSPConsumer.end_hsp(self)
1649        try:
1650            self._alignment.hsps.append(self._hsp)
1651        except AttributeError:
1652            raise ValueError("Found an HSP before an alignment") from None
1653
1654    def end_database_report(self):
1655        _DatabaseReportConsumer.end_database_report(self)
1656        self.data.__dict__.update(self._dr.__dict__)
1657
1658    def end_parameters(self):
1659        _ParametersConsumer.end_parameters(self)
1660        self.data.__dict__.update(self._params.__dict__)
1661
1662
1663class _PSIBlastConsumer(
1664    AbstractConsumer,
1665    _HeaderConsumer,
1666    _DescriptionConsumer,
1667    _AlignmentConsumer,
1668    _HSPConsumer,
1669    _DatabaseReportConsumer,
1670    _ParametersConsumer,
1671):
1672    def __init__(self):
1673        self.data = None
1674
1675    def start_header(self):
1676        self.data = Record.PSIBlast()
1677        _HeaderConsumer.start_header(self)
1678
1679    def end_header(self):
1680        _HeaderConsumer.end_header(self)
1681        self.data.__dict__.update(self._header.__dict__)
1682
1683    def start_descriptions(self):
1684        self._round = Record.Round()
1685        self.data.rounds.append(self._round)
1686        _DescriptionConsumer.start_descriptions(self)
1687
1688    def end_descriptions(self):
1689        _DescriptionConsumer.end_descriptions(self)
1690        self._round.number = self._roundnum
1691        if self._descriptions:
1692            self._round.new_seqs.extend(self._descriptions)
1693        self._round.reused_seqs.extend(self._model_sequences)
1694        self._round.new_seqs.extend(self._nonmodel_sequences)
1695        if self._converged:
1696            self.data.converged = 1
1697
1698    def end_alignment(self):
1699        _AlignmentConsumer.end_alignment(self)
1700        if self._alignment.hsps:
1701            self._round.alignments.append(self._alignment)
1702        if self._multiple_alignment:
1703            self._round.multiple_alignment = self._multiple_alignment
1704
1705    def end_hsp(self):
1706        _HSPConsumer.end_hsp(self)
1707        try:
1708            self._alignment.hsps.append(self._hsp)
1709        except AttributeError:
1710            raise ValueError("Found an HSP before an alignment") from None
1711
1712    def end_database_report(self):
1713        _DatabaseReportConsumer.end_database_report(self)
1714        self.data.__dict__.update(self._dr.__dict__)
1715
1716    def end_parameters(self):
1717        _ParametersConsumer.end_parameters(self)
1718        self.data.__dict__.update(self._params.__dict__)
1719
1720
1721class Iterator:
1722    """Iterates over a file of multiple BLAST results.
1723
1724    Methods:
1725    next   Return the next record from the stream, or None.
1726
1727    """
1728
1729    def __init__(self, handle, parser=None):
1730        """Initialize a new iterator.
1731
1732        Arguments:
1733         - handle is a file-like object.
1734         - parser is an optional Parser object to change the results
1735           into another form.  If set to None, then the raw contents
1736           of the file will be returned.
1737        """
1738        try:
1739            handle.readline
1740        except AttributeError:
1741            raise ValueError(
1742                "I expected a file handle or file-like object, got %s" % type(handle)
1743            ) from None
1744        self._uhandle = UndoHandle(handle)
1745        self._parser = parser
1746        self._header = []
1747
1748    def __next__(self):
1749        """Return the next Blast record from the file.
1750
1751        If no more records, return None.
1752        """
1753        lines = []
1754        query = False
1755        while True:
1756            line = self._uhandle.readline()
1757            if not line:
1758                break
1759            # If I've reached the next one, then put the line back and stop.
1760            if lines and (
1761                line.startswith("BLAST")
1762                or line.startswith("BLAST", 1)
1763                or line.startswith("<?xml ")
1764            ):
1765                self._uhandle.saveline(line)
1766                break
1767            # New style files omit the BLAST line to mark a new query:
1768            if line.startswith("Query="):
1769                if not query:
1770                    if not self._header:
1771                        self._header = lines[:]
1772                    query = True
1773                else:
1774                    # Start of another record
1775                    self._uhandle.saveline(line)
1776                    break
1777            lines.append(line)
1778
1779        if query and "BLAST" not in lines[0]:
1780            # Cheat and re-insert the header
1781            # print("-"*50)
1782            # print("".join(self._header))
1783            # print("-"*50)
1784            # print("".join(lines))
1785            # print("-"*50)
1786            lines = self._header + lines
1787
1788        if not lines:
1789            return None
1790
1791        data = "".join(lines)
1792        if self._parser is not None:
1793            return self._parser.parse(StringIO(data))
1794        return data
1795
1796    def __iter__(self):
1797        return iter(self.__next__, None)
1798
1799
1800def _re_search(regex, line, error_msg):
1801    m = re.search(regex, line)
1802    if not m:
1803        raise ValueError(error_msg)
1804    return m.groups()
1805
1806
1807def _get_cols(line, cols_to_get, ncols=None, expected=None):
1808    if expected is None:
1809        expected = {}
1810    cols = line.split()
1811
1812    # Check to make sure number of columns is correct
1813    if ncols is not None and len(cols) != ncols:
1814        raise ValueError(
1815            "I expected %d columns (got %d) in line\n%s" % (ncols, len(cols), line)
1816        )
1817
1818    # Check to make sure columns contain the correct data
1819    for k in expected:
1820        if cols[k] != expected[k]:
1821            raise ValueError(
1822                "I expected '%s' in column %d in line\n%s" % (expected[k], k, line)
1823            )
1824
1825    # Construct the answer tuple
1826    results = []
1827    for c in cols_to_get:
1828        results.append(cols[c])
1829    return tuple(results)
1830
1831
1832def _safe_int(str):
1833    try:
1834        return int(str)
1835    except ValueError:
1836        # Something went wrong.  Try to clean up the string.
1837        # Remove all commas from the string
1838        str = str.replace(",", "")
1839    # try again after removing commas.
1840    # Note int() will return a long rather than overflow
1841    try:
1842        return int(str)
1843    except ValueError:
1844        pass
1845    # Call float to handle things like "54.3", note could lose precision, e.g.
1846    # >>> int("5399354557888517312")
1847    # 5399354557888517312
1848    # >>> int(float("5399354557888517312"))
1849    # 5399354557888517120
1850    return int(float(str))
1851
1852
1853def _safe_float(str):
1854    # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that
1855    # float('e-172') does not produce an error on his platform.  Thus,
1856    # we need to check the string for this condition.
1857
1858    # Sometimes BLAST leaves of the '1' in front of an exponent.
1859    if str and str[0] in ["E", "e"]:
1860        str = "1" + str
1861    try:
1862        return float(str)
1863    except ValueError:
1864        # Remove all commas from the string
1865        str = str.replace(",", "")
1866    # try again.
1867    return float(str)
1868
1869
1870class _BlastErrorConsumer(_BlastConsumer):
1871    def __init__(self):
1872        _BlastConsumer.__init__(self)
1873
1874    def noevent(self, line):
1875        if "Query must be at least wordsize" in line:
1876            raise ShortQueryBlastError("Query must be at least wordsize")
1877        # Now pass the line back up to the superclass.
1878        method = getattr(
1879            _BlastConsumer, "noevent", _BlastConsumer.__getattr__(self, "noevent")
1880        )
1881        method(line)
1882
1883
1884class BlastErrorParser(AbstractParser):
1885    """Attempt to catch and diagnose BLAST errors while parsing.
1886
1887    This utilizes the BlastParser module but adds an additional layer
1888    of complexity on top of it by attempting to diagnose ValueErrors
1889    that may actually indicate problems during BLAST parsing.
1890
1891    Current BLAST problems this detects are:
1892    - LowQualityBlastError - When BLASTing really low quality sequences
1893    (ie. some GenBank entries which are just short stretches of a single
1894    nucleotide), BLAST will report an error with the sequence and be
1895    unable to search with this. This will lead to a badly formatted
1896    BLAST report that the parsers choke on. The parser will convert the
1897    ValueError to a LowQualityBlastError and attempt to provide useful
1898    information.
1899    """
1900
1901    def __init__(self, bad_report_handle=None):
1902        """Initialize a parser that tries to catch BlastErrors.
1903
1904        Arguments:
1905        - bad_report_handle - An optional argument specifying a handle
1906        where bad reports should be sent. This would allow you to save
1907        all of the bad reports to a file, for instance. If no handle
1908        is specified, the bad reports will not be saved.
1909        """
1910        self._bad_report_handle = bad_report_handle
1911
1912        # self._b_parser = BlastParser()
1913        self._scanner = _Scanner()
1914        self._consumer = _BlastErrorConsumer()
1915
1916    def parse(self, handle):
1917        """Parse a handle, attempting to diagnose errors."""
1918        results = handle.read()
1919
1920        try:
1921            self._scanner.feed(StringIO(results), self._consumer)
1922        except ValueError:
1923            # if we have a bad_report_file, save the info to it first
1924            if self._bad_report_handle:
1925                # send the info to the error handle
1926                self._bad_report_handle.write(results)
1927
1928            # now we want to try and diagnose the error
1929            self._diagnose_error(StringIO(results), self._consumer.data)
1930
1931            # if we got here we can't figure out the problem
1932            # so we should pass along the syntax error we got
1933            raise
1934        return self._consumer.data
1935
1936    def _diagnose_error(self, handle, data_record):
1937        """Attempt to diagnose an error in the passed handle (PRIVATE).
1938
1939        Arguments:
1940        - handle - The handle potentially containing the error
1941        - data_record - The data record partially created by the consumer.
1942        """
1943        line = handle.readline()
1944
1945        while line:
1946            # 'Searchingdone' instead of 'Searching......done' seems
1947            # to indicate a failure to perform the BLAST due to
1948            # low quality sequence
1949            if line.startswith("Searchingdone"):
1950                raise LowQualityBlastError(
1951                    "Blast failure occurred on query: ", data_record.query
1952                )
1953            line = handle.readline()
1954