1# Adapted from Bio.AlignIO.FastaIO copyright 2008-2011 by Peter Cock.
2# Copyright 2012 by Wibowo Arindrarto.
3# All rights reserved.
4# This file is part of the Biopython distribution and governed by your
5# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
6# Please see the LICENSE file that should have been included as part of this
7# package.
8r"""Bio.SearchIO support for Bill Pearson's FASTA tools.
9
10This module adds support for parsing FASTA outputs. FASTA is a suite of
11programs that finds regions of local or global similarity between protein
12or nucleotide sequences, either by searching databases or identifying
13local duplications.
14
15Bio.SearchIO.FastaIO was tested on the following FASTA flavors and versions:
16
17    - flavors: fasta, ssearch, tfastx
18    - versions: 35, 36
19
20Other flavors and/or versions may introduce some bugs. Please file a bug report
21if you see such problems to Biopython's bug tracker.
22
23More information on FASTA are available through these links:
24
25    - Website: http://fasta.bioch.virginia.edu/fasta_www2/fasta_list2.shtml
26    - User guide: http://fasta.bioch.virginia.edu/fasta_www2/fasta_guide.pdf
27
28
29Supported Formats
30=================
31
32Bio.SearchIO.FastaIO supports parsing and indexing FASTA outputs triggered by
33the -m 10 flag. Other formats that mimic other programs (e.g. the BLAST tabular
34format using the -m 8 flag) may be parseable but using SearchIO's other parsers
35(in this case, using the 'blast-tab' parser).
36
37
38fasta-m10
39=========
40
41Note that in FASTA -m 10 outputs, HSPs from different strands are considered to
42be from different hits. They are listed as two separate entries in the hit
43table. FastaIO recognizes this and will group HSPs with the same hit ID into a
44single Hit object, regardless of strand.
45
46FASTA also sometimes output extra sequences adjacent to the HSP match. These
47extra sequences are discarded by FastaIO. Only regions containing the actual
48sequence match are extracted.
49
50The following object attributes are provided:
51
52+-----------------+-------------------------+----------------------------------+
53| Object          | Attribute               | Value                            |
54+=================+=========================+==================================+
55| QueryResult     | description             | query sequence description       |
56|                 +-------------------------+----------------------------------+
57|                 | id                      | query sequence ID                |
58|                 +-------------------------+----------------------------------+
59|                 | program                 | FASTA flavor                     |
60|                 +-------------------------+----------------------------------+
61|                 | seq_len                 | full length of query sequence    |
62|                 +-------------------------+----------------------------------+
63|                 | target                  | target search database           |
64|                 +-------------------------+----------------------------------+
65|                 | version                 | FASTA version                    |
66+-----------------+-------------------------+----------------------------------+
67| Hit             | seq_len                 | full length of the hit sequence  |
68+-----------------+-------------------------+----------------------------------+
69| HSP             | bitscore                | \*_bits line                     |
70|                 +-------------------------+----------------------------------+
71|                 | evalue                  | \*_expect line                   |
72|                 +-------------------------+----------------------------------+
73|                 | ident_pct               | \*_ident line                    |
74|                 +-------------------------+----------------------------------+
75|                 | init1_score             | \*_init1 line                    |
76|                 +-------------------------+----------------------------------+
77|                 | initn_score             | \*_initn line                    |
78|                 +-------------------------+----------------------------------+
79|                 | opt_score               | \*_opt line, \*_s-w opt line     |
80|                 +-------------------------+----------------------------------+
81|                 | pos_pct                 | \*_sim line                      |
82|                 +-------------------------+----------------------------------+
83|                 | sw_score                | \*_score line                    |
84|                 +-------------------------+----------------------------------+
85|                 | z_score                 | \*_z-score line                  |
86+-----------------+-------------------------+----------------------------------+
87| HSPFragment     | aln_annotation          | al_cons block, if present        |
88| (also via HSP)  +-------------------------+----------------------------------+
89|                 | hit                     | hit sequence                     |
90|                 +-------------------------+----------------------------------+
91|                 | hit_end                 | hit sequence end coordinate      |
92|                 +-------------------------+----------------------------------+
93|                 | hit_start               | hit sequence start coordinate    |
94|                 +-------------------------+----------------------------------+
95|                 | hit_strand              | hit sequence strand              |
96|                 +-------------------------+----------------------------------+
97|                 | query                   | query sequence                   |
98|                 +-------------------------+----------------------------------+
99|                 | query_end               | query sequence end coordinate    |
100|                 +-------------------------+----------------------------------+
101|                 | query_start             | query sequence start coordinate  |
102|                 +-------------------------+----------------------------------+
103|                 | query_strand            | query sequence strand            |
104+-----------------+-------------------------+----------------------------------+
105
106"""
107
108import re
109
110from Bio.SearchIO._index import SearchIndexer
111from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
112
113
114__all__ = ("FastaM10Parser", "FastaM10Indexer")
115
116
117# precompile regex patterns
118# regex for program name
119_RE_FLAVS = re.compile(r"t?fast[afmsxy]|pr[sf][sx]|lalign|[gs]?[glso]search")
120# regex for sequence ID and length ~ deals with both \n and \r\n
121_PTR_ID_DESC_SEQLEN = r">>>(.+?)\s+(.*?) *- (\d+) (?:aa|nt)\s*$"
122_RE_ID_DESC_SEQLEN = re.compile(_PTR_ID_DESC_SEQLEN)
123_RE_ID_DESC_SEQLEN_IDX = re.compile(_PTR_ID_DESC_SEQLEN.encode())
124# regex for qresult, hit, or hsp attribute value
125_RE_ATTR = re.compile(r"^; [a-z]+(_[ \w-]+):\s+(.*)$")
126# regex for capturing excess start and end sequences in alignments
127_RE_START_EXC = re.compile(r"^-*")
128_RE_END_EXC = re.compile(r"-*$")
129
130# attribute name mappings
131_HSP_ATTR_MAP = {
132    "_initn": ("initn_score", int),
133    "_init1": ("init1_score", int),
134    "_opt": ("opt_score", int),
135    "_s-w opt": ("opt_score", int),
136    "_z-score": ("z_score", float),
137    "_bits": ("bitscore", float),
138    "_expect": ("evalue", float),
139    "_score": ("sw_score", int),
140    "_ident": ("ident_pct", float),
141    "_sim": ("pos_pct", float),
142}
143
144# state flags
145_STATE_NONE = 0
146_STATE_QUERY_BLOCK = 1
147_STATE_HIT_BLOCK = 2
148_STATE_CONS_BLOCK = 3
149
150
151def _set_qresult_hits(qresult, hit_rows=()):
152    """Append Hits without alignments into QueryResults (PRIVATE)."""
153    for hit_row in hit_rows:
154        hit_id, remainder = hit_row.split(" ", 1)
155        # TODO: parse hit and hsp properties properly; by dealing with:
156        #   - any character in the description (brackets, spaces, etc.)
157        #   - possible [f] or [r] presence (for frame info)
158        #   - possible presence of E2() column
159        #   - possible incomplete hit_id due to column length limit
160        # The current method only looks at the Hit ID, none of the things above
161        if hit_id not in qresult:
162            frag = HSPFragment(hit_id, qresult.id)
163            hsp = HSP([frag])
164            hit = Hit([hsp])
165            qresult.append(hit)
166
167    return qresult
168
169
170def _set_hsp_seqs(hsp, parsed, program):
171    """Set HSPs sequences (PRIVATE).
172
173    :param hsp: HSP whose properties will be set
174    :type hsp: HSP
175    :param parsed: parsed values of the HSP attributes
176    :type parsed: dictionary {string: object}
177    :param program: program name
178    :type program: string
179
180    """
181    # get aligned sequences and check if they have equal lengths
182    start = 0
183    for seq_type in ("hit", "query"):
184        if "tfast" not in program:
185            pseq = parsed[seq_type]
186            # adjust start and end coordinates based on the amount of
187            # filler characters
188            start, stop = _get_aln_slice_coords(pseq)
189            start_adj = len(re.search(_RE_START_EXC, pseq["seq"]).group(0))
190            stop_adj = len(re.search(_RE_END_EXC, pseq["seq"]).group(0))
191            start = start + start_adj
192            stop = stop + start_adj - stop_adj
193            parsed[seq_type]["seq"] = pseq["seq"][start:stop]
194    if len(parsed["query"]["seq"]) != len(parsed["hit"]["seq"]):
195        raise ValueError(
196            "Length mismatch: %r %r"
197            % (len(parsed["query"]["seq"]), len(parsed["hit"]["seq"]))
198        )
199    if "similarity" in hsp.aln_annotation:
200        # only using 'start' since FASTA seems to have trimmed the 'excess'
201        # end part
202        hsp.aln_annotation["similarity"] = hsp.aln_annotation["similarity"][start:]
203        # hit or query works equally well here
204        assert len(hsp.aln_annotation["similarity"]) == len(parsed["hit"]["seq"])
205
206    # query and hit sequence types must be the same
207    assert parsed["query"]["_type"] == parsed["hit"]["_type"]
208    type_val = parsed["query"]["_type"]  # hit works fine too
209    molecule_type = "DNA" if type_val == "D" else "protein"
210    setattr(hsp.fragment, "molecule_type", molecule_type)
211
212    for seq_type in ("hit", "query"):
213        # get and set start and end coordinates
214        start = int(parsed[seq_type]["_start"])
215        end = int(parsed[seq_type]["_stop"])
216
217        setattr(hsp.fragment, seq_type + "_start", min(start, end) - 1)
218        setattr(hsp.fragment, seq_type + "_end", max(start, end))
219        # set seq and molecule type
220        setattr(hsp.fragment, seq_type, parsed[seq_type]["seq"])
221
222        if molecule_type != "protein":
223            # get strand from coordinate; start <= end is plus
224            # start > end is minus
225            if start <= end:
226                setattr(hsp.fragment, seq_type + "_strand", 1)
227            else:
228                setattr(hsp.fragment, seq_type + "_strand", -1)
229        else:
230            setattr(hsp.fragment, seq_type + "_strand", 0)
231
232
233def _get_aln_slice_coords(parsed_hsp):
234    """Get HSPs sequences (PRIVATE).
235
236    To get the actual pairwise alignment sequences, we must first
237    translate the un-gapped sequence based coordinates into positions
238    in the gapped sequence (which may have a flanking region shown
239    using leading - characters).  To date, I have never seen any
240    trailing flanking region shown in the m10 file, but the
241    following code should also cope with that.
242
243    Note that this code seems to work fine even when the "sq_offset"
244    entries are present as a result of using the -X command line option.
245    """
246    seq = parsed_hsp["seq"]
247    seq_stripped = seq.strip("-")
248    disp_start = int(parsed_hsp["_display_start"])
249    start = int(parsed_hsp["_start"])
250    stop = int(parsed_hsp["_stop"])
251
252    if start <= stop:
253        start = start - disp_start
254        stop = stop - disp_start + 1
255    else:
256        start = disp_start - start
257        stop = disp_start - stop + 1
258    stop += seq_stripped.count("-")
259    if not (0 <= start and start < stop and stop <= len(seq_stripped)):
260        raise ValueError(
261            "Problem with sequence start/stop,\n%s[%i:%i]\n%s"
262            % (seq, start, stop, parsed_hsp)
263        )
264    return start, stop
265
266
267class FastaM10Parser:
268    """Parser for Bill Pearson's FASTA suite's -m 10 output."""
269
270    def __init__(self, handle, __parse_hit_table=False):
271        """Initialize the class."""
272        self.handle = handle
273        self._preamble = self._parse_preamble()
274
275    def __iter__(self):
276        """Iterate over FastaM10Parser object yields query results."""
277        for qresult in self._parse_qresult():
278            # re-set desc, for hsp query description
279            qresult.description = qresult.description
280            yield qresult
281
282    def _parse_preamble(self):
283        """Parse the Fasta preamble for Fasta flavor and version (PRIVATE)."""
284        preamble = {}
285        while True:
286            line = self.handle.readline()
287            # this should be the line just before the first qresult
288            if line.startswith("Query"):
289                break
290            # try to match for version line
291            elif line.startswith(" version"):
292                preamble["version"] = line.split(" ")[2]
293            else:
294                # try to match for flavor line
295                flav_match = re.match(_RE_FLAVS, line.lower())
296                if flav_match:
297                    preamble["program"] = flav_match.group(0)
298        self.line = line
299
300        return preamble
301
302    def __parse_hit_table(self):
303        """Parse hit table rows."""
304        # parse hit table until we see an empty line
305        hit_rows = []
306        while True:
307            line = self.handle.readline()
308            if (not line) or line.strip():
309                break
310            hit_rows.append("")
311        self.line = line
312        return hit_rows
313
314    def _parse_qresult(self):
315        """Parse query result (PRIVATE)."""
316        # initial qresult value
317        qresult = None
318        hit_rows = []
319        # state values
320        state_QRES_NEW = 1
321        state_QRES_HITTAB = 3
322        state_QRES_CONTENT = 5
323        state_QRES_END = 7
324
325        line = self.line
326
327        while True:
328
329            # one line before the hit table
330            if line.startswith("The best scores are:"):
331                qres_state = state_QRES_HITTAB
332            # the end of a query or the file altogether
333            elif line.strip() == ">>>///" or not line:
334                qres_state = state_QRES_END
335            # the beginning of a new query
336            elif not line.startswith(">>>") and ">>>" in line:
337                qres_state = state_QRES_NEW
338            # the beginning of the query info and its hits + hsps
339            elif line.startswith(">>>") and not line.strip() == ">>><<<":
340                qres_state = state_QRES_CONTENT
341            # default qres mark
342            else:
343                qres_state = None
344
345            if qres_state is not None:
346                if qres_state == state_QRES_HITTAB:
347                    # parse hit table if flag is set
348                    hit_rows = self.__parse_hit_table()
349                    line = self.handle.readline()
350
351                elif qres_state == state_QRES_END:
352                    yield _set_qresult_hits(qresult, hit_rows)
353                    break
354
355                elif qres_state == state_QRES_NEW:
356                    # if qresult is filled, yield it first
357                    if qresult is not None:
358                        yield _set_qresult_hits(qresult, hit_rows)
359                    regx = re.search(_RE_ID_DESC_SEQLEN, line)
360                    query_id = regx.group(1)
361                    seq_len = regx.group(3)
362                    desc = regx.group(2)
363                    qresult = QueryResult(id=query_id)
364                    qresult.seq_len = int(seq_len)
365                    # get target from the next line
366                    line = self.handle.readline()
367                    qresult.target = [x for x in line.split(" ") if x][1].strip()
368                    if desc is not None:
369                        qresult.description = desc
370                    # set values from preamble
371                    for key, value in self._preamble.items():
372                        setattr(qresult, key, value)
373                    line = self.handle.readline()
374
375                elif qres_state == state_QRES_CONTENT:
376                    assert line[3:].startswith(qresult.id), line
377                    for hit, strand in self._parse_hit(query_id):
378                        # HACK: re-set desc, for hsp hit and query description
379                        hit.description = hit.description
380                        hit.query_description = qresult.description
381                        # if hit is not in qresult, append it
382                        if hit.id not in qresult:
383                            qresult.append(hit)
384                        # otherwise, it might be the same hit with a different strand
385                        else:
386                            # make sure strand is different and then append hsp to
387                            # existing hit
388                            for hsp in hit.hsps:
389                                assert strand != hsp.query_strand
390                                qresult[hit.id].append(hsp)
391                    line = self.line
392
393            else:
394                line = self.handle.readline()
395
396        self.line = line
397
398    def _parse_hit(self, query_id):
399        """Parse hit on query identifier (PRIVATE)."""
400        while True:
401            line = self.handle.readline()
402            if line.startswith(">>"):
403                break
404
405        state = _STATE_NONE
406        strand = None
407        hsp_list = []
408        hsp = None
409        parsed_hsp = None
410        hit_desc = None
411        seq_len = None
412        while True:
413            # yield hit if we've reached the start of a new query or
414            # the end of the search
415            self.line = self.handle.readline()
416            if self.line.strip() in [">>><<<", ">>>///"] or (
417                not self.line.startswith(">>>") and ">>>" in self.line
418            ):
419                # append last parsed_hsp['hit']['seq'] line
420                if state == _STATE_HIT_BLOCK:
421                    parsed_hsp["hit"]["seq"] += line.strip()
422                elif state == _STATE_CONS_BLOCK:
423                    hsp.aln_annotation["similarity"] += line.strip("\r\n")
424                # process HSP alignment and coordinates
425                _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"])
426                hit = Hit(hsp_list)
427                hit.description = hit_desc
428                hit.seq_len = seq_len
429                yield hit, strand
430                hsp_list = []
431                break
432            # yield hit and create a new one if we're still in the same query
433            elif line.startswith(">>"):
434                # try yielding,  if we have hsps
435                if hsp_list:
436                    _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"])
437                    hit = Hit(hsp_list)
438                    hit.description = hit_desc
439                    hit.seq_len = seq_len
440                    yield hit, strand
441                    hsp_list = []
442                # try to get the hit id and desc, and handle cases without descs
443                try:
444                    hit_id, hit_desc = line[2:].strip().split(" ", 1)
445                except ValueError:
446                    hit_id = line[2:].strip().split(" ", 1)[0]
447                    hit_desc = ""
448                # create the HSP object for Hit
449                frag = HSPFragment(hit_id, query_id)
450                hsp = HSP([frag])
451                hsp_list.append(hsp)
452                # set or reset the state to none
453                state = _STATE_NONE
454                parsed_hsp = {"query": {}, "hit": {}}
455            # create and append a new HSP if line starts with '>--'
456            elif line.startswith(">--"):
457                # set seq attributes of previous hsp
458                _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"])
459                # and create a new one
460                frag = HSPFragment(hit_id, query_id)
461                hsp = HSP([frag])
462                hsp_list.append(hsp)
463                # set the state ~ none yet
464                state = _STATE_NONE
465                parsed_hsp = {"query": {}, "hit": {}}
466            # this is either query or hit data in the HSP, depending on the state
467            elif line.startswith(">"):
468                if state == _STATE_NONE:
469                    # make sure it's the correct query
470                    if not query_id.startswith(line[1:].split(" ")[0]):
471                        raise ValueError("%r vs %r" % (query_id, line))
472                    state = _STATE_QUERY_BLOCK
473                    parsed_hsp["query"]["seq"] = ""
474                elif state == _STATE_QUERY_BLOCK:
475                    # make sure it's the correct hit
476                    assert hit_id.startswith(line[1:].split(" ")[0])
477                    state = _STATE_HIT_BLOCK
478                    parsed_hsp["hit"]["seq"] = ""
479            # check for conservation block
480            elif line.startswith("; al_cons"):
481                state = _STATE_CONS_BLOCK
482                hsp.fragment.aln_annotation["similarity"] = ""
483            elif line.startswith(";"):
484                # Fasta outputs do not make a clear distinction between Hit
485                # and HSPs, so we check the attribute names to determine
486                # whether it belongs to a Hit or HSP
487                regx = re.search(_RE_ATTR, line.strip())
488                name = regx.group(1)
489                value = regx.group(2)
490
491                # for values before the '>...' query block
492                if state == _STATE_NONE:
493                    if name in _HSP_ATTR_MAP:
494                        attr_name, caster = _HSP_ATTR_MAP[name]
495                        if caster is not str:
496                            value = caster(value)
497                        if name in ["_ident", "_sim"]:
498                            value *= 100
499                        setattr(hsp, attr_name, value)
500                # otherwise, pool the values for processing later
501                elif state == _STATE_QUERY_BLOCK:
502                    parsed_hsp["query"][name] = value
503                elif state == _STATE_HIT_BLOCK:
504                    if name == "_len":
505                        seq_len = int(value)
506                    else:
507                        parsed_hsp["hit"][name] = value
508                # for values in the hit block
509                else:
510                    raise ValueError("Unexpected line: %r" % line)
511            # otherwise, it must be lines containing the sequences
512            else:
513                assert ">" not in line
514                # if we're in hit, parse into hsp.hit
515                if state == _STATE_HIT_BLOCK:
516                    parsed_hsp["hit"]["seq"] += line.strip()
517                elif state == _STATE_QUERY_BLOCK:
518                    parsed_hsp["query"]["seq"] += line.strip()
519                elif state == _STATE_CONS_BLOCK:
520                    hsp.fragment.aln_annotation["similarity"] += line.strip("\r\n")
521                # we should not get here!
522                else:
523                    raise ValueError("Unexpected line: %r" % line)
524            line = self.line
525
526
527class FastaM10Indexer(SearchIndexer):
528    """Indexer class for Bill Pearson's FASTA suite's -m 10 output."""
529
530    _parser = FastaM10Parser
531
532    def __init__(self, filename):
533        """Initialize the class."""
534        SearchIndexer.__init__(self, filename)
535
536    def __iter__(self):
537        """Iterate over FastaM10Indexer; yields query results' keys, start offsets, offset lengths."""
538        handle = self._handle
539        handle.seek(0)
540        start_offset = handle.tell()
541        qresult_key = None
542        query_mark = b">>>"
543
544        line = handle.readline()
545        while True:
546            end_offset = handle.tell()
547
548            if not line.startswith(query_mark) and query_mark in line:
549                regx = re.search(_RE_ID_DESC_SEQLEN_IDX, line)
550                qresult_key = regx.group(1).decode()
551                start_offset = end_offset - len(line)
552            # yield whenever we encounter a new query or at the end of the file
553            if qresult_key is not None:
554                if not line:
555                    yield qresult_key, start_offset, end_offset - start_offset
556                    break
557                line = handle.readline()
558                if not line.startswith(query_mark) and query_mark in line:
559                    yield qresult_key, start_offset, end_offset - start_offset
560                    start_offset = end_offset
561            else:
562                line = handle.readline()
563
564    def get_raw(self, offset):
565        """Return the raw record from the file as a bytes string."""
566        handle = self._handle
567        qresult_raw = b""
568        query_mark = b">>>"
569
570        # read header first
571        handle.seek(0)
572        line = handle.readline()
573        while True:
574            qresult_raw += line
575            line = handle.readline()
576            if not line.startswith(query_mark) and query_mark in line:
577                break
578
579        # and read the qresult raw string
580        handle.seek(offset)
581        line = handle.readline()
582        while True:
583            # preserve whitespace, don't use read_forward
584            if not line:
585                break
586            qresult_raw += line
587
588            line = handle.readline()
589            # break when we've reached qresult end
590            if not line.startswith(query_mark) and query_mark in line:
591                break
592
593        # append mock end marker to qresult_raw, since it's not always present
594        return qresult_raw + b">>><<<\n"
595
596
597# if not used as a module, run the doctest
598if __name__ == "__main__":
599    from Bio._utils import run_doctest
600
601    run_doctest()
602