1# Copyright 2012 by Wibowo Arindrarto.  All rights reserved.
2# This file is part of the Biopython distribution and governed by your
3# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
4# Please see the LICENSE file that should have been included as part of this
5# package.
6"""Bio.SearchIO parser for BLAT output formats.
7
8This module adds support for parsing BLAT outputs. BLAT (BLAST-Like Alignment
9Tool) is a sequence similarity search program initially built for annotating
10the human genome.
11
12Bio.SearchIO.BlastIO was tested using standalone BLAT version 34, psLayout
13version 3. It should be able to parse psLayout version 4 without problems.
14
15More information on BLAT is available from these sites:
16
17    - Publication: http://genome.cshlp.org/content/12/4/656
18    - User guide: http://genome.ucsc.edu/goldenPath/help/blatSpec.html
19    - Source download: http://www.soe.ucsc.edu/~kent/src
20    - Executable download: http://hgdownload.cse.ucsc.edu/admin/exe/
21    - Blat score calculation: http://genome.ucsc.edu/FAQ/FAQblat.html#blat4
22
23
24Supported Formats
25=================
26
27BlatIO supports parsing, indexing, and writing for both PSL and PSLX output
28formats, with or without header. To parse, index, or write PSLX files, use the
29'pslx' keyword argument and set it to True.
30
31    # blat-psl defaults to PSL files
32    >>> from Bio import SearchIO
33    >>> psl = 'Blat/psl_34_004.psl'
34    >>> qresult = SearchIO.read(psl, 'blat-psl')
35    >>> qresult
36    QueryResult(id='hg19_dna', 10 hits)
37
38    # set the pslx flag to parse PSLX files
39    >>> pslx = 'Blat/pslx_34_004.pslx'
40    >>> qresult = SearchIO.read(pslx, 'blat-psl', pslx=True)
41    >>> qresult
42    QueryResult(id='hg19_dna', 10 hits)
43
44For parsing and indexing, you do not need to specify whether the file has a
45header or not. For writing, if you want to write a header, you can set the
46'header' keyword argument to True. This will write a 'psLayout version 3' header
47to your output file.
48
49    from Bio import SearchIO
50    qresult = SearchIO.read(psl, 'blat-psl')
51    SearchIO.write(qresult, 'header.psl', header=True)
52    <stdout> (1, 10, 19, 23)
53
54Note that the number of HSPFragments written may exceed the number of HSP
55objects. This is because in PSL files, it is possible to have single matches
56consisting of noncontiguous sequence fragments. This is where the HSPFragment
57object comes into play. These fragments are grouped into a single HSP because
58they share the same statistics (e.g. match numbers, BLAT score, etc.). However,
59they do not share the same sequence attributes, such as the start and end
60coordinates, making them distinct objects.
61
62In addition to parsing PSL(X) files, BlatIO also computes the percent identities
63and scores of your search results. This is done using the calculation formula
64posted here: http://genome.ucsc.edu/FAQ/FAQblat.html#blat4. It mimics the score
65and percent identity calculation done by UCSC's web BLAT service.
66
67Since BlatIO parses the file in a single pass, it expects all results from
68the same query to be in consecutive rows. If the results from one query are
69spread in nonconsecutive rows, BlatIO will consider them to be separate
70QueryResult objects.
71
72In most cases, the PSL(X) format uses the same coordinate system as Python
73(zero-based, half open). These coordinates are anchored on the plus strand.
74However, if the query aligns on the minus strand, BLAT will anchor the qStarts
75coordinates on the minus strand instead. BlatIO is aware of this, and will
76re-anchor the qStarts coordinates to the plus strand whenever it sees a minus
77strand query match. Conversely, when you write out to a PSL(X) file, BlatIO will
78reanchor qStarts to the minus strand again.
79
80BlatIO provides the following attribute-column mapping:
81
82+----------------+-------------------------+-----------------------------------+
83| Object         | Attribute               | Column Name, Value                |
84+================+=========================+===================================+
85| QueryResutl    | id                      | Q name, query sequence ID         |
86|                +-------------------------+-----------------------------------+
87|                | seq_len                 | Q size, query sequence full       |
88|                |                         | length                            |
89+----------------+-------------------------+-----------------------------------+
90| Hit            | id                      | T name, hit sequence ID           |
91|                +-------------------------+-----------------------------------+
92|                | seq_len                 | T size, hit sequence full length  |
93+----------------+-------------------------+-----------------------------------+
94| HSP            | hit_end                 | T end, end coordinate of the last |
95|                |                         | hit fragment                      |
96|                +-------------------------+-----------------------------------+
97|                | hit_gap_num             | T gap bases, number of bases      |
98|                |                         | inserted in hit                   |
99|                +-------------------------+-----------------------------------+
100|                | hit_gapopen_num         | T gap count, number of hit gap    |
101|                |                         | inserts                           |
102|                +-------------------------+-----------------------------------+
103|                | hit_span_all            | blockSizes, sizes of each         |
104|                |                         | fragment                          |
105|                +-------------------------+-----------------------------------+
106|                | hit_start               | T start, start coordinate of the  |
107|                |                         | first hit fragment                |
108|                +-------------------------+-----------------------------------+
109|                | hit_start_all           | tStarts, start coordinate of each |
110|                |                         | hit fragment                      |
111|                +-------------------------+-----------------------------------+
112|                | match_num               | match, number of non-repeat       |
113|                |                         | matches                           |
114|                +-------------------------+-----------------------------------+
115|                | mismatch_num            | mismatch, number of mismatches    |
116|                +-------------------------+-----------------------------------+
117|                | match_rep_num           | rep. match, number of matches     |
118|                |                         | that are part of repeats          |
119|                +-------------------------+-----------------------------------+
120|                | n_num                   | N's, number of N bases            |
121|                +-------------------------+-----------------------------------+
122|                | query_end               | Q end, end coordinate of the last |
123|                +-------------------------+-----------------------------------+
124|                |                         | query fragment                    |
125|                | query_gap_num           | Q gap bases, number of bases      |
126|                |                         | inserted in query                 |
127|                +-------------------------+-----------------------------------+
128|                | query_gapopen_num       | Q gap count, number of query gap  |
129|                |                         | inserts                           |
130|                +-------------------------+-----------------------------------+
131|                | query_span_all          | blockSizes, sizes of each         |
132|                |                         | fragment                          |
133|                +-------------------------+-----------------------------------+
134|                | query_start             | Q start, start coordinate of the  |
135|                |                         | first query block                 |
136|                +-------------------------+-----------------------------------+
137|                | query_start_all         | qStarts, start coordinate of each |
138|                |                         | query fragment                    |
139|                +-------------------------+-----------------------------------+
140|                | len [*]_                | block count, the number of blocks |
141|                |                         | in the alignment                  |
142+----------------+-------------------------+-----------------------------------+
143| HSPFragment    | hit                     | hit sequence, if present          |
144|                +-------------------------+-----------------------------------+
145|                | hit_strand              | strand, hit sequence strand       |
146|                +-------------------------+-----------------------------------+
147|                | query                   | query sequence, if present        |
148|                +-------------------------+-----------------------------------+
149|                | query_strand            | strand, query sequence strand     |
150+----------------+-------------------------+-----------------------------------+
151
152In addition to the column mappings above, BlatIO also provides the following
153object attributes:
154
155+----------------+-------------------------+-----------------------------------+
156| Object         | Attribute               | Value                             |
157+================+=========================+===================================+
158| HSP            | gapopen_num             | Q gap count + T gap count, total  |
159|                |                         |  number of gap openings           |
160|                +-------------------------+-----------------------------------+
161|                | ident_num               | matches + repmatches, total       |
162|                |                         | number of identical residues      |
163|                +-------------------------+-----------------------------------+
164|                | ident_pct               | percent identity, calculated      |
165|                |                         | using UCSC's formula              |
166|                +-------------------------+-----------------------------------+
167|                | query_is_protein        | boolean, whether the query        |
168|                |                         | sequence is a protein             |
169|                +-------------------------+-----------------------------------+
170|                | score                   | HSP score, calculated using       |
171|                |                         | UCSC's formula                    |
172+----------------+-------------------------+-----------------------------------+
173
174Finally, the default HSP and HSPFragment properties are also provided. See the
175HSP and HSPFragment documentation for more details on these properties.
176
177
178.. [*] You can obtain the number of blocks / fragments in the HSP by invoking
179   ``len`` on the HSP
180
181"""
182import re
183from math import log
184
185from Bio.SearchIO._index import SearchIndexer
186from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
187
188
189__all__ = ("BlatPslParser", "BlatPslIndexer", "BlatPslWriter")
190
191
192# precompile regex patterns
193_PTR_ROW_CHECK = r"^\d+\s+\d+\s+\d+\s+\d+"
194_RE_ROW_CHECK = re.compile(_PTR_ROW_CHECK)
195_RE_ROW_CHECK_IDX = re.compile(_PTR_ROW_CHECK.encode())
196
197
198def _list_from_csv(csv_string, caster=None):
199    """Transform the given comma-separated string into a list (PRIVATE).
200
201    :param csv_string: comma-separated input string
202    :type csv_string: string
203    :param caster: function used to cast each item in the input string
204                   to its intended type
205    :type caster: callable, accepts string, returns object
206
207    """
208    if caster is None:
209        return [x for x in csv_string.split(",") if x]
210    else:
211        return [caster(x) for x in csv_string.split(",") if x]
212
213
214def _reorient_starts(starts, blksizes, seqlen, strand):
215    """Reorients block starts into the opposite strand's coordinates (PRIVATE).
216
217    :param starts: start coordinates
218    :type starts: list [int]
219    :param blksizes: block sizes
220    :type blksizes: list [int]
221    :param seqlen: sequence length
222    :type seqlen: int
223    :param strand: sequence strand
224    :type strand: int, choice of -1, 0, or 1
225
226    """
227    if len(starts) != len(blksizes):
228        raise RuntimeError(
229            "Unequal start coordinates and block sizes list (%r vs %r)"
230            % (len(starts), len(blksizes))
231        )
232    # see: http://genome.ucsc.edu/goldenPath/help/blatSpec.html
233    # no need to reorient if it's already the positive strand
234    if strand >= 0:
235        return starts
236    else:
237        # the plus-oriented coordinate is calculated by this:
238        # plus_coord = length - minus_coord - block_size
239        return [seqlen - start - blksize for start, blksize in zip(starts, blksizes)]
240
241
242def _is_protein(psl):
243    """Validate if psl is protein (PRIVATE)."""
244    # check if query is protein or not
245    # adapted from http://genome.ucsc.edu/FAQ/FAQblat.html#blat4
246    if len(psl["strand"]) == 2:
247        if psl["strand"][1] == "+":
248            return psl["tend"] == psl["tstarts"][-1] + 3 * psl["blocksizes"][-1]
249        elif psl["strand"][1] == "-":
250            return psl["tstart"] == psl["tsize"] - (
251                psl["tstarts"][-1] + 3 * psl["blocksizes"][-1]
252            )
253
254    return False
255
256
257def _calc_millibad(psl, is_protein):
258    """Calculate millibad (PRIVATE)."""
259    # adapted from http://genome.ucsc.edu/FAQ/FAQblat.html#blat4
260    size_mul = 3 if is_protein else 1
261    millibad = 0
262
263    qali_size = size_mul * (psl["qend"] - psl["qstart"])
264    tali_size = psl["tend"] - psl["tstart"]
265    ali_size = min(qali_size, tali_size)
266    if ali_size <= 0:
267        return 0
268
269    size_dif = qali_size - tali_size
270    size_dif = 0 if size_dif < 0 else size_dif
271
272    total = size_mul * (psl["matches"] + psl["repmatches"] + psl["mismatches"])
273    if total != 0:
274        millibad = (
275            1000
276            * (
277                psl["mismatches"] * size_mul
278                + psl["qnuminsert"]
279                + round(3 * log(1 + size_dif))
280            )
281        ) / total
282
283    return millibad
284
285
286def _calc_score(psl, is_protein):
287    """Calculate score (PRIVATE)."""
288    # adapted from http://genome.ucsc.edu/FAQ/FAQblat.html#blat4
289    size_mul = 3 if is_protein else 1
290    return (
291        size_mul * (psl["matches"] + (psl["repmatches"] >> 1))
292        - size_mul * psl["mismatches"]
293        - psl["qnuminsert"]
294        - psl["tnuminsert"]
295    )
296
297
298def _create_hsp(hid, qid, psl):
299    """Create high scoring pair object (PRIVATE)."""
300    # protein flag
301    is_protein = _is_protein(psl)
302    # strand
303    # if query is protein, strand is 0
304    if is_protein:
305        qstrand = 0
306    else:
307        qstrand = 1 if psl["strand"][0] == "+" else -1
308    # try to get hit strand, if it exists
309    try:
310        hstrand = 1 if psl["strand"][1] == "+" else -1
311    except IndexError:
312        hstrand = 1  # hit strand defaults to plus
313
314    blocksize_multiplier = 3 if is_protein else 1
315    # query block starts
316    qstarts = _reorient_starts(psl["qstarts"], psl["blocksizes"], psl["qsize"], qstrand)
317    # hit block starts
318    if len(psl["strand"]) == 2:
319        hstarts = _reorient_starts(
320            psl["tstarts"],
321            [blocksize_multiplier * i for i in psl["blocksizes"]],
322            psl["tsize"],
323            hstrand,
324        )
325    else:
326        hstarts = psl["tstarts"]
327    # set query and hit coords
328    # this assumes each block has no gaps (which seems to be the case)
329    assert len(qstarts) == len(hstarts) == len(psl["blocksizes"])
330    query_range_all = list(
331        zip(qstarts, [x + y for x, y in zip(qstarts, psl["blocksizes"])])
332    )
333    hit_range_all = list(
334        zip(
335            hstarts,
336            [x + y * blocksize_multiplier for x, y in zip(hstarts, psl["blocksizes"])],
337        )
338    )
339    # check length of sequences and coordinates, all must match
340    if "tseqs" in psl and "qseqs" in psl:
341        assert (
342            len(psl["tseqs"])
343            == len(psl["qseqs"])
344            == len(query_range_all)
345            == len(hit_range_all)
346        )
347    else:
348        assert len(query_range_all) == len(hit_range_all)
349
350    frags = []
351    # iterating over query_range_all, but hit_range_all works just as well
352    for idx, qcoords in enumerate(query_range_all):
353        hseqlist = psl.get("tseqs")
354        hseq = "" if not hseqlist else hseqlist[idx]
355        qseqlist = psl.get("qseqs")
356        qseq = "" if not qseqlist else qseqlist[idx]
357        frag = HSPFragment(hid, qid, hit=hseq, query=qseq)
358        # set molecule type
359        frag.molecule_type = "DNA"
360        # set coordinates
361        frag.query_start = qcoords[0]
362        frag.query_end = qcoords[1]
363        frag.hit_start = hit_range_all[idx][0]
364        frag.hit_end = hit_range_all[idx][1]
365        # and strands
366        frag.query_strand = qstrand
367        frag.hit_strand = hstrand
368        frags.append(frag)
369
370    # create hsp object
371    hsp = HSP(frags)
372    # check if start and end are set correctly
373    assert hsp.query_start == psl["qstart"]
374    assert hsp.query_end == psl["qend"]
375    assert hsp.hit_start == psl["tstart"]
376    assert hsp.hit_end == psl["tend"]
377    # and check block spans as well
378    hit_spans = [span / blocksize_multiplier for span in hsp.hit_span_all]
379    assert hit_spans == hsp.query_span_all == psl["blocksizes"]
380    # set its attributes
381    hsp.match_num = psl["matches"]
382    hsp.mismatch_num = psl["mismatches"]
383    hsp.match_rep_num = psl["repmatches"]
384    hsp.n_num = psl["ncount"]
385    hsp.query_gapopen_num = psl["qnuminsert"]
386    hsp.query_gap_num = psl["qbaseinsert"]
387    hsp.hit_gapopen_num = psl["tnuminsert"]
388    hsp.hit_gap_num = psl["tbaseinsert"]
389
390    hsp.ident_num = psl["matches"] + psl["repmatches"]
391    hsp.gapopen_num = psl["qnuminsert"] + psl["tnuminsert"]
392    hsp.gap_num = psl["qbaseinsert"] + psl["tbaseinsert"]
393    hsp.query_is_protein = is_protein
394    hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1
395    hsp.score = _calc_score(psl, is_protein)
396    # helper flag, for writing
397    hsp._has_hit_strand = len(psl["strand"]) == 2
398
399    return hsp
400
401
402class BlatPslParser:
403    """Parser for the BLAT PSL format."""
404
405    def __init__(self, handle, pslx=False):
406        """Initialize the class."""
407        self.handle = handle
408        self.line = self.handle.readline()
409        self.pslx = pslx
410
411    def __iter__(self):
412        """Iterate over BlatPslParser, yields query results."""
413        # break out if it's an empty file
414        if not self.line:
415            return
416
417        # read through header
418        # this assumes that the result row match the regex
419        while not re.search(_RE_ROW_CHECK, self.line.strip()):
420            self.line = self.handle.readline()
421            if not self.line:
422                return
423
424        # parse into query results
425        for qresult in self._parse_qresult():
426            qresult.program = "blat"
427            yield qresult
428
429    def _parse_row(self):
430        """Return a dictionary of parsed column values (PRIVATE)."""
431        assert self.line
432        cols = [x for x in self.line.strip().split("\t") if x]
433        self._validate_cols(cols)
434
435        psl = {}
436        psl["qname"] = cols[9]  # qName
437        psl["qsize"] = int(cols[10])  # qSize
438        psl["tname"] = cols[13]  # tName
439        psl["tsize"] = int(cols[14])  # tSize
440        psl["matches"] = int(cols[0])  # matches
441        psl["mismatches"] = int(cols[1])  # misMatches
442        psl["repmatches"] = int(cols[2])  # repMatches
443        psl["ncount"] = int(cols[3])  # nCount
444        psl["qnuminsert"] = int(cols[4])  # qNumInsert
445        psl["qbaseinsert"] = int(cols[5])  # qBaseInsert
446        psl["tnuminsert"] = int(cols[6])  # tNumInsert
447        psl["tbaseinsert"] = int(cols[7])  # tBaseInsert
448        psl["strand"] = cols[8]  # strand
449        psl["qstart"] = int(cols[11])  # qStart
450        psl["qend"] = int(cols[12])  # qEnd
451        psl["tstart"] = int(cols[15])  # tStart
452        psl["tend"] = int(cols[16])  # tEnd
453        psl["blockcount"] = int(cols[17])  # blockCount
454        psl["blocksizes"] = _list_from_csv(cols[18], int)  # blockSizes
455        psl["qstarts"] = _list_from_csv(cols[19], int)  # qStarts
456        psl["tstarts"] = _list_from_csv(cols[20], int)  # tStarts
457        if self.pslx:
458            psl["qseqs"] = _list_from_csv(cols[21])  # query sequence
459            psl["tseqs"] = _list_from_csv(cols[22])  # hit sequence
460
461        return psl
462
463    def _validate_cols(self, cols):
464        """Validate column's length of PSL or PSLX (PRIVATE)."""
465        if not self.pslx:
466            if len(cols) != 21:
467                raise ValueError(
468                    "Invalid PSL line: %r. Expected 21 tab-separated columns, found %i"
469                    % (self.line, len(cols))
470                )
471        else:
472            if len(cols) != 23:
473                raise ValueError(
474                    "Invalid PSLX line: %r. Expected 23 tab-separated columns, found %i"
475                    % (self.line, len(cols))
476                )
477
478    def _parse_qresult(self):
479        """Yield QueryResult objects (PRIVATE)."""
480        # state values, determines what to do for each line
481        state_EOF = 0
482        state_QRES_NEW = 1
483        state_QRES_SAME = 3
484        state_HIT_NEW = 2
485        state_HIT_SAME = 4
486        # initial dummy values
487        qres_state = None
488        file_state = None
489        cur_qid, cur_hid = None, None
490        prev_qid, prev_hid = None, None
491        cur, prev = None, None
492        hit_list, hsp_list = [], []
493
494        while True:
495            # store previous line's parsed values for all lines after the first
496            if cur is not None:
497                prev = cur
498                prev_qid = cur_qid
499                prev_hid = cur_hid
500            # only parse the result row if it's not EOF
501            if self.line:
502                cur = self._parse_row()
503                cur_qid = cur["qname"]
504                cur_hid = cur["tname"]
505            else:
506                file_state = state_EOF
507                # mock values, since we have nothing to parse
508                cur_qid, cur_hid = None, None
509
510            # get the state of hit and qresult
511            if prev_qid != cur_qid:
512                qres_state = state_QRES_NEW
513            else:
514                qres_state = state_QRES_SAME
515            # new hits are hits with different ids or hits in a new qresult
516            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
517                hit_state = state_HIT_NEW
518            else:
519                hit_state = state_HIT_SAME
520
521            if prev is not None:
522                # create fragment and HSP and set their attributes
523                hsp = _create_hsp(prev_hid, prev_qid, prev)
524                hsp_list.append(hsp)
525
526                if hit_state == state_HIT_NEW:
527                    # create Hit and set its attributes
528                    hit = Hit(hsp_list)
529                    hit.seq_len = prev["tsize"]
530                    hit_list.append(hit)
531                    hsp_list = []
532
533                # create qresult and yield if we're at a new qresult or at EOF
534                if qres_state == state_QRES_NEW or file_state == state_EOF:
535                    qresult = QueryResult(id=prev_qid)
536                    for hit in hit_list:
537                        qresult.absorb(hit)
538                    qresult.seq_len = prev["qsize"]
539                    yield qresult
540                    # if we're at EOF, break
541                    if file_state == state_EOF:
542                        break
543                    hit_list = []
544
545            self.line = self.handle.readline()
546
547
548class BlatPslIndexer(SearchIndexer):
549    """Indexer class for BLAT PSL output."""
550
551    _parser = BlatPslParser
552
553    def __init__(self, filename, pslx=False):
554        """Initialize the class."""
555        SearchIndexer.__init__(self, filename, pslx=pslx)
556
557    def __iter__(self):
558        """Iterate over the file handle; yields key, start offset, and length."""
559        handle = self._handle
560        handle.seek(0)
561        # denotes column location for query identifier
562        query_id_idx = 9
563        qresult_key = None
564        tab_char = b"\t"
565
566        start_offset = handle.tell()
567        line = handle.readline()
568        # read through header
569        # this assumes that the result row match the regex
570        while not re.search(_RE_ROW_CHECK_IDX, line.strip()):
571            start_offset = handle.tell()
572            line = handle.readline()
573            if not line:
574                return
575
576        # and index the qresults
577        while True:
578            end_offset = handle.tell()
579
580            cols = [x for x in line.strip().split(tab_char) if x]
581            if qresult_key is None:
582                qresult_key = cols[query_id_idx]
583            else:
584                curr_key = cols[query_id_idx]
585
586                if curr_key != qresult_key:
587                    yield qresult_key.decode(), start_offset, end_offset - start_offset
588                    qresult_key = curr_key
589                    start_offset = end_offset - len(line)
590
591            line = handle.readline()
592            if not line:
593                yield qresult_key.decode(), start_offset, end_offset - start_offset
594                break
595
596    def get_raw(self, offset):
597        """Return raw bytes string of a QueryResult object from the given offset."""
598        handle = self._handle
599        handle.seek(offset)
600        query_id_idx = 9
601        qresult_key = None
602        qresult_raw = b""
603        tab_char = b"\t"
604
605        while True:
606            line = handle.readline()
607            if not line:
608                break
609            cols = [x for x in line.strip().split(tab_char) if x]
610            if qresult_key is None:
611                qresult_key = cols[query_id_idx]
612            else:
613                curr_key = cols[query_id_idx]
614                if curr_key != qresult_key:
615                    break
616            qresult_raw += line
617
618        return qresult_raw
619
620
621class BlatPslWriter:
622    """Writer for the blat-psl format."""
623
624    def __init__(self, handle, header=False, pslx=False):
625        """Initialize the class."""
626        self.handle = handle
627        # flag for writing header or not
628        self.header = header
629        self.pslx = pslx
630
631    def write_file(self, qresults):
632        """Write query results to file."""
633        handle = self.handle
634        qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0
635
636        if self.header:
637            handle.write(self._build_header())
638
639        for qresult in qresults:
640            if qresult:
641                handle.write(self._build_row(qresult))
642                qresult_counter += 1
643                hit_counter += len(qresult)
644                hsp_counter += sum(len(hit) for hit in qresult)
645                frag_counter += sum(len(hit.fragments) for hit in qresult)
646
647        return qresult_counter, hit_counter, hsp_counter, frag_counter
648
649    def _build_header(self):
650        """Build header, tab-separated string (PRIVATE)."""
651        # for now, always use the psLayout version 3
652        header = "psLayout version 3\n"
653
654        # adapted from BLAT's source: lib/psl.c#L496
655        header += (
656            "\nmatch\tmis- \trep. \tN's\tQ gap\tQ gap\tT gap\tT "
657            "gap\tstrand\tQ        \tQ   \tQ    \tQ  \tT        \tT   "
658            "\tT    \tT  \tblock\tblockSizes \tqStarts\t tStarts"
659            "\n     \tmatch\tmatch\t   \tcount\tbases\tcount\tbases"
660            "\t      \tname     \tsize\tstart\tend\tname     \tsize"
661            "\tstart\tend\tcount\n%s\n" % ("-" * 159)
662        )
663
664        return header
665
666    def _build_row(self, qresult):
667        """Return a string or one row or more of the QueryResult object (PRIVATE)."""
668        # For now, our writer writes the row according to the order in
669        # the QueryResult and Hit objects.
670        # This is different from BLAT's native output, where the rows are
671        # grouped by strand.
672        # Should we tweak the behavior to better mimic the native output?
673        qresult_lines = []
674
675        for hit in qresult:
676            for hsp in hit.hsps:
677
678                query_is_protein = getattr(hsp, "query_is_protein", False)
679                blocksize_multiplier = 3 if query_is_protein else 1
680
681                line = []
682                line.append(hsp.match_num)
683                line.append(hsp.mismatch_num)
684                line.append(hsp.match_rep_num)
685                line.append(hsp.n_num)
686                line.append(hsp.query_gapopen_num)
687                line.append(hsp.query_gap_num)
688                line.append(hsp.hit_gapopen_num)
689                line.append(hsp.hit_gap_num)
690
691                # check spans
692                eff_query_spans = [blocksize_multiplier * s for s in hsp.query_span_all]
693                if hsp.hit_span_all != eff_query_spans:
694                    raise ValueError("HSP hit span and query span values do not match.")
695                block_sizes = hsp.query_span_all
696
697                # set strand and starts
698                if hsp[0].query_strand >= 0:  # since it may be a protein seq
699                    strand = "+"
700                else:
701                    strand = "-"
702                qstarts = _reorient_starts(
703                    [x[0] for x in hsp.query_range_all],
704                    hsp.query_span_all,
705                    qresult.seq_len,
706                    hsp[0].query_strand,
707                )
708
709                if hsp[0].hit_strand == 1:
710                    hstrand = 1
711                    # only write hit strand if it was present in the source file
712                    if hsp._has_hit_strand:
713                        strand += "+"
714                else:
715                    hstrand = -1
716                    strand += "-"
717                hstarts = _reorient_starts(
718                    [x[0] for x in hsp.hit_range_all],
719                    hsp.hit_span_all,
720                    hit.seq_len,
721                    hstrand,
722                )
723
724                line.append(strand)
725                line.append(qresult.id)
726                line.append(qresult.seq_len)
727                line.append(hsp.query_start)
728                line.append(hsp.query_end)
729                line.append(hit.id)
730                line.append(hit.seq_len)
731                line.append(hsp.hit_start)
732                line.append(hsp.hit_end)
733                line.append(len(hsp))
734                line.append(",".join(str(x) for x in block_sizes) + ",")
735                line.append(",".join(str(x) for x in qstarts) + ",")
736                line.append(",".join(str(x) for x in hstarts) + ",")
737
738                if self.pslx:
739                    line.append(",".join(str(x.seq) for x in hsp.query_all) + ",")
740                    line.append(",".join(str(x.seq) for x in hsp.hit_all) + ",")
741
742                qresult_lines.append("\t".join(str(x) for x in line))
743
744        return "\n".join(qresult_lines) + "\n"
745
746
747# if not used as a module, run the doctest
748if __name__ == "__main__":
749    from Bio._utils import run_doctest
750
751    run_doctest()
752