1# Copyright 2012 by Wibowo Arindrarto. All rights reserved. 2# This file is part of the Biopython distribution and governed by your 3# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 4# Please see the LICENSE file that should have been included as part of this 5# package. 6"""Bio.SearchIO parser for BLAT output formats. 7 8This module adds support for parsing BLAT outputs. BLAT (BLAST-Like Alignment 9Tool) is a sequence similarity search program initially built for annotating 10the human genome. 11 12Bio.SearchIO.BlastIO was tested using standalone BLAT version 34, psLayout 13version 3. It should be able to parse psLayout version 4 without problems. 14 15More information on BLAT is available from these sites: 16 17 - Publication: http://genome.cshlp.org/content/12/4/656 18 - User guide: http://genome.ucsc.edu/goldenPath/help/blatSpec.html 19 - Source download: http://www.soe.ucsc.edu/~kent/src 20 - Executable download: http://hgdownload.cse.ucsc.edu/admin/exe/ 21 - Blat score calculation: http://genome.ucsc.edu/FAQ/FAQblat.html#blat4 22 23 24Supported Formats 25================= 26 27BlatIO supports parsing, indexing, and writing for both PSL and PSLX output 28formats, with or without header. To parse, index, or write PSLX files, use the 29'pslx' keyword argument and set it to True. 30 31 # blat-psl defaults to PSL files 32 >>> from Bio import SearchIO 33 >>> psl = 'Blat/psl_34_004.psl' 34 >>> qresult = SearchIO.read(psl, 'blat-psl') 35 >>> qresult 36 QueryResult(id='hg19_dna', 10 hits) 37 38 # set the pslx flag to parse PSLX files 39 >>> pslx = 'Blat/pslx_34_004.pslx' 40 >>> qresult = SearchIO.read(pslx, 'blat-psl', pslx=True) 41 >>> qresult 42 QueryResult(id='hg19_dna', 10 hits) 43 44For parsing and indexing, you do not need to specify whether the file has a 45header or not. For writing, if you want to write a header, you can set the 46'header' keyword argument to True. This will write a 'psLayout version 3' header 47to your output file. 48 49 from Bio import SearchIO 50 qresult = SearchIO.read(psl, 'blat-psl') 51 SearchIO.write(qresult, 'header.psl', header=True) 52 <stdout> (1, 10, 19, 23) 53 54Note that the number of HSPFragments written may exceed the number of HSP 55objects. This is because in PSL files, it is possible to have single matches 56consisting of noncontiguous sequence fragments. This is where the HSPFragment 57object comes into play. These fragments are grouped into a single HSP because 58they share the same statistics (e.g. match numbers, BLAT score, etc.). However, 59they do not share the same sequence attributes, such as the start and end 60coordinates, making them distinct objects. 61 62In addition to parsing PSL(X) files, BlatIO also computes the percent identities 63and scores of your search results. This is done using the calculation formula 64posted here: http://genome.ucsc.edu/FAQ/FAQblat.html#blat4. It mimics the score 65and percent identity calculation done by UCSC's web BLAT service. 66 67Since BlatIO parses the file in a single pass, it expects all results from 68the same query to be in consecutive rows. If the results from one query are 69spread in nonconsecutive rows, BlatIO will consider them to be separate 70QueryResult objects. 71 72In most cases, the PSL(X) format uses the same coordinate system as Python 73(zero-based, half open). These coordinates are anchored on the plus strand. 74However, if the query aligns on the minus strand, BLAT will anchor the qStarts 75coordinates on the minus strand instead. BlatIO is aware of this, and will 76re-anchor the qStarts coordinates to the plus strand whenever it sees a minus 77strand query match. Conversely, when you write out to a PSL(X) file, BlatIO will 78reanchor qStarts to the minus strand again. 79 80BlatIO provides the following attribute-column mapping: 81 82+----------------+-------------------------+-----------------------------------+ 83| Object | Attribute | Column Name, Value | 84+================+=========================+===================================+ 85| QueryResutl | id | Q name, query sequence ID | 86| +-------------------------+-----------------------------------+ 87| | seq_len | Q size, query sequence full | 88| | | length | 89+----------------+-------------------------+-----------------------------------+ 90| Hit | id | T name, hit sequence ID | 91| +-------------------------+-----------------------------------+ 92| | seq_len | T size, hit sequence full length | 93+----------------+-------------------------+-----------------------------------+ 94| HSP | hit_end | T end, end coordinate of the last | 95| | | hit fragment | 96| +-------------------------+-----------------------------------+ 97| | hit_gap_num | T gap bases, number of bases | 98| | | inserted in hit | 99| +-------------------------+-----------------------------------+ 100| | hit_gapopen_num | T gap count, number of hit gap | 101| | | inserts | 102| +-------------------------+-----------------------------------+ 103| | hit_span_all | blockSizes, sizes of each | 104| | | fragment | 105| +-------------------------+-----------------------------------+ 106| | hit_start | T start, start coordinate of the | 107| | | first hit fragment | 108| +-------------------------+-----------------------------------+ 109| | hit_start_all | tStarts, start coordinate of each | 110| | | hit fragment | 111| +-------------------------+-----------------------------------+ 112| | match_num | match, number of non-repeat | 113| | | matches | 114| +-------------------------+-----------------------------------+ 115| | mismatch_num | mismatch, number of mismatches | 116| +-------------------------+-----------------------------------+ 117| | match_rep_num | rep. match, number of matches | 118| | | that are part of repeats | 119| +-------------------------+-----------------------------------+ 120| | n_num | N's, number of N bases | 121| +-------------------------+-----------------------------------+ 122| | query_end | Q end, end coordinate of the last | 123| +-------------------------+-----------------------------------+ 124| | | query fragment | 125| | query_gap_num | Q gap bases, number of bases | 126| | | inserted in query | 127| +-------------------------+-----------------------------------+ 128| | query_gapopen_num | Q gap count, number of query gap | 129| | | inserts | 130| +-------------------------+-----------------------------------+ 131| | query_span_all | blockSizes, sizes of each | 132| | | fragment | 133| +-------------------------+-----------------------------------+ 134| | query_start | Q start, start coordinate of the | 135| | | first query block | 136| +-------------------------+-----------------------------------+ 137| | query_start_all | qStarts, start coordinate of each | 138| | | query fragment | 139| +-------------------------+-----------------------------------+ 140| | len [*]_ | block count, the number of blocks | 141| | | in the alignment | 142+----------------+-------------------------+-----------------------------------+ 143| HSPFragment | hit | hit sequence, if present | 144| +-------------------------+-----------------------------------+ 145| | hit_strand | strand, hit sequence strand | 146| +-------------------------+-----------------------------------+ 147| | query | query sequence, if present | 148| +-------------------------+-----------------------------------+ 149| | query_strand | strand, query sequence strand | 150+----------------+-------------------------+-----------------------------------+ 151 152In addition to the column mappings above, BlatIO also provides the following 153object attributes: 154 155+----------------+-------------------------+-----------------------------------+ 156| Object | Attribute | Value | 157+================+=========================+===================================+ 158| HSP | gapopen_num | Q gap count + T gap count, total | 159| | | number of gap openings | 160| +-------------------------+-----------------------------------+ 161| | ident_num | matches + repmatches, total | 162| | | number of identical residues | 163| +-------------------------+-----------------------------------+ 164| | ident_pct | percent identity, calculated | 165| | | using UCSC's formula | 166| +-------------------------+-----------------------------------+ 167| | query_is_protein | boolean, whether the query | 168| | | sequence is a protein | 169| +-------------------------+-----------------------------------+ 170| | score | HSP score, calculated using | 171| | | UCSC's formula | 172+----------------+-------------------------+-----------------------------------+ 173 174Finally, the default HSP and HSPFragment properties are also provided. See the 175HSP and HSPFragment documentation for more details on these properties. 176 177 178.. [*] You can obtain the number of blocks / fragments in the HSP by invoking 179 ``len`` on the HSP 180 181""" 182import re 183from math import log 184 185from Bio.SearchIO._index import SearchIndexer 186from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 187 188 189__all__ = ("BlatPslParser", "BlatPslIndexer", "BlatPslWriter") 190 191 192# precompile regex patterns 193_PTR_ROW_CHECK = r"^\d+\s+\d+\s+\d+\s+\d+" 194_RE_ROW_CHECK = re.compile(_PTR_ROW_CHECK) 195_RE_ROW_CHECK_IDX = re.compile(_PTR_ROW_CHECK.encode()) 196 197 198def _list_from_csv(csv_string, caster=None): 199 """Transform the given comma-separated string into a list (PRIVATE). 200 201 :param csv_string: comma-separated input string 202 :type csv_string: string 203 :param caster: function used to cast each item in the input string 204 to its intended type 205 :type caster: callable, accepts string, returns object 206 207 """ 208 if caster is None: 209 return [x for x in csv_string.split(",") if x] 210 else: 211 return [caster(x) for x in csv_string.split(",") if x] 212 213 214def _reorient_starts(starts, blksizes, seqlen, strand): 215 """Reorients block starts into the opposite strand's coordinates (PRIVATE). 216 217 :param starts: start coordinates 218 :type starts: list [int] 219 :param blksizes: block sizes 220 :type blksizes: list [int] 221 :param seqlen: sequence length 222 :type seqlen: int 223 :param strand: sequence strand 224 :type strand: int, choice of -1, 0, or 1 225 226 """ 227 if len(starts) != len(blksizes): 228 raise RuntimeError( 229 "Unequal start coordinates and block sizes list (%r vs %r)" 230 % (len(starts), len(blksizes)) 231 ) 232 # see: http://genome.ucsc.edu/goldenPath/help/blatSpec.html 233 # no need to reorient if it's already the positive strand 234 if strand >= 0: 235 return starts 236 else: 237 # the plus-oriented coordinate is calculated by this: 238 # plus_coord = length - minus_coord - block_size 239 return [seqlen - start - blksize for start, blksize in zip(starts, blksizes)] 240 241 242def _is_protein(psl): 243 """Validate if psl is protein (PRIVATE).""" 244 # check if query is protein or not 245 # adapted from http://genome.ucsc.edu/FAQ/FAQblat.html#blat4 246 if len(psl["strand"]) == 2: 247 if psl["strand"][1] == "+": 248 return psl["tend"] == psl["tstarts"][-1] + 3 * psl["blocksizes"][-1] 249 elif psl["strand"][1] == "-": 250 return psl["tstart"] == psl["tsize"] - ( 251 psl["tstarts"][-1] + 3 * psl["blocksizes"][-1] 252 ) 253 254 return False 255 256 257def _calc_millibad(psl, is_protein): 258 """Calculate millibad (PRIVATE).""" 259 # adapted from http://genome.ucsc.edu/FAQ/FAQblat.html#blat4 260 size_mul = 3 if is_protein else 1 261 millibad = 0 262 263 qali_size = size_mul * (psl["qend"] - psl["qstart"]) 264 tali_size = psl["tend"] - psl["tstart"] 265 ali_size = min(qali_size, tali_size) 266 if ali_size <= 0: 267 return 0 268 269 size_dif = qali_size - tali_size 270 size_dif = 0 if size_dif < 0 else size_dif 271 272 total = size_mul * (psl["matches"] + psl["repmatches"] + psl["mismatches"]) 273 if total != 0: 274 millibad = ( 275 1000 276 * ( 277 psl["mismatches"] * size_mul 278 + psl["qnuminsert"] 279 + round(3 * log(1 + size_dif)) 280 ) 281 ) / total 282 283 return millibad 284 285 286def _calc_score(psl, is_protein): 287 """Calculate score (PRIVATE).""" 288 # adapted from http://genome.ucsc.edu/FAQ/FAQblat.html#blat4 289 size_mul = 3 if is_protein else 1 290 return ( 291 size_mul * (psl["matches"] + (psl["repmatches"] >> 1)) 292 - size_mul * psl["mismatches"] 293 - psl["qnuminsert"] 294 - psl["tnuminsert"] 295 ) 296 297 298def _create_hsp(hid, qid, psl): 299 """Create high scoring pair object (PRIVATE).""" 300 # protein flag 301 is_protein = _is_protein(psl) 302 # strand 303 # if query is protein, strand is 0 304 if is_protein: 305 qstrand = 0 306 else: 307 qstrand = 1 if psl["strand"][0] == "+" else -1 308 # try to get hit strand, if it exists 309 try: 310 hstrand = 1 if psl["strand"][1] == "+" else -1 311 except IndexError: 312 hstrand = 1 # hit strand defaults to plus 313 314 blocksize_multiplier = 3 if is_protein else 1 315 # query block starts 316 qstarts = _reorient_starts(psl["qstarts"], psl["blocksizes"], psl["qsize"], qstrand) 317 # hit block starts 318 if len(psl["strand"]) == 2: 319 hstarts = _reorient_starts( 320 psl["tstarts"], 321 [blocksize_multiplier * i for i in psl["blocksizes"]], 322 psl["tsize"], 323 hstrand, 324 ) 325 else: 326 hstarts = psl["tstarts"] 327 # set query and hit coords 328 # this assumes each block has no gaps (which seems to be the case) 329 assert len(qstarts) == len(hstarts) == len(psl["blocksizes"]) 330 query_range_all = list( 331 zip(qstarts, [x + y for x, y in zip(qstarts, psl["blocksizes"])]) 332 ) 333 hit_range_all = list( 334 zip( 335 hstarts, 336 [x + y * blocksize_multiplier for x, y in zip(hstarts, psl["blocksizes"])], 337 ) 338 ) 339 # check length of sequences and coordinates, all must match 340 if "tseqs" in psl and "qseqs" in psl: 341 assert ( 342 len(psl["tseqs"]) 343 == len(psl["qseqs"]) 344 == len(query_range_all) 345 == len(hit_range_all) 346 ) 347 else: 348 assert len(query_range_all) == len(hit_range_all) 349 350 frags = [] 351 # iterating over query_range_all, but hit_range_all works just as well 352 for idx, qcoords in enumerate(query_range_all): 353 hseqlist = psl.get("tseqs") 354 hseq = "" if not hseqlist else hseqlist[idx] 355 qseqlist = psl.get("qseqs") 356 qseq = "" if not qseqlist else qseqlist[idx] 357 frag = HSPFragment(hid, qid, hit=hseq, query=qseq) 358 # set molecule type 359 frag.molecule_type = "DNA" 360 # set coordinates 361 frag.query_start = qcoords[0] 362 frag.query_end = qcoords[1] 363 frag.hit_start = hit_range_all[idx][0] 364 frag.hit_end = hit_range_all[idx][1] 365 # and strands 366 frag.query_strand = qstrand 367 frag.hit_strand = hstrand 368 frags.append(frag) 369 370 # create hsp object 371 hsp = HSP(frags) 372 # check if start and end are set correctly 373 assert hsp.query_start == psl["qstart"] 374 assert hsp.query_end == psl["qend"] 375 assert hsp.hit_start == psl["tstart"] 376 assert hsp.hit_end == psl["tend"] 377 # and check block spans as well 378 hit_spans = [span / blocksize_multiplier for span in hsp.hit_span_all] 379 assert hit_spans == hsp.query_span_all == psl["blocksizes"] 380 # set its attributes 381 hsp.match_num = psl["matches"] 382 hsp.mismatch_num = psl["mismatches"] 383 hsp.match_rep_num = psl["repmatches"] 384 hsp.n_num = psl["ncount"] 385 hsp.query_gapopen_num = psl["qnuminsert"] 386 hsp.query_gap_num = psl["qbaseinsert"] 387 hsp.hit_gapopen_num = psl["tnuminsert"] 388 hsp.hit_gap_num = psl["tbaseinsert"] 389 390 hsp.ident_num = psl["matches"] + psl["repmatches"] 391 hsp.gapopen_num = psl["qnuminsert"] + psl["tnuminsert"] 392 hsp.gap_num = psl["qbaseinsert"] + psl["tbaseinsert"] 393 hsp.query_is_protein = is_protein 394 hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1 395 hsp.score = _calc_score(psl, is_protein) 396 # helper flag, for writing 397 hsp._has_hit_strand = len(psl["strand"]) == 2 398 399 return hsp 400 401 402class BlatPslParser: 403 """Parser for the BLAT PSL format.""" 404 405 def __init__(self, handle, pslx=False): 406 """Initialize the class.""" 407 self.handle = handle 408 self.line = self.handle.readline() 409 self.pslx = pslx 410 411 def __iter__(self): 412 """Iterate over BlatPslParser, yields query results.""" 413 # break out if it's an empty file 414 if not self.line: 415 return 416 417 # read through header 418 # this assumes that the result row match the regex 419 while not re.search(_RE_ROW_CHECK, self.line.strip()): 420 self.line = self.handle.readline() 421 if not self.line: 422 return 423 424 # parse into query results 425 for qresult in self._parse_qresult(): 426 qresult.program = "blat" 427 yield qresult 428 429 def _parse_row(self): 430 """Return a dictionary of parsed column values (PRIVATE).""" 431 assert self.line 432 cols = [x for x in self.line.strip().split("\t") if x] 433 self._validate_cols(cols) 434 435 psl = {} 436 psl["qname"] = cols[9] # qName 437 psl["qsize"] = int(cols[10]) # qSize 438 psl["tname"] = cols[13] # tName 439 psl["tsize"] = int(cols[14]) # tSize 440 psl["matches"] = int(cols[0]) # matches 441 psl["mismatches"] = int(cols[1]) # misMatches 442 psl["repmatches"] = int(cols[2]) # repMatches 443 psl["ncount"] = int(cols[3]) # nCount 444 psl["qnuminsert"] = int(cols[4]) # qNumInsert 445 psl["qbaseinsert"] = int(cols[5]) # qBaseInsert 446 psl["tnuminsert"] = int(cols[6]) # tNumInsert 447 psl["tbaseinsert"] = int(cols[7]) # tBaseInsert 448 psl["strand"] = cols[8] # strand 449 psl["qstart"] = int(cols[11]) # qStart 450 psl["qend"] = int(cols[12]) # qEnd 451 psl["tstart"] = int(cols[15]) # tStart 452 psl["tend"] = int(cols[16]) # tEnd 453 psl["blockcount"] = int(cols[17]) # blockCount 454 psl["blocksizes"] = _list_from_csv(cols[18], int) # blockSizes 455 psl["qstarts"] = _list_from_csv(cols[19], int) # qStarts 456 psl["tstarts"] = _list_from_csv(cols[20], int) # tStarts 457 if self.pslx: 458 psl["qseqs"] = _list_from_csv(cols[21]) # query sequence 459 psl["tseqs"] = _list_from_csv(cols[22]) # hit sequence 460 461 return psl 462 463 def _validate_cols(self, cols): 464 """Validate column's length of PSL or PSLX (PRIVATE).""" 465 if not self.pslx: 466 if len(cols) != 21: 467 raise ValueError( 468 "Invalid PSL line: %r. Expected 21 tab-separated columns, found %i" 469 % (self.line, len(cols)) 470 ) 471 else: 472 if len(cols) != 23: 473 raise ValueError( 474 "Invalid PSLX line: %r. Expected 23 tab-separated columns, found %i" 475 % (self.line, len(cols)) 476 ) 477 478 def _parse_qresult(self): 479 """Yield QueryResult objects (PRIVATE).""" 480 # state values, determines what to do for each line 481 state_EOF = 0 482 state_QRES_NEW = 1 483 state_QRES_SAME = 3 484 state_HIT_NEW = 2 485 state_HIT_SAME = 4 486 # initial dummy values 487 qres_state = None 488 file_state = None 489 cur_qid, cur_hid = None, None 490 prev_qid, prev_hid = None, None 491 cur, prev = None, None 492 hit_list, hsp_list = [], [] 493 494 while True: 495 # store previous line's parsed values for all lines after the first 496 if cur is not None: 497 prev = cur 498 prev_qid = cur_qid 499 prev_hid = cur_hid 500 # only parse the result row if it's not EOF 501 if self.line: 502 cur = self._parse_row() 503 cur_qid = cur["qname"] 504 cur_hid = cur["tname"] 505 else: 506 file_state = state_EOF 507 # mock values, since we have nothing to parse 508 cur_qid, cur_hid = None, None 509 510 # get the state of hit and qresult 511 if prev_qid != cur_qid: 512 qres_state = state_QRES_NEW 513 else: 514 qres_state = state_QRES_SAME 515 # new hits are hits with different ids or hits in a new qresult 516 if prev_hid != cur_hid or qres_state == state_QRES_NEW: 517 hit_state = state_HIT_NEW 518 else: 519 hit_state = state_HIT_SAME 520 521 if prev is not None: 522 # create fragment and HSP and set their attributes 523 hsp = _create_hsp(prev_hid, prev_qid, prev) 524 hsp_list.append(hsp) 525 526 if hit_state == state_HIT_NEW: 527 # create Hit and set its attributes 528 hit = Hit(hsp_list) 529 hit.seq_len = prev["tsize"] 530 hit_list.append(hit) 531 hsp_list = [] 532 533 # create qresult and yield if we're at a new qresult or at EOF 534 if qres_state == state_QRES_NEW or file_state == state_EOF: 535 qresult = QueryResult(id=prev_qid) 536 for hit in hit_list: 537 qresult.absorb(hit) 538 qresult.seq_len = prev["qsize"] 539 yield qresult 540 # if we're at EOF, break 541 if file_state == state_EOF: 542 break 543 hit_list = [] 544 545 self.line = self.handle.readline() 546 547 548class BlatPslIndexer(SearchIndexer): 549 """Indexer class for BLAT PSL output.""" 550 551 _parser = BlatPslParser 552 553 def __init__(self, filename, pslx=False): 554 """Initialize the class.""" 555 SearchIndexer.__init__(self, filename, pslx=pslx) 556 557 def __iter__(self): 558 """Iterate over the file handle; yields key, start offset, and length.""" 559 handle = self._handle 560 handle.seek(0) 561 # denotes column location for query identifier 562 query_id_idx = 9 563 qresult_key = None 564 tab_char = b"\t" 565 566 start_offset = handle.tell() 567 line = handle.readline() 568 # read through header 569 # this assumes that the result row match the regex 570 while not re.search(_RE_ROW_CHECK_IDX, line.strip()): 571 start_offset = handle.tell() 572 line = handle.readline() 573 if not line: 574 return 575 576 # and index the qresults 577 while True: 578 end_offset = handle.tell() 579 580 cols = [x for x in line.strip().split(tab_char) if x] 581 if qresult_key is None: 582 qresult_key = cols[query_id_idx] 583 else: 584 curr_key = cols[query_id_idx] 585 586 if curr_key != qresult_key: 587 yield qresult_key.decode(), start_offset, end_offset - start_offset 588 qresult_key = curr_key 589 start_offset = end_offset - len(line) 590 591 line = handle.readline() 592 if not line: 593 yield qresult_key.decode(), start_offset, end_offset - start_offset 594 break 595 596 def get_raw(self, offset): 597 """Return raw bytes string of a QueryResult object from the given offset.""" 598 handle = self._handle 599 handle.seek(offset) 600 query_id_idx = 9 601 qresult_key = None 602 qresult_raw = b"" 603 tab_char = b"\t" 604 605 while True: 606 line = handle.readline() 607 if not line: 608 break 609 cols = [x for x in line.strip().split(tab_char) if x] 610 if qresult_key is None: 611 qresult_key = cols[query_id_idx] 612 else: 613 curr_key = cols[query_id_idx] 614 if curr_key != qresult_key: 615 break 616 qresult_raw += line 617 618 return qresult_raw 619 620 621class BlatPslWriter: 622 """Writer for the blat-psl format.""" 623 624 def __init__(self, handle, header=False, pslx=False): 625 """Initialize the class.""" 626 self.handle = handle 627 # flag for writing header or not 628 self.header = header 629 self.pslx = pslx 630 631 def write_file(self, qresults): 632 """Write query results to file.""" 633 handle = self.handle 634 qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0 635 636 if self.header: 637 handle.write(self._build_header()) 638 639 for qresult in qresults: 640 if qresult: 641 handle.write(self._build_row(qresult)) 642 qresult_counter += 1 643 hit_counter += len(qresult) 644 hsp_counter += sum(len(hit) for hit in qresult) 645 frag_counter += sum(len(hit.fragments) for hit in qresult) 646 647 return qresult_counter, hit_counter, hsp_counter, frag_counter 648 649 def _build_header(self): 650 """Build header, tab-separated string (PRIVATE).""" 651 # for now, always use the psLayout version 3 652 header = "psLayout version 3\n" 653 654 # adapted from BLAT's source: lib/psl.c#L496 655 header += ( 656 "\nmatch\tmis- \trep. \tN's\tQ gap\tQ gap\tT gap\tT " 657 "gap\tstrand\tQ \tQ \tQ \tQ \tT \tT " 658 "\tT \tT \tblock\tblockSizes \tqStarts\t tStarts" 659 "\n \tmatch\tmatch\t \tcount\tbases\tcount\tbases" 660 "\t \tname \tsize\tstart\tend\tname \tsize" 661 "\tstart\tend\tcount\n%s\n" % ("-" * 159) 662 ) 663 664 return header 665 666 def _build_row(self, qresult): 667 """Return a string or one row or more of the QueryResult object (PRIVATE).""" 668 # For now, our writer writes the row according to the order in 669 # the QueryResult and Hit objects. 670 # This is different from BLAT's native output, where the rows are 671 # grouped by strand. 672 # Should we tweak the behavior to better mimic the native output? 673 qresult_lines = [] 674 675 for hit in qresult: 676 for hsp in hit.hsps: 677 678 query_is_protein = getattr(hsp, "query_is_protein", False) 679 blocksize_multiplier = 3 if query_is_protein else 1 680 681 line = [] 682 line.append(hsp.match_num) 683 line.append(hsp.mismatch_num) 684 line.append(hsp.match_rep_num) 685 line.append(hsp.n_num) 686 line.append(hsp.query_gapopen_num) 687 line.append(hsp.query_gap_num) 688 line.append(hsp.hit_gapopen_num) 689 line.append(hsp.hit_gap_num) 690 691 # check spans 692 eff_query_spans = [blocksize_multiplier * s for s in hsp.query_span_all] 693 if hsp.hit_span_all != eff_query_spans: 694 raise ValueError("HSP hit span and query span values do not match.") 695 block_sizes = hsp.query_span_all 696 697 # set strand and starts 698 if hsp[0].query_strand >= 0: # since it may be a protein seq 699 strand = "+" 700 else: 701 strand = "-" 702 qstarts = _reorient_starts( 703 [x[0] for x in hsp.query_range_all], 704 hsp.query_span_all, 705 qresult.seq_len, 706 hsp[0].query_strand, 707 ) 708 709 if hsp[0].hit_strand == 1: 710 hstrand = 1 711 # only write hit strand if it was present in the source file 712 if hsp._has_hit_strand: 713 strand += "+" 714 else: 715 hstrand = -1 716 strand += "-" 717 hstarts = _reorient_starts( 718 [x[0] for x in hsp.hit_range_all], 719 hsp.hit_span_all, 720 hit.seq_len, 721 hstrand, 722 ) 723 724 line.append(strand) 725 line.append(qresult.id) 726 line.append(qresult.seq_len) 727 line.append(hsp.query_start) 728 line.append(hsp.query_end) 729 line.append(hit.id) 730 line.append(hit.seq_len) 731 line.append(hsp.hit_start) 732 line.append(hsp.hit_end) 733 line.append(len(hsp)) 734 line.append(",".join(str(x) for x in block_sizes) + ",") 735 line.append(",".join(str(x) for x in qstarts) + ",") 736 line.append(",".join(str(x) for x in hstarts) + ",") 737 738 if self.pslx: 739 line.append(",".join(str(x.seq) for x in hsp.query_all) + ",") 740 line.append(",".join(str(x.seq) for x in hsp.hit_all) + ",") 741 742 qresult_lines.append("\t".join(str(x) for x in line)) 743 744 return "\n".join(qresult_lines) + "\n" 745 746 747# if not used as a module, run the doctest 748if __name__ == "__main__": 749 from Bio._utils import run_doctest 750 751 run_doctest() 752