1# Adapted from Bio.AlignIO.FastaIO copyright 2008-2011 by Peter Cock. 2# Copyright 2012 by Wibowo Arindrarto. 3# All rights reserved. 4# This file is part of the Biopython distribution and governed by your 5# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 6# Please see the LICENSE file that should have been included as part of this 7# package. 8r"""Bio.SearchIO support for Bill Pearson's FASTA tools. 9 10This module adds support for parsing FASTA outputs. FASTA is a suite of 11programs that finds regions of local or global similarity between protein 12or nucleotide sequences, either by searching databases or identifying 13local duplications. 14 15Bio.SearchIO.FastaIO was tested on the following FASTA flavors and versions: 16 17 - flavors: fasta, ssearch, tfastx 18 - versions: 35, 36 19 20Other flavors and/or versions may introduce some bugs. Please file a bug report 21if you see such problems to Biopython's bug tracker. 22 23More information on FASTA are available through these links: 24 25 - Website: http://fasta.bioch.virginia.edu/fasta_www2/fasta_list2.shtml 26 - User guide: http://fasta.bioch.virginia.edu/fasta_www2/fasta_guide.pdf 27 28 29Supported Formats 30================= 31 32Bio.SearchIO.FastaIO supports parsing and indexing FASTA outputs triggered by 33the -m 10 flag. Other formats that mimic other programs (e.g. the BLAST tabular 34format using the -m 8 flag) may be parseable but using SearchIO's other parsers 35(in this case, using the 'blast-tab' parser). 36 37 38fasta-m10 39========= 40 41Note that in FASTA -m 10 outputs, HSPs from different strands are considered to 42be from different hits. They are listed as two separate entries in the hit 43table. FastaIO recognizes this and will group HSPs with the same hit ID into a 44single Hit object, regardless of strand. 45 46FASTA also sometimes output extra sequences adjacent to the HSP match. These 47extra sequences are discarded by FastaIO. Only regions containing the actual 48sequence match are extracted. 49 50The following object attributes are provided: 51 52+-----------------+-------------------------+----------------------------------+ 53| Object | Attribute | Value | 54+=================+=========================+==================================+ 55| QueryResult | description | query sequence description | 56| +-------------------------+----------------------------------+ 57| | id | query sequence ID | 58| +-------------------------+----------------------------------+ 59| | program | FASTA flavor | 60| +-------------------------+----------------------------------+ 61| | seq_len | full length of query sequence | 62| +-------------------------+----------------------------------+ 63| | target | target search database | 64| +-------------------------+----------------------------------+ 65| | version | FASTA version | 66+-----------------+-------------------------+----------------------------------+ 67| Hit | seq_len | full length of the hit sequence | 68+-----------------+-------------------------+----------------------------------+ 69| HSP | bitscore | \*_bits line | 70| +-------------------------+----------------------------------+ 71| | evalue | \*_expect line | 72| +-------------------------+----------------------------------+ 73| | ident_pct | \*_ident line | 74| +-------------------------+----------------------------------+ 75| | init1_score | \*_init1 line | 76| +-------------------------+----------------------------------+ 77| | initn_score | \*_initn line | 78| +-------------------------+----------------------------------+ 79| | opt_score | \*_opt line, \*_s-w opt line | 80| +-------------------------+----------------------------------+ 81| | pos_pct | \*_sim line | 82| +-------------------------+----------------------------------+ 83| | sw_score | \*_score line | 84| +-------------------------+----------------------------------+ 85| | z_score | \*_z-score line | 86+-----------------+-------------------------+----------------------------------+ 87| HSPFragment | aln_annotation | al_cons block, if present | 88| (also via HSP) +-------------------------+----------------------------------+ 89| | hit | hit sequence | 90| +-------------------------+----------------------------------+ 91| | hit_end | hit sequence end coordinate | 92| +-------------------------+----------------------------------+ 93| | hit_start | hit sequence start coordinate | 94| +-------------------------+----------------------------------+ 95| | hit_strand | hit sequence strand | 96| +-------------------------+----------------------------------+ 97| | query | query sequence | 98| +-------------------------+----------------------------------+ 99| | query_end | query sequence end coordinate | 100| +-------------------------+----------------------------------+ 101| | query_start | query sequence start coordinate | 102| +-------------------------+----------------------------------+ 103| | query_strand | query sequence strand | 104+-----------------+-------------------------+----------------------------------+ 105 106""" 107 108import re 109 110from Bio.SearchIO._index import SearchIndexer 111from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 112 113 114__all__ = ("FastaM10Parser", "FastaM10Indexer") 115 116 117# precompile regex patterns 118# regex for program name 119_RE_FLAVS = re.compile(r"t?fast[afmsxy]|pr[sf][sx]|lalign|[gs]?[glso]search") 120# regex for sequence ID and length ~ deals with both \n and \r\n 121_PTR_ID_DESC_SEQLEN = r">>>(.+?)\s+(.*?) *- (\d+) (?:aa|nt)\s*$" 122_RE_ID_DESC_SEQLEN = re.compile(_PTR_ID_DESC_SEQLEN) 123_RE_ID_DESC_SEQLEN_IDX = re.compile(_PTR_ID_DESC_SEQLEN.encode()) 124# regex for qresult, hit, or hsp attribute value 125_RE_ATTR = re.compile(r"^; [a-z]+(_[ \w-]+):\s+(.*)$") 126# regex for capturing excess start and end sequences in alignments 127_RE_START_EXC = re.compile(r"^-*") 128_RE_END_EXC = re.compile(r"-*$") 129 130# attribute name mappings 131_HSP_ATTR_MAP = { 132 "_initn": ("initn_score", int), 133 "_init1": ("init1_score", int), 134 "_opt": ("opt_score", int), 135 "_s-w opt": ("opt_score", int), 136 "_z-score": ("z_score", float), 137 "_bits": ("bitscore", float), 138 "_expect": ("evalue", float), 139 "_score": ("sw_score", int), 140 "_ident": ("ident_pct", float), 141 "_sim": ("pos_pct", float), 142} 143 144# state flags 145_STATE_NONE = 0 146_STATE_QUERY_BLOCK = 1 147_STATE_HIT_BLOCK = 2 148_STATE_CONS_BLOCK = 3 149 150 151def _set_qresult_hits(qresult, hit_rows=()): 152 """Append Hits without alignments into QueryResults (PRIVATE).""" 153 for hit_row in hit_rows: 154 hit_id, remainder = hit_row.split(" ", 1) 155 # TODO: parse hit and hsp properties properly; by dealing with: 156 # - any character in the description (brackets, spaces, etc.) 157 # - possible [f] or [r] presence (for frame info) 158 # - possible presence of E2() column 159 # - possible incomplete hit_id due to column length limit 160 # The current method only looks at the Hit ID, none of the things above 161 if hit_id not in qresult: 162 frag = HSPFragment(hit_id, qresult.id) 163 hsp = HSP([frag]) 164 hit = Hit([hsp]) 165 qresult.append(hit) 166 167 return qresult 168 169 170def _set_hsp_seqs(hsp, parsed, program): 171 """Set HSPs sequences (PRIVATE). 172 173 :param hsp: HSP whose properties will be set 174 :type hsp: HSP 175 :param parsed: parsed values of the HSP attributes 176 :type parsed: dictionary {string: object} 177 :param program: program name 178 :type program: string 179 180 """ 181 # get aligned sequences and check if they have equal lengths 182 start = 0 183 for seq_type in ("hit", "query"): 184 if "tfast" not in program: 185 pseq = parsed[seq_type] 186 # adjust start and end coordinates based on the amount of 187 # filler characters 188 start, stop = _get_aln_slice_coords(pseq) 189 start_adj = len(re.search(_RE_START_EXC, pseq["seq"]).group(0)) 190 stop_adj = len(re.search(_RE_END_EXC, pseq["seq"]).group(0)) 191 start = start + start_adj 192 stop = stop + start_adj - stop_adj 193 parsed[seq_type]["seq"] = pseq["seq"][start:stop] 194 if len(parsed["query"]["seq"]) != len(parsed["hit"]["seq"]): 195 raise ValueError( 196 "Length mismatch: %r %r" 197 % (len(parsed["query"]["seq"]), len(parsed["hit"]["seq"])) 198 ) 199 if "similarity" in hsp.aln_annotation: 200 # only using 'start' since FASTA seems to have trimmed the 'excess' 201 # end part 202 hsp.aln_annotation["similarity"] = hsp.aln_annotation["similarity"][start:] 203 # hit or query works equally well here 204 assert len(hsp.aln_annotation["similarity"]) == len(parsed["hit"]["seq"]) 205 206 # query and hit sequence types must be the same 207 assert parsed["query"]["_type"] == parsed["hit"]["_type"] 208 type_val = parsed["query"]["_type"] # hit works fine too 209 molecule_type = "DNA" if type_val == "D" else "protein" 210 setattr(hsp.fragment, "molecule_type", molecule_type) 211 212 for seq_type in ("hit", "query"): 213 # get and set start and end coordinates 214 start = int(parsed[seq_type]["_start"]) 215 end = int(parsed[seq_type]["_stop"]) 216 217 setattr(hsp.fragment, seq_type + "_start", min(start, end) - 1) 218 setattr(hsp.fragment, seq_type + "_end", max(start, end)) 219 # set seq and molecule type 220 setattr(hsp.fragment, seq_type, parsed[seq_type]["seq"]) 221 222 if molecule_type != "protein": 223 # get strand from coordinate; start <= end is plus 224 # start > end is minus 225 if start <= end: 226 setattr(hsp.fragment, seq_type + "_strand", 1) 227 else: 228 setattr(hsp.fragment, seq_type + "_strand", -1) 229 else: 230 setattr(hsp.fragment, seq_type + "_strand", 0) 231 232 233def _get_aln_slice_coords(parsed_hsp): 234 """Get HSPs sequences (PRIVATE). 235 236 To get the actual pairwise alignment sequences, we must first 237 translate the un-gapped sequence based coordinates into positions 238 in the gapped sequence (which may have a flanking region shown 239 using leading - characters). To date, I have never seen any 240 trailing flanking region shown in the m10 file, but the 241 following code should also cope with that. 242 243 Note that this code seems to work fine even when the "sq_offset" 244 entries are present as a result of using the -X command line option. 245 """ 246 seq = parsed_hsp["seq"] 247 seq_stripped = seq.strip("-") 248 disp_start = int(parsed_hsp["_display_start"]) 249 start = int(parsed_hsp["_start"]) 250 stop = int(parsed_hsp["_stop"]) 251 252 if start <= stop: 253 start = start - disp_start 254 stop = stop - disp_start + 1 255 else: 256 start = disp_start - start 257 stop = disp_start - stop + 1 258 stop += seq_stripped.count("-") 259 if not (0 <= start and start < stop and stop <= len(seq_stripped)): 260 raise ValueError( 261 "Problem with sequence start/stop,\n%s[%i:%i]\n%s" 262 % (seq, start, stop, parsed_hsp) 263 ) 264 return start, stop 265 266 267class FastaM10Parser: 268 """Parser for Bill Pearson's FASTA suite's -m 10 output.""" 269 270 def __init__(self, handle, __parse_hit_table=False): 271 """Initialize the class.""" 272 self.handle = handle 273 self._preamble = self._parse_preamble() 274 275 def __iter__(self): 276 """Iterate over FastaM10Parser object yields query results.""" 277 for qresult in self._parse_qresult(): 278 # re-set desc, for hsp query description 279 qresult.description = qresult.description 280 yield qresult 281 282 def _parse_preamble(self): 283 """Parse the Fasta preamble for Fasta flavor and version (PRIVATE).""" 284 preamble = {} 285 while True: 286 line = self.handle.readline() 287 # this should be the line just before the first qresult 288 if line.startswith("Query"): 289 break 290 # try to match for version line 291 elif line.startswith(" version"): 292 preamble["version"] = line.split(" ")[2] 293 else: 294 # try to match for flavor line 295 flav_match = re.match(_RE_FLAVS, line.lower()) 296 if flav_match: 297 preamble["program"] = flav_match.group(0) 298 self.line = line 299 300 return preamble 301 302 def __parse_hit_table(self): 303 """Parse hit table rows.""" 304 # parse hit table until we see an empty line 305 hit_rows = [] 306 while True: 307 line = self.handle.readline() 308 if (not line) or line.strip(): 309 break 310 hit_rows.append("") 311 self.line = line 312 return hit_rows 313 314 def _parse_qresult(self): 315 """Parse query result (PRIVATE).""" 316 # initial qresult value 317 qresult = None 318 hit_rows = [] 319 # state values 320 state_QRES_NEW = 1 321 state_QRES_HITTAB = 3 322 state_QRES_CONTENT = 5 323 state_QRES_END = 7 324 325 line = self.line 326 327 while True: 328 329 # one line before the hit table 330 if line.startswith("The best scores are:"): 331 qres_state = state_QRES_HITTAB 332 # the end of a query or the file altogether 333 elif line.strip() == ">>>///" or not line: 334 qres_state = state_QRES_END 335 # the beginning of a new query 336 elif not line.startswith(">>>") and ">>>" in line: 337 qres_state = state_QRES_NEW 338 # the beginning of the query info and its hits + hsps 339 elif line.startswith(">>>") and not line.strip() == ">>><<<": 340 qres_state = state_QRES_CONTENT 341 # default qres mark 342 else: 343 qres_state = None 344 345 if qres_state is not None: 346 if qres_state == state_QRES_HITTAB: 347 # parse hit table if flag is set 348 hit_rows = self.__parse_hit_table() 349 line = self.handle.readline() 350 351 elif qres_state == state_QRES_END: 352 yield _set_qresult_hits(qresult, hit_rows) 353 break 354 355 elif qres_state == state_QRES_NEW: 356 # if qresult is filled, yield it first 357 if qresult is not None: 358 yield _set_qresult_hits(qresult, hit_rows) 359 regx = re.search(_RE_ID_DESC_SEQLEN, line) 360 query_id = regx.group(1) 361 seq_len = regx.group(3) 362 desc = regx.group(2) 363 qresult = QueryResult(id=query_id) 364 qresult.seq_len = int(seq_len) 365 # get target from the next line 366 line = self.handle.readline() 367 qresult.target = [x for x in line.split(" ") if x][1].strip() 368 if desc is not None: 369 qresult.description = desc 370 # set values from preamble 371 for key, value in self._preamble.items(): 372 setattr(qresult, key, value) 373 line = self.handle.readline() 374 375 elif qres_state == state_QRES_CONTENT: 376 assert line[3:].startswith(qresult.id), line 377 for hit, strand in self._parse_hit(query_id): 378 # HACK: re-set desc, for hsp hit and query description 379 hit.description = hit.description 380 hit.query_description = qresult.description 381 # if hit is not in qresult, append it 382 if hit.id not in qresult: 383 qresult.append(hit) 384 # otherwise, it might be the same hit with a different strand 385 else: 386 # make sure strand is different and then append hsp to 387 # existing hit 388 for hsp in hit.hsps: 389 assert strand != hsp.query_strand 390 qresult[hit.id].append(hsp) 391 line = self.line 392 393 else: 394 line = self.handle.readline() 395 396 self.line = line 397 398 def _parse_hit(self, query_id): 399 """Parse hit on query identifier (PRIVATE).""" 400 while True: 401 line = self.handle.readline() 402 if line.startswith(">>"): 403 break 404 405 state = _STATE_NONE 406 strand = None 407 hsp_list = [] 408 hsp = None 409 parsed_hsp = None 410 hit_desc = None 411 seq_len = None 412 while True: 413 # yield hit if we've reached the start of a new query or 414 # the end of the search 415 self.line = self.handle.readline() 416 if self.line.strip() in [">>><<<", ">>>///"] or ( 417 not self.line.startswith(">>>") and ">>>" in self.line 418 ): 419 # append last parsed_hsp['hit']['seq'] line 420 if state == _STATE_HIT_BLOCK: 421 parsed_hsp["hit"]["seq"] += line.strip() 422 elif state == _STATE_CONS_BLOCK: 423 hsp.aln_annotation["similarity"] += line.strip("\r\n") 424 # process HSP alignment and coordinates 425 _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"]) 426 hit = Hit(hsp_list) 427 hit.description = hit_desc 428 hit.seq_len = seq_len 429 yield hit, strand 430 hsp_list = [] 431 break 432 # yield hit and create a new one if we're still in the same query 433 elif line.startswith(">>"): 434 # try yielding, if we have hsps 435 if hsp_list: 436 _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"]) 437 hit = Hit(hsp_list) 438 hit.description = hit_desc 439 hit.seq_len = seq_len 440 yield hit, strand 441 hsp_list = [] 442 # try to get the hit id and desc, and handle cases without descs 443 try: 444 hit_id, hit_desc = line[2:].strip().split(" ", 1) 445 except ValueError: 446 hit_id = line[2:].strip().split(" ", 1)[0] 447 hit_desc = "" 448 # create the HSP object for Hit 449 frag = HSPFragment(hit_id, query_id) 450 hsp = HSP([frag]) 451 hsp_list.append(hsp) 452 # set or reset the state to none 453 state = _STATE_NONE 454 parsed_hsp = {"query": {}, "hit": {}} 455 # create and append a new HSP if line starts with '>--' 456 elif line.startswith(">--"): 457 # set seq attributes of previous hsp 458 _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"]) 459 # and create a new one 460 frag = HSPFragment(hit_id, query_id) 461 hsp = HSP([frag]) 462 hsp_list.append(hsp) 463 # set the state ~ none yet 464 state = _STATE_NONE 465 parsed_hsp = {"query": {}, "hit": {}} 466 # this is either query or hit data in the HSP, depending on the state 467 elif line.startswith(">"): 468 if state == _STATE_NONE: 469 # make sure it's the correct query 470 if not query_id.startswith(line[1:].split(" ")[0]): 471 raise ValueError("%r vs %r" % (query_id, line)) 472 state = _STATE_QUERY_BLOCK 473 parsed_hsp["query"]["seq"] = "" 474 elif state == _STATE_QUERY_BLOCK: 475 # make sure it's the correct hit 476 assert hit_id.startswith(line[1:].split(" ")[0]) 477 state = _STATE_HIT_BLOCK 478 parsed_hsp["hit"]["seq"] = "" 479 # check for conservation block 480 elif line.startswith("; al_cons"): 481 state = _STATE_CONS_BLOCK 482 hsp.fragment.aln_annotation["similarity"] = "" 483 elif line.startswith(";"): 484 # Fasta outputs do not make a clear distinction between Hit 485 # and HSPs, so we check the attribute names to determine 486 # whether it belongs to a Hit or HSP 487 regx = re.search(_RE_ATTR, line.strip()) 488 name = regx.group(1) 489 value = regx.group(2) 490 491 # for values before the '>...' query block 492 if state == _STATE_NONE: 493 if name in _HSP_ATTR_MAP: 494 attr_name, caster = _HSP_ATTR_MAP[name] 495 if caster is not str: 496 value = caster(value) 497 if name in ["_ident", "_sim"]: 498 value *= 100 499 setattr(hsp, attr_name, value) 500 # otherwise, pool the values for processing later 501 elif state == _STATE_QUERY_BLOCK: 502 parsed_hsp["query"][name] = value 503 elif state == _STATE_HIT_BLOCK: 504 if name == "_len": 505 seq_len = int(value) 506 else: 507 parsed_hsp["hit"][name] = value 508 # for values in the hit block 509 else: 510 raise ValueError("Unexpected line: %r" % line) 511 # otherwise, it must be lines containing the sequences 512 else: 513 assert ">" not in line 514 # if we're in hit, parse into hsp.hit 515 if state == _STATE_HIT_BLOCK: 516 parsed_hsp["hit"]["seq"] += line.strip() 517 elif state == _STATE_QUERY_BLOCK: 518 parsed_hsp["query"]["seq"] += line.strip() 519 elif state == _STATE_CONS_BLOCK: 520 hsp.fragment.aln_annotation["similarity"] += line.strip("\r\n") 521 # we should not get here! 522 else: 523 raise ValueError("Unexpected line: %r" % line) 524 line = self.line 525 526 527class FastaM10Indexer(SearchIndexer): 528 """Indexer class for Bill Pearson's FASTA suite's -m 10 output.""" 529 530 _parser = FastaM10Parser 531 532 def __init__(self, filename): 533 """Initialize the class.""" 534 SearchIndexer.__init__(self, filename) 535 536 def __iter__(self): 537 """Iterate over FastaM10Indexer; yields query results' keys, start offsets, offset lengths.""" 538 handle = self._handle 539 handle.seek(0) 540 start_offset = handle.tell() 541 qresult_key = None 542 query_mark = b">>>" 543 544 line = handle.readline() 545 while True: 546 end_offset = handle.tell() 547 548 if not line.startswith(query_mark) and query_mark in line: 549 regx = re.search(_RE_ID_DESC_SEQLEN_IDX, line) 550 qresult_key = regx.group(1).decode() 551 start_offset = end_offset - len(line) 552 # yield whenever we encounter a new query or at the end of the file 553 if qresult_key is not None: 554 if not line: 555 yield qresult_key, start_offset, end_offset - start_offset 556 break 557 line = handle.readline() 558 if not line.startswith(query_mark) and query_mark in line: 559 yield qresult_key, start_offset, end_offset - start_offset 560 start_offset = end_offset 561 else: 562 line = handle.readline() 563 564 def get_raw(self, offset): 565 """Return the raw record from the file as a bytes string.""" 566 handle = self._handle 567 qresult_raw = b"" 568 query_mark = b">>>" 569 570 # read header first 571 handle.seek(0) 572 line = handle.readline() 573 while True: 574 qresult_raw += line 575 line = handle.readline() 576 if not line.startswith(query_mark) and query_mark in line: 577 break 578 579 # and read the qresult raw string 580 handle.seek(offset) 581 line = handle.readline() 582 while True: 583 # preserve whitespace, don't use read_forward 584 if not line: 585 break 586 qresult_raw += line 587 588 line = handle.readline() 589 # break when we've reached qresult end 590 if not line.startswith(query_mark) and query_mark in line: 591 break 592 593 # append mock end marker to qresult_raw, since it's not always present 594 return qresult_raw + b">>><<<\n" 595 596 597# if not used as a module, run the doctest 598if __name__ == "__main__": 599 from Bio._utils import run_doctest 600 601 run_doctest() 602