1# Copyright 2012 by Wibowo Arindrarto. All rights reserved. 2# 3# This file is part of the Biopython distribution and governed by your 4# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 5# Please see the LICENSE file that should have been included as part of this 6# package. 7 8"""Bio.SearchIO parser for BLAST+ plain text output formats. 9 10At the moment this is a wrapper around Biopython's NCBIStandalone text 11parser (which is now deprecated). 12 13""" 14 15from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 16from Bio.SearchIO._legacy import NCBIStandalone 17 18 19__all__ = ("BlastTextParser",) 20 21 22class BlastTextParser: 23 """Parser for the BLAST text format.""" 24 25 def __init__(self, handle): 26 """Initialize the class.""" 27 self.handle = handle 28 blast_parser = NCBIStandalone.BlastParser() 29 self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser) 30 31 def __iter__(self): 32 """Iterate over BlastTextParser, yields query results.""" 33 for rec in self.blast_iter: 34 # set attributes to SearchIO's 35 # get id and desc 36 if rec.query.startswith(">"): 37 rec.query = rec.query[1:] 38 try: 39 qid, qdesc = rec.query.split(" ", 1) 40 except ValueError: 41 qid, qdesc = rec.query, "" 42 qdesc = qdesc.replace("\n", "").replace("\r", "") 43 44 qresult = QueryResult(id=qid) 45 qresult.program = rec.application.lower() 46 qresult.target = rec.database 47 qresult.seq_len = rec.query_letters 48 qresult.version = rec.version 49 50 # determine molecule_type based on program 51 if qresult.program == "blastn": 52 molecule_type = "DNA" 53 elif qresult.program in ["blastp", "blastx", "tblastn", "tblastx"]: 54 molecule_type = "protein" 55 56 # iterate over the 'alignments' (hits) and the hit table 57 for idx, aln in enumerate(rec.alignments): 58 # get id and desc 59 if aln.title.startswith("> "): 60 aln.title = aln.title[2:] 61 elif aln.title.startswith(">"): 62 aln.title = aln.title[1:] 63 try: 64 hid, hdesc = aln.title.split(" ", 1) 65 except ValueError: 66 hid, hdesc = aln.title, "" 67 hdesc = hdesc.replace("\n", "").replace("\r", "") 68 69 # iterate over the hsps and group them in a list 70 hsp_list = [] 71 for bhsp in aln.hsps: 72 frag = HSPFragment(hid, qid) 73 frag.molecule_type = molecule_type 74 # set alignment length 75 frag.aln_span = bhsp.identities[1] 76 # set frames 77 try: 78 frag.query_frame = int(bhsp.frame[0]) 79 except IndexError: 80 if qresult.program in ("blastp", "tblastn"): 81 frag.query_frame = 0 82 else: 83 frag.query_frame = 1 84 try: 85 frag.hit_frame = int(bhsp.frame[1]) 86 except IndexError: 87 if qresult.program in ("blastp", "tblastn"): 88 frag.hit_frame = 0 89 else: 90 frag.hit_frame = 1 91 # set query coordinates 92 frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1 93 frag.query_end = max(bhsp.query_start, bhsp.query_end) 94 # set hit coordinates 95 frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1 96 frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end) 97 # set query, hit sequences and its annotation 98 qseq = "" 99 hseq = "" 100 midline = "" 101 for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match): 102 qchar, hchar, mchar = seqtrio 103 if qchar == " " or hchar == " ": 104 assert all(" " == x for x in seqtrio) 105 else: 106 qseq += qchar 107 hseq += hchar 108 midline += mchar 109 frag.query, frag.hit = qseq, hseq 110 frag.aln_annotation["similarity"] = midline 111 112 # create HSP object with the fragment 113 hsp = HSP([frag]) 114 hsp.evalue = bhsp.expect 115 hsp.bitscore = bhsp.bits 116 hsp.bitscore_raw = bhsp.score 117 # set gap 118 try: 119 hsp.gap_num = bhsp.gaps[0] 120 except IndexError: 121 hsp.gap_num = 0 122 # set identity 123 hsp.ident_num = bhsp.identities[0] 124 hsp.pos_num = bhsp.positives[0] 125 if hsp.pos_num is None: 126 hsp.pos_num = hsp[0].aln_span 127 128 hsp_list.append(hsp) 129 130 hit = Hit(hsp_list) 131 hit.seq_len = aln.length 132 hit.description = hdesc 133 qresult.append(hit) 134 135 qresult.description = qdesc 136 yield qresult 137 138 139# if not used as a module, run the doctest 140if __name__ == "__main__": 141 from Bio._utils import run_doctest 142 143 run_doctest() 144