1# Copyright 1999 by Jeffrey Chang. All rights reserved. 2# 3# This file is part of the Biopython distribution and governed by your 4# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 5# Please see the LICENSE file that should have been included as part of this 6# package. 7# 8# Patched by Brad Chapman. 9# Chris Wroe added modifications for work in myGrid 10 11"""Code to invoke the NCBI BLAST server over the internet. 12 13This module provides code to work with the WWW version of BLAST 14provided by the NCBI. https://blast.ncbi.nlm.nih.gov/ 15""" 16 17 18import warnings 19 20from io import StringIO 21import time 22 23from urllib.request import urlopen 24from urllib.parse import urlencode 25from urllib.request import Request 26 27from Bio import BiopythonWarning 28 29 30NCBI_BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi" 31 32 33def qblast( 34 program, 35 database, 36 sequence, 37 url_base=NCBI_BLAST_URL, 38 auto_format=None, 39 composition_based_statistics=None, 40 db_genetic_code=None, 41 endpoints=None, 42 entrez_query="(none)", 43 expect=10.0, 44 filter=None, 45 gapcosts=None, 46 genetic_code=None, 47 hitlist_size=50, 48 i_thresh=None, 49 layout=None, 50 lcase_mask=None, 51 matrix_name=None, 52 nucl_penalty=None, 53 nucl_reward=None, 54 other_advanced=None, 55 perc_ident=None, 56 phi_pattern=None, 57 query_file=None, 58 query_believe_defline=None, 59 query_from=None, 60 query_to=None, 61 searchsp_eff=None, 62 service=None, 63 threshold=None, 64 ungapped_alignment=None, 65 word_size=None, 66 short_query=None, 67 alignments=500, 68 alignment_view=None, 69 descriptions=500, 70 entrez_links_new_window=None, 71 expect_low=None, 72 expect_high=None, 73 format_entrez_query=None, 74 format_object=None, 75 format_type="XML", 76 ncbi_gi=None, 77 results_file=None, 78 show_overview=None, 79 megablast=None, 80 template_type=None, 81 template_length=None, 82): 83 """BLAST search using NCBI's QBLAST server or a cloud service provider. 84 85 Supports all parameters of the old qblast API for Put and Get. 86 87 Please note that NCBI uses the new Common URL API for BLAST searches 88 on the internet (http://ncbi.github.io/blast-cloud/dev/api.html). Thus, 89 some of the parameters used by this function are not (or are no longer) 90 officially supported by NCBI. Although they are still functioning, this 91 may change in the future. 92 93 The Common URL API (http://ncbi.github.io/blast-cloud/dev/api.html) allows 94 doing BLAST searches on cloud servers. To use this feature, please set 95 ``url_base='http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi'`` 96 and ``format_object='Alignment'``. For more details, please see 97 https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast 98 99 Some useful parameters: 100 101 - program blastn, blastp, blastx, tblastn, or tblastx (lower case) 102 - database Which database to search against (e.g. "nr"). 103 - sequence The sequence to search. 104 - ncbi_gi TRUE/FALSE whether to give 'gi' identifier. 105 - descriptions Number of descriptions to show. Def 500. 106 - alignments Number of alignments to show. Def 500. 107 - expect An expect value cutoff. Def 10.0. 108 - matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). 109 - filter "none" turns off filtering. Default no filtering 110 - format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". 111 - entrez_query Entrez query to limit Blast search 112 - hitlist_size Number of hits to return. Default 50 113 - megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) 114 - short_query TRUE/FALSE whether to adjust the search parameters for a 115 short query sequence. Note that this will override 116 manually set parameters like word size and e value. Turns 117 off when sequence length is > 30 residues. Default: None. 118 - service plain, psi, phi, rpsblast, megablast (lower case) 119 120 This function does no checking of the validity of the parameters 121 and passes the values to the server as is. More help is available at: 122 https://ncbi.github.io/blast-cloud/dev/api.html 123 124 """ 125 programs = ["blastn", "blastp", "blastx", "tblastn", "tblastx"] 126 if program not in programs: 127 raise ValueError( 128 "Program specified is %s. Expected one of %s" 129 % (program, ", ".join(programs)) 130 ) 131 132 # SHORT_QUERY_ADJUST throws an error when using blastn (wrong parameter 133 # assignment from NCBIs side). 134 # Thus we set the (known) parameters directly: 135 if short_query and program == "blastn": 136 short_query = None 137 # We only use the 'short-query' parameters for short sequences: 138 if len(sequence) < 31: 139 expect = 1000 140 word_size = 7 141 nucl_reward = 1 142 filter = None 143 lcase_mask = None 144 warnings.warn( 145 '"SHORT_QUERY_ADJUST" is incorrectly implemented (by NCBI) for blastn.' 146 " We bypass the problem by manually adjusting the search parameters." 147 " Thus, results may slightly differ from web page searches.", 148 BiopythonWarning, 149 ) 150 151 # Format the "Put" command, which sends search requests to qblast. 152 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 153 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 154 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified 155 # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) 156 parameters = [ 157 ("AUTO_FORMAT", auto_format), 158 ("COMPOSITION_BASED_STATISTICS", composition_based_statistics), 159 ("DATABASE", database), 160 ("DB_GENETIC_CODE", db_genetic_code), 161 ("ENDPOINTS", endpoints), 162 ("ENTREZ_QUERY", entrez_query), 163 ("EXPECT", expect), 164 ("FILTER", filter), 165 ("GAPCOSTS", gapcosts), 166 ("GENETIC_CODE", genetic_code), 167 ("HITLIST_SIZE", hitlist_size), 168 ("I_THRESH", i_thresh), 169 ("LAYOUT", layout), 170 ("LCASE_MASK", lcase_mask), 171 ("MEGABLAST", megablast), 172 ("MATRIX_NAME", matrix_name), 173 ("NUCL_PENALTY", nucl_penalty), 174 ("NUCL_REWARD", nucl_reward), 175 ("OTHER_ADVANCED", other_advanced), 176 ("PERC_IDENT", perc_ident), 177 ("PHI_PATTERN", phi_pattern), 178 ("PROGRAM", program), 179 # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API? 180 ("QUERY", sequence), 181 ("QUERY_FILE", query_file), 182 ("QUERY_BELIEVE_DEFLINE", query_believe_defline), 183 ("QUERY_FROM", query_from), 184 ("QUERY_TO", query_to), 185 # ('RESULTS_FILE',...), - Can we use this parameter? 186 ("SEARCHSP_EFF", searchsp_eff), 187 ("SERVICE", service), 188 ("SHORT_QUERY_ADJUST", short_query), 189 ("TEMPLATE_TYPE", template_type), 190 ("TEMPLATE_LENGTH", template_length), 191 ("THRESHOLD", threshold), 192 ("UNGAPPED_ALIGNMENT", ungapped_alignment), 193 ("WORD_SIZE", word_size), 194 ("CMD", "Put"), 195 ] 196 query = [x for x in parameters if x[1] is not None] 197 message = urlencode(query).encode() 198 199 # Send off the initial query to qblast. 200 # Note the NCBI do not currently impose a rate limit here, other 201 # than the request not to make say 50 queries at once using multiple 202 # threads. 203 request = Request(url_base, message, {"User-Agent": "BiopythonClient"}) 204 handle = urlopen(request) 205 206 # Format the "Get" command, which gets the formatted results from qblast 207 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 208 rid, rtoe = _parse_qblast_ref_page(handle) 209 parameters = [ 210 ("ALIGNMENTS", alignments), 211 ("ALIGNMENT_VIEW", alignment_view), 212 ("DESCRIPTIONS", descriptions), 213 ("ENTREZ_LINKS_NEW_WINDOW", entrez_links_new_window), 214 ("EXPECT_LOW", expect_low), 215 ("EXPECT_HIGH", expect_high), 216 ("FORMAT_ENTREZ_QUERY", format_entrez_query), 217 ("FORMAT_OBJECT", format_object), 218 ("FORMAT_TYPE", format_type), 219 ("NCBI_GI", ncbi_gi), 220 ("RID", rid), 221 ("RESULTS_FILE", results_file), 222 ("SERVICE", service), 223 ("SHOW_OVERVIEW", show_overview), 224 ("CMD", "Get"), 225 ] 226 query = [x for x in parameters if x[1] is not None] 227 message = urlencode(query).encode() 228 229 # Poll NCBI until the results are ready. 230 # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo 231 # 1. Do not contact the server more often than once every 10 seconds. 232 # 2. Do not poll for any single RID more often than once a minute. 233 # 3. Use the URL parameter email and tool, so that the NCBI 234 # can contact you if there is a problem. 235 # 4. Run scripts weekends or between 9 pm and 5 am Eastern time 236 # on weekdays if more than 50 searches will be submitted. 237 # -- 238 # Could start with a 10s delay, but expect most short queries 239 # will take longer thus at least 70s with delay. Therefore, 240 # start with 20s delay, thereafter once a minute. 241 delay = 20 # seconds 242 while True: 243 current = time.time() 244 wait = qblast._previous + delay - current 245 if wait > 0: 246 time.sleep(wait) 247 qblast._previous = current + wait 248 else: 249 qblast._previous = current 250 # delay by at least 60 seconds only if running the request against the public NCBI API 251 if delay < 60 and url_base == NCBI_BLAST_URL: 252 # Wasn't a quick return, must wait at least a minute 253 delay = 60 254 255 request = Request(url_base, message, {"User-Agent": "BiopythonClient"}) 256 handle = urlopen(request) 257 results = handle.read().decode() 258 259 # Can see an "\n\n" page while results are in progress, 260 # if so just wait a bit longer... 261 if results == "\n\n": 262 continue 263 # XML results don't have the Status tag when finished 264 if "Status=" not in results: 265 break 266 i = results.index("Status=") 267 j = results.index("\n", i) 268 status = results[i + len("Status=") : j].strip() 269 if status.upper() == "READY": 270 break 271 return StringIO(results) 272 273 274qblast._previous = 0 275 276 277def _parse_qblast_ref_page(handle): 278 """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE). 279 280 The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is probably 281 'Request Time of Execution' and RID would be 'Request Identifier'. 282 """ 283 s = handle.read().decode() 284 i = s.find("RID =") 285 if i == -1: 286 rid = None 287 else: 288 j = s.find("\n", i) 289 rid = s[i + len("RID =") : j].strip() 290 291 i = s.find("RTOE =") 292 if i == -1: 293 rtoe = None 294 else: 295 j = s.find("\n", i) 296 rtoe = s[i + len("RTOE =") : j].strip() 297 298 if not rid and not rtoe: 299 # Can we reliably extract the error message from the HTML page? 300 # e.g. "Message ID#24 Error: Failed to read the Blast query: 301 # Nucleotide FASTA provided for protein sequence" 302 # or "Message ID#32 Error: Query contains no data: Query 303 # contains no sequence data" 304 # 305 # This used to occur inside a <div class="error msInf"> entry: 306 i = s.find('<div class="error msInf">') 307 if i != -1: 308 msg = s[i + len('<div class="error msInf">') :].strip() 309 msg = msg.split("</div>", 1)[0].split("\n", 1)[0].strip() 310 if msg: 311 raise ValueError("Error message from NCBI: %s" % msg) 312 # In spring 2010 the markup was like this: 313 i = s.find('<p class="error">') 314 if i != -1: 315 msg = s[i + len('<p class="error">') :].strip() 316 msg = msg.split("</p>", 1)[0].split("\n", 1)[0].strip() 317 if msg: 318 raise ValueError("Error message from NCBI: %s" % msg) 319 # Generic search based on the way the error messages start: 320 i = s.find("Message ID#") 321 if i != -1: 322 # Break the message at the first HTML tag 323 msg = s[i:].split("<", 1)[0].split("\n", 1)[0].strip() 324 raise ValueError("Error message from NCBI: %s" % msg) 325 # We didn't recognise the error layout :( 326 # print(s) 327 raise ValueError( 328 "No RID and no RTOE found in the 'please wait' page, " 329 "there was probably an error in your request but we " 330 "could not extract a helpful error message." 331 ) 332 elif not rid: 333 # Can this happen? 334 raise ValueError( 335 "No RID found in the 'please wait' page. (although RTOE = %r)" % rtoe 336 ) 337 elif not rtoe: 338 # Can this happen? 339 raise ValueError( 340 "No RTOE found in the 'please wait' page. (although RID = %r)" % rid 341 ) 342 343 try: 344 return rid, int(rtoe) 345 except ValueError: 346 raise ValueError( 347 "A non-integer RTOE found in the 'please wait' page, %r" % rtoe 348 ) from None 349