1# Copyright 1999 by Jeffrey Chang.  All rights reserved.
2#
3# This file is part of the Biopython distribution and governed by your
4# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
5# Please see the LICENSE file that should have been included as part of this
6# package.
7#
8# Patched by Brad Chapman.
9# Chris Wroe added modifications for work in myGrid
10
11"""Code to invoke the NCBI BLAST server over the internet.
12
13This module provides code to work with the WWW version of BLAST
14provided by the NCBI. https://blast.ncbi.nlm.nih.gov/
15"""
16
17
18import warnings
19
20from io import StringIO
21import time
22
23from urllib.request import urlopen
24from urllib.parse import urlencode
25from urllib.request import Request
26
27from Bio import BiopythonWarning
28
29
30NCBI_BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi"
31
32
33def qblast(
34    program,
35    database,
36    sequence,
37    url_base=NCBI_BLAST_URL,
38    auto_format=None,
39    composition_based_statistics=None,
40    db_genetic_code=None,
41    endpoints=None,
42    entrez_query="(none)",
43    expect=10.0,
44    filter=None,
45    gapcosts=None,
46    genetic_code=None,
47    hitlist_size=50,
48    i_thresh=None,
49    layout=None,
50    lcase_mask=None,
51    matrix_name=None,
52    nucl_penalty=None,
53    nucl_reward=None,
54    other_advanced=None,
55    perc_ident=None,
56    phi_pattern=None,
57    query_file=None,
58    query_believe_defline=None,
59    query_from=None,
60    query_to=None,
61    searchsp_eff=None,
62    service=None,
63    threshold=None,
64    ungapped_alignment=None,
65    word_size=None,
66    short_query=None,
67    alignments=500,
68    alignment_view=None,
69    descriptions=500,
70    entrez_links_new_window=None,
71    expect_low=None,
72    expect_high=None,
73    format_entrez_query=None,
74    format_object=None,
75    format_type="XML",
76    ncbi_gi=None,
77    results_file=None,
78    show_overview=None,
79    megablast=None,
80    template_type=None,
81    template_length=None,
82):
83    """BLAST search using NCBI's QBLAST server or a cloud service provider.
84
85    Supports all parameters of the old qblast API for Put and Get.
86
87    Please note that NCBI uses the new Common URL API for BLAST searches
88    on the internet (http://ncbi.github.io/blast-cloud/dev/api.html). Thus,
89    some of the parameters used by this function are not (or are no longer)
90    officially supported by NCBI. Although they are still functioning, this
91    may change in the future.
92
93    The Common URL API (http://ncbi.github.io/blast-cloud/dev/api.html) allows
94    doing BLAST searches on cloud servers. To use this feature, please set
95    ``url_base='http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi'``
96    and ``format_object='Alignment'``. For more details, please see
97    https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast
98
99    Some useful parameters:
100
101     - program        blastn, blastp, blastx, tblastn, or tblastx (lower case)
102     - database       Which database to search against (e.g. "nr").
103     - sequence       The sequence to search.
104     - ncbi_gi        TRUE/FALSE whether to give 'gi' identifier.
105     - descriptions   Number of descriptions to show.  Def 500.
106     - alignments     Number of alignments to show.  Def 500.
107     - expect         An expect value cutoff.  Def 10.0.
108     - matrix_name    Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
109     - filter         "none" turns off filtering.  Default no filtering
110     - format_type    "HTML", "Text", "ASN.1", or "XML".  Def. "XML".
111     - entrez_query   Entrez query to limit Blast search
112     - hitlist_size   Number of hits to return. Default 50
113     - megablast      TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
114     - short_query    TRUE/FALSE whether to adjust the search parameters for a
115                      short query sequence. Note that this will override
116                      manually set parameters like word size and e value. Turns
117                      off when sequence length is > 30 residues. Default: None.
118     - service        plain, psi, phi, rpsblast, megablast (lower case)
119
120    This function does no checking of the validity of the parameters
121    and passes the values to the server as is.  More help is available at:
122    https://ncbi.github.io/blast-cloud/dev/api.html
123
124    """
125    programs = ["blastn", "blastp", "blastx", "tblastn", "tblastx"]
126    if program not in programs:
127        raise ValueError(
128            "Program specified is %s. Expected one of %s"
129            % (program, ", ".join(programs))
130        )
131
132    # SHORT_QUERY_ADJUST throws an error when using blastn (wrong parameter
133    # assignment from NCBIs side).
134    # Thus we set the (known) parameters directly:
135    if short_query and program == "blastn":
136        short_query = None
137        # We only use the 'short-query' parameters for short sequences:
138        if len(sequence) < 31:
139            expect = 1000
140            word_size = 7
141            nucl_reward = 1
142            filter = None
143            lcase_mask = None
144            warnings.warn(
145                '"SHORT_QUERY_ADJUST" is incorrectly implemented (by NCBI) for blastn.'
146                " We bypass the problem by manually adjusting the search parameters."
147                " Thus, results may slightly differ from web page searches.",
148                BiopythonWarning,
149            )
150
151    # Format the "Put" command, which sends search requests to qblast.
152    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007
153    # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010
154    # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified
155    # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi"))
156    parameters = [
157        ("AUTO_FORMAT", auto_format),
158        ("COMPOSITION_BASED_STATISTICS", composition_based_statistics),
159        ("DATABASE", database),
160        ("DB_GENETIC_CODE", db_genetic_code),
161        ("ENDPOINTS", endpoints),
162        ("ENTREZ_QUERY", entrez_query),
163        ("EXPECT", expect),
164        ("FILTER", filter),
165        ("GAPCOSTS", gapcosts),
166        ("GENETIC_CODE", genetic_code),
167        ("HITLIST_SIZE", hitlist_size),
168        ("I_THRESH", i_thresh),
169        ("LAYOUT", layout),
170        ("LCASE_MASK", lcase_mask),
171        ("MEGABLAST", megablast),
172        ("MATRIX_NAME", matrix_name),
173        ("NUCL_PENALTY", nucl_penalty),
174        ("NUCL_REWARD", nucl_reward),
175        ("OTHER_ADVANCED", other_advanced),
176        ("PERC_IDENT", perc_ident),
177        ("PHI_PATTERN", phi_pattern),
178        ("PROGRAM", program),
179        # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API?
180        ("QUERY", sequence),
181        ("QUERY_FILE", query_file),
182        ("QUERY_BELIEVE_DEFLINE", query_believe_defline),
183        ("QUERY_FROM", query_from),
184        ("QUERY_TO", query_to),
185        # ('RESULTS_FILE',...), - Can we use this parameter?
186        ("SEARCHSP_EFF", searchsp_eff),
187        ("SERVICE", service),
188        ("SHORT_QUERY_ADJUST", short_query),
189        ("TEMPLATE_TYPE", template_type),
190        ("TEMPLATE_LENGTH", template_length),
191        ("THRESHOLD", threshold),
192        ("UNGAPPED_ALIGNMENT", ungapped_alignment),
193        ("WORD_SIZE", word_size),
194        ("CMD", "Put"),
195    ]
196    query = [x for x in parameters if x[1] is not None]
197    message = urlencode(query).encode()
198
199    # Send off the initial query to qblast.
200    # Note the NCBI do not currently impose a rate limit here, other
201    # than the request not to make say 50 queries at once using multiple
202    # threads.
203    request = Request(url_base, message, {"User-Agent": "BiopythonClient"})
204    handle = urlopen(request)
205
206    # Format the "Get" command, which gets the formatted results from qblast
207    # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007
208    rid, rtoe = _parse_qblast_ref_page(handle)
209    parameters = [
210        ("ALIGNMENTS", alignments),
211        ("ALIGNMENT_VIEW", alignment_view),
212        ("DESCRIPTIONS", descriptions),
213        ("ENTREZ_LINKS_NEW_WINDOW", entrez_links_new_window),
214        ("EXPECT_LOW", expect_low),
215        ("EXPECT_HIGH", expect_high),
216        ("FORMAT_ENTREZ_QUERY", format_entrez_query),
217        ("FORMAT_OBJECT", format_object),
218        ("FORMAT_TYPE", format_type),
219        ("NCBI_GI", ncbi_gi),
220        ("RID", rid),
221        ("RESULTS_FILE", results_file),
222        ("SERVICE", service),
223        ("SHOW_OVERVIEW", show_overview),
224        ("CMD", "Get"),
225    ]
226    query = [x for x in parameters if x[1] is not None]
227    message = urlencode(query).encode()
228
229    # Poll NCBI until the results are ready.
230    # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
231    # 1. Do not contact the server more often than once every 10 seconds.
232    # 2. Do not poll for any single RID more often than once a minute.
233    # 3. Use the URL parameter email and tool, so that the NCBI
234    #    can contact you if there is a problem.
235    # 4. Run scripts weekends or between 9 pm and 5 am Eastern time
236    #    on weekdays if more than 50 searches will be submitted.
237    # --
238    # Could start with a 10s delay, but expect most short queries
239    # will take longer thus at least 70s with delay. Therefore,
240    # start with 20s delay, thereafter once a minute.
241    delay = 20  # seconds
242    while True:
243        current = time.time()
244        wait = qblast._previous + delay - current
245        if wait > 0:
246            time.sleep(wait)
247            qblast._previous = current + wait
248        else:
249            qblast._previous = current
250        # delay by at least 60 seconds only if running the request against the public NCBI API
251        if delay < 60 and url_base == NCBI_BLAST_URL:
252            # Wasn't a quick return, must wait at least a minute
253            delay = 60
254
255        request = Request(url_base, message, {"User-Agent": "BiopythonClient"})
256        handle = urlopen(request)
257        results = handle.read().decode()
258
259        # Can see an "\n\n" page while results are in progress,
260        # if so just wait a bit longer...
261        if results == "\n\n":
262            continue
263        # XML results don't have the Status tag when finished
264        if "Status=" not in results:
265            break
266        i = results.index("Status=")
267        j = results.index("\n", i)
268        status = results[i + len("Status=") : j].strip()
269        if status.upper() == "READY":
270            break
271    return StringIO(results)
272
273
274qblast._previous = 0
275
276
277def _parse_qblast_ref_page(handle):
278    """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE).
279
280    The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is probably
281    'Request Time of Execution' and RID would be 'Request Identifier'.
282    """
283    s = handle.read().decode()
284    i = s.find("RID =")
285    if i == -1:
286        rid = None
287    else:
288        j = s.find("\n", i)
289        rid = s[i + len("RID =") : j].strip()
290
291    i = s.find("RTOE =")
292    if i == -1:
293        rtoe = None
294    else:
295        j = s.find("\n", i)
296        rtoe = s[i + len("RTOE =") : j].strip()
297
298    if not rid and not rtoe:
299        # Can we reliably extract the error message from the HTML page?
300        # e.g.  "Message ID#24 Error: Failed to read the Blast query:
301        #       Nucleotide FASTA provided for protein sequence"
302        # or    "Message ID#32 Error: Query contains no data: Query
303        #       contains no sequence data"
304        #
305        # This used to occur inside a <div class="error msInf"> entry:
306        i = s.find('<div class="error msInf">')
307        if i != -1:
308            msg = s[i + len('<div class="error msInf">') :].strip()
309            msg = msg.split("</div>", 1)[0].split("\n", 1)[0].strip()
310            if msg:
311                raise ValueError("Error message from NCBI: %s" % msg)
312        # In spring 2010 the markup was like this:
313        i = s.find('<p class="error">')
314        if i != -1:
315            msg = s[i + len('<p class="error">') :].strip()
316            msg = msg.split("</p>", 1)[0].split("\n", 1)[0].strip()
317            if msg:
318                raise ValueError("Error message from NCBI: %s" % msg)
319        # Generic search based on the way the error messages start:
320        i = s.find("Message ID#")
321        if i != -1:
322            # Break the message at the first HTML tag
323            msg = s[i:].split("<", 1)[0].split("\n", 1)[0].strip()
324            raise ValueError("Error message from NCBI: %s" % msg)
325        # We didn't recognise the error layout :(
326        # print(s)
327        raise ValueError(
328            "No RID and no RTOE found in the 'please wait' page, "
329            "there was probably an error in your request but we "
330            "could not extract a helpful error message."
331        )
332    elif not rid:
333        # Can this happen?
334        raise ValueError(
335            "No RID found in the 'please wait' page. (although RTOE = %r)" % rtoe
336        )
337    elif not rtoe:
338        # Can this happen?
339        raise ValueError(
340            "No RTOE found in the 'please wait' page. (although RID = %r)" % rid
341        )
342
343    try:
344        return rid, int(rtoe)
345    except ValueError:
346        raise ValueError(
347            "A non-integer RTOE found in the 'please wait' page, %r" % rtoe
348        ) from None
349