1 /* $Id: blast_services.cpp 498594 2016-04-18 14:09:11Z camacho $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Christiam Camacho, Kevin Bealer
27 *
28 * ===========================================================================
29 */
30
31 /// @file blast_services.cpp
32 /// Implementation of CBlastServices class
33
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbi_system.hpp>
36 #include <serial/iterator.hpp>
37 #include <objects/seq/Seq_data.hpp>
38 #include <objects/seq/Bioseq.hpp>
39 #include <objects/seq/Seq_inst.hpp>
40 #include <objects/seq/Delta_seq.hpp>
41 #include <objects/seq/Delta_ext.hpp>
42 #include <objects/seq/Seq_ext.hpp>
43 #include <objects/seq/Seq_literal.hpp>
44 #include <objects/seqloc/Seq_id.hpp>
45 #include <objtools/blast/services/blast_services.hpp>
46 #include <objects/blast/blastclient.hpp>
47 #include <util/util_exception.hpp>
48
49 /** @addtogroup AlgoBlast
50 *
51 * @{
52 */
53
54 BEGIN_NCBI_SCOPE
55 USING_SCOPE(objects);
56
57 #define NCBI_MODULE NETBLAST
58
59 /// Process error messages from a reply object.
60 ///
61 /// Every reply object from blast4 has a space for error and warning
62 /// messages. This function extracts such messages and returns them
63 /// to the user in two strings. All warnings are returned in one
64 /// string, concatenated together with a newline as the delimiter, and
65 /// all error messages are concatenated together in another string in
66 /// the same way. If there are no warnings or errors, the resulting
67 /// strings will be empty.
68 ///
69 /// @param reply The reply object from blast4.
70 /// @param errors Concatenated error messages (if any).
71 /// @param warnings Concatenated warning messages (if any).
72 static void
s_ProcessErrorsFromReply(CRef<objects::CBlast4_reply> reply,string & errors,string & warnings)73 s_ProcessErrorsFromReply(CRef<objects::CBlast4_reply> reply,
74 string& errors,
75 string& warnings)
76 {
77 static const string no_msg("<no message>");
78
79 if (reply->CanGetErrors() && (! reply->GetErrors().empty())) {
80 ITERATE(list< CRef< CBlast4_error > >, iter, reply->GetErrors()) {
81
82 // Determine the message source and destination.
83
84 const string & message((*iter)->CanGetMessage()
85 ? (*iter)->GetMessage()
86 : no_msg);
87
88 string & dest
89 (((*iter)->GetCode() & eBlast4_error_flags_warning)
90 ? warnings
91 : errors);
92
93 // Attach the message (and possibly delimiter) to dest.
94
95 if (! dest.empty()) {
96 dest += "\n";
97 }
98
99 dest += message;
100 }
101 }
102 }
103
104 /// Get bioseqs from a sequence fetching reply.
105 ///
106 /// This method reads the reply from a sequence fetching request
107 /// and extracts the bioseqs, errors and warnings from it.
108 ///
109 /// @param reply
110 /// The reply from a sequence fetching request.
111 /// @param bioseqs
112 /// The returned list of bioseqs from the request.
113 /// @param errors
114 /// Returned string containing any errors encountered.
115 /// @param warnings
116 /// Returned string containing any warnigns encountered.
117 static void
s_GetSeqsFromReply(CRef<objects::CBlast4_reply> reply,CBlastServices::TBioseqVector & bioseqs,string & errors,string & warnings)118 s_GetSeqsFromReply(CRef<objects::CBlast4_reply> reply,
119 CBlastServices::TBioseqVector & bioseqs, // out
120 string & errors, // out
121 string & warnings) // out
122 {
123 // Read the data from the reply into the output arguments.
124
125 bioseqs.clear();
126
127 s_ProcessErrorsFromReply(reply, errors, warnings);
128
129 if (reply->CanGetBody() && reply->GetBody().IsGet_sequences()) {
130 list< CRef<CBioseq> > & bslist =
131 reply->SetBody().SetGet_sequences().Set();
132
133 bioseqs.reserve(bslist.size());
134
135 ITERATE(list< CRef<CBioseq> >, iter, bslist) {
136 bioseqs.push_back(*iter);
137 }
138 }
139 }
140
141 static EBlast4_residue_type
s_SeqTypeToResidue(char p,string & errors)142 s_SeqTypeToResidue(char p, string & errors)
143 {
144 EBlast4_residue_type retval = eBlast4_residue_type_unknown;
145
146 switch(p) {
147 case 'p':
148 retval = eBlast4_residue_type_protein;
149 break;
150
151 case 'n':
152 retval = eBlast4_residue_type_nucleotide;
153 break;
154
155 default:
156 errors = "Error: invalid residue type specified.";
157 }
158
159 return retval;
160 }
161
162 /// Build Sequence Fetching Request
163 ///
164 /// This method builds a blast4 request designed to fetch a list
165 /// of bioseqs from the blast4 server.
166 ///
167 /// @param seqids
168 /// The seqids of the sequences to fetch.
169 /// @param database
170 /// The database or databases containing the desired sequences.
171 /// @param seqtype
172 /// Either 'p' or 'n' for protein or nucleotide.
173 /// @param errors
174 /// Returned string containing any errors encountered.
175 /// @param skip_seq_data
176 /// If true, the sequence data will NOT be fetched
177 /// @param target_only
178 /// If true, only requested seq_id will be returned
179 /// @return
180 /// The blast4 sequence fetching request object.
181 static CRef<objects::CBlast4_request>
s_BuildGetSeqRequest(CBlastServices::TSeqIdVector & seqids,const string & database,char seqtype,bool skip_seq_data,bool target_only,string & errors)182 s_BuildGetSeqRequest(CBlastServices::TSeqIdVector& seqids, // in
183 const string& database, // in
184 char seqtype, // 'p' or 'n'
185 bool skip_seq_data, // in
186 bool target_only, // in
187 string& errors) // out
188 {
189 // This will be returned in an Empty() state if an error occurs.
190 CRef<CBlast4_request> request;
191
192 EBlast4_residue_type rtype = s_SeqTypeToResidue(seqtype, errors);
193
194 if (database.empty()) {
195 errors = "Error: database name may not be blank.";
196 return request;
197 }
198
199 if (seqids.empty()) {
200 errors = "Error: no sequences requested.";
201 return request;
202 }
203
204 // Build ASN.1 request objects and link them together.
205
206 request.Reset(new CBlast4_request);
207
208 CRef<CBlast4_request_body> body(new CBlast4_request_body);
209 CRef<CBlast4_database> db (new CBlast4_database);
210
211 request->SetBody(*body);
212 body->SetGet_sequences().SetDatabase(*db);
213 body->SetGet_sequences().SetSkip_seq_data(skip_seq_data);
214 body->SetGet_sequences().SetTarget_only(target_only);
215
216 // Fill in db values
217
218 db->SetName(database);
219 db->SetType(rtype);
220
221 // Link in the list of requests.
222
223 list< CRef< CSeq_id > > & seqid_list =
224 body->SetGet_sequences().SetSeq_ids();
225
226 ITERATE(CBlastServices::TSeqIdVector, iter, seqids) {
227 seqid_list.push_back(*iter);
228 }
229
230 return request;
231 }
232
233 /// Main function to issue a Blast4-get-sequences-request and collect its
234 /// results from the remote BLAST server.
235 ///
236 /// @param seqids
237 /// The seqids of the sequences to fetch. [in]
238 /// @param database
239 /// The database or databases containing the desired sequences. [in]
240 /// @param seqtype
241 /// Either 'p' or 'n' for protein or nucleotide. [in]
242 /// @param bioseqs
243 /// The vector used to return the requested Bioseqs. [out]
244 /// @param errors
245 /// Returned string containing any errors encountered. [out]
246 /// @param warnings
247 /// A null-separated list of warning. [out]
248 /// @param skip_seq_data
249 /// If true, the sequence data will NOT be fetched [in]
250 /// @param verbose Produce verbose output. [in]
251 static void
s_GetSequences(CBlastServices::TSeqIdVector & seqids,const string & database,char seqtype,bool skip_seq_data,bool target_only,CBlastServices::TBioseqVector & bioseqs,string & errors,string & warnings,bool verbose)252 s_GetSequences(CBlastServices::TSeqIdVector & seqids,
253 const string & database,
254 char seqtype,
255 bool skip_seq_data,
256 bool target_only,
257 CBlastServices::TBioseqVector& bioseqs,
258 string & errors,
259 string & warnings,
260 bool verbose)
261 {
262 // Build the request
263
264 CRef<CBlast4_request> request =
265 s_BuildGetSeqRequest(seqids, database, seqtype, skip_seq_data, target_only, errors);
266
267 if (request.Empty()) {
268 return;
269 }
270 if (verbose) {
271 NcbiCout << MSerial_AsnText << *request << endl;
272 }
273
274 CRef<CBlast4_reply> reply(new CBlast4_reply);
275
276 try {
277 // Send request
278 CBlast4Client().Ask(*request, *reply);
279 }
280 catch(const CEofException &) {
281 NCBI_THROW(CBlastServicesException, eRequestErr,
282 "No response from server, cannot complete request.");
283 }
284
285 if (verbose) {
286 NcbiCout << MSerial_AsnText << *reply << endl;
287 }
288 s_GetSeqsFromReply(reply, bioseqs, errors, warnings);
289 }
290
291 /// Build Sequence Parts Fetching Request
292 ///
293 /// This method builds a blast4 request designed to fetch sequence
294 /// data
295 ///
296 /// @param seqids
297 /// The seqids and ranges of the sequences to fetch.
298 /// @param database
299 /// The database or databases containing the desired sequences.
300 /// @param seqtype
301 /// Either 'p' or 'n' for protein or nucleotide.
302 /// @param errors
303 /// Returned string containing any errors encountered.
304 /// @return
305 /// The blast4 sequence fetching request object.
306 static CRef<objects::CBlast4_request>
s_BuildGetSeqPartsRequest(const CBlastServices::TSeqIntervalVector & seqids,const string & database,char seqtype,string & errors)307 s_BuildGetSeqPartsRequest(const CBlastServices::TSeqIntervalVector & seqids, // in
308 const string & database, // in
309 char seqtype, // 'p' or 'n'
310 string & errors) // out
311 {
312 errors.erase();
313
314 // This will be returned in an Empty() state if an error occurs.
315 CRef<CBlast4_request> request;
316
317 EBlast4_residue_type rtype = s_SeqTypeToResidue(seqtype, errors);
318
319 if (errors.size()) {
320 return request;
321 }
322
323 if (database.empty()) {
324 errors = "Error: database name may not be blank.";
325 return request;
326 }
327 if (seqids.empty()) {
328 errors = "Error: no sequences requested.";
329 return request;
330 }
331
332 // Build ASN.1 request objects and link them together.
333
334 request.Reset(new CBlast4_request);
335
336 CRef<CBlast4_request_body> body(new CBlast4_request_body);
337 CRef<CBlast4_database> db (new CBlast4_database);
338
339 request->SetBody(*body);
340
341 CBlast4_get_seq_parts_request & req =
342 body->SetGet_sequence_parts();
343 copy(seqids.begin(), seqids.end(), back_inserter(req.SetSeq_locations()));
344
345 req.SetDatabase(*db);
346
347 // Fill in db values
348 db->SetName(database);
349 db->SetType(rtype);
350 return request;
351 }
352
353
354 bool
IsValidBlastDb(const string & dbname,bool is_protein)355 CBlastServices::IsValidBlastDb(const string& dbname, bool is_protein)
356 {
357 if (dbname.empty())
358 return false;
359
360 bool found_all = false;
361 vector< CRef<objects::CBlast4_database_info> > result =
362 GetDatabaseInfo(dbname, is_protein, &found_all);
363
364 if (found_all && !result.empty())
365 return true;
366 else
367 return false;
368 }
369
370 CRef<objects::CBlast4_database_info>
x_FindDbInfoFromAvailableDatabases(CRef<objects::CBlast4_database> blastdb)371 CBlastServices::x_FindDbInfoFromAvailableDatabases
372 (CRef<objects::CBlast4_database> blastdb)
373 {
374 _ASSERT(blastdb.NotEmpty());
375
376 CRef<CBlast4_database_info> retval;
377
378 ITERATE(CBlast4_get_databases_reply::Tdata, dbinfo, m_AvailableDatabases) {
379 if ((*dbinfo)->GetDatabase() == *blastdb) {
380 retval = *dbinfo;
381 break;
382 }
383 }
384
385 return retval;
386 }
387
388 vector< CRef<objects::CBlast4_database_info> >
GetOrganismSpecificRepeatsDatabases()389 CBlastServices::GetOrganismSpecificRepeatsDatabases()
390 {
391 if (m_AvailableDatabases.empty()) {
392 x_GetAvailableDatabases();
393 }
394 vector< CRef<objects::CBlast4_database_info> > retval;
395
396 ITERATE(CBlast4_get_databases_reply::Tdata, dbinfo, m_AvailableDatabases) {
397 if ((*dbinfo)->GetDatabase().GetName().find("repeat_") != NPOS) {
398 retval.push_back(*dbinfo);
399 }
400 }
401
402 return retval;
403 }
404
405 void
x_GetAvailableDatabases()406 CBlastServices::x_GetAvailableDatabases()
407 {
408 CBlast4Client client;
409 CRef<CBlast4_get_databases_reply> databases;
410 try {
411 databases = client.AskGet_databases();
412 m_AvailableDatabases = databases->Set();
413 }
414 catch (const CEofException &) {
415 NCBI_THROW(CBlastServicesException, eRequestErr,
416 "No response from server, cannot complete request.");
417 }
418 }
419
420
421 CRef<objects::CBlast4_database_info>
GetDatabaseInfo(CRef<objects::CBlast4_database> blastdb)422 CBlastServices::GetDatabaseInfo(CRef<objects::CBlast4_database> blastdb)
423 {
424 if (blastdb.Empty()) {
425 NCBI_THROW(CBlastServicesException, eArgErr,
426 "NULL argument specified: blast database description");
427 }
428
429 if (m_AvailableDatabases.empty()) {
430 x_GetAvailableDatabases();
431 }
432
433 return x_FindDbInfoFromAvailableDatabases(blastdb);
434 }
435
436
437 vector< CRef<objects::CBlast4_database_info> >
GetDatabaseInfoLegacy(const string & dbname,bool is_protein,bool * found_all,vector<string> * missing_names)438 CBlastServices::GetDatabaseInfoLegacy(const string& dbname, bool is_protein,
439 bool *found_all,
440 vector<string> *missing_names)
441 {
442 vector<CRef<objects::CBlast4_database_info> > retval;
443 vector<string> dbs;
444 NStr::Split(dbname, " \n\t", dbs);
445
446 if (dbs.empty())
447 *found_all = false; // Loop did not run.
448 else
449 *found_all = true; // Set to false if one missing
450
451 ITERATE(vector<string>, i, dbs) {
452 const string kDbName = NStr::TruncateSpaces(*i);
453 if (kDbName.empty())
454 continue;
455
456 CRef<CBlast4_database> blastdb(new CBlast4_database);
457 blastdb->SetName(kDbName);
458 blastdb->SetType(is_protein
459 ? eBlast4_residue_type_protein
460 : eBlast4_residue_type_nucleotide);
461 CRef<CBlast4_database_info> result = GetDatabaseInfo(blastdb);
462 if (result){
463 retval.push_back(result);
464 }
465 else{
466 *found_all = false;
467 if( missing_names ) missing_names->push_back( blastdb->GetName() );
468 }
469 }
470 return retval;
471 }
472
473 vector< CRef<objects::CBlast4_database_info> >
GetDatabaseInfo(const string & dbname,bool is_protein,bool * found_all,vector<string> * missing_names)474 CBlastServices::GetDatabaseInfo(const string& dbname, bool is_protein,
475 bool *found_all,
476 vector<string> *missing_names)
477 {
478 vector<CRef<objects::CBlast4_database_info> > retval;
479 CRef<CBlast4_request> request;
480 CRef<CBlast4_reply> reply(new CBlast4_reply);
481 vector<string> all_db_names;
482 vector<string>::iterator it_db;
483 bool l_multiple_db = false;
484 string local_db_name = NStr::TruncateSpaces( dbname );
485
486 if( found_all ){
487 *found_all = false;
488 }
489 NStr::Split(local_db_name, " \n\t", all_db_names);
490 l_multiple_db = ( all_db_names.size() > 1 );
491
492 request.Reset(new CBlast4_request);
493 CRef<CBlast4_request_body> body(new CBlast4_request_body);
494 CRef<CBlast4_get_databases_ex_request> db_ex_req(new CBlast4_get_databases_ex_request);
495
496 body->SetGet_databases_ex( *db_ex_req );
497 request->SetBody(*body);
498
499 db_ex_req->SetParams().Add("FILTER_TYPE",string("EXACT"));
500 db_ex_req->SetParams().Add("DBNAME",dbname);
501 if( is_protein )
502 db_ex_req->SetParams().Add("DBTYPE",string("prot"));
503 else
504 db_ex_req->SetParams().Add("DBTYPE",string("nucl"));
505
506
507 try {
508 CBlast4Client().Ask(*request, *reply);
509 }
510 catch(const CEofException &) {
511 NCBI_THROW(CBlastServicesException, eRequestErr,
512 "No response from server, cannot complete request.");
513 }
514 // if no answer, call legacy method
515 if( reply->GetBody().GetGet_databases_ex().Get().empty() ){
516 return GetDatabaseInfoLegacy(dbname,is_protein,found_all,missing_names);
517 }
518
519 if( !reply->CanGetBody() || !reply->GetBody().IsGet_databases_ex() ) {
520 if(found_all ) *found_all = false;
521 NCBI_THROW(CBlastServicesException, eRequestErr,
522 "Unexpected response from server, cannot complete request. (GetDatabaseInfoEx)");
523 }
524
525 list< CRef< CBlast4_database_info > >::const_iterator it;
526 it = reply->GetBody().GetGet_databases_ex().Get().begin();
527 for( ; it != reply->GetBody().GetGet_databases_ex().Get().end(); it++){
528 retval.push_back( *it );
529 if( found_all ) {
530 if( l_multiple_db ) {
531 string current_dbname = (*(*it)).GetDatabase().GetName();
532 it_db = find(all_db_names.begin(),all_db_names.end(),current_dbname);
533 if( it_db != all_db_names.end() ){
534 all_db_names.erase( it_db);
535 }
536 }
537 else{
538 //single db lookup
539 *found_all = true;
540 all_db_names.clear();
541 }
542 }
543 }
544
545 if( found_all ){
546 if( all_db_names.empty() ) {
547 *found_all = true; // all resolved
548 }
549 else{
550 if( missing_names ) missing_names->assign(all_db_names.begin(),all_db_names.end());
551 }
552 }
553 return retval;
554 }
555
556 void
GetSequencesInfo(TSeqIdVector & seqids,const string & database,char seqtype,TBioseqVector & bioseqs,string & errors,string & warnings,bool verbose,bool target_only)557 CBlastServices::GetSequencesInfo(TSeqIdVector & seqids, // in
558 const string & database, // in
559 char seqtype, // 'p' or 'n'
560 TBioseqVector& bioseqs, // out
561 string & errors, // out
562 string & warnings, // out
563 bool verbose, // in
564 bool target_only) // in
565 {
566 s_GetSequences(seqids, database, seqtype, true, target_only, bioseqs,
567 errors, warnings, verbose);
568 }
569
570 void
GetSequences(TSeqIdVector & seqids,const string & database,char seqtype,TBioseqVector & bioseqs,string & errors,string & warnings,bool verbose,bool target_only)571 CBlastServices::GetSequences(TSeqIdVector & seqids, // in
572 const string & database, // in
573 char seqtype, // 'p' or 'n'
574 TBioseqVector& bioseqs, // out
575 string & errors, // out
576 string & warnings, // out
577 bool verbose, // in
578 bool target_only) // in
579 {
580 s_GetSequences(seqids, database, seqtype, false, target_only, bioseqs,
581 errors, warnings, verbose);
582 }
583
584
585 /// Extract information from the get-seq-parts reply object.
586 /// @param reply The reply object from blast4.
587 /// @param ids All Seq-ids for the requested sequences.
588 /// @param seq_data Seq_data for the sequences in question.
589 /// @param errors Any error messages found in the reply.
590 /// @param warnings Any warnings found in the reply.
591 static void
s_GetPartsFromReply(CRef<objects::CBlast4_reply> reply,CBlastServices::TSeqIdVector & ids,CBlastServices::TSeqDataVector & seq_data,string & errors,string & warnings)592 s_GetPartsFromReply(CRef<objects::CBlast4_reply> reply, // in
593 CBlastServices::TSeqIdVector & ids, // out
594 CBlastServices::TSeqDataVector & seq_data, // out
595 string & errors, // out
596 string & warnings) // out
597 {
598 seq_data.clear();
599 ids.clear();
600
601 s_ProcessErrorsFromReply(reply, errors, warnings);
602
603 if (reply->CanGetBody() && reply->GetBody().IsGet_sequence_parts()) {
604 CBlast4_get_seq_parts_reply::Tdata& parts_rep =
605 reply->SetBody().SetGet_sequence_parts().Set();
606 ids.reserve(parts_rep.size());
607 seq_data.reserve(parts_rep.size());
608
609 NON_CONST_ITERATE(CBlast4_get_seq_parts_reply::Tdata, itr, parts_rep) {
610 ids.push_back(CRef<CSeq_id>(&(*itr)->SetId()));
611 seq_data.push_back(CRef<CSeq_data>(&(*itr)->SetData()));
612 }
613 }
614 }
615
616 void CBlastServices::
GetSequenceParts(const TSeqIntervalVector & seqids,const string & database,char seqtype,TSeqIdVector & ids,TSeqDataVector & seq_data,string & errors,string & warnings,bool verbose)617 GetSequenceParts(const TSeqIntervalVector & seqids, // in
618 const string & database, // in
619 char seqtype, // 'p' or 'n'
620 TSeqIdVector & ids, // out
621 TSeqDataVector & seq_data, // out
622 string & errors, // out
623 string & warnings, // out
624 bool verbose) // in
625 {
626 // Build the request
627
628 CRef<CBlast4_request> request =
629 s_BuildGetSeqPartsRequest(seqids, database, seqtype, errors);
630
631 if (request.Empty()) {
632 return;
633 }
634 if (verbose) {
635 NcbiCout << MSerial_AsnText << *request << endl;
636 }
637
638 CRef<CBlast4_reply> reply(new CBlast4_reply);
639
640 try {
641 // Send request.
642 CBlast4Client().Ask(*request, *reply);
643 }
644 catch(const CEofException &) {
645 NCBI_THROW(CBlastServicesException, eRequestErr,
646 "No response from server, cannot complete request.");
647 }
648
649 if (verbose) {
650 NcbiCout << MSerial_AsnText << *reply << endl;
651 }
652 s_GetPartsFromReply(reply, ids, seq_data, errors, warnings);
653 }
654
655 objects::CBlast4_get_windowmasked_taxids_reply::Tdata
GetTaxIdWithWindowMaskerSupport()656 CBlastServices::GetTaxIdWithWindowMaskerSupport()
657 {
658 if (m_WindowMaskedTaxIds.empty()) {
659 CBlast4Client client;
660 CRef<CBlast4_get_windowmasked_taxids_reply> reply;
661 try {
662 reply = client.AskGet_windowmasked_taxids();
663 if (m_Verbose) {
664 NcbiCout << MSerial_AsnText << *reply << endl;
665 }
666 m_WindowMaskedTaxIds = reply->Set();
667 }
668 catch (const CEofException &) {
669 NCBI_THROW(CBlastServicesException, eRequestErr,
670 "No response from server, cannot complete request.");
671 }
672 }
673 return m_WindowMaskedTaxIds;
674 }
675
676 #undef NCBI_MODULE
677
678 END_NCBI_SCOPE
679
680 /* @} */
681
682