1 /* $Id: blast_app_util.cpp 632181 2021-05-27 13:23:25Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Christiam Camacho
27 *
28 */
29
30 /** @file blast_app_util.cpp
31 * Utility functions for BLAST command line applications
32 */
33
34 #include <ncbi_pch.hpp>
35 #include "blast_app_util.hpp"
36
37 #include <serial/serial.hpp>
38 #include <serial/objostr.hpp>
39
40 #include <objtools/data_loaders/blastdb/bdbloader.hpp>
41 #include <algo/blast/api/remote_blast.hpp>
42 #include <algo/blast/api/objmgr_query_data.hpp> // for CObjMgr_QueryFactory
43 #include <algo/blast/api/blast_options_builder.hpp>
44 #include <algo/blast/api/search_strategy.hpp>
45 #include <algo/blast/blastinput/blast_input.hpp> // for CInputException
46 #include <algo/blast/blastinput/psiblast_args.hpp>
47 #include <algo/blast/blastinput/tblastn_args.hpp>
48 #include <algo/blast/blastinput/blast_scope_src.hpp>
49 #include <objmgr/util/sequence.hpp>
50
51 #include <objects/scoremat/Pssm.hpp>
52 #include <serial/typeinfo.hpp> // for CTypeInfo, needed by SerialClone
53 #include <objtools/data_loaders/blastdb/bdbloader_rmt.hpp>
54 #include <algo/blast/format/blast_format.hpp>
55 #include <objtools/align_format/format_flags.hpp>
56
57 #if defined(NCBI_OS_LINUX) && HAVE_MALLOC_H
58 #include <malloc.h>
59 #endif
60
61 BEGIN_NCBI_SCOPE
62 USING_SCOPE(objects);
63 USING_SCOPE(blast);
64
GetBatchSize(Int4 hits)65 Int4 CBatchSizeMixer::GetBatchSize(Int4 hits)
66 {
67 if (hits >= 0) {
68 double ratio = 1.0 * (hits+1) / m_BatchSize;
69 m_Ratio = (m_Ratio < 0) ? ratio
70 : k_MixIn * ratio + (1.0 - k_MixIn) * m_Ratio;
71 m_BatchSize = (Int4) (1.0 * m_TargetHits / m_Ratio);
72 }
73 if (m_BatchSize > k_MaxBatchSize) {
74 m_BatchSize = k_MaxBatchSize;
75 m_Ratio = -1.0;
76 } else if (m_BatchSize < k_MinBatchSize) {
77 m_BatchSize = k_MinBatchSize;
78 m_Ratio = -1.0;
79 }
80 return m_BatchSize;
81 }
82
83 CRef<blast::CRemoteBlast>
InitializeRemoteBlast(CRef<blast::IQueryFactory> queries,CRef<blast::CBlastDatabaseArgs> db_args,CRef<blast::CBlastOptionsHandle> opts_hndl,bool verbose_output,const string & client_id,CRef<objects::CPssmWithParameters> pssm)84 InitializeRemoteBlast(CRef<blast::IQueryFactory> queries,
85 CRef<blast::CBlastDatabaseArgs> db_args,
86 CRef<blast::CBlastOptionsHandle> opts_hndl,
87 bool verbose_output,
88 const string& client_id /* = kEmptyStr */,
89 CRef<objects::CPssmWithParameters> pssm
90 /* = CRef<objects::CPssmWithParameters>() */)
91 {
92 _ASSERT(queries || pssm);
93 _ASSERT(db_args);
94 _ASSERT(opts_hndl);
95
96 CRef<CRemoteBlast> retval;
97
98 CRef<CSearchDatabase> search_db = db_args->GetSearchDatabase();
99 if (search_db.NotEmpty()) {
100 if (pssm.NotEmpty()) {
101 _ASSERT(queries.Empty());
102 retval.Reset(new CRemoteBlast(pssm, opts_hndl, *search_db));
103 } else {
104 retval.Reset(new CRemoteBlast(queries, opts_hndl, *search_db));
105 }
106 } else {
107 if (pssm.NotEmpty()) {
108 NCBI_THROW(CInputException, eInvalidInput,
109 "Remote PSI-BL2SEQ is not supported");
110 } else {
111 // N.B.: there is NO scope needed in the GetSubjects call because
112 // the subjects (if any) should have already been added in
113 // InitializeSubject
114 retval.Reset(new CRemoteBlast(queries, opts_hndl,
115 db_args->GetSubjects()));
116 }
117 }
118 if (verbose_output) {
119 retval->SetVerbose();
120 }
121 if (client_id != kEmptyStr) {
122 retval->SetClientId(client_id);
123 }
124 return retval;
125 }
126
s_FindBlastDbDataLoaderName(const string & dbname,bool is_protein)127 static string s_FindBlastDbDataLoaderName(const string& dbname, bool is_protein)
128 {
129 // This string is built based on the knowledge of how the BLAST DB data
130 // loader names are created, see
131 // {CBlastDbDataLoader,CRemoteBlastDbDataLoader}::GetLoaderNameFromArgs
132 const string str2find(string("_") + dbname + string(is_protein ? "P" : "N"));
133 CObjectManager::TRegisteredNames loader_names;
134 CObjectManager::GetInstance()->GetRegisteredNames(loader_names);
135 ITERATE(CObjectManager::TRegisteredNames, loader_name, loader_names) {
136 if (NStr::Find(*loader_name, str2find) != NPOS) {
137 return *loader_name;
138 }
139 }
140 return kEmptyStr;
141 }
142
143 blast::SDataLoaderConfig
InitializeQueryDataLoaderConfiguration(bool query_is_protein,CRef<blast::CLocalDbAdapter> db_adapter)144 InitializeQueryDataLoaderConfiguration(bool query_is_protein,
145 CRef<blast::CLocalDbAdapter> db_adapter)
146 {
147 SDataLoaderConfig retval(query_is_protein);
148 retval.OptimizeForWholeLargeSequenceRetrieval();
149
150 /* Load the BLAST database into the data loader configuration for the query
151 * so that if the query sequence(s) are specified as seq-ids, these can be
152 * fetched from the BLAST database being searched */
153 if (db_adapter->IsBlastDb() && /* this is a BLAST database search */
154 retval.m_UseBlastDbs && /* the BLAST database data loader is requested */
155 (query_is_protein == db_adapter->IsProtein())) { /* the same database type is used for both queries and subjects */
156 // Make sure we don't add the same database more than once
157 vector<string> default_dbs;
158 NStr::Split(retval.m_BlastDbName, " ", default_dbs);
159 if (default_dbs.size() &&
160 (find(default_dbs.begin(), default_dbs.end(),
161 db_adapter->GetDatabaseName()) == default_dbs.end())) {
162 CNcbiOstrstream oss;
163 oss << db_adapter->GetDatabaseName() << " " << retval.m_BlastDbName;
164 retval.m_BlastDbName = CNcbiOstrstreamToString(oss);
165 }
166 }
167 if (retval.m_UseBlastDbs) {
168 _TRACE("Initializing query data loader to '" << retval.m_BlastDbName
169 << "' (" << (query_is_protein ? "protein" : "nucleotide")
170 << " BLAST database)");
171 }
172 if (retval.m_UseGenbank) {
173 _TRACE("Initializing query data loader to use GenBank data loader");
174 }
175 return retval;
176 }
177
178 void
InitializeSubject(CRef<blast::CBlastDatabaseArgs> db_args,CRef<blast::CBlastOptionsHandle> opts_hndl,bool is_remote_search,CRef<blast::CLocalDbAdapter> & db_adapter,CRef<objects::CScope> & scope)179 InitializeSubject(CRef<blast::CBlastDatabaseArgs> db_args,
180 CRef<blast::CBlastOptionsHandle> opts_hndl,
181 bool is_remote_search,
182 CRef<blast::CLocalDbAdapter>& db_adapter,
183 CRef<objects::CScope>& scope)
184 {
185 string dl = kEmptyStr;
186 db_adapter.Reset();
187
188 _ASSERT(db_args.NotEmpty());
189 CRef<CSearchDatabase> search_db = db_args->GetSearchDatabase();
190
191 // Initialize the scope...
192 if (is_remote_search) {
193 const bool is_protein =
194 Blast_SubjectIsProtein(opts_hndl->GetOptions().GetProgramType())
195 ? true : false;
196 SDataLoaderConfig config(is_protein);
197 CBlastScopeSource scope_src(config);
198 // configure scope to fetch sequences remotely for formatting
199 if (scope.NotEmpty()) {
200 scope_src.AddDataLoaders(scope);
201 } else {
202 scope = scope_src.NewScope();
203 }
204 } else {
205 if (scope.Empty()) {
206 scope.Reset(new CScope(*CObjectManager::GetInstance()));
207 }
208 }
209 _ASSERT(scope.NotEmpty());
210
211 // ... and then the subjects
212 CRef<IQueryFactory> subjects;
213 if ( (subjects = db_args->GetSubjects(scope)) ) {
214 _ASSERT(search_db.Empty());
215 char* bl2seq_legacy = getenv("BL2SEQ_LEGACY");
216 if (bl2seq_legacy)
217 db_adapter.Reset(new CLocalDbAdapter(subjects, opts_hndl, false));
218 else
219 db_adapter.Reset(new CLocalDbAdapter(subjects, opts_hndl, true));
220 } else {
221 _ASSERT(search_db.NotEmpty());
222 try {
223 // Try to open the BLAST database even for remote searches, as if
224 // it is available locally, it will be better to fetch the
225 // sequence data for formatting from this (local) source
226 CRef<CSeqDB> seqdb = search_db->GetSeqDb();
227 db_adapter.Reset(new CLocalDbAdapter(*search_db));
228 dl = RegisterOMDataLoader(seqdb);
229 scope->AddDataLoader(dl);
230 } catch (const CSeqDBException&) {
231 // The BLAST database couldn't be found, report this for local
232 // searches, but for remote searches go on.
233 dl = kEmptyStr;
234 if (is_remote_search ) {
235 db_adapter.Reset(new CLocalDbAdapter(*search_db));
236 } else {
237 throw;
238 }
239 }
240 }
241
242 /// Set the BLASTDB data loader as the default data loader (if applicable)
243 if (search_db.NotEmpty()) {
244 if ( dl != kEmptyStr) {
245 // FIXME: will this work with multiple BLAST DBs?
246 scope->AddDataLoader(dl, CBlastDatabaseArgs::kSubjectsDataLoaderPriority);
247 _TRACE("Setting " << dl << " priority to "
248 << (int)CBlastDatabaseArgs::kSubjectsDataLoaderPriority
249 << " for subjects");
250 }
251 }
252 return;
253 }
254
RegisterOMDataLoader(CRef<CSeqDB> db_handle)255 string RegisterOMDataLoader(CRef<CSeqDB> db_handle)
256 {
257 // the blast formatter requires that the database coexist in
258 // the same scope with the query sequences
259 CRef<CObjectManager> om = CObjectManager::GetInstance();
260 CBlastDbDataLoader::RegisterInObjectManager(*om, db_handle, true,
261 CObjectManager::eDefault,
262 CBlastDatabaseArgs::kSubjectsDataLoaderPriority);
263 CBlastDbDataLoader::SBlastDbParam param(db_handle);
264 string retval(CBlastDbDataLoader::GetLoaderNameFromArgs(param));
265 _TRACE("Registering " << retval << " at priority " <<
266 (int)CBlastDatabaseArgs::kSubjectsDataLoaderPriority
267 << " for subjects");
268 return retval;
269 }
270
271
272 static CRef<blast::CExportStrategy>
s_InitializeExportStrategy(CRef<blast::IQueryFactory> queries,CRef<blast::CBlastDatabaseArgs> db_args,CRef<blast::CBlastOptionsHandle> opts_hndl,const string & client_id,CRef<objects::CPssmWithParameters> pssm,unsigned int num_iters)273 s_InitializeExportStrategy(CRef<blast::IQueryFactory> queries,
274 CRef<blast::CBlastDatabaseArgs> db_args,
275 CRef<blast::CBlastOptionsHandle> opts_hndl,
276 const string& client_id /* = kEmptyStr */,
277 CRef<objects::CPssmWithParameters> pssm
278 /* = CRef<objects::CPssmWithParameters>() */,
279 unsigned int num_iters
280 /* = 0 */)
281 {
282 _ASSERT(queries || pssm);
283 _ASSERT(db_args);
284 _ASSERT(opts_hndl);
285
286 CRef<CExportStrategy> retval;
287
288 CRef<CSearchDatabase> search_db = db_args->GetSearchDatabase();
289 if (search_db.NotEmpty())
290 {
291 if (pssm.NotEmpty())
292 {
293 _ASSERT(queries.Empty());
294 if(num_iters != 0)
295 retval.Reset(new blast::CExportStrategy(pssm, opts_hndl, search_db, client_id, num_iters));
296 else
297 retval.Reset(new blast::CExportStrategy(pssm, opts_hndl, search_db, client_id));
298 }
299 else
300 {
301 if(num_iters != 0)
302 retval.Reset(new blast::CExportStrategy(queries, opts_hndl, search_db, client_id, num_iters));
303 else
304 retval.Reset(new blast::CExportStrategy(queries, opts_hndl, search_db, client_id));
305 }
306 }
307 else
308 {
309 if (pssm.NotEmpty())
310 {
311 NCBI_THROW(CInputException, eInvalidInput,
312 "Remote PSI-BL2SEQ is not supported");
313 }
314 else
315 {
316 retval.Reset(new blast::CExportStrategy(queries, opts_hndl,
317 db_args->GetSubjects(), client_id));
318 }
319 }
320
321 return retval;
322 }
323
324
325 /// Real implementation of search strategy extraction
326 /// @todo refactor this code so that it can be reused in other contexts
327 static void
s_ExportSearchStrategy(CNcbiOstream * out,CRef<blast::IQueryFactory> queries,CRef<blast::CBlastOptionsHandle> options_handle,CRef<blast::CBlastDatabaseArgs> db_args,CRef<objects::CPssmWithParameters> pssm,unsigned int num_iters)328 s_ExportSearchStrategy(CNcbiOstream* out,
329 CRef<blast::IQueryFactory> queries,
330 CRef<blast::CBlastOptionsHandle> options_handle,
331 CRef<blast::CBlastDatabaseArgs> db_args,
332 CRef<objects::CPssmWithParameters> pssm,
333 /* = CRef<objects::CPssmWithParameters>() */
334 unsigned int num_iters /* = 0 */)
335 {
336 if ( !out )
337 return;
338
339 _ASSERT(db_args);
340 _ASSERT(options_handle);
341
342 try
343 {
344 CRef<CExportStrategy> export_strategy =
345 s_InitializeExportStrategy(queries, db_args, options_handle,
346 kEmptyStr, pssm, num_iters);
347 export_strategy->ExportSearchStrategy_ASN1(out);
348 }
349 catch (const CBlastException& e)
350 {
351 if (e.GetErrCode() == CBlastException::eNotSupported) {
352 NCBI_THROW(CInputException, eInvalidInput,
353 "Saving search strategies with gi lists is currently "
354 "not supported");
355 }
356 throw;
357 }
358 }
359
360 /// Converts a list of Bioseqs into a TSeqLocVector. All Bioseqs are added to
361 /// the same CScope object
362 /// @param subjects Bioseqs to convert
363 static TSeqLocVector
s_ConvertBioseqs2TSeqLocVector(const CBlast4_subject::TSequences & subjects)364 s_ConvertBioseqs2TSeqLocVector(const CBlast4_subject::TSequences& subjects)
365 {
366 TSeqLocVector retval;
367 CRef<CScope> subj_scope(new CScope(*CObjectManager::GetInstance()));
368 ITERATE(CBlast4_subject::TSequences, bioseq, subjects) {
369 subj_scope->AddBioseq(**bioseq);
370 CRef<CSeq_id> seqid = FindBestChoice((*bioseq)->GetId(),
371 CSeq_id::BestRank);
372 const TSeqPos length = (*bioseq)->GetInst().GetLength();
373 CRef<CSeq_loc> sl(new CSeq_loc(*seqid, 0, length-1));
374 retval.push_back(SSeqLoc(sl, subj_scope));
375 }
376 return retval;
377 }
378
379 /// Import PSSM into the command line arguments object
380 static void
s_ImportPssm(const CBlast4_queries & queries,CRef<blast::CBlastOptionsHandle> opts_hndl,blast::CBlastAppArgs * cmdline_args)381 s_ImportPssm(const CBlast4_queries& queries,
382 CRef<blast::CBlastOptionsHandle> opts_hndl,
383 blast::CBlastAppArgs* cmdline_args)
384 {
385 CRef<CPssmWithParameters> pssm
386 (const_cast<CPssmWithParameters*>(&queries.GetPssm()));
387 CPsiBlastAppArgs* psi_args = NULL;
388 CTblastnAppArgs* tbn_args = NULL;
389
390 if ( (psi_args = dynamic_cast<CPsiBlastAppArgs*>(cmdline_args)) ) {
391 psi_args->SetInputPssm(pssm);
392 } else if ( (tbn_args =
393 dynamic_cast<CTblastnAppArgs*>(cmdline_args))) {
394 tbn_args->SetInputPssm(pssm);
395 } else {
396 EBlastProgramType p = opts_hndl->GetOptions().GetProgramType();
397 string msg("PSSM found in saved strategy, but not supported ");
398 msg += "for " + Blast_ProgramNameFromType(p);
399 NCBI_THROW(CBlastException, eNotSupported, msg);
400 }
401 }
402
403 /// Import queries into command line arguments object
404 static void
s_ImportQueries(const CBlast4_queries & queries,CRef<blast::CBlastOptionsHandle> opts_hndl,blast::CBlastAppArgs * cmdline_args)405 s_ImportQueries(const CBlast4_queries& queries,
406 CRef<blast::CBlastOptionsHandle> opts_hndl,
407 blast::CBlastAppArgs* cmdline_args)
408 {
409 CRef<CTmpFile> tmpfile(new CTmpFile(CTmpFile::eNoRemove));
410
411 // Stuff the query bioseq or seqloc list in the input stream of the
412 // cmdline_args
413 if (queries.IsSeq_loc_list()) {
414 const CBlast4_queries::TSeq_loc_list& seqlocs =
415 queries.GetSeq_loc_list();
416 CFastaOstream out(tmpfile->AsOutputFile(CTmpFile::eIfExists_Throw));
417 out.SetFlag(CFastaOstream::eAssembleParts);
418
419 EBlastProgramType prog = opts_hndl->GetOptions().GetProgramType();
420 SDataLoaderConfig dlconfig(!!Blast_QueryIsProtein(prog));
421 dlconfig.OptimizeForWholeLargeSequenceRetrieval();
422 CBlastScopeSource scope_src(dlconfig);
423 CRef<CScope> scope(scope_src.NewScope());
424
425 ITERATE(CBlast4_queries::TSeq_loc_list, itr, seqlocs) {
426 if ((*itr)->GetId()) {
427 CBioseq_Handle bh = scope->GetBioseqHandle(*(*itr)->GetId());
428 out.Write(bh);
429 }
430 }
431 scope.Reset();
432 scope_src.RevokeBlastDbDataLoader();
433
434 } else {
435 _ASSERT(queries.IsBioseq_set());
436 const CBlast4_queries::TBioseq_set& bioseqs =
437 queries.GetBioseq_set();
438 CFastaOstream out(tmpfile->AsOutputFile(CTmpFile::eIfExists_Throw));
439 out.SetFlag(CFastaOstream::eAssembleParts);
440
441 ITERATE(CBioseq_set::TSeq_set, seq_entry, bioseqs.GetSeq_set()){
442 out.Write(**seq_entry);
443 }
444 }
445
446 const string& fname = tmpfile->GetFileName();
447 tmpfile.Reset(new CTmpFile(fname));
448 cmdline_args->SetInputStream(tmpfile);
449 }
450
451 /// Import the database and return it in a CBlastDatabaseArgs object
452 static CRef<blast::CBlastDatabaseArgs>
s_ImportDatabase(const CBlast4_subject & subj,CBlastOptionsBuilder & opts_builder,bool subject_is_protein,bool is_remote_search)453 s_ImportDatabase(const CBlast4_subject& subj,
454 CBlastOptionsBuilder& opts_builder,
455 bool subject_is_protein,
456 bool is_remote_search)
457 {
458 _ASSERT(subj.IsDatabase());
459 CRef<CBlastDatabaseArgs> db_args(new CBlastDatabaseArgs);
460 const CSearchDatabase::EMoleculeType mol = subject_is_protein
461 ? CSearchDatabase::eBlastDbIsProtein
462 : CSearchDatabase::eBlastDbIsNucleotide;
463 const string dbname(subj.GetDatabase());
464 CRef<CSearchDatabase> search_db(new CSearchDatabase(dbname, mol));
465
466 if (opts_builder.HaveEntrezQuery()) {
467 string limit(opts_builder.GetEntrezQuery());
468 search_db->SetEntrezQueryLimitation(limit);
469 if ( !is_remote_search ) {
470 string msg("Entrez query '");
471 msg += limit + string("' will not be processed locally.\n");
472 msg += string("Please use the -remote option.");
473 throw runtime_error(msg);
474 }
475 }
476
477 if (opts_builder.HaveGiList() || opts_builder.HaveTaxidList()) {
478 CSeqDBGiList *gilist = new CSeqDBGiList();
479 if (opts_builder.HaveGiList()) {
480 ITERATE(list<TGi>, gi, opts_builder.GetGiList()) {
481 gilist->AddGi(*gi);
482 }
483 }
484 if (opts_builder.HaveTaxidList()) {
485 list<TTaxId> list = opts_builder.GetTaxidList();
486 set<TTaxId> taxids(list.begin(), list.end());
487 gilist->AddTaxIds(taxids);
488 }
489 search_db->SetGiList(gilist);
490 }
491
492 if (opts_builder.HaveNegativeGiList() || opts_builder.HaveNegativeTaxidList()) {
493 CSeqDBGiList *gilist = new CSeqDBGiList();
494 if (opts_builder.HaveNegativeGiList()) {
495 ITERATE(list<TGi>, gi, opts_builder.GetNegativeGiList()) {
496 gilist->AddGi(*gi);
497 }
498 }
499 if (opts_builder.HaveNegativeTaxidList()) {
500 list<TTaxId> list = opts_builder.GetNegativeTaxidList();
501 set<TTaxId> taxids(list.begin(), list.end());
502 gilist->AddTaxIds(taxids);
503 }
504 search_db->SetNegativeGiList(gilist);
505 }
506
507 if (opts_builder.HasDbFilteringAlgorithmKey()) {
508 string algo_key = opts_builder.GetDbFilteringAlgorithmKey();
509 ESubjectMaskingType mask_type= eSoftSubjMasking;
510 if(opts_builder.HasSubjectMaskingType())
511 mask_type = opts_builder.GetSubjectMaskingType();
512 search_db->SetFilteringAlgorithm(algo_key, mask_type);
513
514 } else if (opts_builder.HasDbFilteringAlgorithmId()) {
515 int algo_id = opts_builder.GetDbFilteringAlgorithmId();
516 ESubjectMaskingType mask_type= eSoftSubjMasking;
517 if(opts_builder.HasSubjectMaskingType())
518 mask_type = opts_builder.GetSubjectMaskingType();
519 search_db->SetFilteringAlgorithm(algo_id, mask_type);
520 }
521
522 db_args->SetSearchDatabase(search_db);
523 return db_args;
524 }
525
526 /// Import the subject sequences into a CBlastDatabaseArgs object
527 static CRef<blast::CBlastDatabaseArgs>
s_ImportSubjects(const CBlast4_subject & subj,bool subject_is_protein)528 s_ImportSubjects(const CBlast4_subject& subj, bool subject_is_protein)
529 {
530 _ASSERT(subj.IsSequences());
531 CRef<CBlastDatabaseArgs> db_args(new CBlastDatabaseArgs);
532 TSeqLocVector subjects =
533 s_ConvertBioseqs2TSeqLocVector(subj.GetSequences());
534 CRef<CScope> subj_scope = subjects.front().scope;
535 CRef<IQueryFactory> subject_factory(new CObjMgr_QueryFactory(subjects));
536 db_args->SetSubjects(subject_factory, subj_scope, subject_is_protein);
537 return db_args;
538 }
539
540 /// Imports search strategy, using CImportStrategy.
541 static void
s_ImportSearchStrategy(CNcbiIstream * in,blast::CBlastAppArgs * cmdline_args,bool is_remote_search,bool override_query,bool override_subject)542 s_ImportSearchStrategy(CNcbiIstream* in,
543 blast::CBlastAppArgs* cmdline_args,
544 bool is_remote_search,
545 bool override_query,
546 bool override_subject)
547 {
548 if ( !in ) {
549 return;
550 }
551
552 CRef<CBlast4_request> b4req;
553 try {
554 b4req = ExtractBlast4Request(*in);
555 } catch (const CSerialException&) {
556 NCBI_THROW(CInputException, eInvalidInput,
557 "Failed to read search strategy");
558 }
559
560 CImportStrategy strategy(b4req);
561
562 CRef<blast::CBlastOptionsHandle> opts_hndl = strategy.GetOptionsHandle();
563 cmdline_args->SetOptionsHandle(opts_hndl);
564 const EBlastProgramType prog = opts_hndl->GetOptions().GetProgramType();
565 cmdline_args->SetTask(strategy.GetTask());
566 #if _DEBUG
567 {
568 char* program_string = 0;
569 BlastNumber2Program(prog, &program_string);
570 _TRACE("EBlastProgramType=" << program_string << " task=" << strategy.GetTask());
571 sfree(program_string);
572 }
573 #endif
574
575 // Get the subject
576 if (override_subject) {
577 ERR_POST(Warning << "Overriding database/subject in saved strategy");
578 } else {
579 CRef<blast::CBlastDatabaseArgs> db_args;
580 CRef<CBlast4_subject> subj = strategy.GetSubject();
581 const bool subject_is_protein = Blast_SubjectIsProtein(prog) ? true : false;
582
583 if (subj->IsDatabase()) {
584 db_args = s_ImportDatabase(*subj, strategy.GetOptionsBuilder(),
585 subject_is_protein, is_remote_search);
586 } else {
587 db_args = s_ImportSubjects(*subj, subject_is_protein);
588 }
589 _ASSERT(db_args.NotEmpty());
590 cmdline_args->SetBlastDatabaseArgs(db_args);
591 }
592
593 // Get the query, queries, or pssm
594 if (override_query) {
595 ERR_POST(Warning << "Overriding query in saved strategy");
596 } else {
597 CRef<CBlast4_queries> queries = strategy.GetQueries();
598 if (queries->IsPssm()) {
599 s_ImportPssm(*queries, opts_hndl, cmdline_args);
600 } else {
601 s_ImportQueries(*queries, opts_hndl, cmdline_args);
602 }
603 // Set the range restriction for the query, if applicable
604 const TSeqRange query_range = strategy.GetQueryRange();
605 if (query_range != TSeqRange::GetEmpty()) {
606 cmdline_args->GetQueryOptionsArgs()->SetRange(query_range);
607 }
608 }
609
610 if ( CPsiBlastAppArgs* psi_args = dynamic_cast<CPsiBlastAppArgs*>(cmdline_args) )
611 {
612 psi_args->SetNumberOfIterations(strategy.GetPsiNumOfIterations());
613 }
614 }
615
616 bool
RecoverSearchStrategy(const CArgs & args,blast::CBlastAppArgs * cmdline_args)617 RecoverSearchStrategy(const CArgs& args, blast::CBlastAppArgs* cmdline_args)
618 {
619 CNcbiIstream* in = cmdline_args->GetImportSearchStrategyStream(args);
620 if ( !in ) {
621 return false;
622 }
623 const bool is_remote_search =
624 (args.Exist(kArgRemote) && args[kArgRemote].HasValue() && args[kArgRemote].AsBoolean());
625 const bool override_query = (args[kArgQuery].HasValue() &&
626 args[kArgQuery].AsString() != kDfltArgQuery);
627 const bool override_subject = CBlastDatabaseArgs::HasBeenSet(args);
628
629 if (CMbIndexArgs::HasBeenSet(args)) {
630 if (args[kArgUseIndex].AsBoolean() != kDfltArgUseIndex)
631 ERR_POST(Warning << "Overriding megablast BLAST DB indexed options in saved strategy");
632 }
633
634 s_ImportSearchStrategy(in, cmdline_args, is_remote_search, override_query,
635 override_subject);
636
637 return true;
638 }
639
640 // Process search strategies
641 // FIXME: save program options,
642 // Save task if provided, no other options (only those in the cmd line) should
643 // be saved
644 void
SaveSearchStrategy(const CArgs & args,blast::CBlastAppArgs * cmdline_args,CRef<blast::IQueryFactory> queries,CRef<blast::CBlastOptionsHandle> opts_hndl,CRef<objects::CPssmWithParameters> pssm,unsigned int num_iters)645 SaveSearchStrategy(const CArgs& args,
646 blast::CBlastAppArgs* cmdline_args,
647 CRef<blast::IQueryFactory> queries,
648 CRef<blast::CBlastOptionsHandle> opts_hndl,
649 CRef<objects::CPssmWithParameters> pssm
650 /* = CRef<objects::CPssmWithParameters>() */,
651 unsigned int num_iters /* =0 */)
652 {
653 CNcbiOstream* out = cmdline_args->GetExportSearchStrategyStream(args);
654 if ( !out ) {
655 return;
656 }
657
658 s_ExportSearchStrategy(out, queries, opts_hndl,
659 cmdline_args->GetBlastDatabaseArgs(),
660 pssm, num_iters);
661 }
662
663 /// Extracts the subject sequence IDs and ranges from the BLAST results
664 /// @note if this ever needs to be refactored for popular developer
665 /// consumption, this function should operate on CSeq_align_set as opposed to
666 /// blast::CSearchResultSet
667 static void
s_ExtractSeqidsAndRanges(const blast::CSearchResultSet & results,CScope::TIds & ids,vector<TSeqRange> & ranges)668 s_ExtractSeqidsAndRanges(const blast::CSearchResultSet& results,
669 CScope::TIds& ids, vector<TSeqRange>& ranges)
670 {
671 static const CSeq_align::TDim kQueryRow = 0;
672 static const CSeq_align::TDim kSubjRow = 1;
673 ids.clear();
674 ranges.clear();
675
676 typedef map< CConstRef<CSeq_id>,
677 vector<TSeqRange>,
678 CConstRefCSeqId_LessThan
679 > TSeqIdRanges;
680 TSeqIdRanges id_ranges;
681
682 ITERATE(blast::CSearchResultSet, result, results) {
683 if ( !(*result)->HasAlignments() ) {
684 continue;
685 }
686 ITERATE(CSeq_align_set::Tdata, aln, (*result)->GetSeqAlign()->Get()) {
687 CConstRef<CSeq_id> subj(&(*aln)->GetSeq_id(kSubjRow));
688 TSeqRange subj_range((*aln)->GetSeqRange(kSubjRow));
689 if ((*aln)->GetSeqStrand(kQueryRow) == eNa_strand_minus &&
690 (*aln)->GetSeqStrand(kSubjRow) == eNa_strand_plus) {
691 TSeqRange r(subj_range);
692 // flag the range as needed to be flipped once the sequence
693 // length is known
694 subj_range.SetFrom(r.GetToOpen());
695 subj_range.SetToOpen(r.GetFrom());
696 }
697 id_ranges[subj].push_back(subj_range);
698 }
699 }
700
701 ITERATE(TSeqIdRanges, itr, id_ranges) {
702 ITERATE(vector<TSeqRange>, range, itr->second) {
703 ids.push_back(CSeq_id_Handle::GetHandle(*itr->first));
704 ranges.push_back(*range);
705 }
706 }
707 _ASSERT(ids.size() == ranges.size());
708 }
709
710 /// Returns true if the remote BLAST DB data loader is being used
711 static bool
s_IsUsingRemoteBlastDbDataLoader()712 s_IsUsingRemoteBlastDbDataLoader()
713 {
714 CObjectManager::TRegisteredNames data_loaders;
715 CObjectManager::GetInstance()->GetRegisteredNames(data_loaders);
716 ITERATE(CObjectManager::TRegisteredNames, name, data_loaders) {
717 if (NStr::StartsWith(*name, objects::CRemoteBlastDbDataLoader::kNamePrefix)) {
718 return true;
719 }
720 }
721 return false;
722 }
723
724 static bool
s_IsPrefetchFormat(blast::CFormattingArgs::EOutputFormat format_type)725 s_IsPrefetchFormat(blast::CFormattingArgs::EOutputFormat format_type)
726 {
727 if ((format_type == CFormattingArgs::eAsnText) ||
728 (format_type == CFormattingArgs::eAsnBinary) ||
729 (format_type == CFormattingArgs::eArchiveFormat)||
730 (format_type == CFormattingArgs::eJsonSeqalign)) {
731 return false;
732 }
733 return true;
734 }
735
736 static bool
s_PreFetchSeqs(const blast::CSearchResultSet & results,blast::CFormattingArgs::EOutputFormat format_type)737 s_PreFetchSeqs(const blast::CSearchResultSet& results,
738 blast::CFormattingArgs::EOutputFormat format_type)
739 {
740 {
741 char * pre_fetch_limit_str = getenv("PRE_FETCH_SEQS_LIMIT");
742 if (pre_fetch_limit_str) {
743 int pre_fetch_limit = NStr::StringToInt(pre_fetch_limit_str);
744 if(pre_fetch_limit == 0) {
745 return false;
746 }
747 if(pre_fetch_limit == INT_MAX){
748 return true;
749 }
750 int num_of_seqs = 0;
751 for(unsigned int i=0; i < results.GetNumResults(); i++) {
752 if(results[i].HasAlignments()) {
753 num_of_seqs += results[i].GetSeqAlign()->Size();
754 }
755 }
756 if(num_of_seqs > pre_fetch_limit) {
757 return false;
758 }
759 }
760 }
761
762 return s_IsPrefetchFormat(format_type);
763 }
764
BlastFormatter_PreFetchSequenceData(const blast::CSearchResultSet & results,CRef<CScope> scope,blast::CFormattingArgs::EOutputFormat format_type)765 void BlastFormatter_PreFetchSequenceData(const blast::CSearchResultSet& results,
766 CRef<CScope> scope,
767 blast::CFormattingArgs::EOutputFormat format_type)
768 {
769 _ASSERT(scope.NotEmpty());
770 if (results.size() == 0) {
771 return;
772 }
773 if(!s_PreFetchSeqs(results, format_type)){
774 return;
775 }
776 try {
777 CScope::TIds ids;
778 vector<TSeqRange> ranges;
779 s_ExtractSeqidsAndRanges(results, ids, ranges);
780 _TRACE("Prefetching " << ids.size() << " sequence lengths");
781 LoadSequencesToScope(ids, ranges, scope);
782 } catch (CException& e) {
783 if(s_IsUsingRemoteBlastDbDataLoader()) {
784 NCBI_THROW(CBlastSystemException, eNetworkError,
785 "Error fetching sequence data from BLAST databases at NCBI, "
786 "please try again later");
787 }
788 else {
789 NCBI_RETHROW_SAME(e, "Error pre-fetching sequence data ");
790 }
791 }
792
793 }
794
795 /// Auxiliary function to extract the ancillary data from the PSSM.
796 CRef<CBlastAncillaryData>
ExtractPssmAncillaryData(const CPssmWithParameters & pssm)797 ExtractPssmAncillaryData(const CPssmWithParameters& pssm)
798 {
799 _ASSERT(pssm.CanGetPssm());
800 pair<double, double> lambda, k, h;
801 lambda.first = pssm.GetPssm().GetLambdaUngapped();
802 lambda.second = pssm.GetPssm().GetLambda();
803 k.first = pssm.GetPssm().GetKappaUngapped();
804 k.second = pssm.GetPssm().GetKappa();
805 h.first = pssm.GetPssm().GetHUngapped();
806 h.second = pssm.GetPssm().GetH();
807 return CRef<CBlastAncillaryData>(new CBlastAncillaryData(lambda, k, h, 0,
808 true));
809 }
810
811 void
CheckForFreqRatioFile(const string & rps_dbname,CRef<CBlastOptionsHandle> & opt_handle,bool isRpsblast)812 CheckForFreqRatioFile(const string& rps_dbname, CRef<CBlastOptionsHandle> & opt_handle, bool isRpsblast)
813 {
814 bool use_cbs = (opt_handle->GetOptions().GetCompositionBasedStats() == eNoCompositionBasedStats) ? false : true;
815 if(use_cbs) {
816 vector<string> db;
817 NStr::Split(rps_dbname, " ", db);
818 list<string> failed_db;
819 for (unsigned int i=0; i < db.size(); i++) {
820 string path;
821 try {
822 vector<string> dbpath;
823 CSeqDB::FindVolumePaths(db[i], CSeqDB::eProtein, dbpath);
824 path = *dbpath.begin();
825 } catch (const CSeqDBException& e) {
826 NCBI_RETHROW(e, CBlastException, eRpsInit,
827 "Cannot retrieve path to RPS database");
828 }
829
830 CFile f(path + ".freq");
831 if(!f.Exists()) {
832 failed_db.push_back(db[i]);
833 }
834
835 }
836 if(!failed_db.empty()) {
837 opt_handle->SetOptions().SetCompositionBasedStats(eNoCompositionBasedStats);
838 string all_failed = NStr::Join(failed_db, ", ");
839 string prog_str = isRpsblast ? "RPSBLAST": "DELTABLAST";
840 string msg = all_failed + " contain(s) no freq ratios " \
841 + "needed for composition-based statistics.\n" \
842 + prog_str + " will be run without composition-based statistics.";
843 ERR_POST(Warning << msg);
844 }
845
846 }
847 return;
848 }
849
850 bool
IsIStreamEmpty(CNcbiIstream & in)851 IsIStreamEmpty(CNcbiIstream & in)
852 {
853 #ifdef NCBI_OS_MSWIN
854 char c;
855 in.setf(ios::skipws);
856 if (!(in >> c))
857 return true;
858 in.unget();
859 return false;
860 #else
861 char c;
862 CNcbiStreampos orig_p = in.tellg();
863 // Piped input
864 if(orig_p < 0)
865 return false;
866
867 IOS_BASE::iostate orig_state = in.rdstate();
868 IOS_BASE::fmtflags orig_flags = in.setf(ios::skipws);
869
870 if(! (in >> c))
871 return true;
872
873 in.seekg(orig_p);
874 in.flags(orig_flags);
875 in.clear();
876 in.setstate(orig_state);
877
878 return false;
879 #endif
880 }
881
882 string
GetCmdlineArgs(const CNcbiArguments & a)883 GetCmdlineArgs(const CNcbiArguments & a)
884 {
885 string cmd = kEmptyStr;
886 for(unsigned int i=0; i < a.Size(); i++) {
887 cmd += a[i] + " ";
888 }
889 return cmd;
890 }
891
892 bool
UseXInclude(const CFormattingArgs & f,const string & s)893 UseXInclude(const CFormattingArgs & f, const string & s)
894 {
895 CFormattingArgs::EOutputFormat fmt = f.GetFormattedOutputChoice();
896 if((fmt == CFormattingArgs::eXml2) || (fmt == CFormattingArgs::eJson)) {
897 if (s == "-"){
898 string f_str = (fmt == CFormattingArgs::eXml2) ? "14.": "13.";
899 NCBI_THROW(CInputException, eEmptyUserInput,
900 "Please provide a file name for outfmt " + f_str);
901 }
902 return true;
903 }
904 return false;
905 }
906
907 string
GetSubjectFile(const CArgs & args)908 GetSubjectFile(const CArgs& args)
909 {
910 string filename="";
911
912 if (args.Exist(kArgSubject) && args[kArgSubject].HasValue())
913 filename = args[kArgSubject].AsString();
914
915 return filename;
916 }
917
PrintErrorArchive(const CArgs & a,const list<CRef<CBlast4_error>> & msg)918 void PrintErrorArchive(const CArgs & a, const list<CRef<CBlast4_error> > & msg)
919 {
920 try {
921 if(NStr::StringToInt(NStr::TruncateSpaces(a[align_format::kArgOutputFormat].AsString())) == CFormattingArgs::eArchiveFormat) {
922 CRef<CBlast4_archive> archive (new CBlast4_archive);
923
924 CBlast4_request & req = archive->SetRequest();
925 CBlast4_get_request_info_request & info= req.SetBody().SetGet_request_info();
926 info.SetRequest_id("Error");
927 CBlast4_get_search_results_reply & results = archive->SetResults();
928 // Pacify unused varaible warning, the set above is used to populate mandatory field
929 (void) results;
930 archive->SetMessages() = msg;
931 CBlastFormat::PrintArchive(archive, a[kArgOutput].AsOutputFile());
932 }
933 } catch (...) {}
934 }
935
QueryBatchCleanup()936 void QueryBatchCleanup()
937 {
938 #if defined(NCBI_OS_LINUX) && HAVE_MALLOC_H
939 malloc_trim(0);
940 #endif
941 return;
942
943 }
944
LogQueryInfo(CBlastUsageReport & report,const CBlastInput & q_info)945 void LogQueryInfo(CBlastUsageReport & report, const CBlastInput & q_info)
946 {
947 report.AddParam(CBlastUsageReport::eTotalQueryLength, q_info.GetTotalLengthProcessed());
948 report.AddParam(CBlastUsageReport::eNumQueries, q_info.GetNumSeqsProcessed());
949 }
950
951
LogRPSBlastOptions(blast::CBlastUsageReport & report,const CBlastOptions & opt)952 void LogRPSBlastOptions(blast::CBlastUsageReport & report, const CBlastOptions & opt)
953 {
954 report.AddParam(CBlastUsageReport::eProgram, Blast_ProgramNameFromType(opt.GetProgramType()));
955 report.AddParam(CBlastUsageReport::eEvalueThreshold, opt.GetEvalueThreshold());
956 report.AddParam(CBlastUsageReport::eHitListSize, opt.GetHitlistSize());
957 report.AddParam(CBlastUsageReport::eCompBasedStats, opt.GetCompositionBasedStats());
958 }
959
LogRPSCmdOptions(blast::CBlastUsageReport & report,const CBlastAppArgs & args)960 void LogRPSCmdOptions(blast::CBlastUsageReport & report, const CBlastAppArgs & args)
961 {
962 if (args.GetBlastDatabaseArgs().NotEmpty() &&
963 args.GetBlastDatabaseArgs()->GetSearchDatabase().NotEmpty() &&
964 args.GetBlastDatabaseArgs()->GetSearchDatabase()->GetSeqDb().NotEmpty()) {
965
966 CRef<CSeqDB> db = args.GetBlastDatabaseArgs()->GetSearchDatabase()->GetSeqDb();
967 string db_name = db->GetDBNameList();
968 int off = db_name.find_last_of(CFile::GetPathSeparator());
969 if (off != -1) {
970 db_name.erase(0, off+1);
971 }
972 report.AddParam(CBlastUsageReport::eDBName, db_name);
973 report.AddParam(CBlastUsageReport::eDBLength, (Int8) db->GetTotalLength());
974 report.AddParam(CBlastUsageReport::eDBNumSeqs, db->GetNumSeqs());
975 report.AddParam(CBlastUsageReport::eDBDate, db->GetDate());
976 }
977
978 if(args.GetFormattingArgs().NotEmpty()){
979 report.AddParam(CBlastUsageReport::eOutputFmt, args.GetFormattingArgs()->GetFormattedOutputChoice());
980 }
981 }
982
GetMTByQueriesBatchSize(EProgram p,int num_threads)983 int GetMTByQueriesBatchSize(EProgram p, int num_threads)
984 {
985 int batch_size = 0;
986
987 char * mt_query_batch_env = getenv("BLAST_MT_QUERY_BATCH_SIZE");
988 if (mt_query_batch_env) {
989 batch_size = NStr::StringToInt(mt_query_batch_env);
990 }
991 else {
992 int factor = 1;
993 if ( EProgramToEBlastProgramType(p) == eBlastTypeBlastn ) {
994 factor = 2;
995 }
996 batch_size = GetQueryBatchSize(p)/factor;
997 }
998 return batch_size;
999 }
1000
CheckMTByQueries_DBSize(CRef<CLocalDbAdapter> & db_adapter,const CBlastOptions & opt)1001 void CheckMTByQueries_DBSize(CRef<CLocalDbAdapter> & db_adapter, const CBlastOptions & opt)
1002 {
1003 CRef<CSearchDatabase> sdb = db_adapter->GetSearchDatabase();
1004 const CRef<CSeqDBGiList>& gi = sdb->GetGiList();
1005 const CRef<CSeqDBGiList>& n_gi = sdb->GetNegativeGiList();
1006
1007 if (gi.NotEmpty() || n_gi.NotEmpty()) {
1008 return;
1009 }
1010
1011 EProgram prog = opt.GetProgram();
1012 CRef<CSeqDB> seqdb = sdb->GetSeqDb();
1013 Uint8 total_length = seqdb->GetTotalLength();
1014 Uint8 length_limit = 0;
1015
1016 if (db_adapter->IsProtein()) {
1017 static const Uint8 kMaxProtTotalLength = 2000000000;
1018 length_limit = kMaxProtTotalLength;
1019 }
1020 else {
1021 static const Uint8 kMaxNuclTotalLength = 14000000000;
1022 length_limit = kMaxNuclTotalLength;
1023 }
1024
1025 if(total_length > length_limit) {
1026 string warn = "This database is probably too large to benefit from -mt_mode=1. " \
1027 "We suggest using -mt_mode=1 only if the database is less than";
1028 if (db_adapter->IsProtein()) {
1029 warn += " 2 billion residues ";
1030 }
1031 else {
1032 warn += " 14 billion bases ";
1033 }
1034 ERR_POST(Warning << warn + "or if the search is limited by an option such as -taxids, -taxidlist or -gilist.");
1035 }
1036 return;
1037 }
1038
CheckMTByQueries_QuerySize(EProgram prog,int batch_size)1039 void CheckMTByQueries_QuerySize(EProgram prog, int batch_size)
1040 {
1041 string warning = "This set of queries is too small to fully benefit from the -mt_mode=1 option. "\
1042 "The total number of letters should be at least ";
1043 warning += NStr::IntToString(batch_size);
1044 EBlastProgramType p = EProgramToEBlastProgramType(prog);
1045 if (Blast_QueryIsProtein(p)) {
1046 warning += " residues per thread.";
1047 }
1048 else {
1049 warning += " bases per thread.";
1050 }
1051 ERR_POST(Warning << warning);
1052 }
1053
1054 END_NCBI_SCOPE
1055