1 /* $Id: blast_app_util.cpp 637481 2021-09-13 18:27:32Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Christiam Camacho
27 *
28 */
29
30 /** @file blast_app_util.cpp
31 * Utility functions for BLAST command line applications
32 */
33
34 #include <ncbi_pch.hpp>
35 #include "blast_app_util.hpp"
36
37 #include <serial/serial.hpp>
38 #include <serial/objostr.hpp>
39
40 #include <objtools/data_loaders/blastdb/bdbloader.hpp>
41 #include <algo/blast/api/remote_blast.hpp>
42 #include <algo/blast/api/objmgr_query_data.hpp> // for CObjMgr_QueryFactory
43 #include <algo/blast/api/blast_options_builder.hpp>
44 #include <algo/blast/api/search_strategy.hpp>
45 #include <algo/blast/blastinput/blast_input.hpp> // for CInputException
46 #include <algo/blast/blastinput/psiblast_args.hpp>
47 #include <algo/blast/blastinput/tblastn_args.hpp>
48 #include <algo/blast/blastinput/blast_scope_src.hpp>
49 #include <objmgr/util/sequence.hpp>
50
51 #include <objects/scoremat/Pssm.hpp>
52 #include <serial/typeinfo.hpp> // for CTypeInfo, needed by SerialClone
53 #include <objtools/data_loaders/blastdb/bdbloader_rmt.hpp>
54 #include <algo/blast/format/blast_format.hpp>
55 #include <objtools/align_format/format_flags.hpp>
56
57 #if defined(NCBI_OS_LINUX) && HAVE_MALLOC_H
58 #include <malloc.h>
59 #endif
60
61 BEGIN_NCBI_SCOPE
62 USING_SCOPE(objects);
63 USING_SCOPE(blast);
64
GetBatchSize(Int4 hits)65 Int4 CBatchSizeMixer::GetBatchSize(Int4 hits)
66 {
67 if (hits >= 0) {
68 double ratio = 1.0 * (hits+1) / m_BatchSize;
69 m_Ratio = (m_Ratio < 0) ? ratio
70 : k_MixIn * ratio + (1.0 - k_MixIn) * m_Ratio;
71 m_BatchSize = (Int4) (1.0 * m_TargetHits / m_Ratio);
72 }
73 if (m_BatchSize > k_MaxBatchSize) {
74 m_BatchSize = k_MaxBatchSize;
75 m_Ratio = -1.0;
76 } else if (m_BatchSize < k_MinBatchSize) {
77 m_BatchSize = k_MinBatchSize;
78 m_Ratio = -1.0;
79 }
80 return m_BatchSize;
81 }
82
83 CRef<blast::CRemoteBlast>
InitializeRemoteBlast(CRef<blast::IQueryFactory> queries,CRef<blast::CBlastDatabaseArgs> db_args,CRef<blast::CBlastOptionsHandle> opts_hndl,bool verbose_output,const string & client_id,CRef<objects::CPssmWithParameters> pssm)84 InitializeRemoteBlast(CRef<blast::IQueryFactory> queries,
85 CRef<blast::CBlastDatabaseArgs> db_args,
86 CRef<blast::CBlastOptionsHandle> opts_hndl,
87 bool verbose_output,
88 const string& client_id /* = kEmptyStr */,
89 CRef<objects::CPssmWithParameters> pssm
90 /* = CRef<objects::CPssmWithParameters>() */)
91 {
92 _ASSERT(queries || pssm);
93 _ASSERT(db_args);
94 _ASSERT(opts_hndl);
95
96 CRef<CRemoteBlast> retval;
97
98 CRef<CSearchDatabase> search_db = db_args->GetSearchDatabase();
99 if (search_db.NotEmpty()) {
100 if (pssm.NotEmpty()) {
101 _ASSERT(queries.Empty());
102 retval.Reset(new CRemoteBlast(pssm, opts_hndl, *search_db));
103 } else {
104 retval.Reset(new CRemoteBlast(queries, opts_hndl, *search_db));
105 }
106 } else {
107 if (pssm.NotEmpty()) {
108 NCBI_THROW(CInputException, eInvalidInput,
109 "Remote PSI-BL2SEQ is not supported");
110 } else {
111 // N.B.: there is NO scope needed in the GetSubjects call because
112 // the subjects (if any) should have already been added in
113 // InitializeSubject
114 retval.Reset(new CRemoteBlast(queries, opts_hndl,
115 db_args->GetSubjects()));
116 }
117 }
118 if (verbose_output) {
119 retval->SetVerbose();
120 }
121 if (client_id != kEmptyStr) {
122 retval->SetClientId(client_id);
123 }
124 return retval;
125 }
126
127 blast::SDataLoaderConfig
InitializeQueryDataLoaderConfiguration(bool query_is_protein,CRef<blast::CLocalDbAdapter> db_adapter)128 InitializeQueryDataLoaderConfiguration(bool query_is_protein,
129 CRef<blast::CLocalDbAdapter> db_adapter)
130 {
131 SDataLoaderConfig retval(query_is_protein);
132 retval.OptimizeForWholeLargeSequenceRetrieval();
133
134 /* Load the BLAST database into the data loader configuration for the query
135 * so that if the query sequence(s) are specified as seq-ids, these can be
136 * fetched from the BLAST database being searched */
137 if (db_adapter->IsBlastDb() && /* this is a BLAST database search */
138 retval.m_UseBlastDbs && /* the BLAST database data loader is requested */
139 (query_is_protein == db_adapter->IsProtein())) { /* the same database type is used for both queries and subjects */
140 // Make sure we don't add the same database more than once
141 vector<string> default_dbs;
142 NStr::Split(retval.m_BlastDbName, " ", default_dbs);
143 if (default_dbs.size() &&
144 (find(default_dbs.begin(), default_dbs.end(),
145 db_adapter->GetDatabaseName()) == default_dbs.end())) {
146 CNcbiOstrstream oss;
147 oss << db_adapter->GetDatabaseName() << " " << retval.m_BlastDbName;
148 retval.m_BlastDbName = CNcbiOstrstreamToString(oss);
149 }
150 }
151 if (retval.m_UseBlastDbs) {
152 _TRACE("Initializing query data loader to '" << retval.m_BlastDbName
153 << "' (" << (query_is_protein ? "protein" : "nucleotide")
154 << " BLAST database)");
155 }
156 if (retval.m_UseGenbank) {
157 _TRACE("Initializing query data loader to use GenBank data loader");
158 }
159 return retval;
160 }
161
162 void
InitializeSubject(CRef<blast::CBlastDatabaseArgs> db_args,CRef<blast::CBlastOptionsHandle> opts_hndl,bool is_remote_search,CRef<blast::CLocalDbAdapter> & db_adapter,CRef<objects::CScope> & scope)163 InitializeSubject(CRef<blast::CBlastDatabaseArgs> db_args,
164 CRef<blast::CBlastOptionsHandle> opts_hndl,
165 bool is_remote_search,
166 CRef<blast::CLocalDbAdapter>& db_adapter,
167 CRef<objects::CScope>& scope)
168 {
169 string dl = kEmptyStr;
170 db_adapter.Reset();
171
172 _ASSERT(db_args.NotEmpty());
173 CRef<CSearchDatabase> search_db = db_args->GetSearchDatabase();
174
175 // Initialize the scope...
176 if (is_remote_search) {
177 const bool is_protein =
178 Blast_SubjectIsProtein(opts_hndl->GetOptions().GetProgramType())
179 ? true : false;
180 SDataLoaderConfig config(is_protein);
181 CBlastScopeSource scope_src(config);
182 // configure scope to fetch sequences remotely for formatting
183 if (scope.NotEmpty()) {
184 scope_src.AddDataLoaders(scope);
185 } else {
186 scope = scope_src.NewScope();
187 }
188 } else {
189 if (scope.Empty()) {
190 scope.Reset(new CScope(*CObjectManager::GetInstance()));
191 }
192 }
193 _ASSERT(scope.NotEmpty());
194
195 // ... and then the subjects
196 CRef<IQueryFactory> subjects;
197 if ( (subjects = db_args->GetSubjects(scope)) ) {
198 _ASSERT(search_db.Empty());
199 char* bl2seq_legacy = getenv("BL2SEQ_LEGACY");
200 if (bl2seq_legacy)
201 db_adapter.Reset(new CLocalDbAdapter(subjects, opts_hndl, false));
202 else
203 db_adapter.Reset(new CLocalDbAdapter(subjects, opts_hndl, true));
204 } else {
205 _ASSERT(search_db.NotEmpty());
206 try {
207 // Try to open the BLAST database even for remote searches, as if
208 // it is available locally, it will be better to fetch the
209 // sequence data for formatting from this (local) source
210 CRef<CSeqDB> seqdb = search_db->GetSeqDb();
211 db_adapter.Reset(new CLocalDbAdapter(*search_db));
212 dl = RegisterOMDataLoader(seqdb);
213 scope->AddDataLoader(dl);
214 } catch (const CSeqDBException&) {
215 // The BLAST database couldn't be found, report this for local
216 // searches, but for remote searches go on.
217 dl = kEmptyStr;
218 if (is_remote_search ) {
219 db_adapter.Reset(new CLocalDbAdapter(*search_db));
220 } else {
221 throw;
222 }
223 }
224 }
225
226 /// Set the BLASTDB data loader as the default data loader (if applicable)
227 if (search_db.NotEmpty()) {
228 if ( dl != kEmptyStr) {
229 // FIXME: will this work with multiple BLAST DBs?
230 scope->AddDataLoader(dl, CBlastDatabaseArgs::kSubjectsDataLoaderPriority);
231 _TRACE("Setting " << dl << " priority to "
232 << (int)CBlastDatabaseArgs::kSubjectsDataLoaderPriority
233 << " for subjects");
234 }
235 }
236 return;
237 }
238
RegisterOMDataLoader(CRef<CSeqDB> db_handle)239 string RegisterOMDataLoader(CRef<CSeqDB> db_handle)
240 {
241 // the blast formatter requires that the database coexist in
242 // the same scope with the query sequences
243 CRef<CObjectManager> om = CObjectManager::GetInstance();
244 CBlastDbDataLoader::RegisterInObjectManager(*om, db_handle, true,
245 CObjectManager::eDefault,
246 CBlastDatabaseArgs::kSubjectsDataLoaderPriority);
247 CBlastDbDataLoader::SBlastDbParam param(db_handle);
248 string retval(CBlastDbDataLoader::GetLoaderNameFromArgs(param));
249 _TRACE("Registering " << retval << " at priority " <<
250 (int)CBlastDatabaseArgs::kSubjectsDataLoaderPriority
251 << " for subjects");
252 return retval;
253 }
254
255
256 static CRef<blast::CExportStrategy>
s_InitializeExportStrategy(CRef<blast::IQueryFactory> queries,CRef<blast::CBlastDatabaseArgs> db_args,CRef<blast::CBlastOptionsHandle> opts_hndl,const string & client_id,CRef<objects::CPssmWithParameters> pssm,unsigned int num_iters)257 s_InitializeExportStrategy(CRef<blast::IQueryFactory> queries,
258 CRef<blast::CBlastDatabaseArgs> db_args,
259 CRef<blast::CBlastOptionsHandle> opts_hndl,
260 const string& client_id /* = kEmptyStr */,
261 CRef<objects::CPssmWithParameters> pssm
262 /* = CRef<objects::CPssmWithParameters>() */,
263 unsigned int num_iters
264 /* = 0 */)
265 {
266 _ASSERT(queries || pssm);
267 _ASSERT(db_args);
268 _ASSERT(opts_hndl);
269
270 CRef<CExportStrategy> retval;
271
272 CRef<CSearchDatabase> search_db = db_args->GetSearchDatabase();
273 if (search_db.NotEmpty())
274 {
275 if (pssm.NotEmpty())
276 {
277 _ASSERT(queries.Empty());
278 if(num_iters != 0)
279 retval.Reset(new blast::CExportStrategy(pssm, opts_hndl, search_db, client_id, num_iters));
280 else
281 retval.Reset(new blast::CExportStrategy(pssm, opts_hndl, search_db, client_id));
282 }
283 else
284 {
285 if(num_iters != 0)
286 retval.Reset(new blast::CExportStrategy(queries, opts_hndl, search_db, client_id, num_iters));
287 else
288 retval.Reset(new blast::CExportStrategy(queries, opts_hndl, search_db, client_id));
289 }
290 }
291 else
292 {
293 if (pssm.NotEmpty())
294 {
295 NCBI_THROW(CInputException, eInvalidInput,
296 "Remote PSI-BL2SEQ is not supported");
297 }
298 else
299 {
300 retval.Reset(new blast::CExportStrategy(queries, opts_hndl,
301 db_args->GetSubjects(), client_id));
302 }
303 }
304
305 return retval;
306 }
307
308
309 /// Real implementation of search strategy extraction
310 /// @todo refactor this code so that it can be reused in other contexts
311 static void
s_ExportSearchStrategy(CNcbiOstream * out,CRef<blast::IQueryFactory> queries,CRef<blast::CBlastOptionsHandle> options_handle,CRef<blast::CBlastDatabaseArgs> db_args,CRef<objects::CPssmWithParameters> pssm,unsigned int num_iters)312 s_ExportSearchStrategy(CNcbiOstream* out,
313 CRef<blast::IQueryFactory> queries,
314 CRef<blast::CBlastOptionsHandle> options_handle,
315 CRef<blast::CBlastDatabaseArgs> db_args,
316 CRef<objects::CPssmWithParameters> pssm,
317 /* = CRef<objects::CPssmWithParameters>() */
318 unsigned int num_iters /* = 0 */)
319 {
320 if ( !out )
321 return;
322
323 _ASSERT(db_args);
324 _ASSERT(options_handle);
325
326 try
327 {
328 CRef<CExportStrategy> export_strategy =
329 s_InitializeExportStrategy(queries, db_args, options_handle,
330 kEmptyStr, pssm, num_iters);
331 export_strategy->ExportSearchStrategy_ASN1(out);
332 }
333 catch (const CBlastException& e)
334 {
335 if (e.GetErrCode() == CBlastException::eNotSupported) {
336 NCBI_THROW(CInputException, eInvalidInput,
337 "Saving search strategies with gi lists is currently "
338 "not supported");
339 }
340 throw;
341 }
342 }
343
344 /// Converts a list of Bioseqs into a TSeqLocVector. All Bioseqs are added to
345 /// the same CScope object
346 /// @param subjects Bioseqs to convert
347 static TSeqLocVector
s_ConvertBioseqs2TSeqLocVector(const CBlast4_subject::TSequences & subjects)348 s_ConvertBioseqs2TSeqLocVector(const CBlast4_subject::TSequences& subjects)
349 {
350 TSeqLocVector retval;
351 CRef<CScope> subj_scope(new CScope(*CObjectManager::GetInstance()));
352 ITERATE(CBlast4_subject::TSequences, bioseq, subjects) {
353 subj_scope->AddBioseq(**bioseq);
354 CRef<CSeq_id> seqid = FindBestChoice((*bioseq)->GetId(),
355 CSeq_id::BestRank);
356 const TSeqPos length = (*bioseq)->GetInst().GetLength();
357 CRef<CSeq_loc> sl(new CSeq_loc(*seqid, 0, length-1));
358 retval.push_back(SSeqLoc(sl, subj_scope));
359 }
360 return retval;
361 }
362
363 /// Import PSSM into the command line arguments object
364 static void
s_ImportPssm(const CBlast4_queries & queries,CRef<blast::CBlastOptionsHandle> opts_hndl,blast::CBlastAppArgs * cmdline_args)365 s_ImportPssm(const CBlast4_queries& queries,
366 CRef<blast::CBlastOptionsHandle> opts_hndl,
367 blast::CBlastAppArgs* cmdline_args)
368 {
369 CRef<CPssmWithParameters> pssm
370 (const_cast<CPssmWithParameters*>(&queries.GetPssm()));
371 CPsiBlastAppArgs* psi_args = NULL;
372 CTblastnAppArgs* tbn_args = NULL;
373
374 if ( (psi_args = dynamic_cast<CPsiBlastAppArgs*>(cmdline_args)) ) {
375 psi_args->SetInputPssm(pssm);
376 } else if ( (tbn_args =
377 dynamic_cast<CTblastnAppArgs*>(cmdline_args))) {
378 tbn_args->SetInputPssm(pssm);
379 } else {
380 EBlastProgramType p = opts_hndl->GetOptions().GetProgramType();
381 string msg("PSSM found in saved strategy, but not supported ");
382 msg += "for " + Blast_ProgramNameFromType(p);
383 NCBI_THROW(CBlastException, eNotSupported, msg);
384 }
385 }
386
387 /// Import queries into command line arguments object
388 static void
s_ImportQueries(const CBlast4_queries & queries,CRef<blast::CBlastOptionsHandle> opts_hndl,blast::CBlastAppArgs * cmdline_args)389 s_ImportQueries(const CBlast4_queries& queries,
390 CRef<blast::CBlastOptionsHandle> opts_hndl,
391 blast::CBlastAppArgs* cmdline_args)
392 {
393 CRef<CTmpFile> tmpfile(new CTmpFile(CTmpFile::eNoRemove));
394
395 // Stuff the query bioseq or seqloc list in the input stream of the
396 // cmdline_args
397 if (queries.IsSeq_loc_list()) {
398 const CBlast4_queries::TSeq_loc_list& seqlocs =
399 queries.GetSeq_loc_list();
400 CFastaOstream out(tmpfile->AsOutputFile(CTmpFile::eIfExists_Throw));
401 out.SetFlag(CFastaOstream::eAssembleParts);
402
403 EBlastProgramType prog = opts_hndl->GetOptions().GetProgramType();
404 SDataLoaderConfig dlconfig(!!Blast_QueryIsProtein(prog));
405 dlconfig.OptimizeForWholeLargeSequenceRetrieval();
406 CBlastScopeSource scope_src(dlconfig);
407 CRef<CScope> scope(scope_src.NewScope());
408
409 ITERATE(CBlast4_queries::TSeq_loc_list, itr, seqlocs) {
410 if ((*itr)->GetId()) {
411 CBioseq_Handle bh = scope->GetBioseqHandle(*(*itr)->GetId());
412 out.Write(bh);
413 }
414 }
415 scope.Reset();
416 scope_src.RevokeBlastDbDataLoader();
417
418 } else {
419 _ASSERT(queries.IsBioseq_set());
420 const CBlast4_queries::TBioseq_set& bioseqs =
421 queries.GetBioseq_set();
422 CFastaOstream out(tmpfile->AsOutputFile(CTmpFile::eIfExists_Throw));
423 out.SetFlag(CFastaOstream::eAssembleParts);
424
425 ITERATE(CBioseq_set::TSeq_set, seq_entry, bioseqs.GetSeq_set()){
426 out.Write(**seq_entry);
427 }
428 }
429
430 const string& fname = tmpfile->GetFileName();
431 tmpfile.Reset(new CTmpFile(fname));
432 cmdline_args->SetInputStream(tmpfile);
433 }
434
435 /// Import the database and return it in a CBlastDatabaseArgs object
436 static CRef<blast::CBlastDatabaseArgs>
s_ImportDatabase(const CBlast4_subject & subj,CBlastOptionsBuilder & opts_builder,bool subject_is_protein,bool is_remote_search)437 s_ImportDatabase(const CBlast4_subject& subj,
438 CBlastOptionsBuilder& opts_builder,
439 bool subject_is_protein,
440 bool is_remote_search)
441 {
442 _ASSERT(subj.IsDatabase());
443 CRef<CBlastDatabaseArgs> db_args(new CBlastDatabaseArgs);
444 const CSearchDatabase::EMoleculeType mol = subject_is_protein
445 ? CSearchDatabase::eBlastDbIsProtein
446 : CSearchDatabase::eBlastDbIsNucleotide;
447 const string dbname(subj.GetDatabase());
448 CRef<CSearchDatabase> search_db(new CSearchDatabase(dbname, mol));
449
450 if (opts_builder.HaveEntrezQuery()) {
451 string limit(opts_builder.GetEntrezQuery());
452 search_db->SetEntrezQueryLimitation(limit);
453 if ( !is_remote_search ) {
454 string msg("Entrez query '");
455 msg += limit + string("' will not be processed locally.\n");
456 msg += string("Please use the -remote option.");
457 throw runtime_error(msg);
458 }
459 }
460
461 if (opts_builder.HaveGiList() || opts_builder.HaveTaxidList()) {
462 CSeqDBGiList *gilist = new CSeqDBGiList();
463 if (opts_builder.HaveGiList()) {
464 ITERATE(list<TGi>, gi, opts_builder.GetGiList()) {
465 gilist->AddGi(*gi);
466 }
467 }
468 if (opts_builder.HaveTaxidList()) {
469 list<TTaxId> list = opts_builder.GetTaxidList();
470 set<TTaxId> taxids(list.begin(), list.end());
471 gilist->AddTaxIds(taxids);
472 }
473 search_db->SetGiList(gilist);
474 }
475
476 if (opts_builder.HaveNegativeGiList() || opts_builder.HaveNegativeTaxidList()) {
477 CSeqDBGiList *gilist = new CSeqDBGiList();
478 if (opts_builder.HaveNegativeGiList()) {
479 ITERATE(list<TGi>, gi, opts_builder.GetNegativeGiList()) {
480 gilist->AddGi(*gi);
481 }
482 }
483 if (opts_builder.HaveNegativeTaxidList()) {
484 list<TTaxId> list = opts_builder.GetNegativeTaxidList();
485 set<TTaxId> taxids(list.begin(), list.end());
486 gilist->AddTaxIds(taxids);
487 }
488 search_db->SetNegativeGiList(gilist);
489 }
490
491 if (opts_builder.HasDbFilteringAlgorithmKey()) {
492 string algo_key = opts_builder.GetDbFilteringAlgorithmKey();
493 ESubjectMaskingType mask_type= eSoftSubjMasking;
494 if(opts_builder.HasSubjectMaskingType())
495 mask_type = opts_builder.GetSubjectMaskingType();
496 search_db->SetFilteringAlgorithm(algo_key, mask_type);
497
498 } else if (opts_builder.HasDbFilteringAlgorithmId()) {
499 int algo_id = opts_builder.GetDbFilteringAlgorithmId();
500 ESubjectMaskingType mask_type= eSoftSubjMasking;
501 if(opts_builder.HasSubjectMaskingType())
502 mask_type = opts_builder.GetSubjectMaskingType();
503 search_db->SetFilteringAlgorithm(algo_id, mask_type);
504 }
505
506 db_args->SetSearchDatabase(search_db);
507 return db_args;
508 }
509
510 /// Import the subject sequences into a CBlastDatabaseArgs object
511 static CRef<blast::CBlastDatabaseArgs>
s_ImportSubjects(const CBlast4_subject & subj,bool subject_is_protein)512 s_ImportSubjects(const CBlast4_subject& subj, bool subject_is_protein)
513 {
514 _ASSERT(subj.IsSequences());
515 CRef<CBlastDatabaseArgs> db_args(new CBlastDatabaseArgs);
516 TSeqLocVector subjects =
517 s_ConvertBioseqs2TSeqLocVector(subj.GetSequences());
518 CRef<CScope> subj_scope = subjects.front().scope;
519 CRef<IQueryFactory> subject_factory(new CObjMgr_QueryFactory(subjects));
520 db_args->SetSubjects(subject_factory, subj_scope, subject_is_protein);
521 return db_args;
522 }
523
524 /// Imports search strategy, using CImportStrategy.
525 static void
s_ImportSearchStrategy(CNcbiIstream * in,blast::CBlastAppArgs * cmdline_args,bool is_remote_search,bool override_query,bool override_subject)526 s_ImportSearchStrategy(CNcbiIstream* in,
527 blast::CBlastAppArgs* cmdline_args,
528 bool is_remote_search,
529 bool override_query,
530 bool override_subject)
531 {
532 if ( !in ) {
533 return;
534 }
535
536 CRef<CBlast4_request> b4req;
537 try {
538 b4req = ExtractBlast4Request(*in);
539 } catch (const CSerialException&) {
540 NCBI_THROW(CInputException, eInvalidInput,
541 "Failed to read search strategy");
542 }
543
544 CImportStrategy strategy(b4req);
545
546 CRef<blast::CBlastOptionsHandle> opts_hndl = strategy.GetOptionsHandle();
547 cmdline_args->SetOptionsHandle(opts_hndl);
548 const EBlastProgramType prog = opts_hndl->GetOptions().GetProgramType();
549 cmdline_args->SetTask(strategy.GetTask());
550 #if _DEBUG
551 {
552 char* program_string = 0;
553 BlastNumber2Program(prog, &program_string);
554 _TRACE("EBlastProgramType=" << program_string << " task=" << strategy.GetTask());
555 sfree(program_string);
556 }
557 #endif
558
559 // Get the subject
560 if (override_subject) {
561 ERR_POST(Warning << "Overriding database/subject in saved strategy");
562 } else {
563 CRef<blast::CBlastDatabaseArgs> db_args;
564 CRef<CBlast4_subject> subj = strategy.GetSubject();
565 const bool subject_is_protein = Blast_SubjectIsProtein(prog) ? true : false;
566
567 if (subj->IsDatabase()) {
568 db_args = s_ImportDatabase(*subj, strategy.GetOptionsBuilder(),
569 subject_is_protein, is_remote_search);
570 } else {
571 db_args = s_ImportSubjects(*subj, subject_is_protein);
572 }
573 _ASSERT(db_args.NotEmpty());
574 cmdline_args->SetBlastDatabaseArgs(db_args);
575 }
576
577 // Get the query, queries, or pssm
578 if (override_query) {
579 ERR_POST(Warning << "Overriding query in saved strategy");
580 } else {
581 CRef<CBlast4_queries> queries = strategy.GetQueries();
582 if (queries->IsPssm()) {
583 s_ImportPssm(*queries, opts_hndl, cmdline_args);
584 } else {
585 s_ImportQueries(*queries, opts_hndl, cmdline_args);
586 }
587 // Set the range restriction for the query, if applicable
588 const TSeqRange query_range = strategy.GetQueryRange();
589 if (query_range != TSeqRange::GetEmpty()) {
590 cmdline_args->GetQueryOptionsArgs()->SetRange(query_range);
591 }
592 }
593
594 if ( CPsiBlastAppArgs* psi_args = dynamic_cast<CPsiBlastAppArgs*>(cmdline_args) )
595 {
596 psi_args->SetNumberOfIterations(strategy.GetPsiNumOfIterations());
597 }
598 }
599
600 bool
RecoverSearchStrategy(const CArgs & args,blast::CBlastAppArgs * cmdline_args)601 RecoverSearchStrategy(const CArgs& args, blast::CBlastAppArgs* cmdline_args)
602 {
603 CNcbiIstream* in = cmdline_args->GetImportSearchStrategyStream(args);
604 if ( !in ) {
605 return false;
606 }
607 const bool is_remote_search =
608 (args.Exist(kArgRemote) && args[kArgRemote].HasValue() && args[kArgRemote].AsBoolean());
609 const bool override_query = (args[kArgQuery].HasValue() &&
610 args[kArgQuery].AsString() != kDfltArgQuery);
611 const bool override_subject = CBlastDatabaseArgs::HasBeenSet(args);
612
613 if (CMbIndexArgs::HasBeenSet(args)) {
614 if (args[kArgUseIndex].AsBoolean() != kDfltArgUseIndex)
615 ERR_POST(Warning << "Overriding megablast BLAST DB indexed options in saved strategy");
616 }
617
618 s_ImportSearchStrategy(in, cmdline_args, is_remote_search, override_query,
619 override_subject);
620
621 return true;
622 }
623
624 // Process search strategies
625 // FIXME: save program options,
626 // Save task if provided, no other options (only those in the cmd line) should
627 // be saved
628 void
SaveSearchStrategy(const CArgs & args,blast::CBlastAppArgs * cmdline_args,CRef<blast::IQueryFactory> queries,CRef<blast::CBlastOptionsHandle> opts_hndl,CRef<objects::CPssmWithParameters> pssm,unsigned int num_iters)629 SaveSearchStrategy(const CArgs& args,
630 blast::CBlastAppArgs* cmdline_args,
631 CRef<blast::IQueryFactory> queries,
632 CRef<blast::CBlastOptionsHandle> opts_hndl,
633 CRef<objects::CPssmWithParameters> pssm
634 /* = CRef<objects::CPssmWithParameters>() */,
635 unsigned int num_iters /* =0 */)
636 {
637 CNcbiOstream* out = cmdline_args->GetExportSearchStrategyStream(args);
638 if ( !out ) {
639 return;
640 }
641
642 s_ExportSearchStrategy(out, queries, opts_hndl,
643 cmdline_args->GetBlastDatabaseArgs(),
644 pssm, num_iters);
645 }
646
647 /// Extracts the subject sequence IDs and ranges from the BLAST results
648 /// @note if this ever needs to be refactored for popular developer
649 /// consumption, this function should operate on CSeq_align_set as opposed to
650 /// blast::CSearchResultSet
651 static void
s_ExtractSeqidsAndRanges(const blast::CSearchResultSet & results,CScope::TIds & ids,vector<TSeqRange> & ranges)652 s_ExtractSeqidsAndRanges(const blast::CSearchResultSet& results,
653 CScope::TIds& ids, vector<TSeqRange>& ranges)
654 {
655 static const CSeq_align::TDim kQueryRow = 0;
656 static const CSeq_align::TDim kSubjRow = 1;
657 ids.clear();
658 ranges.clear();
659
660 typedef map< CConstRef<CSeq_id>,
661 vector<TSeqRange>,
662 CConstRefCSeqId_LessThan
663 > TSeqIdRanges;
664 TSeqIdRanges id_ranges;
665
666 ITERATE(blast::CSearchResultSet, result, results) {
667 if ( !(*result)->HasAlignments() ) {
668 continue;
669 }
670 ITERATE(CSeq_align_set::Tdata, aln, (*result)->GetSeqAlign()->Get()) {
671 CConstRef<CSeq_id> subj(&(*aln)->GetSeq_id(kSubjRow));
672 TSeqRange subj_range((*aln)->GetSeqRange(kSubjRow));
673 if ((*aln)->GetSeqStrand(kQueryRow) == eNa_strand_minus &&
674 (*aln)->GetSeqStrand(kSubjRow) == eNa_strand_plus) {
675 TSeqRange r(subj_range);
676 // flag the range as needed to be flipped once the sequence
677 // length is known
678 subj_range.SetFrom(r.GetToOpen());
679 subj_range.SetToOpen(r.GetFrom());
680 }
681 id_ranges[subj].push_back(subj_range);
682 }
683 }
684
685 ITERATE(TSeqIdRanges, itr, id_ranges) {
686 ITERATE(vector<TSeqRange>, range, itr->second) {
687 ids.push_back(CSeq_id_Handle::GetHandle(*itr->first));
688 ranges.push_back(*range);
689 }
690 }
691 _ASSERT(ids.size() == ranges.size());
692 }
693
694 /// Returns true if the remote BLAST DB data loader is being used
695 static bool
s_IsUsingRemoteBlastDbDataLoader()696 s_IsUsingRemoteBlastDbDataLoader()
697 {
698 CObjectManager::TRegisteredNames data_loaders;
699 CObjectManager::GetInstance()->GetRegisteredNames(data_loaders);
700 ITERATE(CObjectManager::TRegisteredNames, name, data_loaders) {
701 if (NStr::StartsWith(*name, objects::CRemoteBlastDbDataLoader::kNamePrefix)) {
702 return true;
703 }
704 }
705 return false;
706 }
707
708 static bool
s_IsPrefetchFormat(blast::CFormattingArgs::EOutputFormat format_type)709 s_IsPrefetchFormat(blast::CFormattingArgs::EOutputFormat format_type)
710 {
711 if ((format_type == CFormattingArgs::eAsnText) ||
712 (format_type == CFormattingArgs::eAsnBinary) ||
713 (format_type == CFormattingArgs::eArchiveFormat)||
714 (format_type == CFormattingArgs::eJsonSeqalign)) {
715 return false;
716 }
717 return true;
718 }
719
720 static bool
s_PreFetchSeqs(const blast::CSearchResultSet & results,blast::CFormattingArgs::EOutputFormat format_type)721 s_PreFetchSeqs(const blast::CSearchResultSet& results,
722 blast::CFormattingArgs::EOutputFormat format_type)
723 {
724 {
725 char * pre_fetch_limit_str = getenv("PRE_FETCH_SEQS_LIMIT");
726 if (pre_fetch_limit_str) {
727 int pre_fetch_limit = NStr::StringToInt(pre_fetch_limit_str);
728 if(pre_fetch_limit == 0) {
729 return false;
730 }
731 if(pre_fetch_limit == INT_MAX){
732 return true;
733 }
734 int num_of_seqs = 0;
735 for(unsigned int i=0; i < results.GetNumResults(); i++) {
736 if(results[i].HasAlignments()) {
737 num_of_seqs += results[i].GetSeqAlign()->Size();
738 }
739 }
740 if(num_of_seqs > pre_fetch_limit) {
741 return false;
742 }
743 }
744 }
745
746 return s_IsPrefetchFormat(format_type);
747 }
748
BlastFormatter_PreFetchSequenceData(const blast::CSearchResultSet & results,CRef<CScope> scope,blast::CFormattingArgs::EOutputFormat format_type)749 void BlastFormatter_PreFetchSequenceData(const blast::CSearchResultSet& results,
750 CRef<CScope> scope,
751 blast::CFormattingArgs::EOutputFormat format_type)
752 {
753 _ASSERT(scope.NotEmpty());
754 if (results.size() == 0) {
755 return;
756 }
757 if(!s_PreFetchSeqs(results, format_type)){
758 return;
759 }
760 try {
761 CScope::TIds ids;
762 vector<TSeqRange> ranges;
763 s_ExtractSeqidsAndRanges(results, ids, ranges);
764 _TRACE("Prefetching " << ids.size() << " sequence lengths");
765 LoadSequencesToScope(ids, ranges, scope);
766 } catch (CException& e) {
767 if(s_IsUsingRemoteBlastDbDataLoader()) {
768 NCBI_THROW(CBlastSystemException, eNetworkError,
769 "Error fetching sequence data from BLAST databases at NCBI, "
770 "please try again later");
771 }
772 else {
773 NCBI_RETHROW_SAME(e, "Error pre-fetching sequence data ");
774 }
775 }
776
777 }
778
779 /// Auxiliary function to extract the ancillary data from the PSSM.
780 CRef<CBlastAncillaryData>
ExtractPssmAncillaryData(const CPssmWithParameters & pssm)781 ExtractPssmAncillaryData(const CPssmWithParameters& pssm)
782 {
783 _ASSERT(pssm.CanGetPssm());
784 pair<double, double> lambda, k, h;
785 lambda.first = pssm.GetPssm().GetLambdaUngapped();
786 lambda.second = pssm.GetPssm().GetLambda();
787 k.first = pssm.GetPssm().GetKappaUngapped();
788 k.second = pssm.GetPssm().GetKappa();
789 h.first = pssm.GetPssm().GetHUngapped();
790 h.second = pssm.GetPssm().GetH();
791 return CRef<CBlastAncillaryData>(new CBlastAncillaryData(lambda, k, h, 0,
792 true));
793 }
794
795 void
CheckForFreqRatioFile(const string & rps_dbname,CRef<CBlastOptionsHandle> & opt_handle,bool isRpsblast)796 CheckForFreqRatioFile(const string& rps_dbname, CRef<CBlastOptionsHandle> & opt_handle, bool isRpsblast)
797 {
798 bool use_cbs = (opt_handle->GetOptions().GetCompositionBasedStats() == eNoCompositionBasedStats) ? false : true;
799 if(use_cbs) {
800 vector<string> db;
801 NStr::Split(rps_dbname, " ", db);
802 list<string> failed_db;
803 for (unsigned int i=0; i < db.size(); i++) {
804 string path;
805 try {
806 vector<string> dbpath;
807 CSeqDB::FindVolumePaths(db[i], CSeqDB::eProtein, dbpath);
808 path = *dbpath.begin();
809 } catch (const CSeqDBException& e) {
810 NCBI_RETHROW(e, CBlastException, eRpsInit,
811 "Cannot retrieve path to RPS database");
812 }
813
814 CFile f(path + ".freq");
815 if(!f.Exists()) {
816 failed_db.push_back(db[i]);
817 }
818
819 }
820 if(!failed_db.empty()) {
821 opt_handle->SetOptions().SetCompositionBasedStats(eNoCompositionBasedStats);
822 string all_failed = NStr::Join(failed_db, ", ");
823 string prog_str = isRpsblast ? "RPSBLAST": "DELTABLAST";
824 string msg = all_failed + " contain(s) no freq ratios " \
825 + "needed for composition-based statistics.\n" \
826 + prog_str + " will be run without composition-based statistics.";
827 ERR_POST(Warning << msg);
828 }
829
830 }
831 return;
832 }
833
834 bool
IsIStreamEmpty(CNcbiIstream & in)835 IsIStreamEmpty(CNcbiIstream & in)
836 {
837 #ifdef NCBI_OS_MSWIN
838 char c;
839 in.setf(ios::skipws);
840 if (!(in >> c))
841 return true;
842 in.unget();
843 return false;
844 #else
845 char c;
846 CNcbiStreampos orig_p = in.tellg();
847 // Piped input
848 if(orig_p < 0)
849 return false;
850
851 IOS_BASE::iostate orig_state = in.rdstate();
852 IOS_BASE::fmtflags orig_flags = in.setf(ios::skipws);
853
854 if(! (in >> c))
855 return true;
856
857 in.seekg(orig_p);
858 in.flags(orig_flags);
859 in.clear();
860 in.setstate(orig_state);
861
862 return false;
863 #endif
864 }
865
866 string
GetCmdlineArgs(const CNcbiArguments & a)867 GetCmdlineArgs(const CNcbiArguments & a)
868 {
869 string cmd = kEmptyStr;
870 for(unsigned int i=0; i < a.Size(); i++) {
871 cmd += a[i] + " ";
872 }
873 return cmd;
874 }
875
876 bool
UseXInclude(const CFormattingArgs & f,const string & s)877 UseXInclude(const CFormattingArgs & f, const string & s)
878 {
879 CFormattingArgs::EOutputFormat fmt = f.GetFormattedOutputChoice();
880 if((fmt == CFormattingArgs::eXml2) || (fmt == CFormattingArgs::eJson)) {
881 if (s == "-"){
882 string f_str = (fmt == CFormattingArgs::eXml2) ? "14.": "13.";
883 NCBI_THROW(CInputException, eEmptyUserInput,
884 "Please provide a file name for outfmt " + f_str);
885 }
886 return true;
887 }
888 return false;
889 }
890
891 string
GetSubjectFile(const CArgs & args)892 GetSubjectFile(const CArgs& args)
893 {
894 string filename="";
895
896 if (args.Exist(kArgSubject) && args[kArgSubject].HasValue())
897 filename = args[kArgSubject].AsString();
898
899 return filename;
900 }
901
PrintErrorArchive(const CArgs & a,const list<CRef<CBlast4_error>> & msg)902 void PrintErrorArchive(const CArgs & a, const list<CRef<CBlast4_error> > & msg)
903 {
904 try {
905 if(NStr::StringToInt(NStr::TruncateSpaces(a[align_format::kArgOutputFormat].AsString())) == CFormattingArgs::eArchiveFormat) {
906 CRef<CBlast4_archive> archive (new CBlast4_archive);
907
908 CBlast4_request & req = archive->SetRequest();
909 CBlast4_get_request_info_request & info= req.SetBody().SetGet_request_info();
910 info.SetRequest_id("Error");
911 CBlast4_get_search_results_reply & results = archive->SetResults();
912 // Pacify unused varaible warning, the set above is used to populate mandatory field
913 (void) results;
914 archive->SetMessages() = msg;
915 CBlastFormat::PrintArchive(archive, a[kArgOutput].AsOutputFile());
916 }
917 } catch (...) {}
918 }
919
QueryBatchCleanup()920 void QueryBatchCleanup()
921 {
922 #if defined(NCBI_OS_LINUX) && HAVE_MALLOC_H
923 malloc_trim(0);
924 #endif
925 return;
926
927 }
928
LogQueryInfo(CBlastUsageReport & report,const CBlastInput & q_info)929 void LogQueryInfo(CBlastUsageReport & report, const CBlastInput & q_info)
930 {
931 report.AddParam(CBlastUsageReport::eTotalQueryLength, q_info.GetTotalLengthProcessed());
932 report.AddParam(CBlastUsageReport::eNumQueries, q_info.GetNumSeqsProcessed());
933 }
934
935
LogRPSBlastOptions(blast::CBlastUsageReport & report,const CBlastOptions & opt)936 void LogRPSBlastOptions(blast::CBlastUsageReport & report, const CBlastOptions & opt)
937 {
938 report.AddParam(CBlastUsageReport::eProgram, Blast_ProgramNameFromType(opt.GetProgramType()));
939 report.AddParam(CBlastUsageReport::eEvalueThreshold, opt.GetEvalueThreshold());
940 report.AddParam(CBlastUsageReport::eHitListSize, opt.GetHitlistSize());
941 report.AddParam(CBlastUsageReport::eCompBasedStats, opt.GetCompositionBasedStats());
942 }
943
LogRPSCmdOptions(blast::CBlastUsageReport & report,const CBlastAppArgs & args)944 void LogRPSCmdOptions(blast::CBlastUsageReport & report, const CBlastAppArgs & args)
945 {
946 if (args.GetBlastDatabaseArgs().NotEmpty() &&
947 args.GetBlastDatabaseArgs()->GetSearchDatabase().NotEmpty() &&
948 args.GetBlastDatabaseArgs()->GetSearchDatabase()->GetSeqDb().NotEmpty()) {
949
950 CRef<CSeqDB> db = args.GetBlastDatabaseArgs()->GetSearchDatabase()->GetSeqDb();
951 string db_name = db->GetDBNameList();
952 int off = db_name.find_last_of(CFile::GetPathSeparator());
953 if (off != -1) {
954 db_name.erase(0, off+1);
955 }
956 report.AddParam(CBlastUsageReport::eDBName, db_name);
957 report.AddParam(CBlastUsageReport::eDBLength, (Int8) db->GetTotalLength());
958 report.AddParam(CBlastUsageReport::eDBNumSeqs, db->GetNumSeqs());
959 report.AddParam(CBlastUsageReport::eDBDate, db->GetDate());
960 }
961
962 if(args.GetFormattingArgs().NotEmpty()){
963 report.AddParam(CBlastUsageReport::eOutputFmt, args.GetFormattingArgs()->GetFormattedOutputChoice());
964 }
965 }
966
GetMTByQueriesBatchSize(EProgram p,int num_threads)967 int GetMTByQueriesBatchSize(EProgram p, int num_threads)
968 {
969 int batch_size = 0;
970
971 char * mt_query_batch_env = getenv("BLAST_MT_QUERY_BATCH_SIZE");
972 if (mt_query_batch_env) {
973 batch_size = NStr::StringToInt(mt_query_batch_env);
974 }
975 else {
976 int factor = 1;
977 if ( EProgramToEBlastProgramType(p) == eBlastTypeBlastn ) {
978 factor = 2;
979 }
980 batch_size = GetQueryBatchSize(p)/factor;
981 }
982 return batch_size;
983 }
984
CheckMTByQueries_DBSize(CRef<CLocalDbAdapter> & db_adapter,const CBlastOptions & opt)985 void CheckMTByQueries_DBSize(CRef<CLocalDbAdapter> & db_adapter, const CBlastOptions & opt)
986 {
987 CRef<CSearchDatabase> sdb = db_adapter->GetSearchDatabase();
988 const CRef<CSeqDBGiList>& gi = sdb->GetGiList();
989 const CRef<CSeqDBGiList>& n_gi = sdb->GetNegativeGiList();
990
991 if (gi.NotEmpty() || n_gi.NotEmpty()) {
992 return;
993 }
994
995 //EProgram prog = opt.GetProgram();
996 CRef<CSeqDB> seqdb = sdb->GetSeqDb();
997 Uint8 total_length = seqdb->GetTotalLength();
998 Uint8 length_limit = 0;
999
1000 if (db_adapter->IsProtein()) {
1001 static const Uint8 kMaxProtTotalLength = 2000000000;
1002 length_limit = kMaxProtTotalLength;
1003 }
1004 else {
1005 static const Uint8 kMaxNuclTotalLength = 14000000000;
1006 length_limit = kMaxNuclTotalLength;
1007 }
1008
1009 if(total_length > length_limit) {
1010 string warn = "This database is probably too large to benefit from -mt_mode=1. " \
1011 "We suggest using -mt_mode=1 only if the database is less than";
1012 if (db_adapter->IsProtein()) {
1013 warn += " 2 billion residues ";
1014 }
1015 else {
1016 warn += " 14 billion bases ";
1017 }
1018 ERR_POST(Warning << warn + "or if the search is limited by an option such as -taxids, -taxidlist or -gilist.");
1019 }
1020 return;
1021 }
1022
CheckMTByQueries_QuerySize(EProgram prog,int batch_size)1023 void CheckMTByQueries_QuerySize(EProgram prog, int batch_size)
1024 {
1025 string warning = "This set of queries is too small to fully benefit from the -mt_mode=1 option. " \
1026 "The total number of letters should be at least ";
1027 warning += NStr::IntToString(batch_size);
1028 EBlastProgramType p = EProgramToEBlastProgramType(prog);
1029 if (Blast_QueryIsProtein(p)) {
1030 warning += " residues";
1031 }
1032 else {
1033 warning += " bases";
1034 }
1035 warning += " per thread, and there should be at least one query of this length per thread.";
1036 ERR_POST(Warning << warning);
1037 }
1038
1039 END_NCBI_SCOPE
1040