1 /*  $Id: blast_app_util.cpp 637481 2021-09-13 18:27:32Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file blast_app_util.cpp
31  *  Utility functions for BLAST command line applications
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include "blast_app_util.hpp"
36 
37 #include <serial/serial.hpp>
38 #include <serial/objostr.hpp>
39 
40 #include <objtools/data_loaders/blastdb/bdbloader.hpp>
41 #include <algo/blast/api/remote_blast.hpp>
42 #include <algo/blast/api/objmgr_query_data.hpp>     // for CObjMgr_QueryFactory
43 #include <algo/blast/api/blast_options_builder.hpp>
44 #include <algo/blast/api/search_strategy.hpp>
45 #include <algo/blast/blastinput/blast_input.hpp>    // for CInputException
46 #include <algo/blast/blastinput/psiblast_args.hpp>
47 #include <algo/blast/blastinput/tblastn_args.hpp>
48 #include <algo/blast/blastinput/blast_scope_src.hpp>
49 #include <objmgr/util/sequence.hpp>
50 
51 #include <objects/scoremat/Pssm.hpp>
52 #include <serial/typeinfo.hpp>      // for CTypeInfo, needed by SerialClone
53 #include <objtools/data_loaders/blastdb/bdbloader_rmt.hpp>
54 #include <algo/blast/format/blast_format.hpp>
55 #include <objtools/align_format/format_flags.hpp>
56 
57 #if defined(NCBI_OS_LINUX) && HAVE_MALLOC_H
58 #include <malloc.h>
59 #endif
60 
61 BEGIN_NCBI_SCOPE
62 USING_SCOPE(objects);
63 USING_SCOPE(blast);
64 
GetBatchSize(Int4 hits)65 Int4 CBatchSizeMixer::GetBatchSize(Int4 hits)
66 {
67      if (hits >= 0) {
68          double ratio = 1.0 * (hits+1) / m_BatchSize;
69          m_Ratio = (m_Ratio < 0) ? ratio
70                  : k_MixIn * ratio + (1.0 - k_MixIn) * m_Ratio;
71          m_BatchSize = (Int4) (1.0 * m_TargetHits / m_Ratio);
72      }
73      if (m_BatchSize > k_MaxBatchSize) {
74          m_BatchSize = k_MaxBatchSize;
75          m_Ratio = -1.0;
76      } else if (m_BatchSize < k_MinBatchSize) {
77          m_BatchSize = k_MinBatchSize;
78          m_Ratio = -1.0;
79      }
80      return m_BatchSize;
81 }
82 
83 CRef<blast::CRemoteBlast>
InitializeRemoteBlast(CRef<blast::IQueryFactory> queries,CRef<blast::CBlastDatabaseArgs> db_args,CRef<blast::CBlastOptionsHandle> opts_hndl,bool verbose_output,const string & client_id,CRef<objects::CPssmWithParameters> pssm)84 InitializeRemoteBlast(CRef<blast::IQueryFactory> queries,
85                       CRef<blast::CBlastDatabaseArgs> db_args,
86                       CRef<blast::CBlastOptionsHandle> opts_hndl,
87                       bool verbose_output,
88                       const string& client_id /* = kEmptyStr */,
89                       CRef<objects::CPssmWithParameters> pssm
90                         /* = CRef<objects::CPssmWithParameters>() */)
91 {
92     _ASSERT(queries || pssm);
93     _ASSERT(db_args);
94     _ASSERT(opts_hndl);
95 
96     CRef<CRemoteBlast> retval;
97 
98     CRef<CSearchDatabase> search_db = db_args->GetSearchDatabase();
99     if (search_db.NotEmpty()) {
100         if (pssm.NotEmpty()) {
101             _ASSERT(queries.Empty());
102             retval.Reset(new CRemoteBlast(pssm, opts_hndl, *search_db));
103         } else {
104             retval.Reset(new CRemoteBlast(queries, opts_hndl, *search_db));
105         }
106     } else {
107         if (pssm.NotEmpty()) {
108             NCBI_THROW(CInputException, eInvalidInput,
109                        "Remote PSI-BL2SEQ is not supported");
110         } else {
111             // N.B.: there is NO scope needed in the GetSubjects call because
112             // the subjects (if any) should have already been added in
113             // InitializeSubject
114             retval.Reset(new CRemoteBlast(queries, opts_hndl,
115                                          db_args->GetSubjects()));
116         }
117     }
118     if (verbose_output) {
119         retval->SetVerbose();
120     }
121     if (client_id != kEmptyStr) {
122         retval->SetClientId(client_id);
123     }
124     return retval;
125 }
126 
127 blast::SDataLoaderConfig
InitializeQueryDataLoaderConfiguration(bool query_is_protein,CRef<blast::CLocalDbAdapter> db_adapter)128 InitializeQueryDataLoaderConfiguration(bool query_is_protein,
129                                        CRef<blast::CLocalDbAdapter> db_adapter)
130 {
131     SDataLoaderConfig retval(query_is_protein);
132     retval.OptimizeForWholeLargeSequenceRetrieval();
133 
134     /* Load the BLAST database into the data loader configuration for the query
135      * so that if the query sequence(s) are specified as seq-ids, these can be
136      * fetched from the BLAST database being searched */
137     if (db_adapter->IsBlastDb() &&  /* this is a BLAST database search */
138         retval.m_UseBlastDbs &&   /* the BLAST database data loader is requested */
139         (query_is_protein == db_adapter->IsProtein())) { /* the same database type is used for both queries and subjects */
140         // Make sure we don't add the same database more than once
141         vector<string> default_dbs;
142         NStr::Split(retval.m_BlastDbName, " ", default_dbs);
143         if (default_dbs.size() &&
144             (find(default_dbs.begin(), default_dbs.end(),
145                  db_adapter->GetDatabaseName()) == default_dbs.end())) {
146             CNcbiOstrstream oss;
147             oss << db_adapter->GetDatabaseName() << " " << retval.m_BlastDbName;
148             retval.m_BlastDbName = CNcbiOstrstreamToString(oss);
149         }
150     }
151     if (retval.m_UseBlastDbs) {
152         _TRACE("Initializing query data loader to '" << retval.m_BlastDbName
153                << "' (" << (query_is_protein ? "protein" : "nucleotide")
154                << " BLAST database)");
155     }
156     if (retval.m_UseGenbank) {
157         _TRACE("Initializing query data loader to use GenBank data loader");
158     }
159     return retval;
160 }
161 
162 void
InitializeSubject(CRef<blast::CBlastDatabaseArgs> db_args,CRef<blast::CBlastOptionsHandle> opts_hndl,bool is_remote_search,CRef<blast::CLocalDbAdapter> & db_adapter,CRef<objects::CScope> & scope)163 InitializeSubject(CRef<blast::CBlastDatabaseArgs> db_args,
164                   CRef<blast::CBlastOptionsHandle> opts_hndl,
165                   bool is_remote_search,
166                   CRef<blast::CLocalDbAdapter>& db_adapter,
167                   CRef<objects::CScope>& scope)
168 {
169 	string dl = kEmptyStr;
170     db_adapter.Reset();
171 
172     _ASSERT(db_args.NotEmpty());
173     CRef<CSearchDatabase> search_db = db_args->GetSearchDatabase();
174 
175     // Initialize the scope...
176     if (is_remote_search) {
177         const bool is_protein =
178             Blast_SubjectIsProtein(opts_hndl->GetOptions().GetProgramType())
179 			? true : false;
180         SDataLoaderConfig config(is_protein);
181         CBlastScopeSource scope_src(config);
182         // configure scope to fetch sequences remotely for formatting
183         if (scope.NotEmpty()) {
184             scope_src.AddDataLoaders(scope);
185         } else {
186             scope = scope_src.NewScope();
187         }
188     } else {
189         if (scope.Empty()) {
190             scope.Reset(new CScope(*CObjectManager::GetInstance()));
191         }
192     }
193     _ASSERT(scope.NotEmpty());
194 
195     // ... and then the subjects
196     CRef<IQueryFactory> subjects;
197     if ( (subjects = db_args->GetSubjects(scope)) ) {
198         _ASSERT(search_db.Empty());
199 	char* bl2seq_legacy = getenv("BL2SEQ_LEGACY");
200 	if (bl2seq_legacy)
201         	db_adapter.Reset(new CLocalDbAdapter(subjects, opts_hndl, false));
202 	else
203         	db_adapter.Reset(new CLocalDbAdapter(subjects, opts_hndl, true));
204     } else {
205         _ASSERT(search_db.NotEmpty());
206         try {
207             // Try to open the BLAST database even for remote searches, as if
208             // it is available locally, it will be better to fetch the
209             // sequence data for formatting from this (local) source
210             CRef<CSeqDB> seqdb = search_db->GetSeqDb();
211             db_adapter.Reset(new CLocalDbAdapter(*search_db));
212             dl = RegisterOMDataLoader(seqdb);
213             scope->AddDataLoader(dl);
214         } catch (const CSeqDBException&) {
215             // The BLAST database couldn't be found, report this for local
216             // searches, but for remote searches go on.
217         	dl = kEmptyStr;
218             if (is_remote_search ) {
219                 db_adapter.Reset(new CLocalDbAdapter(*search_db));
220             } else {
221                 throw;
222             }
223         }
224     }
225 
226     /// Set the BLASTDB data loader as the default data loader (if applicable)
227     if (search_db.NotEmpty()) {
228         if ( dl != kEmptyStr) {
229             // FIXME: will this work with multiple BLAST DBs?
230             scope->AddDataLoader(dl,  CBlastDatabaseArgs::kSubjectsDataLoaderPriority);
231             _TRACE("Setting " << dl << " priority to "
232                    << (int)CBlastDatabaseArgs::kSubjectsDataLoaderPriority
233                    << " for subjects");
234         }
235     }
236     return;
237 }
238 
RegisterOMDataLoader(CRef<CSeqDB> db_handle)239 string RegisterOMDataLoader(CRef<CSeqDB> db_handle)
240 {
241     // the blast formatter requires that the database coexist in
242     // the same scope with the query sequences
243     CRef<CObjectManager> om = CObjectManager::GetInstance();
244     CBlastDbDataLoader::RegisterInObjectManager(*om, db_handle, true,
245                         CObjectManager::eDefault,
246                         CBlastDatabaseArgs::kSubjectsDataLoaderPriority);
247     CBlastDbDataLoader::SBlastDbParam param(db_handle);
248     string retval(CBlastDbDataLoader::GetLoaderNameFromArgs(param));
249     _TRACE("Registering " << retval << " at priority " <<
250            (int)CBlastDatabaseArgs::kSubjectsDataLoaderPriority
251            << " for subjects");
252     return retval;
253 }
254 
255 
256 static CRef<blast::CExportStrategy>
s_InitializeExportStrategy(CRef<blast::IQueryFactory> queries,CRef<blast::CBlastDatabaseArgs> db_args,CRef<blast::CBlastOptionsHandle> opts_hndl,const string & client_id,CRef<objects::CPssmWithParameters> pssm,unsigned int num_iters)257 s_InitializeExportStrategy(CRef<blast::IQueryFactory> queries,
258                       	 CRef<blast::CBlastDatabaseArgs> db_args,
259                       	 CRef<blast::CBlastOptionsHandle> opts_hndl,
260                       	 const string& client_id /* = kEmptyStr */,
261                       	 CRef<objects::CPssmWithParameters> pssm
262                          /* = CRef<objects::CPssmWithParameters>() */,
263                          unsigned int num_iters
264                          /* = 0 */)
265 {
266     _ASSERT(queries || pssm);
267     _ASSERT(db_args);
268     _ASSERT(opts_hndl);
269 
270     CRef<CExportStrategy> retval;
271 
272     CRef<CSearchDatabase> search_db = db_args->GetSearchDatabase();
273     if (search_db.NotEmpty())
274     {
275         if (pssm.NotEmpty())
276         {
277             _ASSERT(queries.Empty());
278             if(num_iters != 0)
279             	retval.Reset(new blast::CExportStrategy(pssm, opts_hndl, search_db, client_id, num_iters));
280             else
281             	retval.Reset(new blast::CExportStrategy(pssm, opts_hndl, search_db, client_id));
282         }
283         else
284         {
285             if(num_iters != 0)
286             	retval.Reset(new blast::CExportStrategy(queries, opts_hndl, search_db, client_id, num_iters));
287             else
288             	retval.Reset(new blast::CExportStrategy(queries, opts_hndl, search_db, client_id));
289         }
290     }
291     else
292     {
293         if (pssm.NotEmpty())
294         {
295             NCBI_THROW(CInputException, eInvalidInput,
296                        "Remote PSI-BL2SEQ is not supported");
297         }
298         else
299         {
300             retval.Reset(new blast::CExportStrategy(queries, opts_hndl,
301             								 db_args->GetSubjects(), client_id));
302         }
303     }
304 
305     return retval;
306 }
307 
308 
309 /// Real implementation of search strategy extraction
310 /// @todo refactor this code so that it can be reused in other contexts
311 static void
s_ExportSearchStrategy(CNcbiOstream * out,CRef<blast::IQueryFactory> queries,CRef<blast::CBlastOptionsHandle> options_handle,CRef<blast::CBlastDatabaseArgs> db_args,CRef<objects::CPssmWithParameters> pssm,unsigned int num_iters)312 s_ExportSearchStrategy(CNcbiOstream* out,
313                      CRef<blast::IQueryFactory> queries,
314                      CRef<blast::CBlastOptionsHandle> options_handle,
315                      CRef<blast::CBlastDatabaseArgs> db_args,
316                      CRef<objects::CPssmWithParameters> pssm,
317                        /* = CRef<objects::CPssmWithParameters>() */
318                      unsigned int num_iters /* = 0 */)
319 {
320     if ( !out )
321         return;
322 
323     _ASSERT(db_args);
324     _ASSERT(options_handle);
325 
326     try
327     {
328         CRef<CExportStrategy> export_strategy =
329         			s_InitializeExportStrategy(queries, db_args, options_handle,
330                                   	 	 	   kEmptyStr, pssm, num_iters);
331         export_strategy->ExportSearchStrategy_ASN1(out);
332     }
333     catch (const CBlastException& e)
334     {
335         if (e.GetErrCode() == CBlastException::eNotSupported) {
336             NCBI_THROW(CInputException, eInvalidInput,
337                        "Saving search strategies with gi lists is currently "
338                        "not supported");
339         }
340         throw;
341     }
342 }
343 
344 /// Converts a list of Bioseqs into a TSeqLocVector. All Bioseqs are added to
345 /// the same CScope object
346 /// @param subjects Bioseqs to convert
347 static TSeqLocVector
s_ConvertBioseqs2TSeqLocVector(const CBlast4_subject::TSequences & subjects)348 s_ConvertBioseqs2TSeqLocVector(const CBlast4_subject::TSequences& subjects)
349 {
350     TSeqLocVector retval;
351     CRef<CScope> subj_scope(new CScope(*CObjectManager::GetInstance()));
352     ITERATE(CBlast4_subject::TSequences, bioseq, subjects) {
353         subj_scope->AddBioseq(**bioseq);
354         CRef<CSeq_id> seqid = FindBestChoice((*bioseq)->GetId(),
355                                              CSeq_id::BestRank);
356         const TSeqPos length = (*bioseq)->GetInst().GetLength();
357         CRef<CSeq_loc> sl(new CSeq_loc(*seqid, 0, length-1));
358         retval.push_back(SSeqLoc(sl, subj_scope));
359     }
360     return retval;
361 }
362 
363 /// Import PSSM into the command line arguments object
364 static void
s_ImportPssm(const CBlast4_queries & queries,CRef<blast::CBlastOptionsHandle> opts_hndl,blast::CBlastAppArgs * cmdline_args)365 s_ImportPssm(const CBlast4_queries& queries,
366              CRef<blast::CBlastOptionsHandle> opts_hndl,
367              blast::CBlastAppArgs* cmdline_args)
368 {
369     CRef<CPssmWithParameters> pssm
370         (const_cast<CPssmWithParameters*>(&queries.GetPssm()));
371     CPsiBlastAppArgs* psi_args = NULL;
372     CTblastnAppArgs* tbn_args = NULL;
373 
374     if ( (psi_args = dynamic_cast<CPsiBlastAppArgs*>(cmdline_args)) ) {
375         psi_args->SetInputPssm(pssm);
376     } else if ( (tbn_args =
377                  dynamic_cast<CTblastnAppArgs*>(cmdline_args))) {
378         tbn_args->SetInputPssm(pssm);
379     } else {
380         EBlastProgramType p = opts_hndl->GetOptions().GetProgramType();
381         string msg("PSSM found in saved strategy, but not supported ");
382         msg += "for " + Blast_ProgramNameFromType(p);
383         NCBI_THROW(CBlastException, eNotSupported, msg);
384     }
385 }
386 
387 /// Import queries into command line arguments object
388 static void
s_ImportQueries(const CBlast4_queries & queries,CRef<blast::CBlastOptionsHandle> opts_hndl,blast::CBlastAppArgs * cmdline_args)389 s_ImportQueries(const CBlast4_queries& queries,
390                 CRef<blast::CBlastOptionsHandle> opts_hndl,
391                 blast::CBlastAppArgs* cmdline_args)
392 {
393     CRef<CTmpFile> tmpfile(new CTmpFile(CTmpFile::eNoRemove));
394 
395     // Stuff the query bioseq or seqloc list in the input stream of the
396     // cmdline_args
397     if (queries.IsSeq_loc_list()) {
398         const CBlast4_queries::TSeq_loc_list& seqlocs =
399             queries.GetSeq_loc_list();
400         CFastaOstream out(tmpfile->AsOutputFile(CTmpFile::eIfExists_Throw));
401         out.SetFlag(CFastaOstream::eAssembleParts);
402 
403         EBlastProgramType prog = opts_hndl->GetOptions().GetProgramType();
404         SDataLoaderConfig dlconfig(!!Blast_QueryIsProtein(prog));
405         dlconfig.OptimizeForWholeLargeSequenceRetrieval();
406         CBlastScopeSource scope_src(dlconfig);
407         CRef<CScope> scope(scope_src.NewScope());
408 
409         ITERATE(CBlast4_queries::TSeq_loc_list, itr, seqlocs) {
410             if ((*itr)->GetId()) {
411                 CBioseq_Handle bh = scope->GetBioseqHandle(*(*itr)->GetId());
412                 out.Write(bh);
413             }
414         }
415         scope.Reset();
416         scope_src.RevokeBlastDbDataLoader();
417 
418     } else {
419         _ASSERT(queries.IsBioseq_set());
420         const CBlast4_queries::TBioseq_set& bioseqs =
421             queries.GetBioseq_set();
422         CFastaOstream out(tmpfile->AsOutputFile(CTmpFile::eIfExists_Throw));
423         out.SetFlag(CFastaOstream::eAssembleParts);
424 
425         ITERATE(CBioseq_set::TSeq_set, seq_entry, bioseqs.GetSeq_set()){
426             out.Write(**seq_entry);
427         }
428     }
429 
430     const string& fname = tmpfile->GetFileName();
431     tmpfile.Reset(new CTmpFile(fname));
432     cmdline_args->SetInputStream(tmpfile);
433 }
434 
435 /// Import the database and return it in a CBlastDatabaseArgs object
436 static CRef<blast::CBlastDatabaseArgs>
s_ImportDatabase(const CBlast4_subject & subj,CBlastOptionsBuilder & opts_builder,bool subject_is_protein,bool is_remote_search)437 s_ImportDatabase(const CBlast4_subject& subj,
438                  CBlastOptionsBuilder& opts_builder,
439                  bool subject_is_protein,
440                  bool is_remote_search)
441 {
442     _ASSERT(subj.IsDatabase());
443     CRef<CBlastDatabaseArgs> db_args(new CBlastDatabaseArgs);
444     const CSearchDatabase::EMoleculeType mol = subject_is_protein
445         ? CSearchDatabase::eBlastDbIsProtein
446         : CSearchDatabase::eBlastDbIsNucleotide;
447     const string dbname(subj.GetDatabase());
448     CRef<CSearchDatabase> search_db(new CSearchDatabase(dbname, mol));
449 
450     if (opts_builder.HaveEntrezQuery()) {
451         string limit(opts_builder.GetEntrezQuery());
452         search_db->SetEntrezQueryLimitation(limit);
453         if ( !is_remote_search ) {
454             string msg("Entrez query '");
455             msg += limit + string("' will not be processed locally.\n");
456             msg += string("Please use the -remote option.");
457             throw runtime_error(msg);
458         }
459     }
460 
461     if (opts_builder.HaveGiList() || opts_builder.HaveTaxidList()) {
462         CSeqDBGiList *gilist = new CSeqDBGiList();
463         if (opts_builder.HaveGiList()) {
464         	ITERATE(list<TGi>, gi, opts_builder.GetGiList()) {
465         		gilist->AddGi(*gi);
466         	}
467         }
468         if (opts_builder.HaveTaxidList()) {
469         	list<TTaxId>  list = opts_builder.GetTaxidList();
470            	set<TTaxId> taxids(list.begin(), list.end());
471         	gilist->AddTaxIds(taxids);
472         }
473         search_db->SetGiList(gilist);
474     }
475 
476     if (opts_builder.HaveNegativeGiList() || opts_builder.HaveNegativeTaxidList()) {
477     	CSeqDBGiList *gilist = new CSeqDBGiList();
478         if (opts_builder.HaveNegativeGiList()) {
479           	ITERATE(list<TGi>, gi, opts_builder.GetNegativeGiList()) {
480       		gilist->AddGi(*gi);
481        	    }
482         }
483         if (opts_builder.HaveNegativeTaxidList()) {
484         	list<TTaxId>  list = opts_builder.GetNegativeTaxidList();
485            	set<TTaxId> taxids(list.begin(), list.end());
486            	gilist->AddTaxIds(taxids);
487         }
488         search_db->SetNegativeGiList(gilist);
489     }
490 
491     if (opts_builder.HasDbFilteringAlgorithmKey()) {
492         string algo_key = opts_builder.GetDbFilteringAlgorithmKey();
493         ESubjectMaskingType mask_type= eSoftSubjMasking;
494         if(opts_builder.HasSubjectMaskingType())
495         	mask_type = opts_builder.GetSubjectMaskingType();
496         search_db->SetFilteringAlgorithm(algo_key, mask_type);
497 
498     } else if (opts_builder.HasDbFilteringAlgorithmId()) {
499         int algo_id = opts_builder.GetDbFilteringAlgorithmId();
500         ESubjectMaskingType mask_type= eSoftSubjMasking;
501         if(opts_builder.HasSubjectMaskingType())
502         	mask_type = opts_builder.GetSubjectMaskingType();
503         search_db->SetFilteringAlgorithm(algo_id, mask_type);
504     }
505 
506     db_args->SetSearchDatabase(search_db);
507     return db_args;
508 }
509 
510 /// Import the subject sequences into a CBlastDatabaseArgs object
511 static CRef<blast::CBlastDatabaseArgs>
s_ImportSubjects(const CBlast4_subject & subj,bool subject_is_protein)512 s_ImportSubjects(const CBlast4_subject& subj, bool subject_is_protein)
513 {
514     _ASSERT(subj.IsSequences());
515     CRef<CBlastDatabaseArgs> db_args(new CBlastDatabaseArgs);
516     TSeqLocVector subjects =
517         s_ConvertBioseqs2TSeqLocVector(subj.GetSequences());
518     CRef<CScope> subj_scope = subjects.front().scope;
519     CRef<IQueryFactory> subject_factory(new CObjMgr_QueryFactory(subjects));
520     db_args->SetSubjects(subject_factory, subj_scope, subject_is_protein);
521     return db_args;
522 }
523 
524 /// Imports search strategy, using CImportStrategy.
525 static void
s_ImportSearchStrategy(CNcbiIstream * in,blast::CBlastAppArgs * cmdline_args,bool is_remote_search,bool override_query,bool override_subject)526 s_ImportSearchStrategy(CNcbiIstream* in,
527                        blast::CBlastAppArgs* cmdline_args,
528                        bool is_remote_search,
529                        bool override_query,
530                        bool override_subject)
531 {
532     if ( !in ) {
533         return;
534     }
535 
536     CRef<CBlast4_request> b4req;
537     try {
538         b4req = ExtractBlast4Request(*in);
539     } catch (const CSerialException&) {
540         NCBI_THROW(CInputException, eInvalidInput,
541                    "Failed to read search strategy");
542     }
543 
544     CImportStrategy strategy(b4req);
545 
546     CRef<blast::CBlastOptionsHandle> opts_hndl = strategy.GetOptionsHandle();
547     cmdline_args->SetOptionsHandle(opts_hndl);
548     const EBlastProgramType prog = opts_hndl->GetOptions().GetProgramType();
549     cmdline_args->SetTask(strategy.GetTask());
550 #if _DEBUG
551     {
552         char* program_string = 0;
553         BlastNumber2Program(prog, &program_string);
554         _TRACE("EBlastProgramType=" << program_string << " task=" << strategy.GetTask());
555         sfree(program_string);
556     }
557 #endif
558 
559     // Get the subject
560     if (override_subject) {
561         ERR_POST(Warning << "Overriding database/subject in saved strategy");
562     } else {
563         CRef<blast::CBlastDatabaseArgs> db_args;
564         CRef<CBlast4_subject> subj = strategy.GetSubject();
565         const bool subject_is_protein = Blast_SubjectIsProtein(prog) ? true : false;
566 
567         if (subj->IsDatabase()) {
568             db_args = s_ImportDatabase(*subj, strategy.GetOptionsBuilder(),
569                                        subject_is_protein, is_remote_search);
570         } else {
571             db_args = s_ImportSubjects(*subj, subject_is_protein);
572         }
573         _ASSERT(db_args.NotEmpty());
574         cmdline_args->SetBlastDatabaseArgs(db_args);
575     }
576 
577     // Get the query, queries, or pssm
578     if (override_query) {
579         ERR_POST(Warning << "Overriding query in saved strategy");
580     } else {
581         CRef<CBlast4_queries> queries = strategy.GetQueries();
582         if (queries->IsPssm()) {
583             s_ImportPssm(*queries, opts_hndl, cmdline_args);
584         } else {
585             s_ImportQueries(*queries, opts_hndl, cmdline_args);
586         }
587         // Set the range restriction for the query, if applicable
588         const TSeqRange query_range = strategy.GetQueryRange();
589         if (query_range != TSeqRange::GetEmpty()) {
590             cmdline_args->GetQueryOptionsArgs()->SetRange(query_range);
591         }
592     }
593 
594     if ( CPsiBlastAppArgs* psi_args = dynamic_cast<CPsiBlastAppArgs*>(cmdline_args) )
595     {
596             psi_args->SetNumberOfIterations(strategy.GetPsiNumOfIterations());
597     }
598 }
599 
600 bool
RecoverSearchStrategy(const CArgs & args,blast::CBlastAppArgs * cmdline_args)601 RecoverSearchStrategy(const CArgs& args, blast::CBlastAppArgs* cmdline_args)
602 {
603     CNcbiIstream* in = cmdline_args->GetImportSearchStrategyStream(args);
604     if ( !in ) {
605         return false;
606     }
607     const bool is_remote_search =
608         (args.Exist(kArgRemote) && args[kArgRemote].HasValue() && args[kArgRemote].AsBoolean());
609     const bool override_query = (args[kArgQuery].HasValue() &&
610                                  args[kArgQuery].AsString() != kDfltArgQuery);
611     const bool override_subject = CBlastDatabaseArgs::HasBeenSet(args);
612 
613     if (CMbIndexArgs::HasBeenSet(args)) {
614     	if (args[kArgUseIndex].AsBoolean() != kDfltArgUseIndex)
615     		ERR_POST(Warning << "Overriding megablast BLAST DB indexed options in saved strategy");
616     }
617 
618     s_ImportSearchStrategy(in, cmdline_args, is_remote_search, override_query,
619                            override_subject);
620 
621     return true;
622 }
623 
624 // Process search strategies
625 // FIXME: save program options,
626 // Save task if provided, no other options (only those in the cmd line) should
627 // be saved
628 void
SaveSearchStrategy(const CArgs & args,blast::CBlastAppArgs * cmdline_args,CRef<blast::IQueryFactory> queries,CRef<blast::CBlastOptionsHandle> opts_hndl,CRef<objects::CPssmWithParameters> pssm,unsigned int num_iters)629 SaveSearchStrategy(const CArgs& args,
630                    blast::CBlastAppArgs* cmdline_args,
631                    CRef<blast::IQueryFactory> queries,
632                    CRef<blast::CBlastOptionsHandle> opts_hndl,
633                    CRef<objects::CPssmWithParameters> pssm
634                      /* = CRef<objects::CPssmWithParameters>() */,
635                     unsigned int num_iters /* =0 */)
636 {
637     CNcbiOstream* out = cmdline_args->GetExportSearchStrategyStream(args);
638     if ( !out ) {
639         return;
640     }
641 
642     s_ExportSearchStrategy(out, queries, opts_hndl,
643                            cmdline_args->GetBlastDatabaseArgs(),
644                            pssm, num_iters);
645 }
646 
647 /// Extracts the subject sequence IDs and ranges from the BLAST results
648 /// @note if this ever needs to be refactored for popular developer
649 /// consumption, this function should operate on CSeq_align_set as opposed to
650 /// blast::CSearchResultSet
651 static void
s_ExtractSeqidsAndRanges(const blast::CSearchResultSet & results,CScope::TIds & ids,vector<TSeqRange> & ranges)652 s_ExtractSeqidsAndRanges(const blast::CSearchResultSet& results,
653                          CScope::TIds& ids, vector<TSeqRange>& ranges)
654 {
655     static const CSeq_align::TDim kQueryRow = 0;
656     static const CSeq_align::TDim kSubjRow = 1;
657     ids.clear();
658     ranges.clear();
659 
660     typedef map< CConstRef<CSeq_id>,
661                  vector<TSeqRange>,
662                  CConstRefCSeqId_LessThan
663                > TSeqIdRanges;
664     TSeqIdRanges id_ranges;
665 
666     ITERATE(blast::CSearchResultSet, result, results) {
667         if ( !(*result)->HasAlignments() ) {
668             continue;
669         }
670         ITERATE(CSeq_align_set::Tdata, aln, (*result)->GetSeqAlign()->Get()) {
671             CConstRef<CSeq_id> subj(&(*aln)->GetSeq_id(kSubjRow));
672             TSeqRange subj_range((*aln)->GetSeqRange(kSubjRow));
673             if ((*aln)->GetSeqStrand(kQueryRow) == eNa_strand_minus &&
674                 (*aln)->GetSeqStrand(kSubjRow) == eNa_strand_plus) {
675                 TSeqRange r(subj_range);
676                 // flag the range as needed to be flipped once the sequence
677                 // length is known
678                 subj_range.SetFrom(r.GetToOpen());
679                 subj_range.SetToOpen(r.GetFrom());
680             }
681             id_ranges[subj].push_back(subj_range);
682         }
683     }
684 
685     ITERATE(TSeqIdRanges, itr, id_ranges) {
686         ITERATE(vector<TSeqRange>, range, itr->second) {
687             ids.push_back(CSeq_id_Handle::GetHandle(*itr->first));
688             ranges.push_back(*range);
689         }
690     }
691     _ASSERT(ids.size() == ranges.size());
692 }
693 
694 /// Returns true if the remote BLAST DB data loader is being used
695 static bool
s_IsUsingRemoteBlastDbDataLoader()696 s_IsUsingRemoteBlastDbDataLoader()
697 {
698     CObjectManager::TRegisteredNames data_loaders;
699     CObjectManager::GetInstance()->GetRegisteredNames(data_loaders);
700     ITERATE(CObjectManager::TRegisteredNames, name, data_loaders) {
701         if (NStr::StartsWith(*name, objects::CRemoteBlastDbDataLoader::kNamePrefix)) {
702             return true;
703         }
704     }
705     return false;
706 }
707 
708 static bool
s_IsPrefetchFormat(blast::CFormattingArgs::EOutputFormat format_type)709 s_IsPrefetchFormat(blast::CFormattingArgs::EOutputFormat format_type)
710 {
711 	if ((format_type == CFormattingArgs::eAsnText) ||
712 	    (format_type == CFormattingArgs::eAsnBinary) ||
713 	    (format_type == CFormattingArgs::eArchiveFormat)||
714 	    (format_type == CFormattingArgs::eJsonSeqalign)) {
715 		return false;
716 	}
717 	return true;
718 }
719 
720 static bool
s_PreFetchSeqs(const blast::CSearchResultSet & results,blast::CFormattingArgs::EOutputFormat format_type)721 s_PreFetchSeqs(const blast::CSearchResultSet& results,
722 		       blast::CFormattingArgs::EOutputFormat format_type)
723 {
724 	{
725 		char * pre_fetch_limit_str = getenv("PRE_FETCH_SEQS_LIMIT");
726 		if (pre_fetch_limit_str) {
727 			int pre_fetch_limit = NStr::StringToInt(pre_fetch_limit_str);
728 			if(pre_fetch_limit == 0) {
729 				return false;
730 			}
731 			if(pre_fetch_limit == INT_MAX){
732 				return true;
733 			}
734 			int num_of_seqs = 0;
735 			for(unsigned int i=0; i < results.GetNumResults(); i++) {
736 				if(results[i].HasAlignments()) {
737 					num_of_seqs += results[i].GetSeqAlign()->Size();
738 				}
739 			}
740 			if(num_of_seqs > pre_fetch_limit) {
741 				return false;
742 			}
743 		}
744 	}
745 
746 	return s_IsPrefetchFormat(format_type);
747 }
748 
BlastFormatter_PreFetchSequenceData(const blast::CSearchResultSet & results,CRef<CScope> scope,blast::CFormattingArgs::EOutputFormat format_type)749 void BlastFormatter_PreFetchSequenceData(const blast::CSearchResultSet& results,
750 		                                 CRef<CScope> scope,
751 		                                 blast::CFormattingArgs::EOutputFormat format_type)
752 {
753     _ASSERT(scope.NotEmpty());
754     if (results.size() == 0) {
755         return;
756     }
757     if(!s_PreFetchSeqs(results, format_type)){
758     	return;
759     }
760     try {
761        CScope::TIds ids;
762        vector<TSeqRange> ranges;
763        s_ExtractSeqidsAndRanges(results, ids, ranges);
764        _TRACE("Prefetching " << ids.size() << " sequence lengths");
765        LoadSequencesToScope(ids, ranges, scope);
766 	} catch (CException& e) {
767 	   if(s_IsUsingRemoteBlastDbDataLoader()) {
768           NCBI_THROW(CBlastSystemException, eNetworkError,
769                      "Error fetching sequence data from BLAST databases at NCBI, "
770                      "please try again later");
771 	   }
772 	   else {
773           NCBI_RETHROW_SAME(e, "Error pre-fetching sequence data ");
774 	   }
775    }
776 
777 }
778 
779 /// Auxiliary function to extract the ancillary data from the PSSM.
780 CRef<CBlastAncillaryData>
ExtractPssmAncillaryData(const CPssmWithParameters & pssm)781 ExtractPssmAncillaryData(const CPssmWithParameters& pssm)
782 {
783     _ASSERT(pssm.CanGetPssm());
784     pair<double, double> lambda, k, h;
785     lambda.first = pssm.GetPssm().GetLambdaUngapped();
786     lambda.second = pssm.GetPssm().GetLambda();
787     k.first = pssm.GetPssm().GetKappaUngapped();
788     k.second = pssm.GetPssm().GetKappa();
789     h.first = pssm.GetPssm().GetHUngapped();
790     h.second = pssm.GetPssm().GetH();
791     return CRef<CBlastAncillaryData>(new CBlastAncillaryData(lambda, k, h, 0,
792                                                              true));
793 }
794 
795 void
CheckForFreqRatioFile(const string & rps_dbname,CRef<CBlastOptionsHandle> & opt_handle,bool isRpsblast)796 CheckForFreqRatioFile(const string& rps_dbname, CRef<CBlastOptionsHandle>  & opt_handle, bool isRpsblast)
797 {
798     bool use_cbs = (opt_handle->GetOptions().GetCompositionBasedStats() == eNoCompositionBasedStats) ? false : true;
799     if(use_cbs) {
800         vector<string> db;
801         NStr::Split(rps_dbname, " ", db);
802         list<string> failed_db;
803         for (unsigned int i=0; i < db.size(); i++) {
804     	    string path;
805     	    try {
806                 vector<string> dbpath;
807        	        CSeqDB::FindVolumePaths(db[i], CSeqDB::eProtein, dbpath);
808                 path = *dbpath.begin();
809             } catch (const CSeqDBException& e) {
810                  NCBI_RETHROW(e, CBlastException, eRpsInit,
811                               "Cannot retrieve path to RPS database");
812             }
813 
814     	    CFile f(path + ".freq");
815             if(!f.Exists()) {
816             	failed_db.push_back(db[i]);
817             }
818 
819         }
820         if(!failed_db.empty()) {
821         	opt_handle->SetOptions().SetCompositionBasedStats(eNoCompositionBasedStats);
822         	string all_failed = NStr::Join(failed_db, ", ");
823         	string prog_str = isRpsblast ? "RPSBLAST": "DELTABLAST";
824         	string msg = all_failed + " contain(s) no freq ratios " \
825                      	 + "needed for composition-based statistics.\n" \
826                      	 + prog_str + " will be run without composition-based statistics.";
827         	ERR_POST(Warning << msg);
828         }
829 
830     }
831     return;
832 }
833 
834 bool
IsIStreamEmpty(CNcbiIstream & in)835 IsIStreamEmpty(CNcbiIstream & in)
836 {
837 #ifdef NCBI_OS_MSWIN
838 	char c;
839 	in.setf(ios::skipws);
840 	if (!(in >> c))
841 		return true;
842 	in.unget();
843 	return false;
844 #else
845 	char c;
846 	CNcbiStreampos orig_p = in.tellg();
847 	// Piped input
848 	if(orig_p < 0)
849 		return false;
850 
851 	IOS_BASE::iostate orig_state = in.rdstate();
852 	IOS_BASE::fmtflags orig_flags = in.setf(ios::skipws);
853 
854 	if(! (in >> c))
855 		return true;
856 
857 	in.seekg(orig_p);
858 	in.flags(orig_flags);
859 	in.clear();
860 	in.setstate(orig_state);
861 
862 	return false;
863 #endif
864 }
865 
866 string
GetCmdlineArgs(const CNcbiArguments & a)867 GetCmdlineArgs(const CNcbiArguments & a)
868 {
869 	string cmd = kEmptyStr;
870 	for(unsigned int i=0; i < a.Size(); i++) {
871 		cmd += a[i] + " ";
872 	}
873 	return cmd;
874 }
875 
876 bool
UseXInclude(const CFormattingArgs & f,const string & s)877 UseXInclude(const CFormattingArgs & f, const string & s)
878 {
879 	CFormattingArgs::EOutputFormat fmt = f.GetFormattedOutputChoice();
880 	if((fmt ==  CFormattingArgs::eXml2) || (fmt ==  CFormattingArgs::eJson)) {
881 	   if (s == "-"){
882 		   string f_str = (fmt == CFormattingArgs::eXml2) ? "14.": "13.";
883 		   NCBI_THROW(CInputException, eEmptyUserInput,
884 		              "Please provide a file name for outfmt " + f_str);
885 	   }
886 	   return true;
887 	}
888 	return false;
889 }
890 
891 string
GetSubjectFile(const CArgs & args)892 GetSubjectFile(const CArgs& args)
893 {
894 	string filename="";
895 
896 	if (args.Exist(kArgSubject) && args[kArgSubject].HasValue())
897 		filename = args[kArgSubject].AsString();
898 
899 	return filename;
900 }
901 
PrintErrorArchive(const CArgs & a,const list<CRef<CBlast4_error>> & msg)902 void PrintErrorArchive(const CArgs & a, const list<CRef<CBlast4_error> > & msg)
903 {
904 	try {
905 		if(NStr::StringToInt(NStr::TruncateSpaces(a[align_format::kArgOutputFormat].AsString())) == CFormattingArgs::eArchiveFormat) {
906 			CRef<CBlast4_archive> archive (new CBlast4_archive);
907 
908 			CBlast4_request & req = archive->SetRequest();
909 			CBlast4_get_request_info_request & info= req.SetBody().SetGet_request_info();
910 			info.SetRequest_id("Error");
911 			CBlast4_get_search_results_reply & results = archive->SetResults();
912                         // Pacify unused varaible warning, the set above is used to populate mandatory field
913 			(void) results;
914 			archive->SetMessages() = msg;
915 			CBlastFormat::PrintArchive(archive, a[kArgOutput].AsOutputFile());
916 		}
917 	} catch (...) {}
918 }
919 
QueryBatchCleanup()920 void QueryBatchCleanup()
921 {
922 #if defined(NCBI_OS_LINUX) && HAVE_MALLOC_H
923 	malloc_trim(0);
924 #endif
925     return;
926 
927 }
928 
LogQueryInfo(CBlastUsageReport & report,const CBlastInput & q_info)929 void LogQueryInfo(CBlastUsageReport & report, const CBlastInput & q_info)
930 {
931 	report.AddParam(CBlastUsageReport::eTotalQueryLength, q_info.GetTotalLengthProcessed());
932 	report.AddParam(CBlastUsageReport::eNumQueries, q_info.GetNumSeqsProcessed());
933 }
934 
935 
LogRPSBlastOptions(blast::CBlastUsageReport & report,const CBlastOptions & opt)936 void LogRPSBlastOptions(blast::CBlastUsageReport & report, const CBlastOptions & opt)
937 {
938 	report.AddParam(CBlastUsageReport::eProgram, Blast_ProgramNameFromType(opt.GetProgramType()));
939 	report.AddParam(CBlastUsageReport::eEvalueThreshold, opt.GetEvalueThreshold());
940 	report.AddParam(CBlastUsageReport::eHitListSize, opt.GetHitlistSize());
941     report.AddParam(CBlastUsageReport::eCompBasedStats, opt.GetCompositionBasedStats());
942 }
943 
LogRPSCmdOptions(blast::CBlastUsageReport & report,const CBlastAppArgs & args)944 void LogRPSCmdOptions(blast::CBlastUsageReport & report, const CBlastAppArgs & args)
945 {
946 	if (args.GetBlastDatabaseArgs().NotEmpty() &&
947 		args.GetBlastDatabaseArgs()->GetSearchDatabase().NotEmpty() &&
948 		args.GetBlastDatabaseArgs()->GetSearchDatabase()->GetSeqDb().NotEmpty()) {
949 
950 		CRef<CSeqDB> db = args.GetBlastDatabaseArgs()->GetSearchDatabase()->GetSeqDb();
951 		string db_name = db->GetDBNameList();
952 		int off = db_name.find_last_of(CFile::GetPathSeparator());
953 	    if (off != -1) {
954 	    	db_name.erase(0, off+1);
955 		}
956 		report.AddParam(CBlastUsageReport::eDBName, db_name);
957 		report.AddParam(CBlastUsageReport::eDBLength, (Int8) db->GetTotalLength());
958 		report.AddParam(CBlastUsageReport::eDBNumSeqs, db->GetNumSeqs());
959 		report.AddParam(CBlastUsageReport::eDBDate, db->GetDate());
960 	}
961 
962 	if(args.GetFormattingArgs().NotEmpty()){
963 		report.AddParam(CBlastUsageReport::eOutputFmt, args.GetFormattingArgs()->GetFormattedOutputChoice());
964 	}
965 }
966 
GetMTByQueriesBatchSize(EProgram p,int num_threads)967 int GetMTByQueriesBatchSize(EProgram p, int num_threads)
968 {
969 	    int batch_size = 0;
970 
971 		char * mt_query_batch_env = getenv("BLAST_MT_QUERY_BATCH_SIZE");
972 		if (mt_query_batch_env) {
973 			batch_size = NStr::StringToInt(mt_query_batch_env);
974 		}
975 		else {
976 			int factor = 1;
977 			if ( EProgramToEBlastProgramType(p) == eBlastTypeBlastn ) {
978 				factor = 2;
979 			}
980 			batch_size = GetQueryBatchSize(p)/factor;
981 		}
982 		return batch_size;
983 }
984 
CheckMTByQueries_DBSize(CRef<CLocalDbAdapter> & db_adapter,const CBlastOptions & opt)985 void CheckMTByQueries_DBSize(CRef<CLocalDbAdapter> & db_adapter, const CBlastOptions & opt)
986 {
987 	CRef<CSearchDatabase> sdb = db_adapter->GetSearchDatabase();
988 	const CRef<CSeqDBGiList>&  gi = sdb->GetGiList();
989 	const CRef<CSeqDBGiList>& n_gi = sdb->GetNegativeGiList();
990 
991 	if (gi.NotEmpty() || n_gi.NotEmpty()) {
992 		return;
993 	}
994 
995 	//EProgram prog = opt.GetProgram();
996 	CRef<CSeqDB> seqdb = sdb->GetSeqDb();
997 	Uint8 total_length = seqdb->GetTotalLength();
998 	Uint8 length_limit = 0;
999 
1000 	if (db_adapter->IsProtein()) {
1001 		static const Uint8 kMaxProtTotalLength = 2000000000;
1002 		length_limit = kMaxProtTotalLength;
1003 	}
1004 	else {
1005 		static const Uint8 kMaxNuclTotalLength = 14000000000;
1006 		length_limit = kMaxNuclTotalLength;
1007 	}
1008 
1009 	if(total_length > length_limit) {
1010 		string warn = "This database is probably too large to benefit from -mt_mode=1. " \
1011 					  "We suggest using -mt_mode=1 only if the database is less than";
1012 		if (db_adapter->IsProtein()) {
1013 			warn += " 2 billion residues ";
1014 		}
1015 		else {
1016 			warn += " 14 billion bases ";
1017 		}
1018        	ERR_POST(Warning <<   warn + "or if the search is limited by an option such as -taxids, -taxidlist or -gilist.");
1019 	}
1020 	return;
1021 }
1022 
CheckMTByQueries_QuerySize(EProgram prog,int batch_size)1023 void CheckMTByQueries_QuerySize(EProgram prog, int batch_size)
1024 {
1025 	string warning = "This set of queries is too small to fully benefit from the -mt_mode=1 option. " \
1026 			         "The total number of letters should be at least ";
1027 	warning += NStr::IntToString(batch_size);
1028 	EBlastProgramType p = EProgramToEBlastProgramType(prog);
1029 	if (Blast_QueryIsProtein(p)) {
1030 		warning += " residues";
1031 	}
1032 	else {
1033 		warning += " bases";
1034 	}
1035 	warning += " per thread, and there should be at least one query of this length per thread.";
1036     ERR_POST(Warning << warning);
1037 }
1038 
1039 END_NCBI_SCOPE
1040