1 /*  $Id: blast_app_util.cpp 632181 2021-05-27 13:23:25Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file blast_app_util.cpp
31  *  Utility functions for BLAST command line applications
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include "blast_app_util.hpp"
36 
37 #include <serial/serial.hpp>
38 #include <serial/objostr.hpp>
39 
40 #include <objtools/data_loaders/blastdb/bdbloader.hpp>
41 #include <algo/blast/api/remote_blast.hpp>
42 #include <algo/blast/api/objmgr_query_data.hpp>     // for CObjMgr_QueryFactory
43 #include <algo/blast/api/blast_options_builder.hpp>
44 #include <algo/blast/api/search_strategy.hpp>
45 #include <algo/blast/blastinput/blast_input.hpp>    // for CInputException
46 #include <algo/blast/blastinput/psiblast_args.hpp>
47 #include <algo/blast/blastinput/tblastn_args.hpp>
48 #include <algo/blast/blastinput/blast_scope_src.hpp>
49 #include <objmgr/util/sequence.hpp>
50 
51 #include <objects/scoremat/Pssm.hpp>
52 #include <serial/typeinfo.hpp>      // for CTypeInfo, needed by SerialClone
53 #include <objtools/data_loaders/blastdb/bdbloader_rmt.hpp>
54 #include <algo/blast/format/blast_format.hpp>
55 #include <objtools/align_format/format_flags.hpp>
56 
57 #if defined(NCBI_OS_LINUX) && HAVE_MALLOC_H
58 #include <malloc.h>
59 #endif
60 
61 BEGIN_NCBI_SCOPE
62 USING_SCOPE(objects);
63 USING_SCOPE(blast);
64 
GetBatchSize(Int4 hits)65 Int4 CBatchSizeMixer::GetBatchSize(Int4 hits)
66 {
67      if (hits >= 0) {
68          double ratio = 1.0 * (hits+1) / m_BatchSize;
69          m_Ratio = (m_Ratio < 0) ? ratio
70                  : k_MixIn * ratio + (1.0 - k_MixIn) * m_Ratio;
71          m_BatchSize = (Int4) (1.0 * m_TargetHits / m_Ratio);
72      }
73      if (m_BatchSize > k_MaxBatchSize) {
74          m_BatchSize = k_MaxBatchSize;
75          m_Ratio = -1.0;
76      } else if (m_BatchSize < k_MinBatchSize) {
77          m_BatchSize = k_MinBatchSize;
78          m_Ratio = -1.0;
79      }
80      return m_BatchSize;
81 }
82 
83 CRef<blast::CRemoteBlast>
InitializeRemoteBlast(CRef<blast::IQueryFactory> queries,CRef<blast::CBlastDatabaseArgs> db_args,CRef<blast::CBlastOptionsHandle> opts_hndl,bool verbose_output,const string & client_id,CRef<objects::CPssmWithParameters> pssm)84 InitializeRemoteBlast(CRef<blast::IQueryFactory> queries,
85                       CRef<blast::CBlastDatabaseArgs> db_args,
86                       CRef<blast::CBlastOptionsHandle> opts_hndl,
87                       bool verbose_output,
88                       const string& client_id /* = kEmptyStr */,
89                       CRef<objects::CPssmWithParameters> pssm
90                         /* = CRef<objects::CPssmWithParameters>() */)
91 {
92     _ASSERT(queries || pssm);
93     _ASSERT(db_args);
94     _ASSERT(opts_hndl);
95 
96     CRef<CRemoteBlast> retval;
97 
98     CRef<CSearchDatabase> search_db = db_args->GetSearchDatabase();
99     if (search_db.NotEmpty()) {
100         if (pssm.NotEmpty()) {
101             _ASSERT(queries.Empty());
102             retval.Reset(new CRemoteBlast(pssm, opts_hndl, *search_db));
103         } else {
104             retval.Reset(new CRemoteBlast(queries, opts_hndl, *search_db));
105         }
106     } else {
107         if (pssm.NotEmpty()) {
108             NCBI_THROW(CInputException, eInvalidInput,
109                        "Remote PSI-BL2SEQ is not supported");
110         } else {
111             // N.B.: there is NO scope needed in the GetSubjects call because
112             // the subjects (if any) should have already been added in
113             // InitializeSubject
114             retval.Reset(new CRemoteBlast(queries, opts_hndl,
115                                          db_args->GetSubjects()));
116         }
117     }
118     if (verbose_output) {
119         retval->SetVerbose();
120     }
121     if (client_id != kEmptyStr) {
122         retval->SetClientId(client_id);
123     }
124     return retval;
125 }
126 
s_FindBlastDbDataLoaderName(const string & dbname,bool is_protein)127 static string s_FindBlastDbDataLoaderName(const string& dbname, bool is_protein)
128 {
129     // This string is built based on the knowledge of how the BLAST DB data
130     // loader names are created, see
131     // {CBlastDbDataLoader,CRemoteBlastDbDataLoader}::GetLoaderNameFromArgs
132     const string str2find(string("_") + dbname + string(is_protein ? "P" : "N"));
133     CObjectManager::TRegisteredNames loader_names;
134     CObjectManager::GetInstance()->GetRegisteredNames(loader_names);
135     ITERATE(CObjectManager::TRegisteredNames, loader_name, loader_names) {
136         if (NStr::Find(*loader_name, str2find) != NPOS) {
137             return *loader_name;
138         }
139     }
140     return kEmptyStr;
141 }
142 
143 blast::SDataLoaderConfig
InitializeQueryDataLoaderConfiguration(bool query_is_protein,CRef<blast::CLocalDbAdapter> db_adapter)144 InitializeQueryDataLoaderConfiguration(bool query_is_protein,
145                                        CRef<blast::CLocalDbAdapter> db_adapter)
146 {
147     SDataLoaderConfig retval(query_is_protein);
148     retval.OptimizeForWholeLargeSequenceRetrieval();
149 
150     /* Load the BLAST database into the data loader configuration for the query
151      * so that if the query sequence(s) are specified as seq-ids, these can be
152      * fetched from the BLAST database being searched */
153     if (db_adapter->IsBlastDb() &&  /* this is a BLAST database search */
154         retval.m_UseBlastDbs &&   /* the BLAST database data loader is requested */
155         (query_is_protein == db_adapter->IsProtein())) { /* the same database type is used for both queries and subjects */
156         // Make sure we don't add the same database more than once
157         vector<string> default_dbs;
158         NStr::Split(retval.m_BlastDbName, " ", default_dbs);
159         if (default_dbs.size() &&
160             (find(default_dbs.begin(), default_dbs.end(),
161                  db_adapter->GetDatabaseName()) == default_dbs.end())) {
162             CNcbiOstrstream oss;
163             oss << db_adapter->GetDatabaseName() << " " << retval.m_BlastDbName;
164             retval.m_BlastDbName = CNcbiOstrstreamToString(oss);
165         }
166     }
167     if (retval.m_UseBlastDbs) {
168         _TRACE("Initializing query data loader to '" << retval.m_BlastDbName
169                << "' (" << (query_is_protein ? "protein" : "nucleotide")
170                << " BLAST database)");
171     }
172     if (retval.m_UseGenbank) {
173         _TRACE("Initializing query data loader to use GenBank data loader");
174     }
175     return retval;
176 }
177 
178 void
InitializeSubject(CRef<blast::CBlastDatabaseArgs> db_args,CRef<blast::CBlastOptionsHandle> opts_hndl,bool is_remote_search,CRef<blast::CLocalDbAdapter> & db_adapter,CRef<objects::CScope> & scope)179 InitializeSubject(CRef<blast::CBlastDatabaseArgs> db_args,
180                   CRef<blast::CBlastOptionsHandle> opts_hndl,
181                   bool is_remote_search,
182                   CRef<blast::CLocalDbAdapter>& db_adapter,
183                   CRef<objects::CScope>& scope)
184 {
185 	string dl = kEmptyStr;
186     db_adapter.Reset();
187 
188     _ASSERT(db_args.NotEmpty());
189     CRef<CSearchDatabase> search_db = db_args->GetSearchDatabase();
190 
191     // Initialize the scope...
192     if (is_remote_search) {
193         const bool is_protein =
194             Blast_SubjectIsProtein(opts_hndl->GetOptions().GetProgramType())
195 			? true : false;
196         SDataLoaderConfig config(is_protein);
197         CBlastScopeSource scope_src(config);
198         // configure scope to fetch sequences remotely for formatting
199         if (scope.NotEmpty()) {
200             scope_src.AddDataLoaders(scope);
201         } else {
202             scope = scope_src.NewScope();
203         }
204     } else {
205         if (scope.Empty()) {
206             scope.Reset(new CScope(*CObjectManager::GetInstance()));
207         }
208     }
209     _ASSERT(scope.NotEmpty());
210 
211     // ... and then the subjects
212     CRef<IQueryFactory> subjects;
213     if ( (subjects = db_args->GetSubjects(scope)) ) {
214         _ASSERT(search_db.Empty());
215 	char* bl2seq_legacy = getenv("BL2SEQ_LEGACY");
216 	if (bl2seq_legacy)
217         	db_adapter.Reset(new CLocalDbAdapter(subjects, opts_hndl, false));
218 	else
219         	db_adapter.Reset(new CLocalDbAdapter(subjects, opts_hndl, true));
220     } else {
221         _ASSERT(search_db.NotEmpty());
222         try {
223             // Try to open the BLAST database even for remote searches, as if
224             // it is available locally, it will be better to fetch the
225             // sequence data for formatting from this (local) source
226             CRef<CSeqDB> seqdb = search_db->GetSeqDb();
227             db_adapter.Reset(new CLocalDbAdapter(*search_db));
228             dl = RegisterOMDataLoader(seqdb);
229             scope->AddDataLoader(dl);
230         } catch (const CSeqDBException&) {
231             // The BLAST database couldn't be found, report this for local
232             // searches, but for remote searches go on.
233         	dl = kEmptyStr;
234             if (is_remote_search ) {
235                 db_adapter.Reset(new CLocalDbAdapter(*search_db));
236             } else {
237                 throw;
238             }
239         }
240     }
241 
242     /// Set the BLASTDB data loader as the default data loader (if applicable)
243     if (search_db.NotEmpty()) {
244         if ( dl != kEmptyStr) {
245             // FIXME: will this work with multiple BLAST DBs?
246             scope->AddDataLoader(dl,  CBlastDatabaseArgs::kSubjectsDataLoaderPriority);
247             _TRACE("Setting " << dl << " priority to "
248                    << (int)CBlastDatabaseArgs::kSubjectsDataLoaderPriority
249                    << " for subjects");
250         }
251     }
252     return;
253 }
254 
RegisterOMDataLoader(CRef<CSeqDB> db_handle)255 string RegisterOMDataLoader(CRef<CSeqDB> db_handle)
256 {
257     // the blast formatter requires that the database coexist in
258     // the same scope with the query sequences
259     CRef<CObjectManager> om = CObjectManager::GetInstance();
260     CBlastDbDataLoader::RegisterInObjectManager(*om, db_handle, true,
261                         CObjectManager::eDefault,
262                         CBlastDatabaseArgs::kSubjectsDataLoaderPriority);
263     CBlastDbDataLoader::SBlastDbParam param(db_handle);
264     string retval(CBlastDbDataLoader::GetLoaderNameFromArgs(param));
265     _TRACE("Registering " << retval << " at priority " <<
266            (int)CBlastDatabaseArgs::kSubjectsDataLoaderPriority
267            << " for subjects");
268     return retval;
269 }
270 
271 
272 static CRef<blast::CExportStrategy>
s_InitializeExportStrategy(CRef<blast::IQueryFactory> queries,CRef<blast::CBlastDatabaseArgs> db_args,CRef<blast::CBlastOptionsHandle> opts_hndl,const string & client_id,CRef<objects::CPssmWithParameters> pssm,unsigned int num_iters)273 s_InitializeExportStrategy(CRef<blast::IQueryFactory> queries,
274                       	 CRef<blast::CBlastDatabaseArgs> db_args,
275                       	 CRef<blast::CBlastOptionsHandle> opts_hndl,
276                       	 const string& client_id /* = kEmptyStr */,
277                       	 CRef<objects::CPssmWithParameters> pssm
278                          /* = CRef<objects::CPssmWithParameters>() */,
279                          unsigned int num_iters
280                          /* = 0 */)
281 {
282     _ASSERT(queries || pssm);
283     _ASSERT(db_args);
284     _ASSERT(opts_hndl);
285 
286     CRef<CExportStrategy> retval;
287 
288     CRef<CSearchDatabase> search_db = db_args->GetSearchDatabase();
289     if (search_db.NotEmpty())
290     {
291         if (pssm.NotEmpty())
292         {
293             _ASSERT(queries.Empty());
294             if(num_iters != 0)
295             	retval.Reset(new blast::CExportStrategy(pssm, opts_hndl, search_db, client_id, num_iters));
296             else
297             	retval.Reset(new blast::CExportStrategy(pssm, opts_hndl, search_db, client_id));
298         }
299         else
300         {
301             if(num_iters != 0)
302             	retval.Reset(new blast::CExportStrategy(queries, opts_hndl, search_db, client_id, num_iters));
303             else
304             	retval.Reset(new blast::CExportStrategy(queries, opts_hndl, search_db, client_id));
305         }
306     }
307     else
308     {
309         if (pssm.NotEmpty())
310         {
311             NCBI_THROW(CInputException, eInvalidInput,
312                        "Remote PSI-BL2SEQ is not supported");
313         }
314         else
315         {
316             retval.Reset(new blast::CExportStrategy(queries, opts_hndl,
317             								 db_args->GetSubjects(), client_id));
318         }
319     }
320 
321     return retval;
322 }
323 
324 
325 /// Real implementation of search strategy extraction
326 /// @todo refactor this code so that it can be reused in other contexts
327 static void
s_ExportSearchStrategy(CNcbiOstream * out,CRef<blast::IQueryFactory> queries,CRef<blast::CBlastOptionsHandle> options_handle,CRef<blast::CBlastDatabaseArgs> db_args,CRef<objects::CPssmWithParameters> pssm,unsigned int num_iters)328 s_ExportSearchStrategy(CNcbiOstream* out,
329                      CRef<blast::IQueryFactory> queries,
330                      CRef<blast::CBlastOptionsHandle> options_handle,
331                      CRef<blast::CBlastDatabaseArgs> db_args,
332                      CRef<objects::CPssmWithParameters> pssm,
333                        /* = CRef<objects::CPssmWithParameters>() */
334                      unsigned int num_iters /* = 0 */)
335 {
336     if ( !out )
337         return;
338 
339     _ASSERT(db_args);
340     _ASSERT(options_handle);
341 
342     try
343     {
344         CRef<CExportStrategy> export_strategy =
345         			s_InitializeExportStrategy(queries, db_args, options_handle,
346                                   	 	 	   kEmptyStr, pssm, num_iters);
347         export_strategy->ExportSearchStrategy_ASN1(out);
348     }
349     catch (const CBlastException& e)
350     {
351         if (e.GetErrCode() == CBlastException::eNotSupported) {
352             NCBI_THROW(CInputException, eInvalidInput,
353                        "Saving search strategies with gi lists is currently "
354                        "not supported");
355         }
356         throw;
357     }
358 }
359 
360 /// Converts a list of Bioseqs into a TSeqLocVector. All Bioseqs are added to
361 /// the same CScope object
362 /// @param subjects Bioseqs to convert
363 static TSeqLocVector
s_ConvertBioseqs2TSeqLocVector(const CBlast4_subject::TSequences & subjects)364 s_ConvertBioseqs2TSeqLocVector(const CBlast4_subject::TSequences& subjects)
365 {
366     TSeqLocVector retval;
367     CRef<CScope> subj_scope(new CScope(*CObjectManager::GetInstance()));
368     ITERATE(CBlast4_subject::TSequences, bioseq, subjects) {
369         subj_scope->AddBioseq(**bioseq);
370         CRef<CSeq_id> seqid = FindBestChoice((*bioseq)->GetId(),
371                                              CSeq_id::BestRank);
372         const TSeqPos length = (*bioseq)->GetInst().GetLength();
373         CRef<CSeq_loc> sl(new CSeq_loc(*seqid, 0, length-1));
374         retval.push_back(SSeqLoc(sl, subj_scope));
375     }
376     return retval;
377 }
378 
379 /// Import PSSM into the command line arguments object
380 static void
s_ImportPssm(const CBlast4_queries & queries,CRef<blast::CBlastOptionsHandle> opts_hndl,blast::CBlastAppArgs * cmdline_args)381 s_ImportPssm(const CBlast4_queries& queries,
382              CRef<blast::CBlastOptionsHandle> opts_hndl,
383              blast::CBlastAppArgs* cmdline_args)
384 {
385     CRef<CPssmWithParameters> pssm
386         (const_cast<CPssmWithParameters*>(&queries.GetPssm()));
387     CPsiBlastAppArgs* psi_args = NULL;
388     CTblastnAppArgs* tbn_args = NULL;
389 
390     if ( (psi_args = dynamic_cast<CPsiBlastAppArgs*>(cmdline_args)) ) {
391         psi_args->SetInputPssm(pssm);
392     } else if ( (tbn_args =
393                  dynamic_cast<CTblastnAppArgs*>(cmdline_args))) {
394         tbn_args->SetInputPssm(pssm);
395     } else {
396         EBlastProgramType p = opts_hndl->GetOptions().GetProgramType();
397         string msg("PSSM found in saved strategy, but not supported ");
398         msg += "for " + Blast_ProgramNameFromType(p);
399         NCBI_THROW(CBlastException, eNotSupported, msg);
400     }
401 }
402 
403 /// Import queries into command line arguments object
404 static void
s_ImportQueries(const CBlast4_queries & queries,CRef<blast::CBlastOptionsHandle> opts_hndl,blast::CBlastAppArgs * cmdline_args)405 s_ImportQueries(const CBlast4_queries& queries,
406                 CRef<blast::CBlastOptionsHandle> opts_hndl,
407                 blast::CBlastAppArgs* cmdline_args)
408 {
409     CRef<CTmpFile> tmpfile(new CTmpFile(CTmpFile::eNoRemove));
410 
411     // Stuff the query bioseq or seqloc list in the input stream of the
412     // cmdline_args
413     if (queries.IsSeq_loc_list()) {
414         const CBlast4_queries::TSeq_loc_list& seqlocs =
415             queries.GetSeq_loc_list();
416         CFastaOstream out(tmpfile->AsOutputFile(CTmpFile::eIfExists_Throw));
417         out.SetFlag(CFastaOstream::eAssembleParts);
418 
419         EBlastProgramType prog = opts_hndl->GetOptions().GetProgramType();
420         SDataLoaderConfig dlconfig(!!Blast_QueryIsProtein(prog));
421         dlconfig.OptimizeForWholeLargeSequenceRetrieval();
422         CBlastScopeSource scope_src(dlconfig);
423         CRef<CScope> scope(scope_src.NewScope());
424 
425         ITERATE(CBlast4_queries::TSeq_loc_list, itr, seqlocs) {
426             if ((*itr)->GetId()) {
427                 CBioseq_Handle bh = scope->GetBioseqHandle(*(*itr)->GetId());
428                 out.Write(bh);
429             }
430         }
431         scope.Reset();
432         scope_src.RevokeBlastDbDataLoader();
433 
434     } else {
435         _ASSERT(queries.IsBioseq_set());
436         const CBlast4_queries::TBioseq_set& bioseqs =
437             queries.GetBioseq_set();
438         CFastaOstream out(tmpfile->AsOutputFile(CTmpFile::eIfExists_Throw));
439         out.SetFlag(CFastaOstream::eAssembleParts);
440 
441         ITERATE(CBioseq_set::TSeq_set, seq_entry, bioseqs.GetSeq_set()){
442             out.Write(**seq_entry);
443         }
444     }
445 
446     const string& fname = tmpfile->GetFileName();
447     tmpfile.Reset(new CTmpFile(fname));
448     cmdline_args->SetInputStream(tmpfile);
449 }
450 
451 /// Import the database and return it in a CBlastDatabaseArgs object
452 static CRef<blast::CBlastDatabaseArgs>
s_ImportDatabase(const CBlast4_subject & subj,CBlastOptionsBuilder & opts_builder,bool subject_is_protein,bool is_remote_search)453 s_ImportDatabase(const CBlast4_subject& subj,
454                  CBlastOptionsBuilder& opts_builder,
455                  bool subject_is_protein,
456                  bool is_remote_search)
457 {
458     _ASSERT(subj.IsDatabase());
459     CRef<CBlastDatabaseArgs> db_args(new CBlastDatabaseArgs);
460     const CSearchDatabase::EMoleculeType mol = subject_is_protein
461         ? CSearchDatabase::eBlastDbIsProtein
462         : CSearchDatabase::eBlastDbIsNucleotide;
463     const string dbname(subj.GetDatabase());
464     CRef<CSearchDatabase> search_db(new CSearchDatabase(dbname, mol));
465 
466     if (opts_builder.HaveEntrezQuery()) {
467         string limit(opts_builder.GetEntrezQuery());
468         search_db->SetEntrezQueryLimitation(limit);
469         if ( !is_remote_search ) {
470             string msg("Entrez query '");
471             msg += limit + string("' will not be processed locally.\n");
472             msg += string("Please use the -remote option.");
473             throw runtime_error(msg);
474         }
475     }
476 
477     if (opts_builder.HaveGiList() || opts_builder.HaveTaxidList()) {
478         CSeqDBGiList *gilist = new CSeqDBGiList();
479         if (opts_builder.HaveGiList()) {
480         	ITERATE(list<TGi>, gi, opts_builder.GetGiList()) {
481         		gilist->AddGi(*gi);
482         	}
483         }
484         if (opts_builder.HaveTaxidList()) {
485         	list<TTaxId>  list = opts_builder.GetTaxidList();
486            	set<TTaxId> taxids(list.begin(), list.end());
487         	gilist->AddTaxIds(taxids);
488         }
489         search_db->SetGiList(gilist);
490     }
491 
492     if (opts_builder.HaveNegativeGiList() || opts_builder.HaveNegativeTaxidList()) {
493     	CSeqDBGiList *gilist = new CSeqDBGiList();
494         if (opts_builder.HaveNegativeGiList()) {
495           	ITERATE(list<TGi>, gi, opts_builder.GetNegativeGiList()) {
496       		gilist->AddGi(*gi);
497        	    }
498         }
499         if (opts_builder.HaveNegativeTaxidList()) {
500         	list<TTaxId>  list = opts_builder.GetNegativeTaxidList();
501            	set<TTaxId> taxids(list.begin(), list.end());
502            	gilist->AddTaxIds(taxids);
503         }
504         search_db->SetNegativeGiList(gilist);
505     }
506 
507     if (opts_builder.HasDbFilteringAlgorithmKey()) {
508         string algo_key = opts_builder.GetDbFilteringAlgorithmKey();
509         ESubjectMaskingType mask_type= eSoftSubjMasking;
510         if(opts_builder.HasSubjectMaskingType())
511         	mask_type = opts_builder.GetSubjectMaskingType();
512         search_db->SetFilteringAlgorithm(algo_key, mask_type);
513 
514     } else if (opts_builder.HasDbFilteringAlgorithmId()) {
515         int algo_id = opts_builder.GetDbFilteringAlgorithmId();
516         ESubjectMaskingType mask_type= eSoftSubjMasking;
517         if(opts_builder.HasSubjectMaskingType())
518         	mask_type = opts_builder.GetSubjectMaskingType();
519         search_db->SetFilteringAlgorithm(algo_id, mask_type);
520     }
521 
522     db_args->SetSearchDatabase(search_db);
523     return db_args;
524 }
525 
526 /// Import the subject sequences into a CBlastDatabaseArgs object
527 static CRef<blast::CBlastDatabaseArgs>
s_ImportSubjects(const CBlast4_subject & subj,bool subject_is_protein)528 s_ImportSubjects(const CBlast4_subject& subj, bool subject_is_protein)
529 {
530     _ASSERT(subj.IsSequences());
531     CRef<CBlastDatabaseArgs> db_args(new CBlastDatabaseArgs);
532     TSeqLocVector subjects =
533         s_ConvertBioseqs2TSeqLocVector(subj.GetSequences());
534     CRef<CScope> subj_scope = subjects.front().scope;
535     CRef<IQueryFactory> subject_factory(new CObjMgr_QueryFactory(subjects));
536     db_args->SetSubjects(subject_factory, subj_scope, subject_is_protein);
537     return db_args;
538 }
539 
540 /// Imports search strategy, using CImportStrategy.
541 static void
s_ImportSearchStrategy(CNcbiIstream * in,blast::CBlastAppArgs * cmdline_args,bool is_remote_search,bool override_query,bool override_subject)542 s_ImportSearchStrategy(CNcbiIstream* in,
543                        blast::CBlastAppArgs* cmdline_args,
544                        bool is_remote_search,
545                        bool override_query,
546                        bool override_subject)
547 {
548     if ( !in ) {
549         return;
550     }
551 
552     CRef<CBlast4_request> b4req;
553     try {
554         b4req = ExtractBlast4Request(*in);
555     } catch (const CSerialException&) {
556         NCBI_THROW(CInputException, eInvalidInput,
557                    "Failed to read search strategy");
558     }
559 
560     CImportStrategy strategy(b4req);
561 
562     CRef<blast::CBlastOptionsHandle> opts_hndl = strategy.GetOptionsHandle();
563     cmdline_args->SetOptionsHandle(opts_hndl);
564     const EBlastProgramType prog = opts_hndl->GetOptions().GetProgramType();
565     cmdline_args->SetTask(strategy.GetTask());
566 #if _DEBUG
567     {
568         char* program_string = 0;
569         BlastNumber2Program(prog, &program_string);
570         _TRACE("EBlastProgramType=" << program_string << " task=" << strategy.GetTask());
571         sfree(program_string);
572     }
573 #endif
574 
575     // Get the subject
576     if (override_subject) {
577         ERR_POST(Warning << "Overriding database/subject in saved strategy");
578     } else {
579         CRef<blast::CBlastDatabaseArgs> db_args;
580         CRef<CBlast4_subject> subj = strategy.GetSubject();
581         const bool subject_is_protein = Blast_SubjectIsProtein(prog) ? true : false;
582 
583         if (subj->IsDatabase()) {
584             db_args = s_ImportDatabase(*subj, strategy.GetOptionsBuilder(),
585                                        subject_is_protein, is_remote_search);
586         } else {
587             db_args = s_ImportSubjects(*subj, subject_is_protein);
588         }
589         _ASSERT(db_args.NotEmpty());
590         cmdline_args->SetBlastDatabaseArgs(db_args);
591     }
592 
593     // Get the query, queries, or pssm
594     if (override_query) {
595         ERR_POST(Warning << "Overriding query in saved strategy");
596     } else {
597         CRef<CBlast4_queries> queries = strategy.GetQueries();
598         if (queries->IsPssm()) {
599             s_ImportPssm(*queries, opts_hndl, cmdline_args);
600         } else {
601             s_ImportQueries(*queries, opts_hndl, cmdline_args);
602         }
603         // Set the range restriction for the query, if applicable
604         const TSeqRange query_range = strategy.GetQueryRange();
605         if (query_range != TSeqRange::GetEmpty()) {
606             cmdline_args->GetQueryOptionsArgs()->SetRange(query_range);
607         }
608     }
609 
610     if ( CPsiBlastAppArgs* psi_args = dynamic_cast<CPsiBlastAppArgs*>(cmdline_args) )
611     {
612             psi_args->SetNumberOfIterations(strategy.GetPsiNumOfIterations());
613     }
614 }
615 
616 bool
RecoverSearchStrategy(const CArgs & args,blast::CBlastAppArgs * cmdline_args)617 RecoverSearchStrategy(const CArgs& args, blast::CBlastAppArgs* cmdline_args)
618 {
619     CNcbiIstream* in = cmdline_args->GetImportSearchStrategyStream(args);
620     if ( !in ) {
621         return false;
622     }
623     const bool is_remote_search =
624         (args.Exist(kArgRemote) && args[kArgRemote].HasValue() && args[kArgRemote].AsBoolean());
625     const bool override_query = (args[kArgQuery].HasValue() &&
626                                  args[kArgQuery].AsString() != kDfltArgQuery);
627     const bool override_subject = CBlastDatabaseArgs::HasBeenSet(args);
628 
629     if (CMbIndexArgs::HasBeenSet(args)) {
630     	if (args[kArgUseIndex].AsBoolean() != kDfltArgUseIndex)
631     		ERR_POST(Warning << "Overriding megablast BLAST DB indexed options in saved strategy");
632     }
633 
634     s_ImportSearchStrategy(in, cmdline_args, is_remote_search, override_query,
635                            override_subject);
636 
637     return true;
638 }
639 
640 // Process search strategies
641 // FIXME: save program options,
642 // Save task if provided, no other options (only those in the cmd line) should
643 // be saved
644 void
SaveSearchStrategy(const CArgs & args,blast::CBlastAppArgs * cmdline_args,CRef<blast::IQueryFactory> queries,CRef<blast::CBlastOptionsHandle> opts_hndl,CRef<objects::CPssmWithParameters> pssm,unsigned int num_iters)645 SaveSearchStrategy(const CArgs& args,
646                    blast::CBlastAppArgs* cmdline_args,
647                    CRef<blast::IQueryFactory> queries,
648                    CRef<blast::CBlastOptionsHandle> opts_hndl,
649                    CRef<objects::CPssmWithParameters> pssm
650                      /* = CRef<objects::CPssmWithParameters>() */,
651                     unsigned int num_iters /* =0 */)
652 {
653     CNcbiOstream* out = cmdline_args->GetExportSearchStrategyStream(args);
654     if ( !out ) {
655         return;
656     }
657 
658     s_ExportSearchStrategy(out, queries, opts_hndl,
659                            cmdline_args->GetBlastDatabaseArgs(),
660                            pssm, num_iters);
661 }
662 
663 /// Extracts the subject sequence IDs and ranges from the BLAST results
664 /// @note if this ever needs to be refactored for popular developer
665 /// consumption, this function should operate on CSeq_align_set as opposed to
666 /// blast::CSearchResultSet
667 static void
s_ExtractSeqidsAndRanges(const blast::CSearchResultSet & results,CScope::TIds & ids,vector<TSeqRange> & ranges)668 s_ExtractSeqidsAndRanges(const blast::CSearchResultSet& results,
669                          CScope::TIds& ids, vector<TSeqRange>& ranges)
670 {
671     static const CSeq_align::TDim kQueryRow = 0;
672     static const CSeq_align::TDim kSubjRow = 1;
673     ids.clear();
674     ranges.clear();
675 
676     typedef map< CConstRef<CSeq_id>,
677                  vector<TSeqRange>,
678                  CConstRefCSeqId_LessThan
679                > TSeqIdRanges;
680     TSeqIdRanges id_ranges;
681 
682     ITERATE(blast::CSearchResultSet, result, results) {
683         if ( !(*result)->HasAlignments() ) {
684             continue;
685         }
686         ITERATE(CSeq_align_set::Tdata, aln, (*result)->GetSeqAlign()->Get()) {
687             CConstRef<CSeq_id> subj(&(*aln)->GetSeq_id(kSubjRow));
688             TSeqRange subj_range((*aln)->GetSeqRange(kSubjRow));
689             if ((*aln)->GetSeqStrand(kQueryRow) == eNa_strand_minus &&
690                 (*aln)->GetSeqStrand(kSubjRow) == eNa_strand_plus) {
691                 TSeqRange r(subj_range);
692                 // flag the range as needed to be flipped once the sequence
693                 // length is known
694                 subj_range.SetFrom(r.GetToOpen());
695                 subj_range.SetToOpen(r.GetFrom());
696             }
697             id_ranges[subj].push_back(subj_range);
698         }
699     }
700 
701     ITERATE(TSeqIdRanges, itr, id_ranges) {
702         ITERATE(vector<TSeqRange>, range, itr->second) {
703             ids.push_back(CSeq_id_Handle::GetHandle(*itr->first));
704             ranges.push_back(*range);
705         }
706     }
707     _ASSERT(ids.size() == ranges.size());
708 }
709 
710 /// Returns true if the remote BLAST DB data loader is being used
711 static bool
s_IsUsingRemoteBlastDbDataLoader()712 s_IsUsingRemoteBlastDbDataLoader()
713 {
714     CObjectManager::TRegisteredNames data_loaders;
715     CObjectManager::GetInstance()->GetRegisteredNames(data_loaders);
716     ITERATE(CObjectManager::TRegisteredNames, name, data_loaders) {
717         if (NStr::StartsWith(*name, objects::CRemoteBlastDbDataLoader::kNamePrefix)) {
718             return true;
719         }
720     }
721     return false;
722 }
723 
724 static bool
s_IsPrefetchFormat(blast::CFormattingArgs::EOutputFormat format_type)725 s_IsPrefetchFormat(blast::CFormattingArgs::EOutputFormat format_type)
726 {
727 	if ((format_type == CFormattingArgs::eAsnText) ||
728 	    (format_type == CFormattingArgs::eAsnBinary) ||
729 	    (format_type == CFormattingArgs::eArchiveFormat)||
730 	    (format_type == CFormattingArgs::eJsonSeqalign)) {
731 		return false;
732 	}
733 	return true;
734 }
735 
736 static bool
s_PreFetchSeqs(const blast::CSearchResultSet & results,blast::CFormattingArgs::EOutputFormat format_type)737 s_PreFetchSeqs(const blast::CSearchResultSet& results,
738 		       blast::CFormattingArgs::EOutputFormat format_type)
739 {
740 	{
741 		char * pre_fetch_limit_str = getenv("PRE_FETCH_SEQS_LIMIT");
742 		if (pre_fetch_limit_str) {
743 			int pre_fetch_limit = NStr::StringToInt(pre_fetch_limit_str);
744 			if(pre_fetch_limit == 0) {
745 				return false;
746 			}
747 			if(pre_fetch_limit == INT_MAX){
748 				return true;
749 			}
750 			int num_of_seqs = 0;
751 			for(unsigned int i=0; i < results.GetNumResults(); i++) {
752 				if(results[i].HasAlignments()) {
753 					num_of_seqs += results[i].GetSeqAlign()->Size();
754 				}
755 			}
756 			if(num_of_seqs > pre_fetch_limit) {
757 				return false;
758 			}
759 		}
760 	}
761 
762 	return s_IsPrefetchFormat(format_type);
763 }
764 
BlastFormatter_PreFetchSequenceData(const blast::CSearchResultSet & results,CRef<CScope> scope,blast::CFormattingArgs::EOutputFormat format_type)765 void BlastFormatter_PreFetchSequenceData(const blast::CSearchResultSet& results,
766 		                                 CRef<CScope> scope,
767 		                                 blast::CFormattingArgs::EOutputFormat format_type)
768 {
769     _ASSERT(scope.NotEmpty());
770     if (results.size() == 0) {
771         return;
772     }
773     if(!s_PreFetchSeqs(results, format_type)){
774     	return;
775     }
776     try {
777        CScope::TIds ids;
778        vector<TSeqRange> ranges;
779        s_ExtractSeqidsAndRanges(results, ids, ranges);
780        _TRACE("Prefetching " << ids.size() << " sequence lengths");
781        LoadSequencesToScope(ids, ranges, scope);
782 	} catch (CException& e) {
783 	   if(s_IsUsingRemoteBlastDbDataLoader()) {
784           NCBI_THROW(CBlastSystemException, eNetworkError,
785                      "Error fetching sequence data from BLAST databases at NCBI, "
786                      "please try again later");
787 	   }
788 	   else {
789           NCBI_RETHROW_SAME(e, "Error pre-fetching sequence data ");
790 	   }
791    }
792 
793 }
794 
795 /// Auxiliary function to extract the ancillary data from the PSSM.
796 CRef<CBlastAncillaryData>
ExtractPssmAncillaryData(const CPssmWithParameters & pssm)797 ExtractPssmAncillaryData(const CPssmWithParameters& pssm)
798 {
799     _ASSERT(pssm.CanGetPssm());
800     pair<double, double> lambda, k, h;
801     lambda.first = pssm.GetPssm().GetLambdaUngapped();
802     lambda.second = pssm.GetPssm().GetLambda();
803     k.first = pssm.GetPssm().GetKappaUngapped();
804     k.second = pssm.GetPssm().GetKappa();
805     h.first = pssm.GetPssm().GetHUngapped();
806     h.second = pssm.GetPssm().GetH();
807     return CRef<CBlastAncillaryData>(new CBlastAncillaryData(lambda, k, h, 0,
808                                                              true));
809 }
810 
811 void
CheckForFreqRatioFile(const string & rps_dbname,CRef<CBlastOptionsHandle> & opt_handle,bool isRpsblast)812 CheckForFreqRatioFile(const string& rps_dbname, CRef<CBlastOptionsHandle>  & opt_handle, bool isRpsblast)
813 {
814     bool use_cbs = (opt_handle->GetOptions().GetCompositionBasedStats() == eNoCompositionBasedStats) ? false : true;
815     if(use_cbs) {
816         vector<string> db;
817         NStr::Split(rps_dbname, " ", db);
818         list<string> failed_db;
819         for (unsigned int i=0; i < db.size(); i++) {
820     	    string path;
821     	    try {
822                 vector<string> dbpath;
823        	        CSeqDB::FindVolumePaths(db[i], CSeqDB::eProtein, dbpath);
824                 path = *dbpath.begin();
825             } catch (const CSeqDBException& e) {
826                  NCBI_RETHROW(e, CBlastException, eRpsInit,
827                               "Cannot retrieve path to RPS database");
828             }
829 
830     	    CFile f(path + ".freq");
831             if(!f.Exists()) {
832             	failed_db.push_back(db[i]);
833             }
834 
835         }
836         if(!failed_db.empty()) {
837         	opt_handle->SetOptions().SetCompositionBasedStats(eNoCompositionBasedStats);
838         	string all_failed = NStr::Join(failed_db, ", ");
839         	string prog_str = isRpsblast ? "RPSBLAST": "DELTABLAST";
840         	string msg = all_failed + " contain(s) no freq ratios " \
841                      	 + "needed for composition-based statistics.\n" \
842                      	 + prog_str + " will be run without composition-based statistics.";
843         	ERR_POST(Warning << msg);
844         }
845 
846     }
847     return;
848 }
849 
850 bool
IsIStreamEmpty(CNcbiIstream & in)851 IsIStreamEmpty(CNcbiIstream & in)
852 {
853 #ifdef NCBI_OS_MSWIN
854 	char c;
855 	in.setf(ios::skipws);
856 	if (!(in >> c))
857 		return true;
858 	in.unget();
859 	return false;
860 #else
861 	char c;
862 	CNcbiStreampos orig_p = in.tellg();
863 	// Piped input
864 	if(orig_p < 0)
865 		return false;
866 
867 	IOS_BASE::iostate orig_state = in.rdstate();
868 	IOS_BASE::fmtflags orig_flags = in.setf(ios::skipws);
869 
870 	if(! (in >> c))
871 		return true;
872 
873 	in.seekg(orig_p);
874 	in.flags(orig_flags);
875 	in.clear();
876 	in.setstate(orig_state);
877 
878 	return false;
879 #endif
880 }
881 
882 string
GetCmdlineArgs(const CNcbiArguments & a)883 GetCmdlineArgs(const CNcbiArguments & a)
884 {
885 	string cmd = kEmptyStr;
886 	for(unsigned int i=0; i < a.Size(); i++) {
887 		cmd += a[i] + " ";
888 	}
889 	return cmd;
890 }
891 
892 bool
UseXInclude(const CFormattingArgs & f,const string & s)893 UseXInclude(const CFormattingArgs & f, const string & s)
894 {
895 	CFormattingArgs::EOutputFormat fmt = f.GetFormattedOutputChoice();
896 	if((fmt ==  CFormattingArgs::eXml2) || (fmt ==  CFormattingArgs::eJson)) {
897 	   if (s == "-"){
898 		   string f_str = (fmt == CFormattingArgs::eXml2) ? "14.": "13.";
899 		   NCBI_THROW(CInputException, eEmptyUserInput,
900 		              "Please provide a file name for outfmt " + f_str);
901 	   }
902 	   return true;
903 	}
904 	return false;
905 }
906 
907 string
GetSubjectFile(const CArgs & args)908 GetSubjectFile(const CArgs& args)
909 {
910 	string filename="";
911 
912 	if (args.Exist(kArgSubject) && args[kArgSubject].HasValue())
913 		filename = args[kArgSubject].AsString();
914 
915 	return filename;
916 }
917 
PrintErrorArchive(const CArgs & a,const list<CRef<CBlast4_error>> & msg)918 void PrintErrorArchive(const CArgs & a, const list<CRef<CBlast4_error> > & msg)
919 {
920 	try {
921 		if(NStr::StringToInt(NStr::TruncateSpaces(a[align_format::kArgOutputFormat].AsString())) == CFormattingArgs::eArchiveFormat) {
922 			CRef<CBlast4_archive> archive (new CBlast4_archive);
923 
924 			CBlast4_request & req = archive->SetRequest();
925 			CBlast4_get_request_info_request & info= req.SetBody().SetGet_request_info();
926 			info.SetRequest_id("Error");
927 			CBlast4_get_search_results_reply & results = archive->SetResults();
928                         // Pacify unused varaible warning, the set above is used to populate mandatory field
929 			(void) results;
930 			archive->SetMessages() = msg;
931 			CBlastFormat::PrintArchive(archive, a[kArgOutput].AsOutputFile());
932 		}
933 	} catch (...) {}
934 }
935 
QueryBatchCleanup()936 void QueryBatchCleanup()
937 {
938 #if defined(NCBI_OS_LINUX) && HAVE_MALLOC_H
939 	malloc_trim(0);
940 #endif
941     return;
942 
943 }
944 
LogQueryInfo(CBlastUsageReport & report,const CBlastInput & q_info)945 void LogQueryInfo(CBlastUsageReport & report, const CBlastInput & q_info)
946 {
947 	report.AddParam(CBlastUsageReport::eTotalQueryLength, q_info.GetTotalLengthProcessed());
948 	report.AddParam(CBlastUsageReport::eNumQueries, q_info.GetNumSeqsProcessed());
949 }
950 
951 
LogRPSBlastOptions(blast::CBlastUsageReport & report,const CBlastOptions & opt)952 void LogRPSBlastOptions(blast::CBlastUsageReport & report, const CBlastOptions & opt)
953 {
954 	report.AddParam(CBlastUsageReport::eProgram, Blast_ProgramNameFromType(opt.GetProgramType()));
955 	report.AddParam(CBlastUsageReport::eEvalueThreshold, opt.GetEvalueThreshold());
956 	report.AddParam(CBlastUsageReport::eHitListSize, opt.GetHitlistSize());
957     report.AddParam(CBlastUsageReport::eCompBasedStats, opt.GetCompositionBasedStats());
958 }
959 
LogRPSCmdOptions(blast::CBlastUsageReport & report,const CBlastAppArgs & args)960 void LogRPSCmdOptions(blast::CBlastUsageReport & report, const CBlastAppArgs & args)
961 {
962 	if (args.GetBlastDatabaseArgs().NotEmpty() &&
963 		args.GetBlastDatabaseArgs()->GetSearchDatabase().NotEmpty() &&
964 		args.GetBlastDatabaseArgs()->GetSearchDatabase()->GetSeqDb().NotEmpty()) {
965 
966 		CRef<CSeqDB> db = args.GetBlastDatabaseArgs()->GetSearchDatabase()->GetSeqDb();
967 		string db_name = db->GetDBNameList();
968 		int off = db_name.find_last_of(CFile::GetPathSeparator());
969 	    if (off != -1) {
970 	    	db_name.erase(0, off+1);
971 		}
972 		report.AddParam(CBlastUsageReport::eDBName, db_name);
973 		report.AddParam(CBlastUsageReport::eDBLength, (Int8) db->GetTotalLength());
974 		report.AddParam(CBlastUsageReport::eDBNumSeqs, db->GetNumSeqs());
975 		report.AddParam(CBlastUsageReport::eDBDate, db->GetDate());
976 	}
977 
978 	if(args.GetFormattingArgs().NotEmpty()){
979 		report.AddParam(CBlastUsageReport::eOutputFmt, args.GetFormattingArgs()->GetFormattedOutputChoice());
980 	}
981 }
982 
GetMTByQueriesBatchSize(EProgram p,int num_threads)983 int GetMTByQueriesBatchSize(EProgram p, int num_threads)
984 {
985 	    int batch_size = 0;
986 
987 		char * mt_query_batch_env = getenv("BLAST_MT_QUERY_BATCH_SIZE");
988 		if (mt_query_batch_env) {
989 			batch_size = NStr::StringToInt(mt_query_batch_env);
990 		}
991 		else {
992 			int factor = 1;
993 			if ( EProgramToEBlastProgramType(p) == eBlastTypeBlastn ) {
994 				factor = 2;
995 			}
996 			batch_size = GetQueryBatchSize(p)/factor;
997 		}
998 		return batch_size;
999 }
1000 
CheckMTByQueries_DBSize(CRef<CLocalDbAdapter> & db_adapter,const CBlastOptions & opt)1001 void CheckMTByQueries_DBSize(CRef<CLocalDbAdapter> & db_adapter, const CBlastOptions & opt)
1002 {
1003 	CRef<CSearchDatabase> sdb = db_adapter->GetSearchDatabase();
1004 	const CRef<CSeqDBGiList>&  gi = sdb->GetGiList();
1005 	const CRef<CSeqDBGiList>& n_gi = sdb->GetNegativeGiList();
1006 
1007 	if (gi.NotEmpty() || n_gi.NotEmpty()) {
1008 		return;
1009 	}
1010 
1011 	EProgram prog = opt.GetProgram();
1012 	CRef<CSeqDB> seqdb = sdb->GetSeqDb();
1013 	Uint8 total_length = seqdb->GetTotalLength();
1014 	Uint8 length_limit = 0;
1015 
1016 	if (db_adapter->IsProtein()) {
1017 		static const Uint8 kMaxProtTotalLength = 2000000000;
1018 		length_limit = kMaxProtTotalLength;
1019 	}
1020 	else {
1021 		static const Uint8 kMaxNuclTotalLength = 14000000000;
1022 		length_limit = kMaxNuclTotalLength;
1023 	}
1024 
1025 	if(total_length > length_limit) {
1026 		string warn = "This database is probably too large to benefit from -mt_mode=1. " \
1027 					  "We suggest using -mt_mode=1 only if the database is less than";
1028 		if (db_adapter->IsProtein()) {
1029 			warn += " 2 billion residues ";
1030 		}
1031 		else {
1032 			warn += " 14 billion bases ";
1033 		}
1034        	ERR_POST(Warning <<   warn + "or if the search is limited by an option such as -taxids, -taxidlist or -gilist.");
1035 	}
1036 	return;
1037 }
1038 
CheckMTByQueries_QuerySize(EProgram prog,int batch_size)1039 void CheckMTByQueries_QuerySize(EProgram prog, int batch_size)
1040 {
1041 	string warning = "This set of queries is too small to fully benefit from the -mt_mode=1 option. "\
1042 			         "The total number of letters should be at least ";
1043 	warning += NStr::IntToString(batch_size);
1044 	EBlastProgramType p = EProgramToEBlastProgramType(prog);
1045 	if (Blast_QueryIsProtein(p)) {
1046 		warning += " residues per thread.";
1047 	}
1048 	else {
1049 		warning += " bases per thread.";
1050 	}
1051     ERR_POST(Warning << warning);
1052 }
1053 
1054 END_NCBI_SCOPE
1055