1 /* ===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  * Author:  Christiam Camacho
26  *
27  */
28 
29 /** @file local_blast.cpp
30  * NOTE: This file contains work in progress and the APIs are likely to change,
31  * please do not rely on them until this notice is removed.
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <algo/blast/api/local_blast.hpp>
36 #include <algo/blast/api/uniform_search.hpp>
37 #include <algo/blast/api/blast_seqinfosrc.hpp>
38 #include "blast_aux_priv.hpp"
39 #include <objects/scoremat/PssmWithParameters.hpp>
40 #include <algo/blast/api/seqinfosrc_seqdb.hpp>
41 #include <algo/blast/api/blast_dbindex.hpp>
42 #include <util/profile/rtprofile.hpp>
43 
44 /** @addtogroup AlgoBlast
45  *
46  * @{
47  */
48 
49 BEGIN_NCBI_SCOPE
50 USING_SCOPE(objects);
BEGIN_SCOPE(blast)51 BEGIN_SCOPE(blast)
52 
53 size_t
54 SplitQuery_GetChunkSize(EProgram program)
55 {
56     size_t retval = 0;
57 
58     // used for experimentation purposes
59     char* chunk_sz_str = getenv("CHUNK_SIZE");
60     if (chunk_sz_str && !NStr::IsBlank(chunk_sz_str)) {
61         retval = NStr::StringToInt(chunk_sz_str);
62         _TRACE("Using query chunk size from environment " << retval);
63     } else {
64 
65         switch (program) {
66         case eBlastn:
67             retval = 1000000;
68             break;
69         case eMegablast:
70         case eDiscMegablast:
71         case eMapper:
72             retval = 5000000;
73             break;
74         case eTblastn:
75             retval = 20000;
76             break;
77         // if the query will be translated, round the chunk size up to the next
78         // multiple of 3, that way, when the nucleotide sequence(s) get(s)
79         // split, context N%6 in one chunk will have the same frame as context
80         // N%6 in the next chunk
81         case eBlastx:
82         case eTblastx:
83             // N.B.: the splitting is done on the nucleotide query sequences,
84             // then each of these chunks is translated
85             retval = 10002;
86             break;
87         case eVecScreen:
88         	// Disable query splitting for vecscreen
89         	retval =1;
90         	break;
91         case eBlastp:
92         default:
93             retval = 10000;
94             break;
95         }
96     }
97 
98     const EBlastProgramType prog_type(EProgramToEBlastProgramType(program));
99     if (Blast_QueryIsTranslated(prog_type) && !Blast_SubjectIsPssm(prog_type) &&
100         (retval % CODON_LENGTH) != 0) {
101         NCBI_THROW(CBlastException, eInvalidArgument,
102                    "Split query chunk size must be divisible by 3");
103     }
104     _TRACE("Returning query chunk size " << retval << " for " << EProgramToTaskName(program));
105     return retval;
106 }
107 
CLocalBlast(CRef<IQueryFactory> qf,CRef<CBlastOptionsHandle> opts_handle,const CSearchDatabase & dbinfo)108 CLocalBlast::CLocalBlast(CRef<IQueryFactory> qf,
109                          CRef<CBlastOptionsHandle> opts_handle,
110                          const CSearchDatabase& dbinfo)
111 : m_QueryFactory    (qf),
112   m_Opts            (const_cast<CBlastOptions*>(&opts_handle->GetOptions())),
113   m_InternalData    (0),
114   m_PrelimSearch    (new CBlastPrelimSearch(qf, m_Opts, dbinfo)),
115   m_TbackSearch     (0)
116 {}
117 
CLocalBlast(CRef<IQueryFactory> qf,CRef<CBlastOptionsHandle> opts_handle,CRef<CLocalDbAdapter> db)118 CLocalBlast::CLocalBlast(CRef<IQueryFactory> qf,
119                          CRef<CBlastOptionsHandle> opts_handle,
120                          CRef<CLocalDbAdapter> db)
121 : m_QueryFactory    (qf),
122   m_Opts            (const_cast<CBlastOptions*>(&opts_handle->GetOptions())),
123   m_InternalData    (0),
124   m_PrelimSearch    (new CBlastPrelimSearch(qf, m_Opts, db)),
125   m_TbackSearch     (0),
126   m_LocalDbAdapter  (db.GetNonNullPointer())
127 {}
128 
CLocalBlast(CRef<IQueryFactory> qf,CRef<CBlastOptionsHandle> opts_handle,BlastSeqSrc * seqsrc,CRef<IBlastSeqInfoSrc> seqInfoSrc)129 CLocalBlast::CLocalBlast(CRef<IQueryFactory> qf,
130                          CRef<CBlastOptionsHandle> opts_handle,
131                          BlastSeqSrc* seqsrc,
132                          CRef<IBlastSeqInfoSrc> seqInfoSrc)
133 : m_QueryFactory    (qf),
134   m_Opts            (const_cast<CBlastOptions*>(&opts_handle->GetOptions())),
135   m_InternalData    (0),
136   m_PrelimSearch    (new CBlastPrelimSearch(qf, m_Opts, seqsrc,
137                                             CRef<CPssmWithParameters>())),
138   m_TbackSearch     (0),
139   m_SeqInfoSrc      (seqInfoSrc)
140 {}
141 
142 /** FIXME: this should be removed as soon as we safely can
143  * We will be able to do this once we are guaranteed that every
144  * constructor to CLocalBlast takes or can construct a IBlastSeqInfoSrc
145  * on it's own.
146  * This function is dangerous as it assumes that the BlastSeqSrc
147  * is based upon CSeqDB, which is not guaranteed.
148  */
149 static IBlastSeqInfoSrc*
s_InitSeqInfoSrc(const BlastSeqSrc * seqsrc)150 s_InitSeqInfoSrc(const BlastSeqSrc* seqsrc)
151 {
152      string db_name;
153      if (const char* seqsrc_name = BlastSeqSrcGetName(seqsrc)) {
154          db_name.assign(seqsrc_name);
155      }
156      if (db_name.empty()) {
157          NCBI_THROW(CBlastException, eNotSupported,
158                     "BlastSeqSrc does not provide a name, probably it is not a"
159                     " BLAST database");
160      }
161      bool is_prot = BlastSeqSrcGetIsProt(seqsrc) ? true : false;
162      return new CSeqDbSeqInfoSrc(db_name, is_prot);
163 }
164 
165 
166 CRef<CSearchResultSet>
Run()167 CLocalBlast::Run()
168 {
169     _ASSERT(m_QueryFactory);
170     _ASSERT(m_PrelimSearch);
171     _ASSERT(m_Opts);
172     BLAST_PROF_MARK2( m_batch_num_str + string("_BLAST.SETUP.START") );
173     // Note: we need to pass the search messages ...
174     // filtered query regions should be masked in the BLAST_SequenceBlk
175     // already.
176 
177     int status = m_PrelimSearch->CheckInternalData();
178     if (status != 0)
179     {
180          // Search was not run, but we send back an empty CSearchResultSet.
181          CRef<ILocalQueryData> local_query_data = m_QueryFactory->MakeLocalQueryData(m_Opts);
182          // TSeqLocVector slv = m_QueryFactory.GetTSeqLocVector();
183          vector< CConstRef<objects::CSeq_id> > seqid_vec;
184          vector< CRef<CBlastAncillaryData> > ancill_vec;
185          TSeqAlignVector sa_vec;
186          size_t index;
187          EResultType res_type = eDatabaseSearch;
188          unsigned int num_subjects = 0;
189          if (m_LocalDbAdapter.NotEmpty() && !m_LocalDbAdapter->IsBlastDb() && !m_LocalDbAdapter->IsDbScanMode()) {
190         	 res_type = eSequenceComparison;
191              IBlastSeqInfoSrc *  subject_infosrc = m_LocalDbAdapter->MakeSeqInfoSrc();
192              if(subject_infosrc != NULL) {
193             	 num_subjects = subject_infosrc->Size();
194              }
195          }
196          TSearchMessages msg_vec;
197          for (index=0; index<local_query_data->GetNumQueries(); index++)
198          {
199               CConstRef<objects::CSeq_id> query_id(local_query_data->GetSeq_loc(index)->GetId());
200               TQueryMessages q_msg;
201               local_query_data->GetQueryMessages(index, q_msg);
202               msg_vec.push_back(q_msg);
203               seqid_vec.push_back(query_id);
204               CRef<objects::CSeq_align_set> tmp_align;
205               sa_vec.push_back(tmp_align);
206               pair<double, double> tmp_pair(-1.0, -1.0);
207               CRef<CBlastAncillaryData>  tmp_ancillary_data(new CBlastAncillaryData(tmp_pair, tmp_pair, tmp_pair, 0));
208               ancill_vec.push_back(tmp_ancillary_data);
209 
210               for(unsigned int i =1; i < num_subjects; i++)
211               {
212             	  TQueryMessages msg;
213                   msg_vec.push_back(msg);
214                   seqid_vec.push_back(query_id);
215                   CRef<objects::CSeq_align_set> tmp_align;
216                   sa_vec.push_back(tmp_align);
217            	      CRef<CBlastAncillaryData>  tmp_ancillary_data(new CBlastAncillaryData(tmp_pair, tmp_pair, tmp_pair, 0));
218            	      ancill_vec.push_back(tmp_ancillary_data);
219               }
220          }
221          msg_vec.Combine(m_PrelimSearch->GetSearchMessages());
222 
223          CRef<CSearchResultSet> result_set(new CSearchResultSet(seqid_vec, sa_vec, msg_vec, ancill_vec, 0, res_type));
224          return result_set;
225     }
226 
227     BLAST_PROF_MARK2( m_batch_num_str + string("_BLAST.SETUP.STOP") );
228     BLAST_PROF_MARK2( m_batch_num_str + string("_BLAST.PRE.START") );
229 	try {
230 	    m_PrelimSearch->SetNumberOfThreads(GetNumberOfThreads());
231 	    m_InternalData = m_PrelimSearch->Run();
232 	} catch( CIndexedDbException & ) {
233 	    throw;
234 	} catch (CBlastException & e) {
235 		if(e.GetErrCode() == CBlastException::eCoreBlastError) {
236 			throw;
237 		}
238 	}catch (...) {
239 	}
240 
241     //_ASSERT(m_InternalData);
242     BLAST_PROF_MARK2( m_batch_num_str + string("_BLAST.PRE.STOP") );
243     BLAST_PROF_MARK2( m_batch_num_str + string("_BLAST.TB.START") );
244 
245     TSearchMessages search_msgs = m_PrelimSearch->GetSearchMessages();
246 
247     CRef<IBlastSeqInfoSrc> seqinfo_src;
248 
249     if (m_SeqInfoSrc.NotEmpty())
250     {
251         // Use the SeqInfoSrc provided by the user during construction
252         seqinfo_src = m_SeqInfoSrc;
253     }
254     else if (m_LocalDbAdapter.NotEmpty()) {
255         // This path is preferred because it preserves the GI list
256         // limitation if there is one.  DBs with both internal OID
257         // filtering and user GI list filtering will not do complete
258         // filtering during the traceback stage, which can cause
259         // 'Unknown defline' errors during formatting.
260 
261         seqinfo_src.Reset(m_LocalDbAdapter->MakeSeqInfoSrc());
262     } else {
263         seqinfo_src.Reset(s_InitSeqInfoSrc(m_InternalData->m_SeqSrc->GetPointer()));
264     }
265 
266     m_TbackSearch.Reset(new CBlastTracebackSearch(m_QueryFactory,
267                                                   m_InternalData,
268                                                   m_Opts,
269                                                   seqinfo_src,
270                                                   search_msgs));
271     if (m_LocalDbAdapter.NotEmpty() && !m_LocalDbAdapter->IsBlastDb()
272 	&& !m_LocalDbAdapter->IsDbScanMode()) {
273         m_TbackSearch->SetResultType(eSequenceComparison);
274     }
275     m_TbackSearch->SetNumberOfThreads(GetNumberOfThreads());
276     CRef<CSearchResultSet> retval = m_TbackSearch->Run();
277     retval->SetFilteredQueryRegions(m_PrelimSearch->GetFilteredQueryRegions());
278     m_Messages = m_TbackSearch->GetSearchMessages();
279 
280     BLAST_PROF_MARK2( m_batch_num_str + string("_BLAST.TB.STOP") );
281     return retval;
282 }
283 
GetNumExtensions()284 Int4 CLocalBlast::GetNumExtensions()
285 {
286     Int4 retv = 0;
287     if (m_InternalData) {
288         BlastDiagnostics * diag = m_InternalData->m_Diagnostics->GetPointer();
289         if (diag && diag->ungapped_stat) {
290              retv = diag->ungapped_stat->good_init_extends;
291         }
292     }
293     return retv;
294 }
295 
GetDiagnostics()296 BlastDiagnostics* CLocalBlast::GetDiagnostics()
297 {
298     BlastDiagnostics* diag=NULL;
299 
300     if (m_InternalData) {
301         diag = Blast_DiagnosticsCopy(m_InternalData->m_Diagnostics->GetPointer());
302     }
303 
304     return diag;
305 }
306 
307 END_SCOPE(blast)
308 END_NCBI_SCOPE
309 
310 /* @} */
311