1 /* ===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 * Author: Christiam Camacho
26 *
27 */
28
29 /** @file local_blast.cpp
30 * NOTE: This file contains work in progress and the APIs are likely to change,
31 * please do not rely on them until this notice is removed.
32 */
33
34 #include <ncbi_pch.hpp>
35 #include <algo/blast/api/local_blast.hpp>
36 #include <algo/blast/api/uniform_search.hpp>
37 #include <algo/blast/api/blast_seqinfosrc.hpp>
38 #include "blast_aux_priv.hpp"
39 #include <objects/scoremat/PssmWithParameters.hpp>
40 #include <algo/blast/api/seqinfosrc_seqdb.hpp>
41 #include <algo/blast/api/blast_dbindex.hpp>
42 #include <util/profile/rtprofile.hpp>
43
44 /** @addtogroup AlgoBlast
45 *
46 * @{
47 */
48
49 BEGIN_NCBI_SCOPE
50 USING_SCOPE(objects);
BEGIN_SCOPE(blast)51 BEGIN_SCOPE(blast)
52
53 size_t
54 SplitQuery_GetChunkSize(EProgram program)
55 {
56 size_t retval = 0;
57
58 // used for experimentation purposes
59 char* chunk_sz_str = getenv("CHUNK_SIZE");
60 if (chunk_sz_str && !NStr::IsBlank(chunk_sz_str)) {
61 retval = NStr::StringToInt(chunk_sz_str);
62 _TRACE("Using query chunk size from environment " << retval);
63 } else {
64
65 switch (program) {
66 case eBlastn:
67 retval = 1000000;
68 break;
69 case eMegablast:
70 case eDiscMegablast:
71 case eMapper:
72 retval = 5000000;
73 break;
74 case eTblastn:
75 retval = 20000;
76 break;
77 // if the query will be translated, round the chunk size up to the next
78 // multiple of 3, that way, when the nucleotide sequence(s) get(s)
79 // split, context N%6 in one chunk will have the same frame as context
80 // N%6 in the next chunk
81 case eBlastx:
82 case eTblastx:
83 // N.B.: the splitting is done on the nucleotide query sequences,
84 // then each of these chunks is translated
85 retval = 10002;
86 break;
87 case eVecScreen:
88 // Disable query splitting for vecscreen
89 retval =1;
90 break;
91 case eBlastp:
92 default:
93 retval = 10000;
94 break;
95 }
96 }
97
98 const EBlastProgramType prog_type(EProgramToEBlastProgramType(program));
99 if (Blast_QueryIsTranslated(prog_type) && !Blast_SubjectIsPssm(prog_type) &&
100 (retval % CODON_LENGTH) != 0) {
101 NCBI_THROW(CBlastException, eInvalidArgument,
102 "Split query chunk size must be divisible by 3");
103 }
104 _TRACE("Returning query chunk size " << retval << " for " << EProgramToTaskName(program));
105 return retval;
106 }
107
CLocalBlast(CRef<IQueryFactory> qf,CRef<CBlastOptionsHandle> opts_handle,const CSearchDatabase & dbinfo)108 CLocalBlast::CLocalBlast(CRef<IQueryFactory> qf,
109 CRef<CBlastOptionsHandle> opts_handle,
110 const CSearchDatabase& dbinfo)
111 : m_QueryFactory (qf),
112 m_Opts (const_cast<CBlastOptions*>(&opts_handle->GetOptions())),
113 m_InternalData (0),
114 m_PrelimSearch (new CBlastPrelimSearch(qf, m_Opts, dbinfo)),
115 m_TbackSearch (0)
116 {}
117
CLocalBlast(CRef<IQueryFactory> qf,CRef<CBlastOptionsHandle> opts_handle,CRef<CLocalDbAdapter> db)118 CLocalBlast::CLocalBlast(CRef<IQueryFactory> qf,
119 CRef<CBlastOptionsHandle> opts_handle,
120 CRef<CLocalDbAdapter> db)
121 : m_QueryFactory (qf),
122 m_Opts (const_cast<CBlastOptions*>(&opts_handle->GetOptions())),
123 m_InternalData (0),
124 m_PrelimSearch (new CBlastPrelimSearch(qf, m_Opts, db)),
125 m_TbackSearch (0),
126 m_LocalDbAdapter (db.GetNonNullPointer())
127 {}
128
CLocalBlast(CRef<IQueryFactory> qf,CRef<CBlastOptionsHandle> opts_handle,BlastSeqSrc * seqsrc,CRef<IBlastSeqInfoSrc> seqInfoSrc)129 CLocalBlast::CLocalBlast(CRef<IQueryFactory> qf,
130 CRef<CBlastOptionsHandle> opts_handle,
131 BlastSeqSrc* seqsrc,
132 CRef<IBlastSeqInfoSrc> seqInfoSrc)
133 : m_QueryFactory (qf),
134 m_Opts (const_cast<CBlastOptions*>(&opts_handle->GetOptions())),
135 m_InternalData (0),
136 m_PrelimSearch (new CBlastPrelimSearch(qf, m_Opts, seqsrc,
137 CRef<CPssmWithParameters>())),
138 m_TbackSearch (0),
139 m_SeqInfoSrc (seqInfoSrc)
140 {}
141
142 /** FIXME: this should be removed as soon as we safely can
143 * We will be able to do this once we are guaranteed that every
144 * constructor to CLocalBlast takes or can construct a IBlastSeqInfoSrc
145 * on it's own.
146 * This function is dangerous as it assumes that the BlastSeqSrc
147 * is based upon CSeqDB, which is not guaranteed.
148 */
149 static IBlastSeqInfoSrc*
s_InitSeqInfoSrc(const BlastSeqSrc * seqsrc)150 s_InitSeqInfoSrc(const BlastSeqSrc* seqsrc)
151 {
152 string db_name;
153 if (const char* seqsrc_name = BlastSeqSrcGetName(seqsrc)) {
154 db_name.assign(seqsrc_name);
155 }
156 if (db_name.empty()) {
157 NCBI_THROW(CBlastException, eNotSupported,
158 "BlastSeqSrc does not provide a name, probably it is not a"
159 " BLAST database");
160 }
161 bool is_prot = BlastSeqSrcGetIsProt(seqsrc) ? true : false;
162 return new CSeqDbSeqInfoSrc(db_name, is_prot);
163 }
164
165
166 CRef<CSearchResultSet>
Run()167 CLocalBlast::Run()
168 {
169 _ASSERT(m_QueryFactory);
170 _ASSERT(m_PrelimSearch);
171 _ASSERT(m_Opts);
172 BLAST_PROF_MARK2( m_batch_num_str + string("_BLAST.SETUP.START") );
173 // Note: we need to pass the search messages ...
174 // filtered query regions should be masked in the BLAST_SequenceBlk
175 // already.
176
177 int status = m_PrelimSearch->CheckInternalData();
178 if (status != 0)
179 {
180 // Search was not run, but we send back an empty CSearchResultSet.
181 CRef<ILocalQueryData> local_query_data = m_QueryFactory->MakeLocalQueryData(m_Opts);
182 // TSeqLocVector slv = m_QueryFactory.GetTSeqLocVector();
183 vector< CConstRef<objects::CSeq_id> > seqid_vec;
184 vector< CRef<CBlastAncillaryData> > ancill_vec;
185 TSeqAlignVector sa_vec;
186 size_t index;
187 EResultType res_type = eDatabaseSearch;
188 unsigned int num_subjects = 0;
189 if (m_LocalDbAdapter.NotEmpty() && !m_LocalDbAdapter->IsBlastDb() && !m_LocalDbAdapter->IsDbScanMode()) {
190 res_type = eSequenceComparison;
191 IBlastSeqInfoSrc * subject_infosrc = m_LocalDbAdapter->MakeSeqInfoSrc();
192 if(subject_infosrc != NULL) {
193 num_subjects = subject_infosrc->Size();
194 }
195 }
196 TSearchMessages msg_vec;
197 for (index=0; index<local_query_data->GetNumQueries(); index++)
198 {
199 CConstRef<objects::CSeq_id> query_id(local_query_data->GetSeq_loc(index)->GetId());
200 TQueryMessages q_msg;
201 local_query_data->GetQueryMessages(index, q_msg);
202 msg_vec.push_back(q_msg);
203 seqid_vec.push_back(query_id);
204 CRef<objects::CSeq_align_set> tmp_align;
205 sa_vec.push_back(tmp_align);
206 pair<double, double> tmp_pair(-1.0, -1.0);
207 CRef<CBlastAncillaryData> tmp_ancillary_data(new CBlastAncillaryData(tmp_pair, tmp_pair, tmp_pair, 0));
208 ancill_vec.push_back(tmp_ancillary_data);
209
210 for(unsigned int i =1; i < num_subjects; i++)
211 {
212 TQueryMessages msg;
213 msg_vec.push_back(msg);
214 seqid_vec.push_back(query_id);
215 CRef<objects::CSeq_align_set> tmp_align;
216 sa_vec.push_back(tmp_align);
217 CRef<CBlastAncillaryData> tmp_ancillary_data(new CBlastAncillaryData(tmp_pair, tmp_pair, tmp_pair, 0));
218 ancill_vec.push_back(tmp_ancillary_data);
219 }
220 }
221 msg_vec.Combine(m_PrelimSearch->GetSearchMessages());
222
223 CRef<CSearchResultSet> result_set(new CSearchResultSet(seqid_vec, sa_vec, msg_vec, ancill_vec, 0, res_type));
224 return result_set;
225 }
226
227 BLAST_PROF_MARK2( m_batch_num_str + string("_BLAST.SETUP.STOP") );
228 BLAST_PROF_MARK2( m_batch_num_str + string("_BLAST.PRE.START") );
229 try {
230 m_PrelimSearch->SetNumberOfThreads(GetNumberOfThreads());
231 m_InternalData = m_PrelimSearch->Run();
232 } catch( CIndexedDbException & ) {
233 throw;
234 } catch (CBlastException & e) {
235 if(e.GetErrCode() == CBlastException::eCoreBlastError) {
236 throw;
237 }
238 }catch (...) {
239 }
240
241 //_ASSERT(m_InternalData);
242 BLAST_PROF_MARK2( m_batch_num_str + string("_BLAST.PRE.STOP") );
243 BLAST_PROF_MARK2( m_batch_num_str + string("_BLAST.TB.START") );
244
245 TSearchMessages search_msgs = m_PrelimSearch->GetSearchMessages();
246
247 CRef<IBlastSeqInfoSrc> seqinfo_src;
248
249 if (m_SeqInfoSrc.NotEmpty())
250 {
251 // Use the SeqInfoSrc provided by the user during construction
252 seqinfo_src = m_SeqInfoSrc;
253 }
254 else if (m_LocalDbAdapter.NotEmpty()) {
255 // This path is preferred because it preserves the GI list
256 // limitation if there is one. DBs with both internal OID
257 // filtering and user GI list filtering will not do complete
258 // filtering during the traceback stage, which can cause
259 // 'Unknown defline' errors during formatting.
260
261 seqinfo_src.Reset(m_LocalDbAdapter->MakeSeqInfoSrc());
262 } else {
263 seqinfo_src.Reset(s_InitSeqInfoSrc(m_InternalData->m_SeqSrc->GetPointer()));
264 }
265
266 m_TbackSearch.Reset(new CBlastTracebackSearch(m_QueryFactory,
267 m_InternalData,
268 m_Opts,
269 seqinfo_src,
270 search_msgs));
271 if (m_LocalDbAdapter.NotEmpty() && !m_LocalDbAdapter->IsBlastDb()
272 && !m_LocalDbAdapter->IsDbScanMode()) {
273 m_TbackSearch->SetResultType(eSequenceComparison);
274 }
275 m_TbackSearch->SetNumberOfThreads(GetNumberOfThreads());
276 CRef<CSearchResultSet> retval = m_TbackSearch->Run();
277 retval->SetFilteredQueryRegions(m_PrelimSearch->GetFilteredQueryRegions());
278 m_Messages = m_TbackSearch->GetSearchMessages();
279
280 BLAST_PROF_MARK2( m_batch_num_str + string("_BLAST.TB.STOP") );
281 return retval;
282 }
283
GetNumExtensions()284 Int4 CLocalBlast::GetNumExtensions()
285 {
286 Int4 retv = 0;
287 if (m_InternalData) {
288 BlastDiagnostics * diag = m_InternalData->m_Diagnostics->GetPointer();
289 if (diag && diag->ungapped_stat) {
290 retv = diag->ungapped_stat->good_init_extends;
291 }
292 }
293 return retv;
294 }
295
GetDiagnostics()296 BlastDiagnostics* CLocalBlast::GetDiagnostics()
297 {
298 BlastDiagnostics* diag=NULL;
299
300 if (m_InternalData) {
301 diag = Blast_DiagnosticsCopy(m_InternalData->m_Diagnostics->GetPointer());
302 }
303
304 return diag;
305 }
306
307 END_SCOPE(blast)
308 END_NCBI_SCOPE
309
310 /* @} */
311