1 /* ===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 * Author: Kevin Bealer
26 *
27 */
28
29 /** @file remote_search.cpp
30 * This file implements the uniform Blast search interface in terms of
31 * the blast4 network API via the CRemoteBlast library.
32 * NOTE: This file contains work in progress and the APIs are likely to change,
33 * please do not rely on them until this notice is removed.
34 */
35
36 #include <ncbi_pch.hpp>
37 #include <objects/seqalign/seqalign__.hpp>
38 #include <algo/blast/api/remote_search.hpp>
39 #include <algo/blast/api/blast_prot_options.hpp>
40 #include <objects/scoremat/PssmWithParameters.hpp>
41
42 /** @addtogroup AlgoBlast
43 *
44 * @{
45 */
46
47 BEGIN_NCBI_SCOPE
48 USING_SCOPE(objects);
BEGIN_SCOPE(blast)49 BEGIN_SCOPE(blast)
50
51 /// Supporting elements
52
53 //
54 // Factory
55 //
56
57 CRef<ISeqSearch>
58 CRemoteSearchFactory::GetSeqSearch()
59 {
60 return CRef<ISeqSearch>(new CRemoteSeqSearch());
61 }
62
63 CRef<IPssmSearch>
GetPssmSearch()64 CRemoteSearchFactory::GetPssmSearch()
65 {
66 return CRef<IPssmSearch>(new CRemotePssmSearch());
67 }
68
69 CRef<CBlastOptionsHandle>
GetOptions(EProgram program)70 CRemoteSearchFactory::GetOptions(EProgram program)
71 {
72 CRef<CBlastOptionsHandle> opts
73 (CBlastOptionsFactory::Create(program, CBlastOptions::eRemote));
74
75 return opts;
76 }
77
78 //
79 // Seq Search
80 //
81
x_RemoteBlast()82 CRemoteBlast & CRemoteSeqSearch::x_RemoteBlast()
83 {
84 if (m_RemoteBlast.Empty()) {
85 // Verify all parts accounted for....
86 if (m_SearchOpts.Empty()) {
87 NCBI_THROW(CSearchException, eConfigErr, "No options specified");
88 }
89
90 if (m_Queries.Empty()) {
91 NCBI_THROW(CSearchException, eConfigErr, "No queries specified");
92 }
93
94 if (m_Subject.Empty() || m_Subject->GetDatabaseName().empty()) {
95 NCBI_THROW(CSearchException, eConfigErr,
96 "No database name specified");
97 }
98
99 // .. Done...
100
101 m_RemoteBlast.Reset(new CRemoteBlast(& * m_SearchOpts));
102 m_RemoteBlast->SetDatabase(m_Subject->GetDatabaseName());
103 const string& kEntrezQuery = m_Subject->GetEntrezQueryLimitation();
104 if ( !kEntrezQuery.empty() ) {
105 m_RemoteBlast->SetEntrezQuery(kEntrezQuery.c_str());
106 }
107
108 const CSearchDatabase::TGiList& kGiList =
109 m_Subject->GetGiListLimitation();
110 if ( !kGiList.empty() ) {
111 list<TGi> temp(kGiList.begin(), kGiList.end());
112 m_RemoteBlast->SetGIList(temp);
113 }
114
115 CRef<CBioseq_set> bss = m_Queries->GetBioseqSet();
116 IRemoteQueryData::TSeqLocs sll = m_Queries->GetSeqLocs();
117
118 if ((bss.Empty()) && (sll.empty())) {
119 NCBI_THROW(CSearchException, eConfigErr,
120 "Empty queries object specified.");
121 }
122
123 if (bss.NotEmpty()) {
124 m_RemoteBlast->SetQueries(bss);
125 } else {
126 _ASSERT(! sll.empty());
127 m_RemoteBlast->SetQueries(sll);
128 }
129 }
130
131 return *m_RemoteBlast;
132 }
133
134 /// Build a result set from results in a remote blast search.
135 ///
136 /// The remote blast object will be queried for results and these will
137 /// be used to build a CSearchResultSet. If the search has not yet
138 /// completed, this function will wait until it has.
139 ///
140 /// @param rb The remote blast object representing the search.
141 /// @return The results of the search as a CSearchResultSet.
142 static CRef<CSearchResultSet>
143 s_BuildResultsRemote(CRemoteBlast & rb);
144
145 CRef<CSearchResultSet>
Run()146 CRemoteSeqSearch::Run()
147 {
148 // Calling Run() directly always queues a new search.
149 m_RemoteBlast.Reset();
150 //x_RemoteBlast().SetVerbose();
151 x_RemoteBlast().SubmitSync();
152
153 const vector<string> & w = x_RemoteBlast().GetWarningVector();
154 m_Warnings.insert(m_Warnings.end(), w.begin(), w.end());
155
156 return s_BuildResultsRemote(*m_RemoteBlast);
157 }
158
SetOptions(CRef<CBlastOptionsHandle> opts)159 void CRemoteSeqSearch::SetOptions(CRef<CBlastOptionsHandle> opts)
160 {
161 m_SearchOpts = opts;
162 }
163
SetSubject(CConstRef<CSearchDatabase> subject)164 void CRemoteSeqSearch::SetSubject(CConstRef<CSearchDatabase> subject)
165 {
166 m_Subject = subject;
167 }
168
SetQueryFactory(CRef<IQueryFactory> query_factory)169 void CRemoteSeqSearch::SetQueryFactory(CRef<IQueryFactory> query_factory)
170 {
171 if (query_factory.Empty()) {
172 NCBI_THROW(CSearchException, eConfigErr,
173 "CRemoteSeqSearch: empty query factory was specified.");
174 }
175
176 m_Queries.Reset(query_factory->MakeRemoteQueryData());
177 }
178
179 /// CRemoteBlast does not separate each hit to the query in discontinuous
180 /// Seq-aligns, so we do it here. This functionality might be merged with
181 /// CRemoteBlast::GetSeqAlignSets() in the future
182 static TSeqAlignVector
s_SplitAlignVectorBySubjects(TSeqAlignVector seqaligns)183 s_SplitAlignVectorBySubjects(TSeqAlignVector seqaligns)
184 {
185 // For each query...
186 NON_CONST_ITERATE(TSeqAlignVector, itr, seqaligns) {
187 CRef<CSeq_align_set> seq_align = *itr;
188
189 CRef<CSeq_align_set> new_seq_align(new CSeq_align_set);
190
191 // set the current Seq-id to an invalid gi
192 CConstRef<CSeq_id> current_subject(new CSeq_id(CSeq_id::e_Gi, 1));
193 // list of HSPs for a single query-subject pair
194 CRef<CSeq_align> current_hsp_list;
195
196 // for each HSP ...
197 ITERATE(CSeq_align_set::Tdata, hsp_itr, seq_align->Get()) {
198
199 const int kSubjectIndex = 1;
200 CConstRef<CSeq_id> subj_id(& (*hsp_itr)->GetSeq_id(kSubjectIndex));
201
202 // new subject sequence (hit) found
203 if (subj_id->Compare(*current_subject) == CSeq_id::e_NO) {
204
205 current_subject = subj_id;
206
207 if (current_hsp_list.NotEmpty()) {
208 new_seq_align->Set().push_back(current_hsp_list);
209 }
210 current_hsp_list.Reset(new CSeq_align);
211 current_hsp_list->SetType(CSeq_align::eType_disc);
212 current_hsp_list->SetDim(2);
213 current_hsp_list->SetSegs().SetDisc().Set().push_back(*hsp_itr);
214
215 } else {
216 // same subject sequence as in previous iteration
217 current_hsp_list->SetSegs().SetDisc().Set().push_back(*hsp_itr);
218 }
219 }
220 if (current_hsp_list.NotEmpty()) {
221 new_seq_align->Set().push_back(current_hsp_list);
222 }
223
224 *itr = new_seq_align;
225 }
226 return seqaligns;
227 }
228
229 static CRef<CSearchResultSet>
s_BuildResultsRemote(CRemoteBlast & rb)230 s_BuildResultsRemote(CRemoteBlast & rb)
231 {
232 // This cascades the warnings and errors: all queries get all
233 // errors and warnings. At the moment, none of the remote (or for
234 // that matter, local) code seems to have a way to categorize
235 // errors by type and query.
236
237 // If the query number were known, and the error number were
238 // known, it is possible that the user could (in some cases) cope
239 // with the error or possibly salvage data from the non-failing
240 // requests.
241
242 // Comments:
243 //
244 // 1. In how many (if any) client code scenarios does error
245 // recovery makes sense?
246 //
247 // 2. What kinds of errors that are recoverable?
248 //
249 // 3. Does the user ever need to know more than that a request
250 // found results, found nothing, or produced an error message?
251 //
252 // 4. If a single query fails, how do we avoid pairing the fatal
253 // error message with non-failing requests.
254
255 TQueryMessages msgs;
256 CRef<CSearchMessage> msg;
257
258 // Convert warnings and errors into CSearchMessage objects.
259
260 ITERATE(vector<string>, iter, rb.GetWarningVector()) {
261 msg.Reset(new CSearchMessage(eBlastSevError, -1, *iter));
262 msgs.push_back(msg);
263 }
264
265 ITERATE(vector<string>, iter, rb.GetErrorVector()) {
266 msg.Reset(new CSearchMessage(eBlastSevError, -1, *iter));
267 msgs.push_back(msg);
268 }
269
270 TSeqAlignVector aligns =
271 s_SplitAlignVectorBySubjects(rb.GetSeqAlignSets());
272
273 // Cascade the messages -- this will result in a lot of CRef<>
274 // sharing but hopefully not too much actual computation.
275
276 TSearchMessages msg_vec;
277
278 for(size_t i = 0; i<aligns.size(); i++) {
279 msg_vec.push_back(msgs);
280 }
281
282 return CRef<CSearchResultSet>(new CSearchResultSet(aligns, msg_vec));
283 }
284
285
286 //
287 // Psi Search
288 //
289
SetOptions(CRef<CBlastOptionsHandle> opts)290 void CRemotePssmSearch::SetOptions(CRef<CBlastOptionsHandle> opts)
291 {
292 m_SearchOpts = opts;
293 m_RemoteBlast.Reset(new CRemoteBlast(& * opts));
294 }
295
SetSubject(CConstRef<CSearchDatabase> subject)296 void CRemotePssmSearch::SetSubject(CConstRef<CSearchDatabase> subject)
297 {
298 m_Subject = subject;
299 }
300
x_RemoteBlast()301 CRemoteBlast & CRemotePssmSearch::x_RemoteBlast()
302 {
303 if (m_RemoteBlast.Empty()) {
304 // Verify all parts accounted for....
305 if (m_SearchOpts.Empty()) {
306 NCBI_THROW(CSearchException, eConfigErr, "No options specified");
307 }
308
309 if (m_Pssm.Empty()) {
310 NCBI_THROW(CSearchException, eConfigErr, "No queries specified");
311 }
312
313 if (m_Subject.Empty() || m_Subject->GetDatabaseName().empty()) {
314 NCBI_THROW(CSearchException, eConfigErr,
315 "No database name specified");
316 }
317
318 // .. Done...
319
320 m_RemoteBlast.Reset(new CRemoteBlast(& * m_SearchOpts));
321 m_RemoteBlast->SetDatabase(m_Subject->GetDatabaseName());
322 m_RemoteBlast->SetQueries(m_Pssm);
323
324 const string& kEntrezQuery = m_Subject->GetEntrezQueryLimitation();
325 if ( !kEntrezQuery.empty() ) {
326 m_RemoteBlast->SetEntrezQuery(kEntrezQuery.c_str());
327 }
328
329 const CSearchDatabase::TGiList& kGiList =
330 m_Subject->GetGiListLimitation();
331 if ( !kGiList.empty() ) {
332 list<TGi> temp(kGiList.begin(), kGiList.end());
333 m_RemoteBlast->SetGIList(temp);
334 }
335 }
336
337 return *m_RemoteBlast;
338 }
339
340 CRef<CSearchResultSet>
Run()341 CRemotePssmSearch::Run()
342 {
343 // Calling Run() directly always queues a new search.
344 m_RemoteBlast.Reset();
345 //x_RemoteBlast().SetVerbose();
346
347 x_RemoteBlast().SubmitSync();
348
349 const vector<string> & w = x_RemoteBlast().GetWarningVector();
350 m_Warnings.insert(m_Warnings.end(), w.begin(), w.end());
351
352 return s_BuildResultsRemote(*m_RemoteBlast);
353 }
354
355
SetQuery(CRef<objects::CPssmWithParameters> pssm)356 void CRemotePssmSearch::SetQuery(CRef<objects::CPssmWithParameters> pssm)
357 {
358 if (pssm.Empty()) {
359 NCBI_THROW(CSearchException, eConfigErr,
360 "CRemotePssmSearch: empty query object was specified.");
361 }
362
363 m_Pssm = pssm;
364 }
365
366 END_SCOPE(blast)
367 END_NCBI_SCOPE
368
369 /* @} */
370