1 /* ===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  * Author:  Kevin Bealer
26  *
27  */
28 
29 /** @file remote_search.cpp
30  * This file implements the uniform Blast search interface in terms of
31  * the blast4 network API via the CRemoteBlast library.
32  * NOTE: This file contains work in progress and the APIs are likely to change,
33  * please do not rely on them until this notice is removed.
34  */
35 
36 #include <ncbi_pch.hpp>
37 #include <objects/seqalign/seqalign__.hpp>
38 #include <algo/blast/api/remote_search.hpp>
39 #include <algo/blast/api/blast_prot_options.hpp>
40 #include <objects/scoremat/PssmWithParameters.hpp>
41 
42 /** @addtogroup AlgoBlast
43  *
44  * @{
45  */
46 
47 BEGIN_NCBI_SCOPE
48 USING_SCOPE(objects);
BEGIN_SCOPE(blast)49 BEGIN_SCOPE(blast)
50 
51 /// Supporting elements
52 
53 //
54 // Factory
55 //
56 
57 CRef<ISeqSearch>
58 CRemoteSearchFactory::GetSeqSearch()
59 {
60     return CRef<ISeqSearch>(new CRemoteSeqSearch());
61 }
62 
63 CRef<IPssmSearch>
GetPssmSearch()64 CRemoteSearchFactory::GetPssmSearch()
65 {
66     return CRef<IPssmSearch>(new CRemotePssmSearch());
67 }
68 
69 CRef<CBlastOptionsHandle>
GetOptions(EProgram program)70 CRemoteSearchFactory::GetOptions(EProgram program)
71 {
72     CRef<CBlastOptionsHandle> opts
73         (CBlastOptionsFactory::Create(program, CBlastOptions::eRemote));
74 
75     return opts;
76 }
77 
78 //
79 // Seq Search
80 //
81 
x_RemoteBlast()82 CRemoteBlast & CRemoteSeqSearch::x_RemoteBlast()
83 {
84     if (m_RemoteBlast.Empty()) {
85         // Verify all parts accounted for....
86         if (m_SearchOpts.Empty()) {
87             NCBI_THROW(CSearchException, eConfigErr, "No options specified");
88         }
89 
90         if (m_Queries.Empty()) {
91             NCBI_THROW(CSearchException, eConfigErr, "No queries specified");
92         }
93 
94         if (m_Subject.Empty() || m_Subject->GetDatabaseName().empty()) {
95             NCBI_THROW(CSearchException, eConfigErr,
96                        "No database name specified");
97         }
98 
99         // .. Done...
100 
101         m_RemoteBlast.Reset(new CRemoteBlast(& * m_SearchOpts));
102         m_RemoteBlast->SetDatabase(m_Subject->GetDatabaseName());
103         const string& kEntrezQuery = m_Subject->GetEntrezQueryLimitation();
104         if ( !kEntrezQuery.empty() ) {
105             m_RemoteBlast->SetEntrezQuery(kEntrezQuery.c_str());
106         }
107 
108         const CSearchDatabase::TGiList& kGiList =
109             m_Subject->GetGiListLimitation();
110         if ( !kGiList.empty() ) {
111             list<TGi> temp(kGiList.begin(), kGiList.end());
112             m_RemoteBlast->SetGIList(temp);
113         }
114 
115         CRef<CBioseq_set> bss        = m_Queries->GetBioseqSet();
116         IRemoteQueryData::TSeqLocs sll = m_Queries->GetSeqLocs();
117 
118         if ((bss.Empty()) && (sll.empty())) {
119             NCBI_THROW(CSearchException, eConfigErr,
120                        "Empty queries object specified.");
121         }
122 
123         if (bss.NotEmpty()) {
124             m_RemoteBlast->SetQueries(bss);
125         } else {
126             _ASSERT(! sll.empty());
127             m_RemoteBlast->SetQueries(sll);
128         }
129     }
130 
131     return *m_RemoteBlast;
132 }
133 
134 /// Build a result set from results in a remote blast search.
135 ///
136 /// The remote blast object will be queried for results and these will
137 /// be used to build a CSearchResultSet.  If the search has not yet
138 /// completed, this function will wait until it has.
139 ///
140 /// @param rb The remote blast object representing the search.
141 /// @return The results of the search as a CSearchResultSet.
142 static CRef<CSearchResultSet>
143 s_BuildResultsRemote(CRemoteBlast & rb);
144 
145 CRef<CSearchResultSet>
Run()146 CRemoteSeqSearch::Run()
147 {
148     // Calling Run() directly always queues a new search.
149     m_RemoteBlast.Reset();
150     //x_RemoteBlast().SetVerbose();
151     x_RemoteBlast().SubmitSync();
152 
153     const vector<string> & w = x_RemoteBlast().GetWarningVector();
154     m_Warnings.insert(m_Warnings.end(), w.begin(), w.end());
155 
156     return s_BuildResultsRemote(*m_RemoteBlast);
157 }
158 
SetOptions(CRef<CBlastOptionsHandle> opts)159 void CRemoteSeqSearch::SetOptions(CRef<CBlastOptionsHandle> opts)
160 {
161     m_SearchOpts = opts;
162 }
163 
SetSubject(CConstRef<CSearchDatabase> subject)164 void CRemoteSeqSearch::SetSubject(CConstRef<CSearchDatabase> subject)
165 {
166     m_Subject = subject;
167 }
168 
SetQueryFactory(CRef<IQueryFactory> query_factory)169 void CRemoteSeqSearch::SetQueryFactory(CRef<IQueryFactory> query_factory)
170 {
171     if (query_factory.Empty()) {
172         NCBI_THROW(CSearchException, eConfigErr,
173                    "CRemoteSeqSearch: empty query factory was specified.");
174     }
175 
176     m_Queries.Reset(query_factory->MakeRemoteQueryData());
177 }
178 
179 /// CRemoteBlast does not separate each hit to the query in discontinuous
180 /// Seq-aligns, so we do it here. This functionality might be merged with
181 /// CRemoteBlast::GetSeqAlignSets() in the future
182 static TSeqAlignVector
s_SplitAlignVectorBySubjects(TSeqAlignVector seqaligns)183 s_SplitAlignVectorBySubjects(TSeqAlignVector seqaligns)
184 {
185     // For each query...
186     NON_CONST_ITERATE(TSeqAlignVector, itr, seqaligns) {
187         CRef<CSeq_align_set> seq_align = *itr;
188 
189         CRef<CSeq_align_set> new_seq_align(new CSeq_align_set);
190 
191         // set the current Seq-id to an invalid gi
192         CConstRef<CSeq_id> current_subject(new CSeq_id(CSeq_id::e_Gi, 1));
193         // list of HSPs for a single query-subject pair
194         CRef<CSeq_align> current_hsp_list;
195 
196         // for each HSP ...
197         ITERATE(CSeq_align_set::Tdata, hsp_itr, seq_align->Get()) {
198 
199             const int kSubjectIndex = 1;
200             CConstRef<CSeq_id> subj_id(& (*hsp_itr)->GetSeq_id(kSubjectIndex));
201 
202             // new subject sequence (hit) found
203             if (subj_id->Compare(*current_subject) == CSeq_id::e_NO) {
204 
205                 current_subject = subj_id;
206 
207                 if (current_hsp_list.NotEmpty()) {
208                     new_seq_align->Set().push_back(current_hsp_list);
209                 }
210                 current_hsp_list.Reset(new CSeq_align);
211                 current_hsp_list->SetType(CSeq_align::eType_disc);
212                 current_hsp_list->SetDim(2);
213                 current_hsp_list->SetSegs().SetDisc().Set().push_back(*hsp_itr);
214 
215             } else {
216                 // same subject sequence as in previous iteration
217                 current_hsp_list->SetSegs().SetDisc().Set().push_back(*hsp_itr);
218             }
219         }
220         if (current_hsp_list.NotEmpty()) {
221             new_seq_align->Set().push_back(current_hsp_list);
222         }
223 
224         *itr = new_seq_align;
225     }
226     return seqaligns;
227 }
228 
229 static CRef<CSearchResultSet>
s_BuildResultsRemote(CRemoteBlast & rb)230 s_BuildResultsRemote(CRemoteBlast & rb)
231 {
232     // This cascades the warnings and errors: all queries get all
233     // errors and warnings.  At the moment, none of the remote (or for
234     // that matter, local) code seems to have a way to categorize
235     // errors by type and query.
236 
237     // If the query number were known, and the error number were
238     // known, it is possible that the user could (in some cases) cope
239     // with the error or possibly salvage data from the non-failing
240     // requests.
241 
242     // Comments:
243     //
244     // 1. In how many (if any) client code scenarios does error
245     //    recovery makes sense?
246     //
247     // 2. What kinds of errors that are recoverable?
248     //
249     // 3. Does the user ever need to know more than that a request
250     //    found results, found nothing, or produced an error message?
251     //
252     // 4. If a single query fails, how do we avoid pairing the fatal
253     //    error message with non-failing requests.
254 
255     TQueryMessages msgs;
256     CRef<CSearchMessage> msg;
257 
258     // Convert warnings and errors into CSearchMessage objects.
259 
260     ITERATE(vector<string>, iter, rb.GetWarningVector()) {
261         msg.Reset(new CSearchMessage(eBlastSevError, -1, *iter));
262         msgs.push_back(msg);
263     }
264 
265     ITERATE(vector<string>, iter, rb.GetErrorVector()) {
266         msg.Reset(new CSearchMessage(eBlastSevError, -1, *iter));
267         msgs.push_back(msg);
268     }
269 
270     TSeqAlignVector aligns =
271         s_SplitAlignVectorBySubjects(rb.GetSeqAlignSets());
272 
273     // Cascade the messages -- this will result in a lot of CRef<>
274     // sharing but hopefully not too much actual computation.
275 
276     TSearchMessages msg_vec;
277 
278     for(size_t i = 0; i<aligns.size(); i++) {
279         msg_vec.push_back(msgs);
280     }
281 
282     return CRef<CSearchResultSet>(new CSearchResultSet(aligns, msg_vec));
283 }
284 
285 
286 //
287 // Psi Search
288 //
289 
SetOptions(CRef<CBlastOptionsHandle> opts)290 void CRemotePssmSearch::SetOptions(CRef<CBlastOptionsHandle> opts)
291 {
292     m_SearchOpts  = opts;
293     m_RemoteBlast.Reset(new CRemoteBlast(& * opts));
294 }
295 
SetSubject(CConstRef<CSearchDatabase> subject)296 void CRemotePssmSearch::SetSubject(CConstRef<CSearchDatabase> subject)
297 {
298     m_Subject = subject;
299 }
300 
x_RemoteBlast()301 CRemoteBlast & CRemotePssmSearch::x_RemoteBlast()
302 {
303     if (m_RemoteBlast.Empty()) {
304         // Verify all parts accounted for....
305         if (m_SearchOpts.Empty()) {
306             NCBI_THROW(CSearchException, eConfigErr, "No options specified");
307         }
308 
309         if (m_Pssm.Empty()) {
310             NCBI_THROW(CSearchException, eConfigErr, "No queries specified");
311         }
312 
313         if (m_Subject.Empty() || m_Subject->GetDatabaseName().empty()) {
314             NCBI_THROW(CSearchException, eConfigErr,
315                        "No database name specified");
316         }
317 
318         // .. Done...
319 
320         m_RemoteBlast.Reset(new CRemoteBlast(& * m_SearchOpts));
321         m_RemoteBlast->SetDatabase(m_Subject->GetDatabaseName());
322         m_RemoteBlast->SetQueries(m_Pssm);
323 
324         const string& kEntrezQuery = m_Subject->GetEntrezQueryLimitation();
325         if ( !kEntrezQuery.empty() ) {
326             m_RemoteBlast->SetEntrezQuery(kEntrezQuery.c_str());
327         }
328 
329         const CSearchDatabase::TGiList& kGiList =
330             m_Subject->GetGiListLimitation();
331         if ( !kGiList.empty() ) {
332             list<TGi> temp(kGiList.begin(), kGiList.end());
333             m_RemoteBlast->SetGIList(temp);
334         }
335     }
336 
337     return *m_RemoteBlast;
338 }
339 
340 CRef<CSearchResultSet>
Run()341 CRemotePssmSearch::Run()
342 {
343     // Calling Run() directly always queues a new search.
344     m_RemoteBlast.Reset();
345     //x_RemoteBlast().SetVerbose();
346 
347     x_RemoteBlast().SubmitSync();
348 
349     const vector<string> & w = x_RemoteBlast().GetWarningVector();
350     m_Warnings.insert(m_Warnings.end(), w.begin(), w.end());
351 
352     return s_BuildResultsRemote(*m_RemoteBlast);
353 }
354 
355 
SetQuery(CRef<objects::CPssmWithParameters> pssm)356 void CRemotePssmSearch::SetQuery(CRef<objects::CPssmWithParameters> pssm)
357 {
358     if (pssm.Empty()) {
359         NCBI_THROW(CSearchException, eConfigErr,
360                    "CRemotePssmSearch: empty query object was specified.");
361     }
362 
363     m_Pssm = pssm;
364 }
365 
366 END_SCOPE(blast)
367 END_NCBI_SCOPE
368 
369 /* @} */
370