1 /* $Id: kblastapi.cpp 555442 2018-01-18 12:23:05Z madden $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Tom Madden
27 *
28 * File Description:
29 * API for KMER BLAST searches
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 #include <objmgr/object_manager.hpp>
36 #include <objtools/blast/seqdb_reader/seqdb.hpp>
37 #include <algo/blast/blastinput/blast_scope_src.hpp>
38 #include <algo/blast/api/uniform_search.hpp>
39 #include <algo/blast/api/blast_prot_options.hpp>
40 #include <algo/blast/api/blast_advprot_options.hpp>
41 #include <algo/blast/api/local_blast.hpp>
42 #include <algo/blast/api/objmgr_query_data.hpp>
43 #include <math.h>
44
45 #include <algo/blast/api/seqsrc_multiseq.hpp>
46 #include <algo/blast/api/seqinfosrc_seqvec.hpp>
47
48 #include <util/sequtil/sequtil_convert.hpp>
49
50 #include <algo/blast/proteinkmer/blastkmer.hpp>
51 #include <algo/blast/proteinkmer/blastkmerresults.hpp>
52 #include <algo/blast/proteinkmer/blastkmeroptions.hpp>
53 #include <algo/blast/proteinkmer/kblastapi.hpp>
54
55 USING_NCBI_SCOPE;
56 USING_SCOPE(objects);
57 USING_SCOPE(blast);
58
59 /// Places all the needed subject sequences into a scope.
60 /// The database loader (or cache perhaps?) does not seem to be thread safe, so it
61 /// is avoided.
62 static void
s_GetSequencesIntoScope(CRef<CBlastKmerResultsSet> resultSet,CRef<CScope> scope,CRef<CSeqDB> seqdb)63 s_GetSequencesIntoScope(CRef<CBlastKmerResultsSet> resultSet, CRef<CScope> scope, CRef<CSeqDB> seqdb)
64 {
65 int numSearches=resultSet->GetNumQueries();
66 vector<int> oid_v;
67 for (int index=0; index<numSearches; index++)
68 {
69 CBlastKmerResults& results = (*resultSet)[index];
70 TBlastKmerScoreVector scores = results.GetScores();
71 int oid;
72 for(TBlastKmerScoreVector::const_iterator iter=scores.begin(); iter != scores.end(); ++iter)
73 {
74 CRef<CSeq_id> sid = (*iter).first;
75 if (sid->IsGi())
76 seqdb->GiToOid(sid->GetGi(), oid);
77 else
78 seqdb->SeqidToOid(*sid, oid);
79 oid_v.push_back(oid);
80 }
81 }
82 if (oid_v.size() == 0) // Nothing to be loaded.
83 return;
84
85 sort(oid_v.begin(), oid_v.end());
86
87 for(vector<int>::iterator iter=oid_v.begin(); iter!=oid_v.end(); ++iter)
88 {
89 CRef<CBioseq> bioseq = seqdb->GetBioseq(*iter);
90 scope->AddBioseq(*bioseq);
91 }
92 return;
93 }
94
s_AddNewResultSet(CRef<CSearchResultSet> resultSet,CRef<CSearchResultSet> myResultSet)95 void s_AddNewResultSet(CRef<CSearchResultSet> resultSet, CRef<CSearchResultSet> myResultSet)
96 {
97 for (CSearchResultSet::iterator iter=myResultSet->begin(); iter!=myResultSet->end(); ++iter)
98 {
99 resultSet->push_back(*iter);
100 }
101 }
102
s_MakeEmptyResults(CRef<IQueryFactory> qf,const CBlastOptions & opts,CRef<CLocalDbAdapter> dbAdapter,TSearchMessages & msg_vec)103 CRef<CSearchResultSet> s_MakeEmptyResults(CRef<IQueryFactory> qf, const CBlastOptions& opts,
104 CRef<CLocalDbAdapter> dbAdapter, TSearchMessages& msg_vec)
105 {
106 // Search was not run, but we send back an empty CSearchResultSet.
107 CRef<ILocalQueryData> local_query_data = qf->MakeLocalQueryData(&opts);
108 vector< CConstRef<objects::CSeq_id> > seqid_vec;
109 vector< CRef<CBlastAncillaryData> > ancill_vec;
110 TSeqAlignVector sa_vec;
111 size_t index;
112 EResultType res_type = eDatabaseSearch;
113 unsigned int num_subjects = 0;
114 if (dbAdapter.NotEmpty() && !dbAdapter->IsBlastDb() && !dbAdapter->IsDbScanMode()) {
115 res_type = eSequenceComparison;
116 IBlastSeqInfoSrc * subject_infosrc = dbAdapter->MakeSeqInfoSrc();
117 if(subject_infosrc != NULL) {
118 num_subjects = subject_infosrc->Size();
119 }
120 }
121 for (index=0; index<local_query_data->GetNumQueries(); index++)
122 {
123 CConstRef<objects::CSeq_id> query_id(local_query_data->GetSeq_loc(index)->GetId());
124 TQueryMessages q_msg;
125 /// FIXME, PROBLEM??
126 // local_query_data->GetQueryMessages(index, q_msg);
127 // msg_vec.push_back(q_msg);
128 seqid_vec.push_back(query_id);
129 CRef<objects::CSeq_align_set> tmp_align;
130 sa_vec.push_back(tmp_align);
131 pair<double, double> tmp_pair(-1.0, -1.0);
132 CRef<CBlastAncillaryData> tmp_ancillary_data(new CBlastAncillaryData(tmp_pair, tmp_pair, tmp_pair, 0));
133 ancill_vec.push_back(tmp_ancillary_data);
134
135 for(unsigned int i =1; i < num_subjects; i++)
136 {
137 TQueryMessages msg;
138 msg_vec.push_back(msg);
139 seqid_vec.push_back(query_id);
140 CRef<objects::CSeq_align_set> tmp_align;
141 sa_vec.push_back(tmp_align);
142 CRef<CBlastAncillaryData> tmp_ancillary_data(new CBlastAncillaryData(tmp_pair, tmp_pair, tmp_pair, 0));
143 ancill_vec.push_back(tmp_ancillary_data);
144 }
145 }
146 msg_vec.resize(seqid_vec.size()); // FIXME
147 CRef<CSearchResultSet> result_set(new CSearchResultSet(seqid_vec, sa_vec, msg_vec, ancill_vec, 0, res_type));
148 return result_set;
149 }
150
151 /////////////////////////////////////////////////////////////////////////////
152 // Perform a KMER search then a BLAST search.
153
Run(void)154 CRef<CSearchResultSet> CBlastKmerSearch::Run(void)
155 {
156 CRef<CSeqDB> seqdb(new CSeqDB(m_Database->GetDatabaseName(), CSeqDB::eProtein));
157 seqdb->SetNumberOfThreads(1, true);
158
159 if (m_OptsHandle->GetDbLength() == 0)
160 m_OptsHandle->SetDbLength(seqdb->GetTotalLength());
161
162 CBlastpKmerOptionsHandle* kmerOptHndl = dynamic_cast<CBlastpKmerOptionsHandle*> (&*m_OptsHandle);
163 CRef<CBlastKmerOptions> opts(new CBlastKmerOptions()); // KMER specific options.
164 opts->SetThresh(kmerOptHndl->GetThresh());
165 opts->SetMinHits(kmerOptHndl->GetMinHits());
166 opts->SetNumTargetSeqs(kmerOptHndl->GetCandidateSeqs());
167
168 CObjMgr_QueryFactory* objmgr_qf = NULL;
169 TSeqLocVector tsl_v;
170 if ( (objmgr_qf = dynamic_cast<CObjMgr_QueryFactory*>(&*m_QueryFactory)) )
171 {
172 tsl_v = objmgr_qf->GetTSeqLocVector();
173 _ASSERT(!tsl_v.empty());
174 }
175 CRef<CBlastKmer> blastkmer(new CBlastKmer(tsl_v, opts, seqdb));
176 if (!m_GIList.Empty())
177 blastkmer->SetGiListLimit(m_GIList);
178 else if(!m_NegGIList.Empty())
179 blastkmer->SetGiListLimit(m_NegGIList);
180
181 CRef<CBlastKmerResultsSet> resultSet = blastkmer->RunSearches();
182
183 //FIXME: check if all are same or use all?
184 vector< CRef<CScope> > scope_v = objmgr_qf->ExtractScopes();
185 s_GetSequencesIntoScope(resultSet, scope_v[0], seqdb);
186
187 int numSearches=resultSet->GetNumQueries();
188 CRef<CSearchResultSet> search_results(new CSearchResultSet());
189 for (int index=0; index<numSearches; index++)
190 {
191 CBlastKmerResults& results = (*resultSet)[index];
192 if (results.HasErrors() || results.HasWarnings())
193 {
194 TQueryMessages msg = results.GetErrors(eBlastSevWarning);
195 string queryId = msg.GetQueryId();
196 for(TQueryMessages::const_iterator iter=msg.begin(); iter != msg.end(); ++iter)
197 {
198 cerr << queryId << " " << (*iter)->GetMessage() << '\n';
199 }
200 }
201 const TBlastKmerScoreVector& scores = results.GetScores();
202
203 if (scores.size() > 0)
204 {
205 TSeqLocVector subjectTSL;
206 results.GetTSL(subjectTSL, scope_v[0]);
207
208 TSeqLocVector query_vector_temp;
209 query_vector_temp.push_back(tsl_v[index]);
210 CRef<IQueryFactory> qfactory(new CObjMgr_QueryFactory(query_vector_temp));
211
212 BlastSeqSrc* seq_src = MultiSeqBlastSeqSrcInit(subjectTSL, eBlastTypeBlastp, true);
213 CRef<IBlastSeqInfoSrc> seqinfo_src(new CSeqVecSeqInfoSrc(subjectTSL));
214
215 CLocalBlast lcl_blast(qfactory, CRef<CBlastOptionsHandle> (m_OptsHandle.GetPointer()), seq_src, seqinfo_src);
216 CRef<CSearchResultSet> my_blast_results = lcl_blast.Run();
217 s_AddNewResultSet(search_results, my_blast_results);
218 seq_src = BlastSeqSrcFree(seq_src);
219 }
220 else
221 {
222 TSeqLocVector query_vector_temp;
223 query_vector_temp.push_back(tsl_v[index]);
224 CRef<IQueryFactory> qfactory(new CObjMgr_QueryFactory(query_vector_temp));
225 TSearchMessages msg;
226 if (results.HasErrors() || results.HasWarnings())
227 {
228 TQueryMessages qmsg = results.GetErrors(eBlastSevWarning);
229 msg.push_back(qmsg);
230 }
231 CRef<CSearchResultSet> my_blast_results=s_MakeEmptyResults(qfactory, m_OptsHandle->GetOptions(), CRef<CLocalDbAdapter>(NULL), msg);
232 s_AddNewResultSet(search_results, my_blast_results);
233 }
234 }
235
236 return search_results;
237 }
238
239