1 /*  $Id: kblastapi.cpp 555442 2018-01-18 12:23:05Z madden $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Tom Madden
27  *
28  * File Description:
29  *   API for KMER BLAST searches
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 #include <objmgr/object_manager.hpp>
36 #include <objtools/blast/seqdb_reader/seqdb.hpp>
37 #include <algo/blast/blastinput/blast_scope_src.hpp>
38 #include <algo/blast/api/uniform_search.hpp>
39 #include <algo/blast/api/blast_prot_options.hpp>
40 #include <algo/blast/api/blast_advprot_options.hpp>
41 #include <algo/blast/api/local_blast.hpp>
42 #include <algo/blast/api/objmgr_query_data.hpp>
43 #include <math.h>
44 
45 #include <algo/blast/api/seqsrc_multiseq.hpp>
46 #include <algo/blast/api/seqinfosrc_seqvec.hpp>
47 
48 #include <util/sequtil/sequtil_convert.hpp>
49 
50 #include <algo/blast/proteinkmer/blastkmer.hpp>
51 #include <algo/blast/proteinkmer/blastkmerresults.hpp>
52 #include <algo/blast/proteinkmer/blastkmeroptions.hpp>
53 #include <algo/blast/proteinkmer/kblastapi.hpp>
54 
55 USING_NCBI_SCOPE;
56 USING_SCOPE(objects);
57 USING_SCOPE(blast);
58 
59 /// Places all the needed subject sequences into a scope.
60 /// The database loader (or cache perhaps?) does not seem to be thread safe, so it
61 /// is avoided.
62 static void
s_GetSequencesIntoScope(CRef<CBlastKmerResultsSet> resultSet,CRef<CScope> scope,CRef<CSeqDB> seqdb)63 s_GetSequencesIntoScope(CRef<CBlastKmerResultsSet> resultSet, CRef<CScope> scope, CRef<CSeqDB> seqdb)
64 {
65         int numSearches=resultSet->GetNumQueries();
66         vector<int> oid_v;
67         for (int index=0; index<numSearches; index++)
68         {
69                 CBlastKmerResults& results = (*resultSet)[index];
70                 TBlastKmerScoreVector scores = results.GetScores();
71                 int oid;
72                 for(TBlastKmerScoreVector::const_iterator iter=scores.begin(); iter != scores.end(); ++iter)
73                 {
74                         CRef<CSeq_id> sid = (*iter).first;
75 			if (sid->IsGi())
76 				seqdb->GiToOid(sid->GetGi(), oid);
77 			else
78                         	seqdb->SeqidToOid(*sid, oid);
79                         oid_v.push_back(oid);
80                 }
81         }
82         if (oid_v.size() == 0) // Nothing to be loaded.
83                 return;
84 
85         sort(oid_v.begin(), oid_v.end());
86 
87         for(vector<int>::iterator iter=oid_v.begin(); iter!=oid_v.end(); ++iter)
88         {
89                 CRef<CBioseq> bioseq = seqdb->GetBioseq(*iter);
90                 scope->AddBioseq(*bioseq);
91         }
92         return;
93 }
94 
s_AddNewResultSet(CRef<CSearchResultSet> resultSet,CRef<CSearchResultSet> myResultSet)95 void s_AddNewResultSet(CRef<CSearchResultSet> resultSet, CRef<CSearchResultSet> myResultSet)
96 {
97 	for (CSearchResultSet::iterator iter=myResultSet->begin(); iter!=myResultSet->end(); ++iter)
98 	{
99 		resultSet->push_back(*iter);
100 	}
101 }
102 
s_MakeEmptyResults(CRef<IQueryFactory> qf,const CBlastOptions & opts,CRef<CLocalDbAdapter> dbAdapter,TSearchMessages & msg_vec)103 CRef<CSearchResultSet> s_MakeEmptyResults(CRef<IQueryFactory> qf,  const CBlastOptions& opts,
104 CRef<CLocalDbAdapter> dbAdapter, TSearchMessages& msg_vec)
105 {
106          // Search was not run, but we send back an empty CSearchResultSet.
107          CRef<ILocalQueryData> local_query_data = qf->MakeLocalQueryData(&opts);
108          vector< CConstRef<objects::CSeq_id> > seqid_vec;
109          vector< CRef<CBlastAncillaryData> > ancill_vec;
110          TSeqAlignVector sa_vec;
111          size_t index;
112          EResultType res_type = eDatabaseSearch;
113          unsigned int num_subjects = 0;
114          if (dbAdapter.NotEmpty() && !dbAdapter->IsBlastDb() && !dbAdapter->IsDbScanMode()) {
115         	 res_type = eSequenceComparison;
116              IBlastSeqInfoSrc *  subject_infosrc = dbAdapter->MakeSeqInfoSrc();
117              if(subject_infosrc != NULL) {
118             	 num_subjects = subject_infosrc->Size();
119              }
120          }
121          for (index=0; index<local_query_data->GetNumQueries(); index++)
122          {
123               CConstRef<objects::CSeq_id> query_id(local_query_data->GetSeq_loc(index)->GetId());
124               TQueryMessages q_msg;
125 	      /// FIXME, PROBLEM??
126               // local_query_data->GetQueryMessages(index, q_msg);
127               // msg_vec.push_back(q_msg);
128               seqid_vec.push_back(query_id);
129               CRef<objects::CSeq_align_set> tmp_align;
130               sa_vec.push_back(tmp_align);
131               pair<double, double> tmp_pair(-1.0, -1.0);
132               CRef<CBlastAncillaryData>  tmp_ancillary_data(new CBlastAncillaryData(tmp_pair, tmp_pair, tmp_pair, 0));
133               ancill_vec.push_back(tmp_ancillary_data);
134 
135               for(unsigned int i =1; i < num_subjects; i++)
136               {
137             	  TQueryMessages msg;
138                   msg_vec.push_back(msg);
139                   seqid_vec.push_back(query_id);
140                   CRef<objects::CSeq_align_set> tmp_align;
141                   sa_vec.push_back(tmp_align);
142            	      CRef<CBlastAncillaryData>  tmp_ancillary_data(new CBlastAncillaryData(tmp_pair, tmp_pair, tmp_pair, 0));
143            	      ancill_vec.push_back(tmp_ancillary_data);
144               }
145          }
146 	 msg_vec.resize(seqid_vec.size());  // FIXME
147          CRef<CSearchResultSet> result_set(new CSearchResultSet(seqid_vec, sa_vec, msg_vec, ancill_vec, 0, res_type));
148          return result_set;
149 }
150 
151 /////////////////////////////////////////////////////////////////////////////
152 //  Perform a KMER search then a BLAST search.
153 
Run(void)154 CRef<CSearchResultSet> CBlastKmerSearch::Run(void)
155 {
156 	CRef<CSeqDB> seqdb(new CSeqDB(m_Database->GetDatabaseName(), CSeqDB::eProtein));
157 	seqdb->SetNumberOfThreads(1, true);
158 
159 	if (m_OptsHandle->GetDbLength() == 0)
160 		m_OptsHandle->SetDbLength(seqdb->GetTotalLength());
161 
162 	CBlastpKmerOptionsHandle* kmerOptHndl = dynamic_cast<CBlastpKmerOptionsHandle*> (&*m_OptsHandle);
163 	CRef<CBlastKmerOptions> opts(new CBlastKmerOptions()); // KMER specific options.
164 	opts->SetThresh(kmerOptHndl->GetThresh());
165 	opts->SetMinHits(kmerOptHndl->GetMinHits());
166 	opts->SetNumTargetSeqs(kmerOptHndl->GetCandidateSeqs());
167 
168 	CObjMgr_QueryFactory* objmgr_qf = NULL;
169 	TSeqLocVector tsl_v;
170 	if ( (objmgr_qf = dynamic_cast<CObjMgr_QueryFactory*>(&*m_QueryFactory)) )
171     	{
172 		tsl_v = objmgr_qf->GetTSeqLocVector();
173 		_ASSERT(!tsl_v.empty());
174 	}
175 	CRef<CBlastKmer> blastkmer(new CBlastKmer(tsl_v, opts, seqdb));
176 	if (!m_GIList.Empty())
177 		blastkmer->SetGiListLimit(m_GIList);
178 	else if(!m_NegGIList.Empty())
179 		blastkmer->SetGiListLimit(m_NegGIList);
180 
181 	CRef<CBlastKmerResultsSet> resultSet = blastkmer->RunSearches();
182 
183 	//FIXME: check if all are same or use all?
184 	vector< CRef<CScope> > scope_v = objmgr_qf->ExtractScopes();
185 	s_GetSequencesIntoScope(resultSet, scope_v[0], seqdb);
186 
187 	int numSearches=resultSet->GetNumQueries();
188 	CRef<CSearchResultSet> search_results(new CSearchResultSet());
189 	for (int index=0; index<numSearches; index++)
190 	{
191 		CBlastKmerResults& results = (*resultSet)[index];
192 		if (results.HasErrors() || results.HasWarnings())
193 		{
194 			TQueryMessages msg = results.GetErrors(eBlastSevWarning);
195 			string queryId = msg.GetQueryId();
196 			for(TQueryMessages::const_iterator iter=msg.begin(); iter != msg.end(); ++iter)
197 			{
198 				cerr << queryId << " " << (*iter)->GetMessage() << '\n';
199 			}
200 		}
201 		const TBlastKmerScoreVector& scores = results.GetScores();
202 
203      		if (scores.size() > 0)
204      		{
205 			TSeqLocVector subjectTSL;
206 			results.GetTSL(subjectTSL, scope_v[0]);
207 
208 			TSeqLocVector query_vector_temp;
209 			query_vector_temp.push_back(tsl_v[index]);
210 			CRef<IQueryFactory> qfactory(new CObjMgr_QueryFactory(query_vector_temp));
211 
212 			BlastSeqSrc* seq_src = MultiSeqBlastSeqSrcInit(subjectTSL, eBlastTypeBlastp, true);
213 			CRef<IBlastSeqInfoSrc> seqinfo_src(new CSeqVecSeqInfoSrc(subjectTSL));
214 
215 			CLocalBlast lcl_blast(qfactory, CRef<CBlastOptionsHandle> (m_OptsHandle.GetPointer()), seq_src, seqinfo_src);
216 			CRef<CSearchResultSet> my_blast_results = lcl_blast.Run();
217 			s_AddNewResultSet(search_results, my_blast_results);
218 			seq_src = BlastSeqSrcFree(seq_src);
219 		}
220 		else
221 		{
222 			TSeqLocVector query_vector_temp;
223 			query_vector_temp.push_back(tsl_v[index]);
224 			CRef<IQueryFactory> qfactory(new CObjMgr_QueryFactory(query_vector_temp));
225 			TSearchMessages msg;
226 			if (results.HasErrors() || results.HasWarnings())
227 			{
228 				TQueryMessages qmsg = results.GetErrors(eBlastSevWarning);
229 				msg.push_back(qmsg);
230 			}
231 			CRef<CSearchResultSet> my_blast_results=s_MakeEmptyResults(qfactory, m_OptsHandle->GetOptions(), CRef<CLocalDbAdapter>(NULL), msg);
232 			s_AddNewResultSet(search_results, my_blast_results);
233 		}
234      }
235 
236      return search_results;
237 }
238 
239