1 /* $Id: cuSimpleB2SWrapper.cpp 341083 2011-10-17 14:21:18Z lanczyck $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Chris Lanczycki
27  *
28  * File Description:
29  *
30  *       Simplified API for a single blast-two-sequences call.
31  *       Does not involve CDs, and NOT optimized (or intended) to be called
32  *       in batch mode.  If you need to make many calls, use CdBlaster!
33  *
34  * ===========================================================================
35  */
36 
37 #include <ncbi_pch.hpp>
38 #include <vector>
39 #include <string>
40 #include <list>
41 
42 #include <objmgr/scope.hpp>
43 #include <objects/seq/Bioseq.hpp>
44 #include <objects/seqalign/Seq_align.hpp>
45 #include <objects/seqalign/Score.hpp>
46 #include <objects/seqloc/Seq_id.hpp>
47 #include <objects/seqset/Seq_entry.hpp>
48 #include <objects/general/Object_id.hpp>
49 #include <algo/blast/api/blast_prot_options.hpp>
50 #include <algo/blast/api/blast_advprot_options.hpp>
51 #include <algo/blast/api/psiblast_options.hpp>
52 #include <algo/blast/api/objmgr_query_data.hpp>
53 #include <algo/structure/cd_utils/cuCdCore.hpp>
54 #include <objects/seqalign/Dense_seg.hpp>
55 
56 #include <algo/structure/cd_utils/cuAlign.hpp>
57 #include <algo/structure/cd_utils/cuSequence.hpp>
58 #include <algo/structure/cd_utils/cuCdReadWriteASN.hpp>
59 #include <algo/structure/cd_utils/cuSimpleB2SWrapper.hpp>
60 
61 BEGIN_NCBI_SCOPE
62 USING_SCOPE(blast);
63 BEGIN_SCOPE(cd_utils)
64 
65 const unsigned int CSimpleB2SWrapper::HITLIST_SIZE_DEFAULT     = 100;
66 const unsigned int CSimpleB2SWrapper::MAX_HITLIST_SIZE         = 10000;
67 const Int8   CSimpleB2SWrapper::CDD_DATABASE_SIZE              = 1000000;
68 const double CSimpleB2SWrapper::E_VAL_DEFAULT                  = 10.0;    // default e-value threshold
69 const double CSimpleB2SWrapper::E_VAL_WHEN_NO_SEQ_ALIGN        = 1000000; // e-value when Blast doesn't return a seq-align
70 const double CSimpleB2SWrapper::SCORE_WHEN_NO_SEQ_ALIGN        = -1.0;
71 const ECompoAdjustModes CSimpleB2SWrapper::COMPOSITION_ADJ_DEF = eNoCompositionBasedStats;
72 const string CSimpleB2SWrapper::SCORING_MATRIX_DEFAULT         = BLOSUM62NAME;
73 const double CSimpleB2SWrapper::DO_NOT_USE_PERC_ID_THRESHOLD   = -1.0;
74 const Int8   CSimpleB2SWrapper::DO_NOT_USE_EFF_SEARCH_SPACE    = -1;
75 
CSimpleB2SWrapper(double percIdThold,string matrixName)76 CSimpleB2SWrapper::CSimpleB2SWrapper(double percIdThold, string matrixName)
77 {
78     InitializeToDefaults();     //  must be called before any Set...() methods
79     SetPercIdThreshold(percIdThold);
80     SetMatrixName(matrixName);
81 }
82 
CSimpleB2SWrapper(CRef<CBioseq> & seq1,CRef<CBioseq> & seq2,double percIdThold,string matrixName)83 CSimpleB2SWrapper::CSimpleB2SWrapper(CRef<CBioseq>& seq1, CRef<CBioseq>& seq2, double percIdThold, string matrixName)
84 {
85     InitializeToDefaults();    //  must be called before any Set...() methods
86     SetSeq(seq1, true, 0, 0);
87     SetSeq(seq2, false, 0, 0);
88     SetPercIdThreshold(percIdThold);
89     SetMatrixName(matrixName);
90 }
91 
InitializeToDefaults()92 void CSimpleB2SWrapper::InitializeToDefaults()
93 {
94     m_hitlistSize = HITLIST_SIZE_DEFAULT;
95     m_dbLength = CDD_DATABASE_SIZE;
96     m_eValueThold = E_VAL_DEFAULT;
97     m_effSearchSpace = DO_NOT_USE_EFF_SEARCH_SPACE;
98     m_caMode = COMPOSITION_ADJ_DEF;
99     m_scoringMatrix = SCORING_MATRIX_DEFAULT;
100 
101 
102 	m_options.Reset(new CBlastAdvancedProteinOptionsHandle);
103 
104     //  Fill object with defaults we want to set.
105     if (m_options.NotEmpty()) {
106         m_options->SetEvalueThreshold(E_VAL_DEFAULT);
107         m_options->SetHitlistSize(HITLIST_SIZE_DEFAULT);
108         m_options->SetMatrixName(SCORING_MATRIX_DEFAULT.c_str());
109         m_options->SetSegFiltering(false);
110         m_options->SetDbLength(CDD_DATABASE_SIZE);
111         m_options->SetCompositionBasedStats(COMPOSITION_ADJ_DEF);
112 
113         m_options->SetDbSeqNum(1);
114     }
115 
116 }
117 
SetSeq(CRef<CBioseq> & seq,bool isSeq1,unsigned int from,unsigned int to)118 void CSimpleB2SWrapper::SetSeq(CRef<CBioseq>& seq, bool isSeq1, unsigned int from, unsigned int to)
119 {
120     bool full = (from == 0 && to == 0);
121     unsigned int len = GetSeqLength(*seq);
122 
123     //  If invalid range, also use the full sequence.
124     if (full || from > to) {
125         full = true;
126         from = 0;
127 		to = (seq.NotEmpty()) ? len - 1 : 0;
128 //		to = (seq.NotEmpty()) ? seq->GetInst().GetLength() - 1 : 0;
129     }
130 
131     //  Clip end of range so it does not extend beyond end of the sequence
132     if (to >= len) {
133         to = len - 1;
134     }
135 
136     SB2SSeq tmp = {full, from, to, seq};
137     if (isSeq1) {
138         m_seq1 = tmp;
139     } else {
140         m_seq2 = tmp;
141     }
142 }
143 
FillOutSeqLoc(const SB2SSeq & s,CSeq_loc & seqLoc)144 bool CSimpleB2SWrapper::FillOutSeqLoc(const SB2SSeq& s, CSeq_loc& seqLoc)
145 {
146     bool result = true;
147     CSeq_interval& seqInt = seqLoc.SetInt();
148     CSeq_id& seqId = seqInt.SetId();
149     seqInt.SetFrom(s.from);
150     seqInt.SetTo(s.to);
151 
152     //  Assign the first identifier from the bioseq
153     if (s.bs.NotEmpty() && s.bs->GetId().size() > 0) {
154         seqId.Assign(*(s.bs->GetId().front()));
155     } else {
156         result = false;
157     }
158 
159     return result;
160 }
161 
162 
SetPercIdThreshold(double percIdThold)163 double CSimpleB2SWrapper::SetPercIdThreshold(double percIdThold)
164 {
165     if ((percIdThold == DO_NOT_USE_PERC_ID_THRESHOLD || (percIdThold >= 0 && percIdThold <= 100.0)) && m_options.NotEmpty()) {
166         m_percIdThold = percIdThold;
167         m_options->SetPercentIdentity(m_percIdThold);
168     }
169     return m_percIdThold;
170 }
171 
SetHitlistSize(unsigned int hitlistSize)172 unsigned int CSimpleB2SWrapper::SetHitlistSize(unsigned int hitlistSize)
173 {
174     if (hitlistSize > 0 && hitlistSize <= MAX_HITLIST_SIZE && m_options.NotEmpty()) {
175         m_hitlistSize = hitlistSize;
176         m_options->SetHitlistSize(m_hitlistSize);
177     }
178     return m_hitlistSize;
179 }
180 
SetDbLength(Int8 dbLength)181 Int8 CSimpleB2SWrapper::SetDbLength(Int8 dbLength)
182 {
183     if (dbLength > 0 && m_options.NotEmpty()) {
184         m_dbLength = dbLength;
185         m_options->SetDbLength(m_dbLength);
186     }
187     return m_dbLength;
188 }
189 
SetEffSearchSpace(Int8 effSearchSpace)190 Int8 CSimpleB2SWrapper::SetEffSearchSpace(Int8 effSearchSpace)
191 {
192     if (effSearchSpace > 0 && m_options.NotEmpty()) {
193         m_effSearchSpace = effSearchSpace;
194         m_options->SetEffectiveSearchSpace(m_effSearchSpace);
195     }
196     return m_effSearchSpace;
197 }
198 
SetCompoAdjustMode(ECompoAdjustModes caMode)199 ECompoAdjustModes CSimpleB2SWrapper::SetCompoAdjustMode(ECompoAdjustModes caMode)
200 {
201     if (m_options.NotEmpty()) {
202         m_caMode = caMode;
203         m_options->SetCompositionBasedStats(m_caMode);
204     }
205     return m_caMode;
206 }
207 
SetEValueThreshold(double eValueThold)208 double CSimpleB2SWrapper::SetEValueThreshold(double eValueThold)
209 {
210     if (eValueThold >= 0 && m_options.NotEmpty()) {
211         m_eValueThold = eValueThold;
212         m_options->SetEvalueThreshold(m_eValueThold);
213     }
214     return m_eValueThold;
215 }
216 
SetMatrixName(string matrixName)217 string CSimpleB2SWrapper::SetMatrixName(string matrixName)
218 {
219     bool isNameKnown = false;
220 
221     if (matrixName == BLOSUM62NAME || matrixName == BLOSUM45NAME || matrixName == BLOSUM80NAME ||
222         matrixName == PAM30NAME || matrixName == PAM70NAME || matrixName == PAM250NAME) {
223         isNameKnown = true;
224     }
225 
226     if (isNameKnown && m_options.NotEmpty()) {
227         m_scoringMatrix = matrixName;
228         m_options->SetMatrixName(m_scoringMatrix.c_str());
229     }
230     return m_scoringMatrix;
231 }
232 
DoBlast2Seqs()233 bool CSimpleB2SWrapper::DoBlast2Seqs()
234 {
235     if (m_options.Empty()) return false;
236 
237 /*
238 	CRef<CBlastAdvancedProteinOptionsHandle> options(new CBlastAdvancedProteinOptionsHandle);
239 	options->SetEvalueThreshold(m_eValueThold);
240 	options->SetMatrixName(m_scoringMatrix.c_str());
241 	options->SetSegFiltering(false);
242 	options->SetDbLength(CDD_DATABASE_SIZE);
243     options->SetHitlistSize(m_hitlistSize);
244 	options->SetDbSeqNum(1);
245 	options->SetCompositionBasedStats(eNoCompositionBasedStats);
246 
247     if (m_percIdThold != DO_NOT_USE_PERC_ID_THRESHOLD) options->SetPercentIdentity(m_percIdThold);
248 //    cout << "m_percIdThold = " << m_percIdThold << ";  DO_... = " << DO_NOT_USE_PERC_ID_THRESHOLD << endl;
249 */
250 
251     CSeq_loc querySeqLoc, subjectSeqLoc;
252     if (!FillOutSeqLoc(m_seq1, querySeqLoc) || !FillOutSeqLoc(m_seq2, subjectSeqLoc)) {
253         return false;
254     }
255 
256     RemoveAllDataLoaders();
257 
258     CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
259     CScope scope(*objmgr);
260     CBioseq_Handle handle1 = scope.AddBioseq(*m_seq1.bs);
261     CBioseq_Handle handle2 = scope.AddBioseq(*m_seq2.bs);
262 
263 
264     CRef<CBlastSearchQuery> bsqQuery(new CBlastSearchQuery(querySeqLoc, scope));
265     CRef<CBlastSearchQuery> bsqSubject(new CBlastSearchQuery(subjectSeqLoc, scope));
266     CBlastQueryVector queryVector, subjectVector;
267     queryVector.AddQuery(bsqQuery);
268     subjectVector.AddQuery(bsqSubject);
269 
270 
271     CRef<IQueryFactory> query(new CObjMgr_QueryFactory(queryVector));
272     CRef<IQueryFactory> subject(new CObjMgr_QueryFactory(subjectVector));
273 	CRef<CBlastProteinOptionsHandle> blastOptions((CBlastProteinOptionsHandle*)m_options.GetPointer());
274 
275 
276     //  perform the blast 2 sequences and process the results...
277     CPsiBl2Seq blaster(query, subject, blastOptions);
278     CSearchResultSet& hits = *blaster.Run();
279     int total = hits.GetNumResults();
280     for (int index=0; index<total; index++)
281        processBlastHits(hits[index]);
282 
283 //  dump the CSearchResults object...
284 //    string err;
285 //    if (total > 0) WriteASNToStream(cout, hits[0], false, &err);
286 
287     return (m_alignments.size() > 0);
288 }
289 
getBestB2SAlignment(double * score,double * eval,double * percIdent) const290 CRef<CSeq_align> CSimpleB2SWrapper::getBestB2SAlignment(double* score, double* eval, double* percIdent) const
291 {
292     if (m_alignments.size() == 0) {
293         CRef< CSeq_align > nullRef;
294         return nullRef;
295     }
296 
297     if (score && m_scores.size() > 0) {
298         *score = m_scores[0];
299     }
300     if (eval && m_evals.size() > 0) {
301         *eval = m_evals[0];
302     }
303     if (percIdent && m_percIdents.size() > 0) {
304         *percIdent = m_percIdents[0];
305     }
306 
307     return m_alignments[0];
308 }
309 
310 
getPairwiseBlastAlignment(unsigned int hitNum,CRef<CSeq_align> & seqAlign) const311 bool CSimpleB2SWrapper::getPairwiseBlastAlignment(unsigned int hitNum, CRef< CSeq_align >& seqAlign) const
312 {
313     bool result = false;
314     if (hitNum < m_alignments.size()) {
315         if (m_alignments[hitNum].NotEmpty()) {
316             seqAlign->Assign(*m_alignments[hitNum]);
317             result = true;
318         }
319     }
320     return result;
321 }
322 
getPairwiseScore(unsigned int hitNum) const323 double  CSimpleB2SWrapper::getPairwiseScore(unsigned int hitNum) const
324 {
325 	return (m_scores.size() > hitNum) ? m_scores[hitNum] : SCORE_WHEN_NO_SEQ_ALIGN;
326 }
327 
getPairwiseEValue(unsigned int hitNum) const328 double  CSimpleB2SWrapper::getPairwiseEValue(unsigned int hitNum) const
329 {
330 	return (m_evals.size() > hitNum) ? m_evals[hitNum] : E_VAL_WHEN_NO_SEQ_ALIGN;
331 }
332 
getPairwisePercIdent(unsigned int hitNum) const333 double  CSimpleB2SWrapper::getPairwisePercIdent(unsigned int hitNum) const
334 {
335 	return (m_percIdents.size() > hitNum) ? m_percIdents[hitNum] : DO_NOT_USE_PERC_ID_THRESHOLD;
336 }
337 
processBlastHits(CSearchResults & hits)338 void CSimpleB2SWrapper::processBlastHits(CSearchResults& hits)
339 {
340     unsigned int len1 = (m_seq1.to >= m_seq1.from) ? m_seq1.to - m_seq1.from + 1 : 0;
341     double invLen1 = (len1) ? 1.0/(double) len1 : 0;
342 	const list< CRef< CSeq_align > >& seqAlignList = hits.GetSeqAlign()->Get();
343 
344 //    unsigned int len2 = (m_seq2.to >= m_seq2.from) ? m_seq2.to - m_seq2.from + 1 : 0;
345 //    cout << "Processing " << seqAlignList.size() << " blast hits (len1 = " << len1 << ", len2 = " << len2 << ").\n";
346 
347     m_scores.clear();
348     m_evals.clear();
349     m_percIdents.clear();
350     m_alignments.clear();
351 
352     if (seqAlignList.size() == 0) return;
353 
354     int nIdent = 0;
355 	double score = 0.0, eVal = kMax_Double, percIdent = 0.0;
356 	CRef< CSeq_align > sa = ExtractFirstSeqAlign(*(seqAlignList.begin()));
357 	if (!sa.Empty())
358 	{
359 		sa->GetNamedScore(CSeq_align::eScore_Score, score);
360 		sa->GetNamedScore(CSeq_align::eScore_EValue, eVal);
361         if (sa->GetNamedScore(CSeq_align::eScore_IdentityCount, nIdent)) {
362             percIdent = 100.0 * invLen1 * (double) nIdent;
363 //            cout << "nIdent = " << nIdent << "; percIdent = " << percIdent << endl;
364 //        } else {
365 //            cout << "????  Didn't find identity count\n";
366         }
367 
368 
369 //        if (!sa->GetNamedScore(CSeq_align::eScore_PercentIdentity, percIdent))
370 //            cout << "????  Didn't find percent identity\n";
371 //        cout << "saving values:   score = " <<score << "; eval = " << eVal << "; id% = " << percIdent << endl;
372 
373         m_scores.push_back(score);
374         m_evals.push_back(eVal);
375         m_percIdents.push_back(percIdent);
376         m_alignments.push_back(sa);
377 	}
378 
379 }
380 
381 /*
382 bool CSimpleB2SWrapper::DoBlast2Seqs_OMFree()
383 {
384     if (m_options.Empty()) return false;
385 
386 //	CRef<CBlastAdvancedProteinOptionsHandle> options(new CBlastAdvancedProteinOptionsHandle);
387 //	options->SetEvalueThreshold(m_eValueThold);
388 //	options->SetMatrixName(m_scoringMatrix.c_str());
389 //	options->SetSegFiltering(false);
390 //	options->SetDbLength(CDD_DATABASE_SIZE);
391 //    options->SetHitlistSize(m_hitlistSize);
392 //	options->SetDbSeqNum(1);
393 //	options->SetCompositionBasedStats(eNoCompositionBasedStats);
394 
395 //    if (m_percIdThold != DO_NOT_USE_PERC_ID_THRESHOLD) options->SetPercentIdentity(m_percIdThold);
396 //    cout << "m_percIdThold = " << m_percIdThold << ";  DO_... = " << DO_NOT_USE_PERC_ID_THRESHOLD << endl;
397 
398 	CRef<CBlastProteinOptionsHandle> blastOptions((CBlastProteinOptionsHandle*)m_options.GetPointer());
399 
400     //  construct 'query' from m_seq1
401     CRef< CBioseq > queryBioseq = m_seq1.bs;
402     CRef<IQueryFactory> query(new CObjMgrFree_QueryFactory(queryBioseq));
403 
404 
405     //  construct 'subject' from m_seq2
406 	CRef< CBioseq_set > bioseqset(new CBioseq_set);
407 	list< CRef< CSeq_entry > >& seqEntryList = bioseqset->SetSeq_set();
408     CRef< CSeq_entry > seqEntry(new CSeq_entry);
409     seqEntry->SetSeq(*m_seq2.bs);
410     seqEntryList.push_back(seqEntry);
411     CRef<IQueryFactory> subject(new CObjMgrFree_QueryFactory(bioseqset));
412 
413     //  perform the blast 2 sequences and process the results...
414     CPsiBl2Seq blaster(query,subject,blastOptions);
415     CSearchResultSet& hits = *blaster.Run();
416     int total = hits.GetNumResults();
417     for (int index=0; index<total; index++)
418        processBlastHits_OMFree(hits[index]);
419 
420 //  dump the CSearchResults object...
421 //    string err;
422 //    if (total > 0) WriteASNToStream(cout, hits[0], false, &err);
423 
424     return (m_alignments.size() > 0);
425 }
426 
427 void CSimpleB2SWrapper::processBlastHits_OMFree(CSearchResults& hits)
428 {
429     unsigned int len1 = (m_seq1.to >= m_seq1.from) ? m_seq1.to - m_seq1.from + 1 : 0;
430     unsigned int len2 = (m_seq2.to >= m_seq2.from) ? m_seq2.to - m_seq2.from + 1 : 0;
431     double invLen1 = (len1) ? 1.0/(double) len1 : 0;
432 	const list< CRef< CSeq_align > >& seqAlignList = hits.GetSeqAlign()->Get();
433 
434     cout << "Processing " << seqAlignList.size() << " blast hits (len1 = " << len1 << ", len2 = " << len2 << ").\n";
435 
436     m_scores.clear();
437     m_evals.clear();
438     m_percIdents.clear();
439     m_alignments.clear();
440 
441     if (seqAlignList.size() == 0) return;
442 
443     int nIdent = 0;
444 	double score = 0.0, eVal = kMax_Double, percIdent = 0.0;
445 	CRef< CSeq_align > sa = ExtractFirstSeqAlign(*(seqAlignList.begin()));
446 	if (!sa.Empty())
447 	{
448 		sa->GetNamedScore(CSeq_align::eScore_Score, score);
449 		sa->GetNamedScore(CSeq_align::eScore_EValue, eVal);
450         if (sa->GetNamedScore(CSeq_align::eScore_IdentityCount, nIdent)) {
451             percIdent = 100.0 * invLen1 * (double) nIdent;
452 //            cout << "nIdent = " << nIdent << "; percIdent = " << percIdent << endl;
453 //        } else {
454 //            cout << "????  Didn't find identity count\n";
455         }
456 
457 
458 //        if (!sa->GetNamedScore(CSeq_align::eScore_PercentIdentity, percIdent))
459 //            cout << "????  Didn't find percent identity\n";
460 //        cout << "saving values:   score = " <<score << "; eval = " << eVal << "; id% = " << percIdent << endl;
461 
462         m_scores.push_back(score);
463         m_evals.push_back(eVal);
464         m_percIdents.push_back(percIdent);
465         m_alignments.push_back(sa);
466 	}
467 }
468 */
469 
470 END_SCOPE(cd_utils)
471 END_NCBI_SCOPE
472