1 /* $Id: cuSimpleB2SWrapper.cpp 341083 2011-10-17 14:21:18Z lanczyck $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Chris Lanczycki
27 *
28 * File Description:
29 *
30 * Simplified API for a single blast-two-sequences call.
31 * Does not involve CDs, and NOT optimized (or intended) to be called
32 * in batch mode. If you need to make many calls, use CdBlaster!
33 *
34 * ===========================================================================
35 */
36
37 #include <ncbi_pch.hpp>
38 #include <vector>
39 #include <string>
40 #include <list>
41
42 #include <objmgr/scope.hpp>
43 #include <objects/seq/Bioseq.hpp>
44 #include <objects/seqalign/Seq_align.hpp>
45 #include <objects/seqalign/Score.hpp>
46 #include <objects/seqloc/Seq_id.hpp>
47 #include <objects/seqset/Seq_entry.hpp>
48 #include <objects/general/Object_id.hpp>
49 #include <algo/blast/api/blast_prot_options.hpp>
50 #include <algo/blast/api/blast_advprot_options.hpp>
51 #include <algo/blast/api/psiblast_options.hpp>
52 #include <algo/blast/api/objmgr_query_data.hpp>
53 #include <algo/structure/cd_utils/cuCdCore.hpp>
54 #include <objects/seqalign/Dense_seg.hpp>
55
56 #include <algo/structure/cd_utils/cuAlign.hpp>
57 #include <algo/structure/cd_utils/cuSequence.hpp>
58 #include <algo/structure/cd_utils/cuCdReadWriteASN.hpp>
59 #include <algo/structure/cd_utils/cuSimpleB2SWrapper.hpp>
60
61 BEGIN_NCBI_SCOPE
62 USING_SCOPE(blast);
63 BEGIN_SCOPE(cd_utils)
64
65 const unsigned int CSimpleB2SWrapper::HITLIST_SIZE_DEFAULT = 100;
66 const unsigned int CSimpleB2SWrapper::MAX_HITLIST_SIZE = 10000;
67 const Int8 CSimpleB2SWrapper::CDD_DATABASE_SIZE = 1000000;
68 const double CSimpleB2SWrapper::E_VAL_DEFAULT = 10.0; // default e-value threshold
69 const double CSimpleB2SWrapper::E_VAL_WHEN_NO_SEQ_ALIGN = 1000000; // e-value when Blast doesn't return a seq-align
70 const double CSimpleB2SWrapper::SCORE_WHEN_NO_SEQ_ALIGN = -1.0;
71 const ECompoAdjustModes CSimpleB2SWrapper::COMPOSITION_ADJ_DEF = eNoCompositionBasedStats;
72 const string CSimpleB2SWrapper::SCORING_MATRIX_DEFAULT = BLOSUM62NAME;
73 const double CSimpleB2SWrapper::DO_NOT_USE_PERC_ID_THRESHOLD = -1.0;
74 const Int8 CSimpleB2SWrapper::DO_NOT_USE_EFF_SEARCH_SPACE = -1;
75
CSimpleB2SWrapper(double percIdThold,string matrixName)76 CSimpleB2SWrapper::CSimpleB2SWrapper(double percIdThold, string matrixName)
77 {
78 InitializeToDefaults(); // must be called before any Set...() methods
79 SetPercIdThreshold(percIdThold);
80 SetMatrixName(matrixName);
81 }
82
CSimpleB2SWrapper(CRef<CBioseq> & seq1,CRef<CBioseq> & seq2,double percIdThold,string matrixName)83 CSimpleB2SWrapper::CSimpleB2SWrapper(CRef<CBioseq>& seq1, CRef<CBioseq>& seq2, double percIdThold, string matrixName)
84 {
85 InitializeToDefaults(); // must be called before any Set...() methods
86 SetSeq(seq1, true, 0, 0);
87 SetSeq(seq2, false, 0, 0);
88 SetPercIdThreshold(percIdThold);
89 SetMatrixName(matrixName);
90 }
91
InitializeToDefaults()92 void CSimpleB2SWrapper::InitializeToDefaults()
93 {
94 m_hitlistSize = HITLIST_SIZE_DEFAULT;
95 m_dbLength = CDD_DATABASE_SIZE;
96 m_eValueThold = E_VAL_DEFAULT;
97 m_effSearchSpace = DO_NOT_USE_EFF_SEARCH_SPACE;
98 m_caMode = COMPOSITION_ADJ_DEF;
99 m_scoringMatrix = SCORING_MATRIX_DEFAULT;
100
101
102 m_options.Reset(new CBlastAdvancedProteinOptionsHandle);
103
104 // Fill object with defaults we want to set.
105 if (m_options.NotEmpty()) {
106 m_options->SetEvalueThreshold(E_VAL_DEFAULT);
107 m_options->SetHitlistSize(HITLIST_SIZE_DEFAULT);
108 m_options->SetMatrixName(SCORING_MATRIX_DEFAULT.c_str());
109 m_options->SetSegFiltering(false);
110 m_options->SetDbLength(CDD_DATABASE_SIZE);
111 m_options->SetCompositionBasedStats(COMPOSITION_ADJ_DEF);
112
113 m_options->SetDbSeqNum(1);
114 }
115
116 }
117
SetSeq(CRef<CBioseq> & seq,bool isSeq1,unsigned int from,unsigned int to)118 void CSimpleB2SWrapper::SetSeq(CRef<CBioseq>& seq, bool isSeq1, unsigned int from, unsigned int to)
119 {
120 bool full = (from == 0 && to == 0);
121 unsigned int len = GetSeqLength(*seq);
122
123 // If invalid range, also use the full sequence.
124 if (full || from > to) {
125 full = true;
126 from = 0;
127 to = (seq.NotEmpty()) ? len - 1 : 0;
128 // to = (seq.NotEmpty()) ? seq->GetInst().GetLength() - 1 : 0;
129 }
130
131 // Clip end of range so it does not extend beyond end of the sequence
132 if (to >= len) {
133 to = len - 1;
134 }
135
136 SB2SSeq tmp = {full, from, to, seq};
137 if (isSeq1) {
138 m_seq1 = tmp;
139 } else {
140 m_seq2 = tmp;
141 }
142 }
143
FillOutSeqLoc(const SB2SSeq & s,CSeq_loc & seqLoc)144 bool CSimpleB2SWrapper::FillOutSeqLoc(const SB2SSeq& s, CSeq_loc& seqLoc)
145 {
146 bool result = true;
147 CSeq_interval& seqInt = seqLoc.SetInt();
148 CSeq_id& seqId = seqInt.SetId();
149 seqInt.SetFrom(s.from);
150 seqInt.SetTo(s.to);
151
152 // Assign the first identifier from the bioseq
153 if (s.bs.NotEmpty() && s.bs->GetId().size() > 0) {
154 seqId.Assign(*(s.bs->GetId().front()));
155 } else {
156 result = false;
157 }
158
159 return result;
160 }
161
162
SetPercIdThreshold(double percIdThold)163 double CSimpleB2SWrapper::SetPercIdThreshold(double percIdThold)
164 {
165 if ((percIdThold == DO_NOT_USE_PERC_ID_THRESHOLD || (percIdThold >= 0 && percIdThold <= 100.0)) && m_options.NotEmpty()) {
166 m_percIdThold = percIdThold;
167 m_options->SetPercentIdentity(m_percIdThold);
168 }
169 return m_percIdThold;
170 }
171
SetHitlistSize(unsigned int hitlistSize)172 unsigned int CSimpleB2SWrapper::SetHitlistSize(unsigned int hitlistSize)
173 {
174 if (hitlistSize > 0 && hitlistSize <= MAX_HITLIST_SIZE && m_options.NotEmpty()) {
175 m_hitlistSize = hitlistSize;
176 m_options->SetHitlistSize(m_hitlistSize);
177 }
178 return m_hitlistSize;
179 }
180
SetDbLength(Int8 dbLength)181 Int8 CSimpleB2SWrapper::SetDbLength(Int8 dbLength)
182 {
183 if (dbLength > 0 && m_options.NotEmpty()) {
184 m_dbLength = dbLength;
185 m_options->SetDbLength(m_dbLength);
186 }
187 return m_dbLength;
188 }
189
SetEffSearchSpace(Int8 effSearchSpace)190 Int8 CSimpleB2SWrapper::SetEffSearchSpace(Int8 effSearchSpace)
191 {
192 if (effSearchSpace > 0 && m_options.NotEmpty()) {
193 m_effSearchSpace = effSearchSpace;
194 m_options->SetEffectiveSearchSpace(m_effSearchSpace);
195 }
196 return m_effSearchSpace;
197 }
198
SetCompoAdjustMode(ECompoAdjustModes caMode)199 ECompoAdjustModes CSimpleB2SWrapper::SetCompoAdjustMode(ECompoAdjustModes caMode)
200 {
201 if (m_options.NotEmpty()) {
202 m_caMode = caMode;
203 m_options->SetCompositionBasedStats(m_caMode);
204 }
205 return m_caMode;
206 }
207
SetEValueThreshold(double eValueThold)208 double CSimpleB2SWrapper::SetEValueThreshold(double eValueThold)
209 {
210 if (eValueThold >= 0 && m_options.NotEmpty()) {
211 m_eValueThold = eValueThold;
212 m_options->SetEvalueThreshold(m_eValueThold);
213 }
214 return m_eValueThold;
215 }
216
SetMatrixName(string matrixName)217 string CSimpleB2SWrapper::SetMatrixName(string matrixName)
218 {
219 bool isNameKnown = false;
220
221 if (matrixName == BLOSUM62NAME || matrixName == BLOSUM45NAME || matrixName == BLOSUM80NAME ||
222 matrixName == PAM30NAME || matrixName == PAM70NAME || matrixName == PAM250NAME) {
223 isNameKnown = true;
224 }
225
226 if (isNameKnown && m_options.NotEmpty()) {
227 m_scoringMatrix = matrixName;
228 m_options->SetMatrixName(m_scoringMatrix.c_str());
229 }
230 return m_scoringMatrix;
231 }
232
DoBlast2Seqs()233 bool CSimpleB2SWrapper::DoBlast2Seqs()
234 {
235 if (m_options.Empty()) return false;
236
237 /*
238 CRef<CBlastAdvancedProteinOptionsHandle> options(new CBlastAdvancedProteinOptionsHandle);
239 options->SetEvalueThreshold(m_eValueThold);
240 options->SetMatrixName(m_scoringMatrix.c_str());
241 options->SetSegFiltering(false);
242 options->SetDbLength(CDD_DATABASE_SIZE);
243 options->SetHitlistSize(m_hitlistSize);
244 options->SetDbSeqNum(1);
245 options->SetCompositionBasedStats(eNoCompositionBasedStats);
246
247 if (m_percIdThold != DO_NOT_USE_PERC_ID_THRESHOLD) options->SetPercentIdentity(m_percIdThold);
248 // cout << "m_percIdThold = " << m_percIdThold << "; DO_... = " << DO_NOT_USE_PERC_ID_THRESHOLD << endl;
249 */
250
251 CSeq_loc querySeqLoc, subjectSeqLoc;
252 if (!FillOutSeqLoc(m_seq1, querySeqLoc) || !FillOutSeqLoc(m_seq2, subjectSeqLoc)) {
253 return false;
254 }
255
256 RemoveAllDataLoaders();
257
258 CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
259 CScope scope(*objmgr);
260 CBioseq_Handle handle1 = scope.AddBioseq(*m_seq1.bs);
261 CBioseq_Handle handle2 = scope.AddBioseq(*m_seq2.bs);
262
263
264 CRef<CBlastSearchQuery> bsqQuery(new CBlastSearchQuery(querySeqLoc, scope));
265 CRef<CBlastSearchQuery> bsqSubject(new CBlastSearchQuery(subjectSeqLoc, scope));
266 CBlastQueryVector queryVector, subjectVector;
267 queryVector.AddQuery(bsqQuery);
268 subjectVector.AddQuery(bsqSubject);
269
270
271 CRef<IQueryFactory> query(new CObjMgr_QueryFactory(queryVector));
272 CRef<IQueryFactory> subject(new CObjMgr_QueryFactory(subjectVector));
273 CRef<CBlastProteinOptionsHandle> blastOptions((CBlastProteinOptionsHandle*)m_options.GetPointer());
274
275
276 // perform the blast 2 sequences and process the results...
277 CPsiBl2Seq blaster(query, subject, blastOptions);
278 CSearchResultSet& hits = *blaster.Run();
279 int total = hits.GetNumResults();
280 for (int index=0; index<total; index++)
281 processBlastHits(hits[index]);
282
283 // dump the CSearchResults object...
284 // string err;
285 // if (total > 0) WriteASNToStream(cout, hits[0], false, &err);
286
287 return (m_alignments.size() > 0);
288 }
289
getBestB2SAlignment(double * score,double * eval,double * percIdent) const290 CRef<CSeq_align> CSimpleB2SWrapper::getBestB2SAlignment(double* score, double* eval, double* percIdent) const
291 {
292 if (m_alignments.size() == 0) {
293 CRef< CSeq_align > nullRef;
294 return nullRef;
295 }
296
297 if (score && m_scores.size() > 0) {
298 *score = m_scores[0];
299 }
300 if (eval && m_evals.size() > 0) {
301 *eval = m_evals[0];
302 }
303 if (percIdent && m_percIdents.size() > 0) {
304 *percIdent = m_percIdents[0];
305 }
306
307 return m_alignments[0];
308 }
309
310
getPairwiseBlastAlignment(unsigned int hitNum,CRef<CSeq_align> & seqAlign) const311 bool CSimpleB2SWrapper::getPairwiseBlastAlignment(unsigned int hitNum, CRef< CSeq_align >& seqAlign) const
312 {
313 bool result = false;
314 if (hitNum < m_alignments.size()) {
315 if (m_alignments[hitNum].NotEmpty()) {
316 seqAlign->Assign(*m_alignments[hitNum]);
317 result = true;
318 }
319 }
320 return result;
321 }
322
getPairwiseScore(unsigned int hitNum) const323 double CSimpleB2SWrapper::getPairwiseScore(unsigned int hitNum) const
324 {
325 return (m_scores.size() > hitNum) ? m_scores[hitNum] : SCORE_WHEN_NO_SEQ_ALIGN;
326 }
327
getPairwiseEValue(unsigned int hitNum) const328 double CSimpleB2SWrapper::getPairwiseEValue(unsigned int hitNum) const
329 {
330 return (m_evals.size() > hitNum) ? m_evals[hitNum] : E_VAL_WHEN_NO_SEQ_ALIGN;
331 }
332
getPairwisePercIdent(unsigned int hitNum) const333 double CSimpleB2SWrapper::getPairwisePercIdent(unsigned int hitNum) const
334 {
335 return (m_percIdents.size() > hitNum) ? m_percIdents[hitNum] : DO_NOT_USE_PERC_ID_THRESHOLD;
336 }
337
processBlastHits(CSearchResults & hits)338 void CSimpleB2SWrapper::processBlastHits(CSearchResults& hits)
339 {
340 unsigned int len1 = (m_seq1.to >= m_seq1.from) ? m_seq1.to - m_seq1.from + 1 : 0;
341 double invLen1 = (len1) ? 1.0/(double) len1 : 0;
342 const list< CRef< CSeq_align > >& seqAlignList = hits.GetSeqAlign()->Get();
343
344 // unsigned int len2 = (m_seq2.to >= m_seq2.from) ? m_seq2.to - m_seq2.from + 1 : 0;
345 // cout << "Processing " << seqAlignList.size() << " blast hits (len1 = " << len1 << ", len2 = " << len2 << ").\n";
346
347 m_scores.clear();
348 m_evals.clear();
349 m_percIdents.clear();
350 m_alignments.clear();
351
352 if (seqAlignList.size() == 0) return;
353
354 int nIdent = 0;
355 double score = 0.0, eVal = kMax_Double, percIdent = 0.0;
356 CRef< CSeq_align > sa = ExtractFirstSeqAlign(*(seqAlignList.begin()));
357 if (!sa.Empty())
358 {
359 sa->GetNamedScore(CSeq_align::eScore_Score, score);
360 sa->GetNamedScore(CSeq_align::eScore_EValue, eVal);
361 if (sa->GetNamedScore(CSeq_align::eScore_IdentityCount, nIdent)) {
362 percIdent = 100.0 * invLen1 * (double) nIdent;
363 // cout << "nIdent = " << nIdent << "; percIdent = " << percIdent << endl;
364 // } else {
365 // cout << "???? Didn't find identity count\n";
366 }
367
368
369 // if (!sa->GetNamedScore(CSeq_align::eScore_PercentIdentity, percIdent))
370 // cout << "???? Didn't find percent identity\n";
371 // cout << "saving values: score = " <<score << "; eval = " << eVal << "; id% = " << percIdent << endl;
372
373 m_scores.push_back(score);
374 m_evals.push_back(eVal);
375 m_percIdents.push_back(percIdent);
376 m_alignments.push_back(sa);
377 }
378
379 }
380
381 /*
382 bool CSimpleB2SWrapper::DoBlast2Seqs_OMFree()
383 {
384 if (m_options.Empty()) return false;
385
386 // CRef<CBlastAdvancedProteinOptionsHandle> options(new CBlastAdvancedProteinOptionsHandle);
387 // options->SetEvalueThreshold(m_eValueThold);
388 // options->SetMatrixName(m_scoringMatrix.c_str());
389 // options->SetSegFiltering(false);
390 // options->SetDbLength(CDD_DATABASE_SIZE);
391 // options->SetHitlistSize(m_hitlistSize);
392 // options->SetDbSeqNum(1);
393 // options->SetCompositionBasedStats(eNoCompositionBasedStats);
394
395 // if (m_percIdThold != DO_NOT_USE_PERC_ID_THRESHOLD) options->SetPercentIdentity(m_percIdThold);
396 // cout << "m_percIdThold = " << m_percIdThold << "; DO_... = " << DO_NOT_USE_PERC_ID_THRESHOLD << endl;
397
398 CRef<CBlastProteinOptionsHandle> blastOptions((CBlastProteinOptionsHandle*)m_options.GetPointer());
399
400 // construct 'query' from m_seq1
401 CRef< CBioseq > queryBioseq = m_seq1.bs;
402 CRef<IQueryFactory> query(new CObjMgrFree_QueryFactory(queryBioseq));
403
404
405 // construct 'subject' from m_seq2
406 CRef< CBioseq_set > bioseqset(new CBioseq_set);
407 list< CRef< CSeq_entry > >& seqEntryList = bioseqset->SetSeq_set();
408 CRef< CSeq_entry > seqEntry(new CSeq_entry);
409 seqEntry->SetSeq(*m_seq2.bs);
410 seqEntryList.push_back(seqEntry);
411 CRef<IQueryFactory> subject(new CObjMgrFree_QueryFactory(bioseqset));
412
413 // perform the blast 2 sequences and process the results...
414 CPsiBl2Seq blaster(query,subject,blastOptions);
415 CSearchResultSet& hits = *blaster.Run();
416 int total = hits.GetNumResults();
417 for (int index=0; index<total; index++)
418 processBlastHits_OMFree(hits[index]);
419
420 // dump the CSearchResults object...
421 // string err;
422 // if (total > 0) WriteASNToStream(cout, hits[0], false, &err);
423
424 return (m_alignments.size() > 0);
425 }
426
427 void CSimpleB2SWrapper::processBlastHits_OMFree(CSearchResults& hits)
428 {
429 unsigned int len1 = (m_seq1.to >= m_seq1.from) ? m_seq1.to - m_seq1.from + 1 : 0;
430 unsigned int len2 = (m_seq2.to >= m_seq2.from) ? m_seq2.to - m_seq2.from + 1 : 0;
431 double invLen1 = (len1) ? 1.0/(double) len1 : 0;
432 const list< CRef< CSeq_align > >& seqAlignList = hits.GetSeqAlign()->Get();
433
434 cout << "Processing " << seqAlignList.size() << " blast hits (len1 = " << len1 << ", len2 = " << len2 << ").\n";
435
436 m_scores.clear();
437 m_evals.clear();
438 m_percIdents.clear();
439 m_alignments.clear();
440
441 if (seqAlignList.size() == 0) return;
442
443 int nIdent = 0;
444 double score = 0.0, eVal = kMax_Double, percIdent = 0.0;
445 CRef< CSeq_align > sa = ExtractFirstSeqAlign(*(seqAlignList.begin()));
446 if (!sa.Empty())
447 {
448 sa->GetNamedScore(CSeq_align::eScore_Score, score);
449 sa->GetNamedScore(CSeq_align::eScore_EValue, eVal);
450 if (sa->GetNamedScore(CSeq_align::eScore_IdentityCount, nIdent)) {
451 percIdent = 100.0 * invLen1 * (double) nIdent;
452 // cout << "nIdent = " << nIdent << "; percIdent = " << percIdent << endl;
453 // } else {
454 // cout << "???? Didn't find identity count\n";
455 }
456
457
458 // if (!sa->GetNamedScore(CSeq_align::eScore_PercentIdentity, percIdent))
459 // cout << "???? Didn't find percent identity\n";
460 // cout << "saving values: score = " <<score << "; eval = " << eVal << "; id% = " << percIdent << endl;
461
462 m_scores.push_back(score);
463 m_evals.push_back(eVal);
464 m_percIdents.push_back(percIdent);
465 m_alignments.push_back(sa);
466 }
467 }
468 */
469
470 END_SCOPE(cd_utils)
471 END_NCBI_SCOPE
472