1 static char const rcsid[] = "$Id: blast.cpp 348556 2011-12-29 23:11:30Z boratyng $";
2
3 /*
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's offical duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================*/
27
28 /*****************************************************************************
29
30 File name: blast.cpp
31
32 Author: Jason Papadopoulos
33
34 Contents: Find local alignments between sequences
35
36 ******************************************************************************/
37
38 #include <ncbi_pch.hpp>
39 #include <util/range_coll.hpp>
40 #include <objmgr/util/sequence.hpp>
41 #include <objects/seqloc/Seq_loc.hpp>
42 #include <objects/seqloc/Seq_interval.hpp>
43 #include <objects/seqalign/Seq_align_set.hpp>
44 #include <objects/general/Object_id.hpp>
45 #include <objects/seqalign/Score.hpp>
46 #include <algo/blast/api/blast_prot_options.hpp>
47 #include <algo/blast/api/bl2seq.hpp>
48 #include <algo/cobalt/cobalt.hpp>
49
50 /// @file blast.cpp
51 /// Find local alignments between sequences
52
53 BEGIN_NCBI_SCOPE
54 BEGIN_SCOPE(cobalt)
55
56 USING_SCOPE(blast);
57 USING_SCOPE(objects);
58
59 /// Create a new query sequence that is a subset of a previous
60 /// query sequence
61 /// @param loc_list List of previously generated sequence fragments [in/out]
62 /// @param query Sequence that contains the current fragment [in]
63 /// @param from Start offset of fragment [in]
64 /// @param to End offset of fragment [in]
65 /// @param seg_list List of simplified representations of
66 /// previous fragments [in/out]
67 /// @param query_index Ordinal ID of 'query'
68 ///
69 void
x_AddNewSegment(vector<CRef<objects::CSeq_loc>> & loc_list,const CRef<objects::CSeq_loc> & query,TOffset from,TOffset to,vector<SSegmentLoc> & seg_list,int query_index)70 CMultiAligner::x_AddNewSegment(vector< CRef<objects::CSeq_loc> >& loc_list,
71 const CRef<objects::CSeq_loc>& query,
72 TOffset from, TOffset to,
73 vector<SSegmentLoc>& seg_list,
74 int query_index)
75 {
76 // Note that all offsets are zero-based
77
78 CRef<CSeq_loc> seqloc(new CSeq_loc());
79 seqloc->SetInt().SetFrom(from);
80 seqloc->SetInt().SetTo(to);
81 seqloc->SetInt().SetStrand(eNa_strand_unknown);
82 seqloc->SetInt().SetId().Assign(sequence::GetId(*query, m_Scope));
83 loc_list.push_back(seqloc);
84
85 seg_list.push_back(SSegmentLoc(query_index, from, to));
86 }
87
88 /// Turn all fragments of selected query sequence not already covered by
89 /// a domain hit into a separate query sequence, used as input
90 /// to a blast search
91 /// @param blastp_indices Indices of query sequences selected for blastp
92 /// search [in]
93 /// @param filler_locs List of generated sequences [out]
94 /// @param filler_segs Simplified representation of filler_locs [out]
95 ///
96 void
x_MakeFillerBlocks(const vector<int> & blastp_indices,vector<CRef<objects::CSeq_loc>> & filler_locs,vector<SSegmentLoc> & filler_segs)97 CMultiAligner::x_MakeFillerBlocks(const vector<int>& blastp_indices,
98 vector< CRef<objects::CSeq_loc> >& filler_locs,
99 vector<SSegmentLoc>& filler_segs)
100 {
101 int num_queries = m_QueryData.size();
102 vector<CRangeCollection<TOffset> > sorted_segs(num_queries);
103
104 // Merge the offset ranges of all the current domain hits
105
106 for (int i = 0; i < m_CombinedHits.Size(); i++) {
107 CHit *hit = m_CombinedHits.GetHit(i);
108 _ASSERT(hit->HasSubHits());
109
110 ITERATE(CHit::TSubHit, subitr, hit->GetSubHit()) {
111 CHit *subhit = *subitr;
112 sorted_segs[hit->m_SeqIndex1].CombineWith(
113 static_cast<CRange<TOffset> >(subhit->m_SeqRange1));
114 sorted_segs[hit->m_SeqIndex2].CombineWith(
115 static_cast<CRange<TOffset> >(subhit->m_SeqRange2));
116 }
117 }
118
119 // For each query sequence, mark off the regions
120 // not covered by a domain hit
121
122 ITERATE(vector<int>, it, blastp_indices) {
123 int i = *it;
124
125 CRangeCollection<TOffset>& collection(sorted_segs[i]);
126 TOffset seg_start = 0;
127
128 // Note that fragments of size less than kMinHitSize
129 // are ignored
130
131 ITERATE(CRangeCollection<TOffset>, itr, collection) {
132 if (itr->GetFrom() - seg_start > CHit::kMinHitSize) {
133 x_AddNewSegment(filler_locs, m_tQueries[i], seg_start,
134 itr->GetFrom() - 1, filler_segs, i);
135 }
136 seg_start = itr->GetToOpen();
137 }
138
139 // Handle the last fragment; this could actually
140 // envelop the entire sequence
141
142 int seq_length = sequence::GetLength(*m_tQueries[i], m_Scope);
143
144 if (seq_length - seg_start > CHit::kMinHitSize) {
145 x_AddNewSegment(filler_locs, m_tQueries[i], seg_start,
146 seq_length - 1, filler_segs, i);
147 }
148 }
149
150 if (m_Options->GetVerbose()) {
151 printf("Filler Segments:\n");
152 for (int i = 0; i < (int)filler_segs.size(); i++) {
153 printf("query %d %4d - %4d\n",
154 filler_segs[i].seq_index,
155 filler_segs[i].GetFrom(),
156 filler_segs[i].GetTo());
157 }
158 printf("\n\n");
159 }
160 }
161
162 /// Run blastp, aligning the collection of filler fragments
163 /// against the entire input dataset
164 /// @param queries List of queries selected for blastp alignment [in]
165 /// @param indices List of indices of each selected query in the queries
166 /// array [in]
167 /// @param filler_locs List of generated sequences [in]
168 /// @param filler_segs Simplified representation of filler_locs [in]
169 ///
170 void
x_AlignFillerBlocks(const TSeqLocVector & queries,const vector<int> & indices,vector<CRef<CSeq_loc>> & filler_locs,vector<SSegmentLoc> & filler_segs)171 CMultiAligner::x_AlignFillerBlocks(const TSeqLocVector& queries,
172 const vector<int>& indices,
173 vector< CRef<CSeq_loc> >& filler_locs,
174 vector<SSegmentLoc>& filler_segs)
175 {
176 const int kBlastBatchSize = 10000;
177 int num_full_queries = indices.size();
178
179 if (filler_locs.empty())
180 return;
181
182 // Set the blast options
183
184 double blastp_evalue = m_Options->GetBlastpEvalue();
185 CRef<CBlastProteinOptionsHandle> blastp_opts(new CBlastProteinOptionsHandle);
186 // deliberately set the cutoff e-value too high
187 blastp_opts->SetEvalueThreshold(max(blastp_evalue, 10.0));
188 //blastp_opts.SetGappedMode(false);
189 blastp_opts->SetSegFiltering(false);
190
191 // use blast on one batch of filler segments at a time
192
193 int batch_start = 0;
194 while (batch_start < (int)filler_locs.size()) {
195
196 TSeqLocVector curr_batch;
197 int batch_size = 0;
198
199 for (int i = batch_start; i < (int)filler_locs.size(); i++) {
200 const CSeq_loc& curr_loc = *filler_locs[i];
201 int fragment_size = curr_loc.GetInt().GetTo() -
202 curr_loc.GetInt().GetFrom() + 1;
203 if (batch_size + fragment_size >= kBlastBatchSize && batch_size > 0)
204 break;
205
206 curr_batch.push_back(SSeqLoc(*filler_locs[i], *m_Scope));
207 batch_size += fragment_size;
208 }
209
210 CBl2Seq blaster(curr_batch, queries, *blastp_opts);
211 TSeqAlignVector v = blaster.Run();
212
213 // check for interrupt
214 if (m_Interrupt && (*m_Interrupt)(&m_ProgressMonitor)) {
215 NCBI_THROW(CMultiAlignerException, eInterrupt,
216 "Alignment interrupted");
217 }
218
219 // Convert each resulting HSP into a CHit object
220
221 // iterate over query sequence fragments for the current batch
222
223 for (int i = 0; i < (int)curr_batch.size(); i++) {
224
225 int list1_oid = filler_segs[batch_start + i].seq_index;
226
227 for (int j = 0; j < num_full_queries; j++) {
228
229 // skip hits that map to the same query sequence
230
231 if (list1_oid == indices[j])
232 continue;
233
234 // iterate over hitlists
235
236 ITERATE(CSeq_align_set::Tdata, itr,
237 v[i * num_full_queries + j]->Get()) {
238
239 // iterate over hits
240
241 const CSeq_align& s = **itr;
242
243 if (s.GetSegs().Which() == CSeq_align::C_Segs::e_Denseg) {
244 // Dense-seg (1 hit)
245
246 const CDense_seg& denseg = s.GetSegs().GetDenseg();
247 int align_score = 0;
248 double evalue = 0;
249
250 ITERATE(CSeq_align::TScore, score_itr, s.GetScore()) {
251 const CScore& curr_score = **score_itr;
252 if (curr_score.GetId().GetStr() == "score")
253 align_score = curr_score.GetValue().GetInt();
254 else if (curr_score.GetId().GetStr() == "e_value")
255 evalue = curr_score.GetValue().GetReal();
256 }
257
258 // check if the hit is worth saving
259 if (evalue > blastp_evalue)
260 continue;
261
262 m_LocalHits.AddToHitList(new CHit(list1_oid, indices[j],
263 align_score, denseg));
264 }
265 else if (s.GetSegs().Which() ==
266 CSeq_align::C_Segs::e_Dendiag) {
267 // Dense-diag (all hits)
268
269 ITERATE(CSeq_align::C_Segs::TDendiag, diag_itr,
270 s.GetSegs().GetDendiag()) {
271 const CDense_diag& dendiag = **diag_itr;
272 int align_score = 0;
273 double evalue = 0;
274
275 // compute the score of the hit
276
277 ITERATE(CDense_diag::TScores, score_itr,
278 dendiag.GetScores()) {
279 const CScore& curr_score = **score_itr;
280 if (curr_score.GetId().GetStr() == "score") {
281 align_score =
282 curr_score.GetValue().GetInt();
283 }
284 else if (curr_score.GetId().GetStr() ==
285 "e_value") {
286 evalue = curr_score.GetValue().GetReal();
287 }
288 }
289
290 // check if the hit is worth saving
291 if (evalue > blastp_evalue)
292 continue;
293
294 m_LocalHits.AddToHitList(new CHit(list1_oid,
295 indices[j], align_score, dendiag));
296 }
297 }
298 }
299 }
300 }
301
302 // proceed to net batch of sequence fragments
303 batch_start += curr_batch.size();
304 }
305 }
306
307 void
x_FindLocalHits(const TSeqLocVector & queries,const vector<int> & indices)308 CMultiAligner::x_FindLocalHits(const TSeqLocVector& queries,
309 const vector<int>& indices)
310 {
311 m_ProgressMonitor.stage = eLocalHitsSearch;
312
313 // Clear off previous state if it exists
314
315 m_LocalHits.PurgeAllHits();
316 if (m_DomainHits.Empty()) {
317
318 m_CombinedHits.PurgeAllHits();
319
320 // Initialize the profile columns of the input sequences
321
322 x_AssignDefaultResFreqs();
323 }
324
325 // Produce another set of queries that consist of the 'filler'
326 // in the input data, i.e. all stretches of all sequences not
327 // covered by at least one domain hit. Then align all the
328 // filler regions against each other and add any new HSPs to
329 // 'results'
330
331 vector< CRef<objects::CSeq_loc> > filler_locs;
332 vector<SSegmentLoc> filler_segs;
333 x_MakeFillerBlocks(indices, filler_locs, filler_segs);
334 x_AlignFillerBlocks(queries, indices, filler_locs, filler_segs);
335
336 //-------------------------------------------------------
337 if (m_Options->GetVerbose()) {
338 printf("blastp hits:\n");
339 for (int i = 0; i < m_LocalHits.Size(); i++) {
340 CHit *hit = m_LocalHits.GetHit(i);
341 printf("query %d %4d - %4d query %d %4d - %4d score %d\n",
342 hit->m_SeqIndex1,
343 hit->m_SeqRange1.GetFrom(),
344 hit->m_SeqRange1.GetTo(),
345 hit->m_SeqIndex2,
346 hit->m_SeqRange2.GetFrom(),
347 hit->m_SeqRange2.GetTo(),
348 hit->m_Score);
349 }
350 printf("\n\n");
351 }
352 //-------------------------------------------------------
353
354 m_CombinedHits.Append(m_LocalHits);
355 }
356
357
358
x_AlignClusterQueries(const TPhyTreeNode * node)359 auto_ptr< vector<int> > CMultiAligner::x_AlignClusterQueries(
360 const TPhyTreeNode* node)
361 {
362 // Traverse cluster tree
363
364 // if leaf node, then create one-element list of node id
365 if (node->IsLeaf()) {
366 auto_ptr< vector<int> > result(new vector<int>());
367 result->push_back(node->GetValue().GetId());
368 return result;
369 }
370
371 // Traverse left and right subtree and gather node ids in the subtrees
372 TPhyTreeNode::TNodeList_CI child(node->SubNodeBegin());
373
374 auto_ptr< vector<int> > left_inds = x_AlignClusterQueries(*child);
375 child++;
376
377 _ASSERT(*child);
378 auto_ptr< vector<int> > right_inds = x_AlignClusterQueries(*child);
379 child++;
380 _ASSERT(child == node->SubNodeEnd());
381
382 // Get most similar sequences from different subtrees
383 const CClusterer::TDistMatrix& dmat = m_Clusterer.GetDistMatrix();
384
385 int left = -1, right = -1;
386 double dist = 0.0;
387 for (size_t i=0;i < left_inds->size();i++) {
388 for (size_t j=0;j < right_inds->size();j++) {
389 if (dist > dmat((*left_inds)[i], (*right_inds)[j]) || left < 0) {
390 left = (*left_inds)[i];
391 right = (*right_inds)[j];
392 dist = dmat(left, right);
393 }
394 }
395 }
396 _ASSERT(left != right);
397
398 // Align the found pair of sequences - one from each subtree
399 double blastp_evalue = m_Options->GetBlastpEvalue();
400 CRef<CBlastProteinOptionsHandle> blastp_opts(new CBlastProteinOptionsHandle);
401 // deliberately set the cutoff e-value too high
402 blastp_opts->SetEvalueThreshold(max(blastp_evalue, 10.0));
403 blastp_opts->SetSegFiltering(false);
404
405 SSeqLoc left_query(*m_tQueries[left], *m_Scope);
406 SSeqLoc right_query(*m_tQueries[right], *m_Scope);
407
408 CBl2Seq blaster(left_query, right_query, *blastp_opts);
409 CRef<CSearchResultSet> v = blaster.RunEx();
410
411 // Add hit to hitlist
412 ITERATE(CSeq_align_set::Tdata, itr, v->GetResults(0, 0).GetSeqAlign()->Get()) {
413
414 // iterate over hits
415
416 const CSeq_align& s = **itr;
417
418 if (s.GetSegs().Which() == CSeq_align::C_Segs::e_Denseg) {
419 // Dense-seg (1 hit)
420
421 const CDense_seg& denseg = s.GetSegs().GetDenseg();
422 int align_score = 0;
423 double evalue = 0;
424
425 ITERATE(CSeq_align::TScore, score_itr, s.GetScore()) {
426 const CScore& curr_score = **score_itr;
427 if (curr_score.GetId().GetStr() == "score")
428 align_score = curr_score.GetValue().GetInt();
429 else if (curr_score.GetId().GetStr() == "e_value")
430 evalue = curr_score.GetValue().GetReal();
431 }
432
433 // check if the hit is worth saving
434 if (evalue > blastp_evalue)
435 continue;
436
437 m_LocalInClusterHits.AddToHitList(new CHit(left, right, align_score,
438 denseg));
439 }
440 else if (s.GetSegs().Which() == CSeq_align::C_Segs::e_Dendiag) {
441 // Dense-diag (all hits)
442
443 ITERATE(CSeq_align::C_Segs::TDendiag, diag_itr,
444 s.GetSegs().GetDendiag()) {
445 const CDense_diag& dendiag = **diag_itr;
446 int align_score = 0;
447 double evalue = 0;
448
449 // compute the score of the hit
450
451 ITERATE(CDense_diag::TScores, score_itr, dendiag.GetScores()) {
452 const CScore& curr_score = **score_itr;
453 if (curr_score.GetId().GetStr() == "score") {
454 align_score =
455 curr_score.GetValue().GetInt();
456 }
457 else if (curr_score.GetId().GetStr() ==
458 "e_value") {
459 evalue = curr_score.GetValue().GetReal();
460 }
461 }
462
463 // check if the hit is worth saving
464 if (evalue > blastp_evalue)
465 continue;
466
467 m_LocalInClusterHits.AddToHitList(new CHit(left, right,
468 align_score, dendiag));
469 }
470 }
471 }
472
473
474 // Return sequences from this subtree
475 ITERATE(vector<int>, it, *right_inds) {
476 left_inds->push_back(*it);
477 }
478 return left_inds;
479 }
480
481
482
x_FindLocalInClusterHits(const vector<TPhyTreeNode * > & cluster_trees)483 void CMultiAligner::x_FindLocalInClusterHits(
484 const vector<TPhyTreeNode*>& cluster_trees)
485 {
486 m_LocalInClusterHits.PurgeAllHits();
487
488 // Traverse cluster trees and find local constraints for each left and
489 // right subtree of each substree
490 ITERATE(vector<TPhyTreeNode*>, it, cluster_trees) {
491 // NULL trees denore one-element clusters, nothing to do in such cases
492 if (*it) {
493 x_AlignClusterQueries(*it);
494 }
495 }
496
497 //-------------------------------------------------------
498 if (m_Options->GetVerbose()) {
499 printf("in-cluster blastp hits:\n");
500 for (int i = 0; i < m_LocalInClusterHits.Size(); i++) {
501 CHit *hit = m_LocalInClusterHits.GetHit(i);
502 printf("query %d %4d - %4d query %d %4d - %4d score %d\n",
503 hit->m_SeqIndex1,
504 hit->m_SeqRange1.GetFrom(),
505 hit->m_SeqRange1.GetTo(),
506 hit->m_SeqIndex2,
507 hit->m_SeqRange2.GetFrom(),
508 hit->m_SeqRange2.GetTo(),
509 hit->m_Score);
510 }
511 printf("\n\n");
512 }
513 //-------------------------------------------------------
514 }
515
516
517
518 END_SCOPE(cobalt)
519 END_NCBI_SCOPE
520