1 static char const rcsid[] = "$Id: blast.cpp 348556 2011-12-29 23:11:30Z boratyng $";
2 
3 /*
4 * ===========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's offical duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================*/
27 
28 /*****************************************************************************
29 
30 File name: blast.cpp
31 
32 Author: Jason Papadopoulos
33 
34 Contents: Find local alignments between sequences
35 
36 ******************************************************************************/
37 
38 #include <ncbi_pch.hpp>
39 #include <util/range_coll.hpp>
40 #include <objmgr/util/sequence.hpp>
41 #include <objects/seqloc/Seq_loc.hpp>
42 #include <objects/seqloc/Seq_interval.hpp>
43 #include <objects/seqalign/Seq_align_set.hpp>
44 #include <objects/general/Object_id.hpp>
45 #include <objects/seqalign/Score.hpp>
46 #include <algo/blast/api/blast_prot_options.hpp>
47 #include <algo/blast/api/bl2seq.hpp>
48 #include <algo/cobalt/cobalt.hpp>
49 
50 /// @file blast.cpp
51 /// Find local alignments between sequences
52 
53 BEGIN_NCBI_SCOPE
54 BEGIN_SCOPE(cobalt)
55 
56 USING_SCOPE(blast);
57 USING_SCOPE(objects);
58 
59 /// Create a new query sequence that is a subset of a previous
60 /// query sequence
61 /// @param loc_list List of previously generated sequence fragments [in/out]
62 /// @param query Sequence that contains the current fragment [in]
63 /// @param from Start offset of fragment [in]
64 /// @param to End offset of fragment [in]
65 /// @param seg_list List of simplified representations of
66 ///                 previous fragments [in/out]
67 /// @param query_index Ordinal ID of 'query'
68 ///
69 void
x_AddNewSegment(vector<CRef<objects::CSeq_loc>> & loc_list,const CRef<objects::CSeq_loc> & query,TOffset from,TOffset to,vector<SSegmentLoc> & seg_list,int query_index)70 CMultiAligner::x_AddNewSegment(vector< CRef<objects::CSeq_loc> >& loc_list,
71                                const CRef<objects::CSeq_loc>& query,
72                                TOffset from, TOffset to,
73                                vector<SSegmentLoc>& seg_list,
74                                int query_index)
75 {
76     // Note that all offsets are zero-based
77 
78     CRef<CSeq_loc> seqloc(new CSeq_loc());
79     seqloc->SetInt().SetFrom(from);
80     seqloc->SetInt().SetTo(to);
81     seqloc->SetInt().SetStrand(eNa_strand_unknown);
82     seqloc->SetInt().SetId().Assign(sequence::GetId(*query, m_Scope));
83     loc_list.push_back(seqloc);
84 
85     seg_list.push_back(SSegmentLoc(query_index, from, to));
86 }
87 
88 /// Turn all fragments of selected query sequence not already covered by
89 /// a domain hit into a separate query sequence, used as input
90 /// to a blast search
91 /// @param blastp_indices Indices of query sequences selected for blastp
92 /// search [in]
93 /// @param filler_locs List of generated sequences [out]
94 /// @param filler_segs Simplified representation of filler_locs [out]
95 ///
96 void
x_MakeFillerBlocks(const vector<int> & blastp_indices,vector<CRef<objects::CSeq_loc>> & filler_locs,vector<SSegmentLoc> & filler_segs)97 CMultiAligner::x_MakeFillerBlocks(const vector<int>& blastp_indices,
98                                vector< CRef<objects::CSeq_loc> >& filler_locs,
99                                vector<SSegmentLoc>& filler_segs)
100 {
101     int num_queries = m_QueryData.size();
102     vector<CRangeCollection<TOffset> > sorted_segs(num_queries);
103 
104     // Merge the offset ranges of all the current domain hits
105 
106     for (int i = 0; i < m_CombinedHits.Size(); i++) {
107         CHit *hit = m_CombinedHits.GetHit(i);
108         _ASSERT(hit->HasSubHits());
109 
110         ITERATE(CHit::TSubHit, subitr, hit->GetSubHit()) {
111             CHit *subhit = *subitr;
112             sorted_segs[hit->m_SeqIndex1].CombineWith(
113                           static_cast<CRange<TOffset> >(subhit->m_SeqRange1));
114             sorted_segs[hit->m_SeqIndex2].CombineWith(
115                           static_cast<CRange<TOffset> >(subhit->m_SeqRange2));
116         }
117     }
118 
119     // For each query sequence, mark off the regions
120     // not covered by a domain hit
121 
122     ITERATE(vector<int>, it, blastp_indices) {
123         int i = *it;
124 
125         CRangeCollection<TOffset>& collection(sorted_segs[i]);
126         TOffset seg_start = 0;
127 
128         // Note that fragments of size less than kMinHitSize
129         // are ignored
130 
131         ITERATE(CRangeCollection<TOffset>, itr, collection) {
132             if (itr->GetFrom() - seg_start > CHit::kMinHitSize) {
133                 x_AddNewSegment(filler_locs, m_tQueries[i], seg_start,
134                                 itr->GetFrom() - 1, filler_segs, i);
135             }
136             seg_start = itr->GetToOpen();
137         }
138 
139         // Handle the last fragment; this could actually
140         // envelop the entire sequence
141 
142         int seq_length = sequence::GetLength(*m_tQueries[i], m_Scope);
143 
144         if (seq_length - seg_start > CHit::kMinHitSize) {
145             x_AddNewSegment(filler_locs, m_tQueries[i], seg_start,
146                             seq_length - 1, filler_segs, i);
147         }
148     }
149 
150     if (m_Options->GetVerbose()) {
151         printf("Filler Segments:\n");
152         for (int i = 0; i < (int)filler_segs.size(); i++) {
153             printf("query %d %4d - %4d\n",
154                    filler_segs[i].seq_index,
155                    filler_segs[i].GetFrom(),
156                    filler_segs[i].GetTo());
157         }
158         printf("\n\n");
159     }
160 }
161 
162 /// Run blastp, aligning the collection of filler fragments
163 /// against the entire input dataset
164 /// @param queries List of queries selected for blastp alignment [in]
165 /// @param indices List of indices of each selected query in the queries
166 /// array [in]
167 /// @param filler_locs List of generated sequences [in]
168 /// @param filler_segs Simplified representation of filler_locs [in]
169 ///
170 void
x_AlignFillerBlocks(const TSeqLocVector & queries,const vector<int> & indices,vector<CRef<CSeq_loc>> & filler_locs,vector<SSegmentLoc> & filler_segs)171 CMultiAligner::x_AlignFillerBlocks(const TSeqLocVector& queries,
172                                    const vector<int>& indices,
173                                    vector< CRef<CSeq_loc> >& filler_locs,
174                                    vector<SSegmentLoc>& filler_segs)
175 {
176     const int kBlastBatchSize = 10000;
177     int num_full_queries = indices.size();
178 
179     if (filler_locs.empty())
180         return;
181 
182     // Set the blast options
183 
184     double blastp_evalue = m_Options->GetBlastpEvalue();
185     CRef<CBlastProteinOptionsHandle> blastp_opts(new CBlastProteinOptionsHandle);
186     // deliberately set the cutoff e-value too high
187     blastp_opts->SetEvalueThreshold(max(blastp_evalue, 10.0));
188     //blastp_opts.SetGappedMode(false);
189     blastp_opts->SetSegFiltering(false);
190 
191     // use blast on one batch of filler segments at a time
192 
193     int batch_start = 0;
194     while (batch_start < (int)filler_locs.size()) {
195 
196         TSeqLocVector curr_batch;
197         int batch_size = 0;
198 
199         for (int i = batch_start; i < (int)filler_locs.size(); i++) {
200             const CSeq_loc& curr_loc = *filler_locs[i];
201             int fragment_size = curr_loc.GetInt().GetTo() -
202                                 curr_loc.GetInt().GetFrom() + 1;
203             if (batch_size + fragment_size >= kBlastBatchSize && batch_size > 0)
204                 break;
205 
206             curr_batch.push_back(SSeqLoc(*filler_locs[i], *m_Scope));
207             batch_size += fragment_size;
208         }
209 
210         CBl2Seq blaster(curr_batch, queries, *blastp_opts);
211         TSeqAlignVector v = blaster.Run();
212 
213         // check for interrupt
214         if (m_Interrupt && (*m_Interrupt)(&m_ProgressMonitor)) {
215             NCBI_THROW(CMultiAlignerException, eInterrupt,
216                        "Alignment interrupted");
217         }
218 
219         // Convert each resulting HSP into a CHit object
220 
221         // iterate over query sequence fragments for the current batch
222 
223         for (int i = 0; i < (int)curr_batch.size(); i++) {
224 
225             int list1_oid = filler_segs[batch_start + i].seq_index;
226 
227             for (int j = 0; j < num_full_queries; j++) {
228 
229                 // skip hits that map to the same query sequence
230 
231                 if (list1_oid == indices[j])
232                     continue;
233 
234                 // iterate over hitlists
235 
236                 ITERATE(CSeq_align_set::Tdata, itr,
237                                    v[i * num_full_queries + j]->Get()) {
238 
239                     // iterate over hits
240 
241                     const CSeq_align& s = **itr;
242 
243                     if (s.GetSegs().Which() == CSeq_align::C_Segs::e_Denseg) {
244                         // Dense-seg (1 hit)
245 
246                         const CDense_seg& denseg = s.GetSegs().GetDenseg();
247                         int align_score = 0;
248                         double evalue = 0;
249 
250                         ITERATE(CSeq_align::TScore, score_itr, s.GetScore()) {
251                             const CScore& curr_score = **score_itr;
252                             if (curr_score.GetId().GetStr() == "score")
253                                 align_score = curr_score.GetValue().GetInt();
254                             else if (curr_score.GetId().GetStr() == "e_value")
255                                 evalue = curr_score.GetValue().GetReal();
256                         }
257 
258                         // check if the hit is worth saving
259                         if (evalue > blastp_evalue)
260                             continue;
261 
262                         m_LocalHits.AddToHitList(new CHit(list1_oid, indices[j],
263                                                       align_score, denseg));
264                     }
265                     else if (s.GetSegs().Which() ==
266                                              CSeq_align::C_Segs::e_Dendiag) {
267                         // Dense-diag (all hits)
268 
269                         ITERATE(CSeq_align::C_Segs::TDendiag, diag_itr,
270                                                     s.GetSegs().GetDendiag()) {
271                             const CDense_diag& dendiag = **diag_itr;
272                             int align_score = 0;
273                             double evalue = 0;
274 
275                             // compute the score of the hit
276 
277                             ITERATE(CDense_diag::TScores, score_itr,
278                                                         dendiag.GetScores()) {
279                                 const CScore& curr_score = **score_itr;
280                                 if (curr_score.GetId().GetStr() == "score") {
281                                     align_score =
282                                         curr_score.GetValue().GetInt();
283                                 }
284                                 else if (curr_score.GetId().GetStr() ==
285                                                                 "e_value") {
286                                     evalue = curr_score.GetValue().GetReal();
287                                 }
288                             }
289 
290                             // check if the hit is worth saving
291                             if (evalue > blastp_evalue)
292                                 continue;
293 
294                             m_LocalHits.AddToHitList(new CHit(list1_oid,
295                                              indices[j], align_score, dendiag));
296                         }
297                     }
298                 }
299             }
300         }
301 
302         // proceed to net batch of sequence fragments
303         batch_start += curr_batch.size();
304     }
305 }
306 
307 void
x_FindLocalHits(const TSeqLocVector & queries,const vector<int> & indices)308 CMultiAligner::x_FindLocalHits(const TSeqLocVector& queries,
309                                const vector<int>& indices)
310 {
311     m_ProgressMonitor.stage = eLocalHitsSearch;
312 
313     // Clear off previous state if it exists
314 
315     m_LocalHits.PurgeAllHits();
316     if (m_DomainHits.Empty()) {
317 
318         m_CombinedHits.PurgeAllHits();
319 
320         // Initialize the profile columns of the input sequences
321 
322         x_AssignDefaultResFreqs();
323     }
324 
325     // Produce another set of queries that consist of the 'filler'
326     // in the input data, i.e. all stretches of all sequences not
327     // covered by at least one domain hit. Then align all the
328     // filler regions against each other and add any new HSPs to
329     // 'results'
330 
331     vector< CRef<objects::CSeq_loc> > filler_locs;
332     vector<SSegmentLoc> filler_segs;
333     x_MakeFillerBlocks(indices, filler_locs, filler_segs);
334     x_AlignFillerBlocks(queries, indices, filler_locs, filler_segs);
335 
336     //-------------------------------------------------------
337     if (m_Options->GetVerbose()) {
338         printf("blastp hits:\n");
339         for (int i = 0; i < m_LocalHits.Size(); i++) {
340             CHit *hit = m_LocalHits.GetHit(i);
341             printf("query %d %4d - %4d query %d %4d - %4d score %d\n",
342                          hit->m_SeqIndex1,
343                          hit->m_SeqRange1.GetFrom(),
344                          hit->m_SeqRange1.GetTo(),
345                          hit->m_SeqIndex2,
346                          hit->m_SeqRange2.GetFrom(),
347                          hit->m_SeqRange2.GetTo(),
348                          hit->m_Score);
349         }
350         printf("\n\n");
351     }
352     //-------------------------------------------------------
353 
354     m_CombinedHits.Append(m_LocalHits);
355 }
356 
357 
358 
x_AlignClusterQueries(const TPhyTreeNode * node)359 auto_ptr< vector<int> > CMultiAligner::x_AlignClusterQueries(
360                                                      const TPhyTreeNode* node)
361 {
362     // Traverse cluster tree
363 
364     // if leaf node, then create one-element list of node id
365     if (node->IsLeaf()) {
366         auto_ptr< vector<int> > result(new vector<int>());
367         result->push_back(node->GetValue().GetId());
368         return result;
369     }
370 
371     // Traverse left and right subtree and gather node ids in the subtrees
372     TPhyTreeNode::TNodeList_CI child(node->SubNodeBegin());
373 
374     auto_ptr< vector<int> > left_inds = x_AlignClusterQueries(*child);
375     child++;
376 
377     _ASSERT(*child);
378     auto_ptr< vector<int> > right_inds = x_AlignClusterQueries(*child);
379     child++;
380     _ASSERT(child == node->SubNodeEnd());
381 
382     // Get most similar sequences from different subtrees
383     const CClusterer::TDistMatrix& dmat = m_Clusterer.GetDistMatrix();
384 
385     int left = -1, right = -1;
386     double dist = 0.0;
387     for (size_t i=0;i < left_inds->size();i++) {
388         for (size_t j=0;j < right_inds->size();j++) {
389             if (dist > dmat((*left_inds)[i], (*right_inds)[j]) || left < 0) {
390                 left = (*left_inds)[i];
391                 right = (*right_inds)[j];
392                 dist = dmat(left, right);
393             }
394         }
395     }
396     _ASSERT(left != right);
397 
398     // Align the found pair of sequences - one from each subtree
399     double blastp_evalue = m_Options->GetBlastpEvalue();
400     CRef<CBlastProteinOptionsHandle> blastp_opts(new CBlastProteinOptionsHandle);
401     // deliberately set the cutoff e-value too high
402     blastp_opts->SetEvalueThreshold(max(blastp_evalue, 10.0));
403     blastp_opts->SetSegFiltering(false);
404 
405     SSeqLoc left_query(*m_tQueries[left], *m_Scope);
406     SSeqLoc right_query(*m_tQueries[right], *m_Scope);
407 
408     CBl2Seq blaster(left_query, right_query, *blastp_opts);
409     CRef<CSearchResultSet> v = blaster.RunEx();
410 
411     // Add hit to hitlist
412     ITERATE(CSeq_align_set::Tdata, itr, v->GetResults(0, 0).GetSeqAlign()->Get()) {
413 
414         // iterate over hits
415 
416         const CSeq_align& s = **itr;
417 
418         if (s.GetSegs().Which() == CSeq_align::C_Segs::e_Denseg) {
419             // Dense-seg (1 hit)
420 
421             const CDense_seg& denseg = s.GetSegs().GetDenseg();
422             int align_score = 0;
423             double evalue = 0;
424 
425             ITERATE(CSeq_align::TScore, score_itr, s.GetScore()) {
426                 const CScore& curr_score = **score_itr;
427                 if (curr_score.GetId().GetStr() == "score")
428                     align_score = curr_score.GetValue().GetInt();
429                 else if (curr_score.GetId().GetStr() == "e_value")
430                     evalue = curr_score.GetValue().GetReal();
431             }
432 
433             // check if the hit is worth saving
434             if (evalue > blastp_evalue)
435                 continue;
436 
437             m_LocalInClusterHits.AddToHitList(new CHit(left, right, align_score,
438                                                        denseg));
439         }
440         else if (s.GetSegs().Which() == CSeq_align::C_Segs::e_Dendiag) {
441             // Dense-diag (all hits)
442 
443             ITERATE(CSeq_align::C_Segs::TDendiag, diag_itr,
444                     s.GetSegs().GetDendiag()) {
445                 const CDense_diag& dendiag = **diag_itr;
446                 int align_score = 0;
447                 double evalue = 0;
448 
449                 // compute the score of the hit
450 
451                 ITERATE(CDense_diag::TScores, score_itr, dendiag.GetScores()) {
452                     const CScore& curr_score = **score_itr;
453                     if (curr_score.GetId().GetStr() == "score") {
454                         align_score =
455                             curr_score.GetValue().GetInt();
456                     }
457                     else if (curr_score.GetId().GetStr() ==
458                              "e_value") {
459                         evalue = curr_score.GetValue().GetReal();
460                     }
461                 }
462 
463                 // check if the hit is worth saving
464                 if (evalue > blastp_evalue)
465                     continue;
466 
467                 m_LocalInClusterHits.AddToHitList(new CHit(left, right,
468                                                    align_score, dendiag));
469             }
470         }
471     }
472 
473 
474     // Return sequences from this subtree
475     ITERATE(vector<int>, it, *right_inds) {
476         left_inds->push_back(*it);
477     }
478     return left_inds;
479 }
480 
481 
482 
x_FindLocalInClusterHits(const vector<TPhyTreeNode * > & cluster_trees)483 void CMultiAligner::x_FindLocalInClusterHits(
484                                      const vector<TPhyTreeNode*>& cluster_trees)
485 {
486     m_LocalInClusterHits.PurgeAllHits();
487 
488     // Traverse cluster trees and find local constraints for each left and
489     // right subtree of each substree
490     ITERATE(vector<TPhyTreeNode*>, it, cluster_trees) {
491         // NULL trees denore one-element clusters, nothing to do in such cases
492         if (*it) {
493             x_AlignClusterQueries(*it);
494         }
495     }
496 
497     //-------------------------------------------------------
498     if (m_Options->GetVerbose()) {
499         printf("in-cluster blastp hits:\n");
500         for (int i = 0; i < m_LocalInClusterHits.Size(); i++) {
501             CHit *hit = m_LocalInClusterHits.GetHit(i);
502             printf("query %d %4d - %4d query %d %4d - %4d score %d\n",
503                          hit->m_SeqIndex1,
504                          hit->m_SeqRange1.GetFrom(),
505                          hit->m_SeqRange1.GetTo(),
506                          hit->m_SeqIndex2,
507                          hit->m_SeqRange2.GetFrom(),
508                          hit->m_SeqRange2.GetTo(),
509                          hit->m_Score);
510         }
511         printf("\n\n");
512     }
513     //-------------------------------------------------------
514 }
515 
516 
517 
518 END_SCOPE(cobalt)
519 END_NCBI_SCOPE
520