1 /*  $Id: banded_aligner.cpp 494142 2016-03-03 20:39:42Z whlavina $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Nathan Bouk
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiexpt.hpp>
34 #include <corelib/ncbi_system.hpp>
35 #include <corelib/ncbi_signal.hpp>
36 #include <math.h>
37 #include <algo/align/ngalign/banded_aligner.hpp>
38 //#include "align_instance.hpp"
39 
40 #include <objects/seqloc/Seq_loc.hpp>
41 #include <objects/seqloc/Seq_id.hpp>
42 #include <objmgr/scope.hpp>
43 #include <algo/blast/api/blast_types.hpp>
44 #include <algo/blast/api/bl2seq.hpp>
45 #include <algo/blast/api/blast_options_handle.hpp>
46 #include <algo/blast/api/blast_nucl_options.hpp>
47 #include <algo/blast/api/disc_nucl_options.hpp>
48 #include <algo/align/util/score_builder.hpp>
49 #include <objects/seqalign/Seq_align.hpp>
50 #include <objects/seqalign/Seq_align_set.hpp>
51 #include <objects/seqalign/Dense_seg.hpp>
52 #include <objmgr/seq_vector.hpp>
53 
54 #include <algo/blast/api/local_blast.hpp>
55 #include <algo/blast/blastinput/blastn_args.hpp>
56 
57 #include <algo/align/contig_assembly/contig_assembly.hpp>
58 #include <algo/align/nw/nw_band_aligner.hpp>
59 #include <algo/align/nw/mm_aligner.hpp>
60 
61 #include <algo/align/util/blast_tabular.hpp>
62 #include <algo/align/util/compartment_finder.hpp>
63 #include <algo/sequence/align_cleanup.hpp>
64 
65 
66 BEGIN_SCOPE(ncbi)
67 USING_SCOPE(objects);
68 USING_SCOPE(blast);
69 
70 
71 
72 
s_CoverageSeqLoc(TAlignSetRef Alignments,int Row,CScope & Scope)73 CRef<CSeq_loc> s_CoverageSeqLoc(TAlignSetRef Alignments, int Row, CScope& Scope)
74 {
75     CRef<CSeq_loc> Accum(new CSeq_loc);
76     ITERATE(CSeq_align_set::Tdata, Iter, Alignments->Get()) {
77 
78         const CSeq_align& Curr = **Iter;
79         const CDense_seg& Denseg = Curr.GetSegs().GetDenseg();
80 
81         size_t Dims = Denseg.GetDim();
82         size_t SegCount = Denseg.GetNumseg();
83         for(size_t CurrSeg = 0; CurrSeg < SegCount; ++CurrSeg) {
84             int Index = (Dims*CurrSeg)+Row;
85             int CurrStart = Denseg.GetStarts()[Index];
86             if( CurrStart != -1) {
87                 CRef<CSeq_loc> CurrLoc(new CSeq_loc);
88                 CurrLoc->SetInt().SetId().Assign( *Denseg.GetIds()[Row] );
89                 CurrLoc->SetInt().SetFrom() = CurrStart;
90                 CurrLoc->SetInt().SetTo() = CurrStart + Denseg.GetLens()[CurrSeg];
91                 if(Denseg.CanGetStrands())
92                     CurrLoc->SetInt().SetStrand() = Denseg.GetStrands()[Index];
93                 Accum->SetMix().Set().push_back(CurrLoc);
94             }
95         }
96     }
97 
98     CRef<CSeq_loc> Merged;
99     Merged = sequence::Seq_loc_Merge(*Accum, CSeq_loc::fSortAndMerge_All, &Scope);
100 
101     return Merged;
102 }
103 
104 
s_CalcCoverageCount(TAlignSetRef Alignments,int Row,CScope & Scope)105 TSeqPos s_CalcCoverageCount(TAlignSetRef Alignments, int Row, CScope& Scope)
106 {
107 
108     CRef<CSeq_loc> Merged;
109 
110     Merged = s_CoverageSeqLoc(Alignments, Row, Scope);
111     //cout << MSerial_AsnText << *Merged;
112 
113     Merged->ChangeToMix();
114     TSeqPos PosCoveredBases = 0, NegCoveredBases = 0;
115     ITERATE(CSeq_loc_mix::Tdata, LocIter, Merged->GetMix().Get()) {
116 
117 //    ITERATE(CPacked_seqint_Base::Tdata, IntIter, Merged->GetPacked_int().Get())
118         if( (*LocIter)->Which() == CSeq_loc::e_Int) {
119             if( (*LocIter)->GetInt().GetStrand() == eNa_strand_plus)
120                 PosCoveredBases += (*LocIter)->GetInt().GetLength();
121             else
122                 NegCoveredBases += (*LocIter)->GetInt().GetLength();
123         }
124     }
125     //cerr << "Covered: " << PosCoveredBases << " or " << NegCoveredBases
126     //     << "  len: " << m_Scope->GetBioseqHandle(*Merged->GetId()).GetBioseqLength() << endl;
127 
128     return max(PosCoveredBases, NegCoveredBases);
129 }
130 
131 
132 ///////
133 
134 
135 
GenerateAlignments(objects::CScope & Scope,ISequenceSet * QuerySet,ISequenceSet * SubjectSet,TAlignResultsRef AccumResults)136 TAlignResultsRef CInstancedAligner::GenerateAlignments(objects::CScope& Scope,
137                                                        ISequenceSet* QuerySet,
138                                                        ISequenceSet* SubjectSet,
139                                                        TAlignResultsRef AccumResults)
140 {
141     TAlignResultsRef NewResults(new CAlignResultsSet);
142 
143     NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter,
144                       AccumResults->Get()) {
145         int BestRank = QueryIter->second->GetBestRank();
146         if(BestRank > m_Threshold || BestRank == -1) {
147 
148             _TRACE("Determined ID: "
149                    << QueryIter->second->GetQueryId()->AsFastaString()
150                    << " needs Instanced MM Aligner.");
151             if(!x_MinCoverageCheck(*QueryIter->second)) {
152                 ERR_POST(Info << "ID: "
153                           << QueryIter->second->GetQueryId()->AsFastaString()
154                           << " fails the minimum percent coverage cutoff. Skipping.");
155                 continue;
156             }
157             x_RunAligner(Scope, *QueryIter->second, NewResults);
158         }
159     }
160 
161     return NewResults;
162 }
163 
164 
165 
x_RunAligner(objects::CScope & Scope,CQuerySet & QueryAligns,TAlignResultsRef Results)166 void CInstancedAligner::x_RunAligner(objects::CScope& Scope,
167                                      CQuerySet& QueryAligns,
168                                      TAlignResultsRef Results)
169 {
170     CRef<CSeq_align_set> ResultSet(new CSeq_align_set);
171 
172     vector<CRef<CInstance> > Instances;
173     x_GetCleanupInstances(QueryAligns, Scope, Instances);
174     x_GetDistanceInstances(QueryAligns, Scope, Instances);
175     x_FilterInstances(Instances, m_MaxRatio);
176     if(Instances.empty()) {
177         ERR_POST(Info << " No instances of " << QueryAligns.GetQueryId()->AsFastaString() << " found.");
178         return;
179     }
180     ERR_POST(Info << " Instance Count: " << Instances.size());
181 
182 
183     CRef<CSeq_align> Result(new CSeq_align);
184     ITERATE(vector<CRef<CInstance> >, InstIter, Instances) {
185         if (CSignal::IsSignaled()) {
186             NCBI_THROW(CException, eUnknown, "trapped signal");
187         }
188 
189         const CInstance& Inst = **InstIter;
190 
191         ERR_POST(Info << " Aligning " << Inst.Query.GetId().AsFastaString() << " to "
192                                      << Inst.Subject.GetId().AsFastaString());
193         ERR_POST(Info << "  q:" << Inst.Query.GetFrom() << ":"
194                                 << Inst.Query.GetTo() << ":"
195                                 << (Inst.Query.GetStrand() == eNa_strand_plus ? "+" : "-")
196                       << " and s: " << Inst.Subject.GetFrom() << ":"
197                                     << Inst.Subject.GetTo() << ":"
198                                     << (Inst.Subject.GetStrand() == eNa_strand_plus ? "+" : "-")
199                       << "  ratio: " << Inst.SubjToQueryRatio());
200 
201         CRef<CDense_seg> GlobalDs;
202         try {
203             GlobalDs = x_RunMMGlobal(
204                             Inst.Query.GetId(), Inst.Subject.GetId(),
205                             Inst.Query.GetStrand(),
206                             Inst.Query.GetFrom(), Inst.Query.GetTo(),
207                             Inst.Subject.GetFrom(), Inst.Subject.GetTo(),
208                             Scope);
209         } catch(CException& e) {
210             ERR_POST(Error << e.ReportAll());
211             ERR_POST(Error << "CInstancedBandedAligner failed.");
212             throw e;
213         }
214 
215         GetDiagContext().Extra()
216             .Print("instance_query", Inst.Query.GetId().AsFastaString())
217             .Print("instance_subject", Inst.Subject.GetId().AsFastaString())
218             .Print("instance_align", (GlobalDs.IsNull() ? "false" : "true"));
219 
220         if(GlobalDs.IsNull())
221             continue;
222 
223         Result->SetSegs().SetDenseg().Assign(*GlobalDs);
224         Result->SetType() = CSeq_align::eType_partial;
225         Result->SetNamedScore(GetName(), 1);
226         ResultSet->Set().push_back(Result);
227 
228         ERR_POST(Info << " Found alignment ");
229 
230 /*
231         // CContigAssembly::BestLocalSubAlignmentrescores, and finds better-scoring
232         // sub-regions of the Denseg, and keeps only the subregion.
233         // The practical result is, if the alignment has a large gap near the edges
234         //  that gap might be trimmed. If there is more matching region past the
235         //  too-big gap, that matching region is lost. Sometimes we don't want that
236         //  especially in the Instance case, because we trust the region so much
237         // So, (see below) if this function changes the Denseg, we return both, the
238         //  trimmed and the untrimmed.
239         CRef<CDense_seg> LocalDs;
240         LocalDs = CContigAssembly::BestLocalSubAlignment(*GlobalDs, Scope);
241 
242         Result->SetSegs().SetDenseg().Assign(*LocalDs);
243         Result->SetType() = CSeq_align::eType_partial;
244         ResultSet->Set().push_back(Result);
245 
246         if(!LocalDs->Equals(*GlobalDs)) {
247             CRef<CSeq_align> Result(new CSeq_align);
248             Result->SetSegs().SetDenseg().Assign(*GlobalDs);
249             Result->SetType() = CSeq_align::eType_partial;
250             ResultSet->Set().push_back(Result);
251         }
252 */
253     }
254 
255     if(!ResultSet->Get().empty()) {
256         Results->Insert(CRef<CQuerySet>(new CQuerySet(*ResultSet)));
257     }
258 }
259 
260 
261 struct SCallbackData
262 {
263     CTime   StartTime;
264     int     TimeOutSeconds;
265     size_t  PreviousCount;
266     bool    TimedOut;
267 };
268 
269 
s_ProgressCallback(CNWAligner::SProgressInfo * ProgressInfo)270 bool s_ProgressCallback(CNWAligner::SProgressInfo* ProgressInfo)
271 {
272     SCallbackData* CallbackData = (SCallbackData*)ProgressInfo->m_data;
273     CTime CurrTime(CTime::eCurrent);
274     CTimeSpan Span = (CurrTime-CallbackData->StartTime);
275 
276     if(CallbackData->TimedOut) {
277         return true;
278     }
279 
280     if(CallbackData->PreviousCount == ProgressInfo->m_iter_done ||
281        Span.GetAsDouble() < 1.0) {
282         CallbackData->PreviousCount = ProgressInfo->m_iter_done;
283         return false;
284     }
285 
286     CallbackData->PreviousCount = ProgressInfo->m_iter_done;
287 
288 //    ERR_POST(Info << "Counts: " << ProgressInfo->m_iter_done << " of " << ProgressInfo->m_iter_total);
289 
290     double PercentComplete = (double(ProgressInfo->m_iter_done)/ProgressInfo->m_iter_total);
291     double PercentRemaining = 1.0-PercentComplete;
292 
293     double Factor = PercentRemaining/PercentComplete;
294 
295 
296     if( Span.GetCompleteSeconds() > CallbackData->TimeOutSeconds) {
297         ERR_POST(Error << " Instanced Aligner took over 5 minutes. Timed out.");
298         CallbackData->TimedOut = true;
299         return true;
300     }
301 
302     double TimeEstimated = Span.GetAsDouble() * Factor;
303 
304     if(TimeEstimated > CallbackData->TimeOutSeconds) {
305         ERR_POST(Error << " Instanced Aligner expected to take " << TimeEstimated
306                        << " seconds. More than " << (CallbackData->TimeOutSeconds/60.0)
307                        << " minutes. Terminating Early.");
308         CallbackData->TimedOut = true;
309         return true;
310     }
311 
312 
313     return false;
314 }
315 
316 
317 
x_RunMMGlobal(const CSeq_id & QueryId,const CSeq_id & SubjectId,ENa_strand Strand,TSeqPos QueryStart,TSeqPos QueryStop,TSeqPos SubjectStart,TSeqPos SubjectStop,CScope & Scope)318 CRef<CDense_seg> CInstancedAligner::x_RunMMGlobal(const CSeq_id& QueryId,
319                                                     const CSeq_id& SubjectId,
320                                                     ENa_strand Strand,
321                                                     TSeqPos QueryStart,
322                                                     TSeqPos QueryStop,
323                                                     TSeqPos SubjectStart,
324                                                     TSeqPos SubjectStop,
325                                                     CScope& Scope)
326 {
327 
328     // ASSUMPTION: Query is short, Subject is long.
329     //     We should be able to align all of Query against a subsection of Subject
330     // Banded Memory useage is Longer Sequence * Band Width * unit size (1)
331     // The common case is that Subject will be a very sizeable Scaffold.
332     //  a bandwidth of anything more than 10-500 can pop RAM on a 16G machine.
333     //  So we're gonna snip Subject to a subregion
334     // CMMAligner uses linear RAM, but we still use the restricted region because
335     //  we know where the regions are.
336 
337     CBioseq_Handle::EVectorStrand VecStrand;
338     VecStrand = (Strand == eNa_strand_plus ? CBioseq_Handle::eStrand_Plus
339                                            : CBioseq_Handle::eStrand_Minus);
340 
341     CBioseq_Handle QueryHandle, SubjectHandle;
342     QueryHandle = Scope.GetBioseqHandle(QueryId);
343     SubjectHandle = Scope.GetBioseqHandle(SubjectId);
344 
345     CSeqVector QueryVec, SubjectVec;
346     QueryVec = QueryHandle.GetSeqVector(CBioseq_Handle::eCoding_Iupac, VecStrand);
347     SubjectVec = SubjectHandle.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
348 
349 
350     // CSeqVector, when making a minus vector, not only compliments the bases,
351     // flips the order to. But the start and stops provided are in Plus space.
352     // So we need to swap them to Minus space
353     TSeqPos QueryStrandedStart, QueryStrandedStop;
354     if(Strand == eNa_strand_plus) {
355         QueryStrandedStart = QueryStart;
356         QueryStrandedStop = QueryStop;
357     } else {
358         QueryStrandedStart = ( (QueryVec.size()-1) - QueryStop);
359         QueryStrandedStop = ( (QueryVec.size()-1) - QueryStart);
360     }
361 
362     string QuerySeq, SubjectSeq;
363     QueryVec.GetSeqData(QueryStrandedStart, QueryStrandedStop+1, QuerySeq);
364     SubjectVec.GetSeqData(SubjectStart, SubjectStop+1, SubjectSeq);
365 
366 /*  ERR_POST(Info << " Query " << QueryStart << " to " << QueryStop << " against");
367     ERR_POST(Info << " QStranded " << QueryStrandedStart << " to " << QueryStrandedStop << " against");
368     ERR_POST(Info << " Subject " << SubjectStart << " to " << SubjectStop);
369 */
370 
371     SCallbackData CallbackData;
372     CallbackData.StartTime = CTime(CTime::eCurrent);
373     CallbackData.TimeOutSeconds = m_TimeOutSeconds;
374     CallbackData.PreviousCount = 0;
375     CallbackData.TimedOut = false;
376 
377     TSeqPos ExtractQueryStart = ((Strand == eNa_strand_plus) ? 0 : QuerySeq.size()-1);
378     CRef<CDense_seg> ResultDenseg;
379 
380     CMMAligner Aligner(QuerySeq, SubjectSeq);
381     Aligner.SetWm(m_Match);
382     Aligner.SetWms(m_Mismatch);
383     Aligner.SetWg(m_GapOpen);
384     Aligner.SetWs(m_GapExtend);
385     Aligner.SetScoreMatrix(NULL);
386     Aligner.SetProgressCallback(s_ProgressCallback, (void*)&CallbackData);
387     try {
388         int Score;
389         Score = Aligner.Run();
390         if(Score == numeric_limits<int>::min()) {
391             return CRef<CDense_seg>();
392         }
393         ResultDenseg = Aligner.GetDense_seg(ExtractQueryStart, Strand, QueryId,
394                                             0, eNa_strand_plus, SubjectId);
395     } catch(CException& e) {
396         if(!CallbackData.TimedOut) {
397             ERR_POST(Error << "MMAligner: " << e.ReportAll());
398         }
399         return CRef<CDense_seg>();
400     }
401 
402 
403     if(ResultDenseg->GetNumseg() > 0) {
404         ResultDenseg->OffsetRow(0, QueryStart);
405         ResultDenseg->OffsetRow(1, SubjectStart);
406 
407         // TrimEndGaps Segfaults on the only one segment which is a gap case
408         if(ResultDenseg->GetNumseg() == 1 &&
409            (ResultDenseg->GetStarts()[0] == -1 || ResultDenseg->GetStarts()[1] == -1)) {
410             return CRef<CDense_seg>();
411         } else {
412             ResultDenseg->TrimEndGaps();
413         }
414 
415     } else {
416         return CRef<CDense_seg>();
417     }
418 
419     return ResultDenseg;
420 }
421 
422 
423 CRef<objects::CSeq_align_set>
x_RunCleanup(const objects::CSeq_align_set & AlignSet,objects::CScope & Scope)424 CInstancedAligner::x_RunCleanup(const objects::CSeq_align_set& AlignSet,
425                                 objects::CScope& Scope)
426 {
427     CAlignCleanup Cleaner(Scope);
428     //Cleaner.FillUnaligned(true);
429 
430     list<CConstRef<CSeq_align> > In;
431     ITERATE(CSeq_align_set::Tdata, AlignIter, AlignSet.Get()) {
432         CConstRef<CSeq_align> Align(*AlignIter);
433         In.push_back(Align);
434     }
435 
436     CRef<CSeq_align_set> Out(new CSeq_align_set);
437 
438     try {
439         Cleaner.Cleanup(In, Out->Set());
440     } catch(CException& e) {
441         ERR_POST(Error << "Cleanup Error: " << e.ReportAll());
442         throw e;
443     }
444 
445     NON_CONST_ITERATE(CSeq_align_set::Tdata, AlignIter, Out->Set()) {
446         CDense_seg& Denseg = (*AlignIter)->SetSegs().SetDenseg();
447 
448         if(!Denseg.CanGetStrands() || Denseg.GetStrands().empty()) {
449             Denseg.SetStrands().resize(Denseg.GetDim()*Denseg.GetNumseg(), eNa_strand_plus);
450         }
451 
452         if(Denseg.GetSeqStrand(1) != eNa_strand_plus) {
453             Denseg.Reverse();
454         }
455         //CRef<CDense_seg> Filled = Denseg.FillUnaligned();
456         //Denseg.Assign(*Filled);
457     }
458 
459     return Out;
460 }
461 
462 
x_GetCleanupInstances(CQuerySet & QueryAligns,CScope & Scope,vector<CRef<CInstance>> & Instances)463 void CInstancedAligner::x_GetCleanupInstances(CQuerySet& QueryAligns, CScope& Scope,
464                                        vector<CRef<CInstance> >& Instances)
465 {
466 
467     ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryAligns.Get()) {
468     ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
469     //ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
470 
471         CRef<CSeq_align_set> Set = SubjectIter->second;
472         CRef<CSeq_align_set> Out;
473 
474         CRef<CSeq_align_set> Pluses(new CSeq_align_set),
475                              Minuses(new CSeq_align_set);
476 
477         ITERATE(CSeq_align_set::Tdata, AlignIter, Set->Get()) {
478             if( (*AlignIter)->GetSeqStrand(0) == eNa_strand_plus)
479                 Pluses->Set().push_back(*AlignIter);
480             else if( (*AlignIter)->GetSeqStrand(0) == eNa_strand_minus)
481                 Minuses->Set().push_back(*AlignIter);
482         }
483 
484         if(!Pluses->Set().empty()) {
485             Out = x_RunCleanup(*Pluses, Scope);
486             if(!Out.IsNull()) {
487                 ITERATE(CSeq_align_set::Tdata, AlignIter, Out->Set()) {
488                     CRef<CInstance> Inst(new CInstance(*AlignIter));
489                     Instances.push_back(Inst);
490                 }
491             }
492         }
493         if(!Minuses->Set().empty()) {
494             Out = x_RunCleanup(*Minuses, Scope);
495             if(!Out.IsNull()) {
496                 ITERATE(CSeq_align_set::Tdata, AlignIter, Out->Set()) {
497                     CRef<CInstance> Inst(new CInstance(*AlignIter));
498                     Instances.push_back(Inst);
499                 }
500             }
501         }
502     }
503     }
504 
505 }
506 
507 
508 
CInstance(CRef<CSeq_align> Align)509 CInstance::CInstance(CRef<CSeq_align> Align)
510 {
511     Query.SetId().Assign(Align->GetSeq_id(0));
512     Subject.SetId().Assign(Align->GetSeq_id(1));
513 
514     Query.SetStrand() = Align->GetSeqStrand(0);
515     Subject.SetStrand() = Align->GetSeqStrand(1);
516 
517     Query.SetFrom() = Align->GetSeqStart(0);
518     Subject.SetFrom() = Align->GetSeqStart(1);
519 
520     Query.SetTo() = Align->GetSeqStop(0);
521     Subject.SetTo() = Align->GetSeqStop(1);
522 
523     Alignments.Set().push_back(Align);
524 }
525 
526 
CInstance(const CSeq_align_set & AlignSet)527 CInstance::CInstance(const CSeq_align_set& AlignSet)
528 {
529     Query.SetId().Assign(AlignSet.Get().front()->GetSeq_id(0));
530     Subject.SetId().Assign(AlignSet.Get().front()->GetSeq_id(1));
531 
532     Query.SetStrand() = AlignSet.Get().front()->GetSeqStrand(0);
533     Subject.SetStrand() = AlignSet.Get().front()->GetSeqStrand(1);
534 
535     Query.SetFrom() = numeric_limits<TSeqPos>::max();
536     Subject.SetFrom() = numeric_limits<TSeqPos>::max();
537 
538     Query.SetTo() = numeric_limits<TSeqPos>::min();
539     Subject.SetTo() = numeric_limits<TSeqPos>::min();
540 
541     ITERATE(CSeq_align_set::Tdata, AlignIter, AlignSet.Get()) {
542         Query.SetFrom(min(Query.GetFrom(), (*AlignIter)->GetSeqStart(0)));
543         Subject.SetFrom(min(Subject.GetFrom(), (*AlignIter)->GetSeqStart(1)));
544 
545         Query.SetTo(max(Query.GetTo(), (*AlignIter)->GetSeqStop(0)));
546         Subject.SetTo(max(Subject.GetTo(), (*AlignIter)->GetSeqStop(1)));
547     }
548 }
549 
550 
IsAlignmentContained(const CSeq_align & Align) const551 bool CInstance::IsAlignmentContained(const CSeq_align& Align) const
552 {
553     if(Query.GetStrand() != Align.GetSeqStrand(0) ||
554        Subject.GetStrand() != Align.GetSeqStrand(1))
555         return false;
556 
557     if( Align.GetSeqStart(0) >= Query.GetFrom() &&
558         Align.GetSeqStop(0)  <= Query.GetTo() &&
559         Align.GetSeqStart(1) >= Subject.GetFrom() &&
560         Align.GetSeqStop(1)  <= Subject.GetTo()) {
561         return true;
562     }
563     else {
564         return false;
565     }
566 }
567 
568 
GapDistance(const CSeq_align & Align) const569 int CInstance::GapDistance(const CSeq_align& Align) const
570 {
571     int LongestGap = 0;
572     LongestGap = max((int)Align.GetSeqStart(0)-(int)Query.GetTo(), LongestGap);
573     LongestGap = max((int)Align.GetSeqStart(1)-(int)Subject.GetTo(), LongestGap);
574     LongestGap = max((int)Query.GetFrom()-(int)Align.GetSeqStop(0), LongestGap);
575     LongestGap = max((int)Subject.GetFrom()-(int)Align.GetSeqStop(1), LongestGap);
576    // cerr << "Longest Gap: " << LongestGap << endl;
577     return LongestGap;
578 }
579 
580 
SubjToQueryRatio() const581 double CInstance::SubjToQueryRatio() const
582 {
583     return (Subject.GetLength() / double(Query.GetLength()));
584 }
585 
586 
QueryLength() const587 TSeqPos CInstance::QueryLength() const
588 {
589     return Query.GetLength();
590 }
591 
592 
MergeIn(const CRef<CSeq_align> Align)593 void CInstance::MergeIn(const CRef<CSeq_align> Align)
594 {
595     Query.SetFrom(min(Query.GetFrom(), Align->GetSeqStart(0)));
596     Subject.SetFrom(min(Subject.GetFrom(), Align->GetSeqStart(1)));
597 
598     Query.SetTo(max(Query.GetTo(), Align->GetSeqStop(0)));
599     Subject.SetTo(max(Subject.GetTo(), Align->GetSeqStop(1)));
600 
601     Alignments.Set().push_back(Align);
602 }
603 
x_GetDistanceInstances(CQuerySet & QueryAligns,CScope & Scope,vector<CRef<CInstance>> & Instances)604 void CInstancedAligner::x_GetDistanceInstances(CQuerySet& QueryAligns, CScope& Scope,
605                                        vector<CRef<CInstance> >& Instances)
606 {
607     typedef pair<CRef<CInstance>, CRef<CSeq_align_set> > TInstPair;
608 
609     // Identify the best pct_coverage for every Subject Id.
610     // This will be used to filter out subjects with relatively low
611     // best pct_coverage, so that they can be skipped.
612     typedef map<string, double> TSubjectCoverage;
613     TSubjectCoverage BestCoverage;
614     double MaxCoverage = 0;
615 
616     ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryAligns.Get()) {
617     ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
618     //ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
619 
620         CRef<CSeq_align_set> Set = SubjectIter->second;
621         string IdStr = Set->Get().front()->GetSeq_id(1).AsFastaString();
622         double SubjCoverage = 0;
623         ITERATE(CSeq_align_set::Tdata, AlignIter, Set->Get()) {
624             double PctCov;
625             (*AlignIter)->GetNamedScore("pct_coverage", PctCov);
626             SubjCoverage = max(SubjCoverage, PctCov);
627         }
628         BestCoverage[IdStr] = SubjCoverage;
629         MaxCoverage = max(SubjCoverage, MaxCoverage);
630     }
631     }
632 
633     typedef vector<CRef<CInstance> > TInstVector;
634     ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryAligns.Get()) {
635     ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
636     //ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
637 
638         TInstVector SubjInstances;
639 
640         CRef<CSeq_align_set> Set = SubjectIter->second;
641         string SubjIdStr = Set->Get().front()->GetSeq_id(1).AsFastaString();
642         if(BestCoverage[SubjIdStr] < (MaxCoverage*0.10)) {
643             continue; // Skip any subject id that has less than 10% the best pct_coverage
644         }
645 
646         // Group together alignments that are not contained within each other,
647         // but are within a 'small' gap distance of each other into initial
648         // instances
649         ITERATE(CSeq_align_set::Tdata, AlignIter, Set->Get()) {
650 
651             if((*AlignIter)->GetSegs().Which() != CSeq_align::C_Segs::e_Denseg)
652                 continue;
653 
654             bool Inserted = false;
655             bool Contained = false;
656             ITERATE(TInstVector, InstIter, SubjInstances) {
657                 bool CurrContained = (*InstIter)->IsAlignmentContained(**AlignIter);
658                 Contained |= CurrContained;
659             }
660             if(Contained)
661                 continue;
662             NON_CONST_ITERATE(TInstVector, InstIter, SubjInstances) {
663                 int GapDist = (*InstIter)->GapDistance(**AlignIter);
664                 if(GapDist < 20000) {
665                     (*InstIter)->MergeIn(*AlignIter);
666                     Inserted = true;
667                     break;
668                 }
669             }
670             if(!Inserted) {
671                 CRef<CInstance> Inst(new CInstance(*AlignIter));
672                 SubjInstances.push_back(Inst);
673             }
674         }
675 
676         // For these initial instances, run CAlignCleanup on the instance
677         //  CSeq_align_sets. Then make instances from CAlignCleanup results.
678         TInstVector CleanedInstances;
679 
680         ITERATE(TInstVector, InstIter, SubjInstances) {
681             // If the instance only has one alignment in it, skip it.
682             // we already have an identical BLAST result.
683             if((*InstIter)->Alignments.Get().size() <= 1)
684                 continue;
685 
686             CRef<CSeq_align_set> Cleaned;
687             Cleaned = x_RunCleanup((*InstIter)->Alignments, Scope);
688             if(!Cleaned.IsNull()) {
689                 ITERATE(CSeq_align_set::Tdata, AlignIter, Cleaned->Get()) {
690                     // If any of these cleaned alignments are dupes of existing alignments,
691                     // skip them
692                     bool DupeFound = false;
693                     ITERATE(CSeq_align_set::Tdata, SourceIter, (*InstIter)->Alignments.Get()) {
694                         if( (*AlignIter)->GetSeqStart(0) == (*SourceIter)->GetSeqStart(0) &&
695                             (*AlignIter)->GetSeqStart(1) == (*SourceIter)->GetSeqStart(1) &&
696                             (*AlignIter)->GetSeqStop(0) == (*SourceIter)->GetSeqStop(0) &&
697                             (*AlignIter)->GetSeqStop(1) == (*SourceIter)->GetSeqStop(1)) {
698                             DupeFound = true;
699                             break;
700                         }
701                     }
702                     if(DupeFound)
703                         continue;
704 
705                     // And if any of these cleaned alignments are contained in
706                     // established instances, throw them out too.
707                     bool Contained = false;
708                     ITERATE(TInstVector, CleanIter, CleanedInstances) {
709                         bool Curr = (*CleanIter)->IsAlignmentContained(**AlignIter);
710                         Contained |= Curr;
711                     }
712                     if(Contained)
713                         continue;
714 
715                     // Any Cleaned Seq_align that got this far is a potential instance.
716                     bool Dupe = false;
717                     CRef<CInstance> Inst(new CInstance(*AlignIter));
718                     ITERATE(vector<CRef<CInstance> >, OldInstIter, CleanedInstances) {
719                         Dupe |= ((*OldInstIter)->Query.Equals(Inst->Query)
720                               && (*OldInstIter)->Subject.Equals(Inst->Subject));
721                     }
722                     if(!Dupe)
723                         CleanedInstances.push_back(Inst);
724                 }
725             }
726         }
727 
728         copy(CleanedInstances.begin(), CleanedInstances.end(),
729             insert_iterator<TInstVector>(Instances, Instances.end()));
730     }
731     }
732 
733 }
734 
735 
x_FilterInstances(vector<CRef<CInstance>> & Instances,double MaxRatio)736 void CInstancedAligner::x_FilterInstances(vector<CRef<CInstance> >& Instances, double MaxRatio)
737 {
738     // Filters out Instances of overly lopsided ratios
739     vector<CRef<CInstance> >::iterator Curr;
740     Curr = Instances.begin();
741     for(Curr = Instances.begin(); Curr != Instances.end(); ) {
742         if( (*Curr)->SubjToQueryRatio() > MaxRatio ||
743             (*Curr)->SubjToQueryRatio() < 0.10 )
744             Curr = Instances.erase(Curr);
745         else
746             ++Curr;
747     }
748 
749     // Find the longest instance now
750     TSeqPos LongestInstance = 0;
751     for(Curr = Instances.begin(); Curr != Instances.end(); ++Curr) {
752         TSeqPos CurrLength = (*Curr)->QueryLength();
753         LongestInstance = max(CurrLength, LongestInstance);
754     }
755 
756     // Filters out instances that are too tiny to both with
757     for(Curr = Instances.begin(); Curr != Instances.end(); ) {
758         if( (*Curr)->QueryLength() <= (LongestInstance*0.05))
759             Curr = Instances.erase(Curr);
760         else
761             ++Curr;
762     }
763 
764     vector<CRef<CInstance> >::iterator Outer, Inner;
765     for(Outer = Instances.begin(); Outer != Instances.end(); ++Outer) {
766         for(Inner = Outer+1; Inner != Instances.end(); ) {
767             if( (*Outer)->Query.Equals((*Inner)->Query) &&
768                 (*Outer)->Subject.Equals((*Inner)->Subject) ) {
769                 Inner = Instances.erase(Inner);
770             } else {
771                 ++Inner;
772             }
773         }
774     }
775 
776 }
777 
778 
x_MinCoverageCheck(const CQuerySet & QueryAligns)779 bool CInstancedAligner::x_MinCoverageCheck(const CQuerySet& QueryAligns)
780 {
781     double BestPctCoverage = -1.0;
782 
783     ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryAligns.Get()) {
784     ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
785     //ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
786 
787         CRef<CSeq_align_set> Set = SubjectIter->second;
788 
789         ITERATE(CSeq_align_set::Tdata, AlignIter, Set->Get()) {
790             double PctCoverage = -1.0;
791             (*AlignIter)->GetNamedScore("pct_coverage", PctCoverage);
792             BestPctCoverage = max(BestPctCoverage, PctCoverage);
793         }
794     }
795     }
796 
797 
798     return (BestPctCoverage >= m_MinPctCoverage || BestPctCoverage == -1.0);
799 }
800 
801 END_SCOPE(ncbi)
802 
803