1 /* $Id: banded_aligner.cpp 494142 2016-03-03 20:39:42Z whlavina $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Nathan Bouk
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiexpt.hpp>
34 #include <corelib/ncbi_system.hpp>
35 #include <corelib/ncbi_signal.hpp>
36 #include <math.h>
37 #include <algo/align/ngalign/banded_aligner.hpp>
38 //#include "align_instance.hpp"
39
40 #include <objects/seqloc/Seq_loc.hpp>
41 #include <objects/seqloc/Seq_id.hpp>
42 #include <objmgr/scope.hpp>
43 #include <algo/blast/api/blast_types.hpp>
44 #include <algo/blast/api/bl2seq.hpp>
45 #include <algo/blast/api/blast_options_handle.hpp>
46 #include <algo/blast/api/blast_nucl_options.hpp>
47 #include <algo/blast/api/disc_nucl_options.hpp>
48 #include <algo/align/util/score_builder.hpp>
49 #include <objects/seqalign/Seq_align.hpp>
50 #include <objects/seqalign/Seq_align_set.hpp>
51 #include <objects/seqalign/Dense_seg.hpp>
52 #include <objmgr/seq_vector.hpp>
53
54 #include <algo/blast/api/local_blast.hpp>
55 #include <algo/blast/blastinput/blastn_args.hpp>
56
57 #include <algo/align/contig_assembly/contig_assembly.hpp>
58 #include <algo/align/nw/nw_band_aligner.hpp>
59 #include <algo/align/nw/mm_aligner.hpp>
60
61 #include <algo/align/util/blast_tabular.hpp>
62 #include <algo/align/util/compartment_finder.hpp>
63 #include <algo/sequence/align_cleanup.hpp>
64
65
66 BEGIN_SCOPE(ncbi)
67 USING_SCOPE(objects);
68 USING_SCOPE(blast);
69
70
71
72
s_CoverageSeqLoc(TAlignSetRef Alignments,int Row,CScope & Scope)73 CRef<CSeq_loc> s_CoverageSeqLoc(TAlignSetRef Alignments, int Row, CScope& Scope)
74 {
75 CRef<CSeq_loc> Accum(new CSeq_loc);
76 ITERATE(CSeq_align_set::Tdata, Iter, Alignments->Get()) {
77
78 const CSeq_align& Curr = **Iter;
79 const CDense_seg& Denseg = Curr.GetSegs().GetDenseg();
80
81 size_t Dims = Denseg.GetDim();
82 size_t SegCount = Denseg.GetNumseg();
83 for(size_t CurrSeg = 0; CurrSeg < SegCount; ++CurrSeg) {
84 int Index = (Dims*CurrSeg)+Row;
85 int CurrStart = Denseg.GetStarts()[Index];
86 if( CurrStart != -1) {
87 CRef<CSeq_loc> CurrLoc(new CSeq_loc);
88 CurrLoc->SetInt().SetId().Assign( *Denseg.GetIds()[Row] );
89 CurrLoc->SetInt().SetFrom() = CurrStart;
90 CurrLoc->SetInt().SetTo() = CurrStart + Denseg.GetLens()[CurrSeg];
91 if(Denseg.CanGetStrands())
92 CurrLoc->SetInt().SetStrand() = Denseg.GetStrands()[Index];
93 Accum->SetMix().Set().push_back(CurrLoc);
94 }
95 }
96 }
97
98 CRef<CSeq_loc> Merged;
99 Merged = sequence::Seq_loc_Merge(*Accum, CSeq_loc::fSortAndMerge_All, &Scope);
100
101 return Merged;
102 }
103
104
s_CalcCoverageCount(TAlignSetRef Alignments,int Row,CScope & Scope)105 TSeqPos s_CalcCoverageCount(TAlignSetRef Alignments, int Row, CScope& Scope)
106 {
107
108 CRef<CSeq_loc> Merged;
109
110 Merged = s_CoverageSeqLoc(Alignments, Row, Scope);
111 //cout << MSerial_AsnText << *Merged;
112
113 Merged->ChangeToMix();
114 TSeqPos PosCoveredBases = 0, NegCoveredBases = 0;
115 ITERATE(CSeq_loc_mix::Tdata, LocIter, Merged->GetMix().Get()) {
116
117 // ITERATE(CPacked_seqint_Base::Tdata, IntIter, Merged->GetPacked_int().Get())
118 if( (*LocIter)->Which() == CSeq_loc::e_Int) {
119 if( (*LocIter)->GetInt().GetStrand() == eNa_strand_plus)
120 PosCoveredBases += (*LocIter)->GetInt().GetLength();
121 else
122 NegCoveredBases += (*LocIter)->GetInt().GetLength();
123 }
124 }
125 //cerr << "Covered: " << PosCoveredBases << " or " << NegCoveredBases
126 // << " len: " << m_Scope->GetBioseqHandle(*Merged->GetId()).GetBioseqLength() << endl;
127
128 return max(PosCoveredBases, NegCoveredBases);
129 }
130
131
132 ///////
133
134
135
GenerateAlignments(objects::CScope & Scope,ISequenceSet * QuerySet,ISequenceSet * SubjectSet,TAlignResultsRef AccumResults)136 TAlignResultsRef CInstancedAligner::GenerateAlignments(objects::CScope& Scope,
137 ISequenceSet* QuerySet,
138 ISequenceSet* SubjectSet,
139 TAlignResultsRef AccumResults)
140 {
141 TAlignResultsRef NewResults(new CAlignResultsSet);
142
143 NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter,
144 AccumResults->Get()) {
145 int BestRank = QueryIter->second->GetBestRank();
146 if(BestRank > m_Threshold || BestRank == -1) {
147
148 _TRACE("Determined ID: "
149 << QueryIter->second->GetQueryId()->AsFastaString()
150 << " needs Instanced MM Aligner.");
151 if(!x_MinCoverageCheck(*QueryIter->second)) {
152 ERR_POST(Info << "ID: "
153 << QueryIter->second->GetQueryId()->AsFastaString()
154 << " fails the minimum percent coverage cutoff. Skipping.");
155 continue;
156 }
157 x_RunAligner(Scope, *QueryIter->second, NewResults);
158 }
159 }
160
161 return NewResults;
162 }
163
164
165
x_RunAligner(objects::CScope & Scope,CQuerySet & QueryAligns,TAlignResultsRef Results)166 void CInstancedAligner::x_RunAligner(objects::CScope& Scope,
167 CQuerySet& QueryAligns,
168 TAlignResultsRef Results)
169 {
170 CRef<CSeq_align_set> ResultSet(new CSeq_align_set);
171
172 vector<CRef<CInstance> > Instances;
173 x_GetCleanupInstances(QueryAligns, Scope, Instances);
174 x_GetDistanceInstances(QueryAligns, Scope, Instances);
175 x_FilterInstances(Instances, m_MaxRatio);
176 if(Instances.empty()) {
177 ERR_POST(Info << " No instances of " << QueryAligns.GetQueryId()->AsFastaString() << " found.");
178 return;
179 }
180 ERR_POST(Info << " Instance Count: " << Instances.size());
181
182
183 CRef<CSeq_align> Result(new CSeq_align);
184 ITERATE(vector<CRef<CInstance> >, InstIter, Instances) {
185 if (CSignal::IsSignaled()) {
186 NCBI_THROW(CException, eUnknown, "trapped signal");
187 }
188
189 const CInstance& Inst = **InstIter;
190
191 ERR_POST(Info << " Aligning " << Inst.Query.GetId().AsFastaString() << " to "
192 << Inst.Subject.GetId().AsFastaString());
193 ERR_POST(Info << " q:" << Inst.Query.GetFrom() << ":"
194 << Inst.Query.GetTo() << ":"
195 << (Inst.Query.GetStrand() == eNa_strand_plus ? "+" : "-")
196 << " and s: " << Inst.Subject.GetFrom() << ":"
197 << Inst.Subject.GetTo() << ":"
198 << (Inst.Subject.GetStrand() == eNa_strand_plus ? "+" : "-")
199 << " ratio: " << Inst.SubjToQueryRatio());
200
201 CRef<CDense_seg> GlobalDs;
202 try {
203 GlobalDs = x_RunMMGlobal(
204 Inst.Query.GetId(), Inst.Subject.GetId(),
205 Inst.Query.GetStrand(),
206 Inst.Query.GetFrom(), Inst.Query.GetTo(),
207 Inst.Subject.GetFrom(), Inst.Subject.GetTo(),
208 Scope);
209 } catch(CException& e) {
210 ERR_POST(Error << e.ReportAll());
211 ERR_POST(Error << "CInstancedBandedAligner failed.");
212 throw e;
213 }
214
215 GetDiagContext().Extra()
216 .Print("instance_query", Inst.Query.GetId().AsFastaString())
217 .Print("instance_subject", Inst.Subject.GetId().AsFastaString())
218 .Print("instance_align", (GlobalDs.IsNull() ? "false" : "true"));
219
220 if(GlobalDs.IsNull())
221 continue;
222
223 Result->SetSegs().SetDenseg().Assign(*GlobalDs);
224 Result->SetType() = CSeq_align::eType_partial;
225 Result->SetNamedScore(GetName(), 1);
226 ResultSet->Set().push_back(Result);
227
228 ERR_POST(Info << " Found alignment ");
229
230 /*
231 // CContigAssembly::BestLocalSubAlignmentrescores, and finds better-scoring
232 // sub-regions of the Denseg, and keeps only the subregion.
233 // The practical result is, if the alignment has a large gap near the edges
234 // that gap might be trimmed. If there is more matching region past the
235 // too-big gap, that matching region is lost. Sometimes we don't want that
236 // especially in the Instance case, because we trust the region so much
237 // So, (see below) if this function changes the Denseg, we return both, the
238 // trimmed and the untrimmed.
239 CRef<CDense_seg> LocalDs;
240 LocalDs = CContigAssembly::BestLocalSubAlignment(*GlobalDs, Scope);
241
242 Result->SetSegs().SetDenseg().Assign(*LocalDs);
243 Result->SetType() = CSeq_align::eType_partial;
244 ResultSet->Set().push_back(Result);
245
246 if(!LocalDs->Equals(*GlobalDs)) {
247 CRef<CSeq_align> Result(new CSeq_align);
248 Result->SetSegs().SetDenseg().Assign(*GlobalDs);
249 Result->SetType() = CSeq_align::eType_partial;
250 ResultSet->Set().push_back(Result);
251 }
252 */
253 }
254
255 if(!ResultSet->Get().empty()) {
256 Results->Insert(CRef<CQuerySet>(new CQuerySet(*ResultSet)));
257 }
258 }
259
260
261 struct SCallbackData
262 {
263 CTime StartTime;
264 int TimeOutSeconds;
265 size_t PreviousCount;
266 bool TimedOut;
267 };
268
269
s_ProgressCallback(CNWAligner::SProgressInfo * ProgressInfo)270 bool s_ProgressCallback(CNWAligner::SProgressInfo* ProgressInfo)
271 {
272 SCallbackData* CallbackData = (SCallbackData*)ProgressInfo->m_data;
273 CTime CurrTime(CTime::eCurrent);
274 CTimeSpan Span = (CurrTime-CallbackData->StartTime);
275
276 if(CallbackData->TimedOut) {
277 return true;
278 }
279
280 if(CallbackData->PreviousCount == ProgressInfo->m_iter_done ||
281 Span.GetAsDouble() < 1.0) {
282 CallbackData->PreviousCount = ProgressInfo->m_iter_done;
283 return false;
284 }
285
286 CallbackData->PreviousCount = ProgressInfo->m_iter_done;
287
288 // ERR_POST(Info << "Counts: " << ProgressInfo->m_iter_done << " of " << ProgressInfo->m_iter_total);
289
290 double PercentComplete = (double(ProgressInfo->m_iter_done)/ProgressInfo->m_iter_total);
291 double PercentRemaining = 1.0-PercentComplete;
292
293 double Factor = PercentRemaining/PercentComplete;
294
295
296 if( Span.GetCompleteSeconds() > CallbackData->TimeOutSeconds) {
297 ERR_POST(Error << " Instanced Aligner took over 5 minutes. Timed out.");
298 CallbackData->TimedOut = true;
299 return true;
300 }
301
302 double TimeEstimated = Span.GetAsDouble() * Factor;
303
304 if(TimeEstimated > CallbackData->TimeOutSeconds) {
305 ERR_POST(Error << " Instanced Aligner expected to take " << TimeEstimated
306 << " seconds. More than " << (CallbackData->TimeOutSeconds/60.0)
307 << " minutes. Terminating Early.");
308 CallbackData->TimedOut = true;
309 return true;
310 }
311
312
313 return false;
314 }
315
316
317
x_RunMMGlobal(const CSeq_id & QueryId,const CSeq_id & SubjectId,ENa_strand Strand,TSeqPos QueryStart,TSeqPos QueryStop,TSeqPos SubjectStart,TSeqPos SubjectStop,CScope & Scope)318 CRef<CDense_seg> CInstancedAligner::x_RunMMGlobal(const CSeq_id& QueryId,
319 const CSeq_id& SubjectId,
320 ENa_strand Strand,
321 TSeqPos QueryStart,
322 TSeqPos QueryStop,
323 TSeqPos SubjectStart,
324 TSeqPos SubjectStop,
325 CScope& Scope)
326 {
327
328 // ASSUMPTION: Query is short, Subject is long.
329 // We should be able to align all of Query against a subsection of Subject
330 // Banded Memory useage is Longer Sequence * Band Width * unit size (1)
331 // The common case is that Subject will be a very sizeable Scaffold.
332 // a bandwidth of anything more than 10-500 can pop RAM on a 16G machine.
333 // So we're gonna snip Subject to a subregion
334 // CMMAligner uses linear RAM, but we still use the restricted region because
335 // we know where the regions are.
336
337 CBioseq_Handle::EVectorStrand VecStrand;
338 VecStrand = (Strand == eNa_strand_plus ? CBioseq_Handle::eStrand_Plus
339 : CBioseq_Handle::eStrand_Minus);
340
341 CBioseq_Handle QueryHandle, SubjectHandle;
342 QueryHandle = Scope.GetBioseqHandle(QueryId);
343 SubjectHandle = Scope.GetBioseqHandle(SubjectId);
344
345 CSeqVector QueryVec, SubjectVec;
346 QueryVec = QueryHandle.GetSeqVector(CBioseq_Handle::eCoding_Iupac, VecStrand);
347 SubjectVec = SubjectHandle.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
348
349
350 // CSeqVector, when making a minus vector, not only compliments the bases,
351 // flips the order to. But the start and stops provided are in Plus space.
352 // So we need to swap them to Minus space
353 TSeqPos QueryStrandedStart, QueryStrandedStop;
354 if(Strand == eNa_strand_plus) {
355 QueryStrandedStart = QueryStart;
356 QueryStrandedStop = QueryStop;
357 } else {
358 QueryStrandedStart = ( (QueryVec.size()-1) - QueryStop);
359 QueryStrandedStop = ( (QueryVec.size()-1) - QueryStart);
360 }
361
362 string QuerySeq, SubjectSeq;
363 QueryVec.GetSeqData(QueryStrandedStart, QueryStrandedStop+1, QuerySeq);
364 SubjectVec.GetSeqData(SubjectStart, SubjectStop+1, SubjectSeq);
365
366 /* ERR_POST(Info << " Query " << QueryStart << " to " << QueryStop << " against");
367 ERR_POST(Info << " QStranded " << QueryStrandedStart << " to " << QueryStrandedStop << " against");
368 ERR_POST(Info << " Subject " << SubjectStart << " to " << SubjectStop);
369 */
370
371 SCallbackData CallbackData;
372 CallbackData.StartTime = CTime(CTime::eCurrent);
373 CallbackData.TimeOutSeconds = m_TimeOutSeconds;
374 CallbackData.PreviousCount = 0;
375 CallbackData.TimedOut = false;
376
377 TSeqPos ExtractQueryStart = ((Strand == eNa_strand_plus) ? 0 : QuerySeq.size()-1);
378 CRef<CDense_seg> ResultDenseg;
379
380 CMMAligner Aligner(QuerySeq, SubjectSeq);
381 Aligner.SetWm(m_Match);
382 Aligner.SetWms(m_Mismatch);
383 Aligner.SetWg(m_GapOpen);
384 Aligner.SetWs(m_GapExtend);
385 Aligner.SetScoreMatrix(NULL);
386 Aligner.SetProgressCallback(s_ProgressCallback, (void*)&CallbackData);
387 try {
388 int Score;
389 Score = Aligner.Run();
390 if(Score == numeric_limits<int>::min()) {
391 return CRef<CDense_seg>();
392 }
393 ResultDenseg = Aligner.GetDense_seg(ExtractQueryStart, Strand, QueryId,
394 0, eNa_strand_plus, SubjectId);
395 } catch(CException& e) {
396 if(!CallbackData.TimedOut) {
397 ERR_POST(Error << "MMAligner: " << e.ReportAll());
398 }
399 return CRef<CDense_seg>();
400 }
401
402
403 if(ResultDenseg->GetNumseg() > 0) {
404 ResultDenseg->OffsetRow(0, QueryStart);
405 ResultDenseg->OffsetRow(1, SubjectStart);
406
407 // TrimEndGaps Segfaults on the only one segment which is a gap case
408 if(ResultDenseg->GetNumseg() == 1 &&
409 (ResultDenseg->GetStarts()[0] == -1 || ResultDenseg->GetStarts()[1] == -1)) {
410 return CRef<CDense_seg>();
411 } else {
412 ResultDenseg->TrimEndGaps();
413 }
414
415 } else {
416 return CRef<CDense_seg>();
417 }
418
419 return ResultDenseg;
420 }
421
422
423 CRef<objects::CSeq_align_set>
x_RunCleanup(const objects::CSeq_align_set & AlignSet,objects::CScope & Scope)424 CInstancedAligner::x_RunCleanup(const objects::CSeq_align_set& AlignSet,
425 objects::CScope& Scope)
426 {
427 CAlignCleanup Cleaner(Scope);
428 //Cleaner.FillUnaligned(true);
429
430 list<CConstRef<CSeq_align> > In;
431 ITERATE(CSeq_align_set::Tdata, AlignIter, AlignSet.Get()) {
432 CConstRef<CSeq_align> Align(*AlignIter);
433 In.push_back(Align);
434 }
435
436 CRef<CSeq_align_set> Out(new CSeq_align_set);
437
438 try {
439 Cleaner.Cleanup(In, Out->Set());
440 } catch(CException& e) {
441 ERR_POST(Error << "Cleanup Error: " << e.ReportAll());
442 throw e;
443 }
444
445 NON_CONST_ITERATE(CSeq_align_set::Tdata, AlignIter, Out->Set()) {
446 CDense_seg& Denseg = (*AlignIter)->SetSegs().SetDenseg();
447
448 if(!Denseg.CanGetStrands() || Denseg.GetStrands().empty()) {
449 Denseg.SetStrands().resize(Denseg.GetDim()*Denseg.GetNumseg(), eNa_strand_plus);
450 }
451
452 if(Denseg.GetSeqStrand(1) != eNa_strand_plus) {
453 Denseg.Reverse();
454 }
455 //CRef<CDense_seg> Filled = Denseg.FillUnaligned();
456 //Denseg.Assign(*Filled);
457 }
458
459 return Out;
460 }
461
462
x_GetCleanupInstances(CQuerySet & QueryAligns,CScope & Scope,vector<CRef<CInstance>> & Instances)463 void CInstancedAligner::x_GetCleanupInstances(CQuerySet& QueryAligns, CScope& Scope,
464 vector<CRef<CInstance> >& Instances)
465 {
466
467 ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryAligns.Get()) {
468 ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
469 //ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
470
471 CRef<CSeq_align_set> Set = SubjectIter->second;
472 CRef<CSeq_align_set> Out;
473
474 CRef<CSeq_align_set> Pluses(new CSeq_align_set),
475 Minuses(new CSeq_align_set);
476
477 ITERATE(CSeq_align_set::Tdata, AlignIter, Set->Get()) {
478 if( (*AlignIter)->GetSeqStrand(0) == eNa_strand_plus)
479 Pluses->Set().push_back(*AlignIter);
480 else if( (*AlignIter)->GetSeqStrand(0) == eNa_strand_minus)
481 Minuses->Set().push_back(*AlignIter);
482 }
483
484 if(!Pluses->Set().empty()) {
485 Out = x_RunCleanup(*Pluses, Scope);
486 if(!Out.IsNull()) {
487 ITERATE(CSeq_align_set::Tdata, AlignIter, Out->Set()) {
488 CRef<CInstance> Inst(new CInstance(*AlignIter));
489 Instances.push_back(Inst);
490 }
491 }
492 }
493 if(!Minuses->Set().empty()) {
494 Out = x_RunCleanup(*Minuses, Scope);
495 if(!Out.IsNull()) {
496 ITERATE(CSeq_align_set::Tdata, AlignIter, Out->Set()) {
497 CRef<CInstance> Inst(new CInstance(*AlignIter));
498 Instances.push_back(Inst);
499 }
500 }
501 }
502 }
503 }
504
505 }
506
507
508
CInstance(CRef<CSeq_align> Align)509 CInstance::CInstance(CRef<CSeq_align> Align)
510 {
511 Query.SetId().Assign(Align->GetSeq_id(0));
512 Subject.SetId().Assign(Align->GetSeq_id(1));
513
514 Query.SetStrand() = Align->GetSeqStrand(0);
515 Subject.SetStrand() = Align->GetSeqStrand(1);
516
517 Query.SetFrom() = Align->GetSeqStart(0);
518 Subject.SetFrom() = Align->GetSeqStart(1);
519
520 Query.SetTo() = Align->GetSeqStop(0);
521 Subject.SetTo() = Align->GetSeqStop(1);
522
523 Alignments.Set().push_back(Align);
524 }
525
526
CInstance(const CSeq_align_set & AlignSet)527 CInstance::CInstance(const CSeq_align_set& AlignSet)
528 {
529 Query.SetId().Assign(AlignSet.Get().front()->GetSeq_id(0));
530 Subject.SetId().Assign(AlignSet.Get().front()->GetSeq_id(1));
531
532 Query.SetStrand() = AlignSet.Get().front()->GetSeqStrand(0);
533 Subject.SetStrand() = AlignSet.Get().front()->GetSeqStrand(1);
534
535 Query.SetFrom() = numeric_limits<TSeqPos>::max();
536 Subject.SetFrom() = numeric_limits<TSeqPos>::max();
537
538 Query.SetTo() = numeric_limits<TSeqPos>::min();
539 Subject.SetTo() = numeric_limits<TSeqPos>::min();
540
541 ITERATE(CSeq_align_set::Tdata, AlignIter, AlignSet.Get()) {
542 Query.SetFrom(min(Query.GetFrom(), (*AlignIter)->GetSeqStart(0)));
543 Subject.SetFrom(min(Subject.GetFrom(), (*AlignIter)->GetSeqStart(1)));
544
545 Query.SetTo(max(Query.GetTo(), (*AlignIter)->GetSeqStop(0)));
546 Subject.SetTo(max(Subject.GetTo(), (*AlignIter)->GetSeqStop(1)));
547 }
548 }
549
550
IsAlignmentContained(const CSeq_align & Align) const551 bool CInstance::IsAlignmentContained(const CSeq_align& Align) const
552 {
553 if(Query.GetStrand() != Align.GetSeqStrand(0) ||
554 Subject.GetStrand() != Align.GetSeqStrand(1))
555 return false;
556
557 if( Align.GetSeqStart(0) >= Query.GetFrom() &&
558 Align.GetSeqStop(0) <= Query.GetTo() &&
559 Align.GetSeqStart(1) >= Subject.GetFrom() &&
560 Align.GetSeqStop(1) <= Subject.GetTo()) {
561 return true;
562 }
563 else {
564 return false;
565 }
566 }
567
568
GapDistance(const CSeq_align & Align) const569 int CInstance::GapDistance(const CSeq_align& Align) const
570 {
571 int LongestGap = 0;
572 LongestGap = max((int)Align.GetSeqStart(0)-(int)Query.GetTo(), LongestGap);
573 LongestGap = max((int)Align.GetSeqStart(1)-(int)Subject.GetTo(), LongestGap);
574 LongestGap = max((int)Query.GetFrom()-(int)Align.GetSeqStop(0), LongestGap);
575 LongestGap = max((int)Subject.GetFrom()-(int)Align.GetSeqStop(1), LongestGap);
576 // cerr << "Longest Gap: " << LongestGap << endl;
577 return LongestGap;
578 }
579
580
SubjToQueryRatio() const581 double CInstance::SubjToQueryRatio() const
582 {
583 return (Subject.GetLength() / double(Query.GetLength()));
584 }
585
586
QueryLength() const587 TSeqPos CInstance::QueryLength() const
588 {
589 return Query.GetLength();
590 }
591
592
MergeIn(const CRef<CSeq_align> Align)593 void CInstance::MergeIn(const CRef<CSeq_align> Align)
594 {
595 Query.SetFrom(min(Query.GetFrom(), Align->GetSeqStart(0)));
596 Subject.SetFrom(min(Subject.GetFrom(), Align->GetSeqStart(1)));
597
598 Query.SetTo(max(Query.GetTo(), Align->GetSeqStop(0)));
599 Subject.SetTo(max(Subject.GetTo(), Align->GetSeqStop(1)));
600
601 Alignments.Set().push_back(Align);
602 }
603
x_GetDistanceInstances(CQuerySet & QueryAligns,CScope & Scope,vector<CRef<CInstance>> & Instances)604 void CInstancedAligner::x_GetDistanceInstances(CQuerySet& QueryAligns, CScope& Scope,
605 vector<CRef<CInstance> >& Instances)
606 {
607 typedef pair<CRef<CInstance>, CRef<CSeq_align_set> > TInstPair;
608
609 // Identify the best pct_coverage for every Subject Id.
610 // This will be used to filter out subjects with relatively low
611 // best pct_coverage, so that they can be skipped.
612 typedef map<string, double> TSubjectCoverage;
613 TSubjectCoverage BestCoverage;
614 double MaxCoverage = 0;
615
616 ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryAligns.Get()) {
617 ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
618 //ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
619
620 CRef<CSeq_align_set> Set = SubjectIter->second;
621 string IdStr = Set->Get().front()->GetSeq_id(1).AsFastaString();
622 double SubjCoverage = 0;
623 ITERATE(CSeq_align_set::Tdata, AlignIter, Set->Get()) {
624 double PctCov;
625 (*AlignIter)->GetNamedScore("pct_coverage", PctCov);
626 SubjCoverage = max(SubjCoverage, PctCov);
627 }
628 BestCoverage[IdStr] = SubjCoverage;
629 MaxCoverage = max(SubjCoverage, MaxCoverage);
630 }
631 }
632
633 typedef vector<CRef<CInstance> > TInstVector;
634 ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryAligns.Get()) {
635 ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
636 //ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
637
638 TInstVector SubjInstances;
639
640 CRef<CSeq_align_set> Set = SubjectIter->second;
641 string SubjIdStr = Set->Get().front()->GetSeq_id(1).AsFastaString();
642 if(BestCoverage[SubjIdStr] < (MaxCoverage*0.10)) {
643 continue; // Skip any subject id that has less than 10% the best pct_coverage
644 }
645
646 // Group together alignments that are not contained within each other,
647 // but are within a 'small' gap distance of each other into initial
648 // instances
649 ITERATE(CSeq_align_set::Tdata, AlignIter, Set->Get()) {
650
651 if((*AlignIter)->GetSegs().Which() != CSeq_align::C_Segs::e_Denseg)
652 continue;
653
654 bool Inserted = false;
655 bool Contained = false;
656 ITERATE(TInstVector, InstIter, SubjInstances) {
657 bool CurrContained = (*InstIter)->IsAlignmentContained(**AlignIter);
658 Contained |= CurrContained;
659 }
660 if(Contained)
661 continue;
662 NON_CONST_ITERATE(TInstVector, InstIter, SubjInstances) {
663 int GapDist = (*InstIter)->GapDistance(**AlignIter);
664 if(GapDist < 20000) {
665 (*InstIter)->MergeIn(*AlignIter);
666 Inserted = true;
667 break;
668 }
669 }
670 if(!Inserted) {
671 CRef<CInstance> Inst(new CInstance(*AlignIter));
672 SubjInstances.push_back(Inst);
673 }
674 }
675
676 // For these initial instances, run CAlignCleanup on the instance
677 // CSeq_align_sets. Then make instances from CAlignCleanup results.
678 TInstVector CleanedInstances;
679
680 ITERATE(TInstVector, InstIter, SubjInstances) {
681 // If the instance only has one alignment in it, skip it.
682 // we already have an identical BLAST result.
683 if((*InstIter)->Alignments.Get().size() <= 1)
684 continue;
685
686 CRef<CSeq_align_set> Cleaned;
687 Cleaned = x_RunCleanup((*InstIter)->Alignments, Scope);
688 if(!Cleaned.IsNull()) {
689 ITERATE(CSeq_align_set::Tdata, AlignIter, Cleaned->Get()) {
690 // If any of these cleaned alignments are dupes of existing alignments,
691 // skip them
692 bool DupeFound = false;
693 ITERATE(CSeq_align_set::Tdata, SourceIter, (*InstIter)->Alignments.Get()) {
694 if( (*AlignIter)->GetSeqStart(0) == (*SourceIter)->GetSeqStart(0) &&
695 (*AlignIter)->GetSeqStart(1) == (*SourceIter)->GetSeqStart(1) &&
696 (*AlignIter)->GetSeqStop(0) == (*SourceIter)->GetSeqStop(0) &&
697 (*AlignIter)->GetSeqStop(1) == (*SourceIter)->GetSeqStop(1)) {
698 DupeFound = true;
699 break;
700 }
701 }
702 if(DupeFound)
703 continue;
704
705 // And if any of these cleaned alignments are contained in
706 // established instances, throw them out too.
707 bool Contained = false;
708 ITERATE(TInstVector, CleanIter, CleanedInstances) {
709 bool Curr = (*CleanIter)->IsAlignmentContained(**AlignIter);
710 Contained |= Curr;
711 }
712 if(Contained)
713 continue;
714
715 // Any Cleaned Seq_align that got this far is a potential instance.
716 bool Dupe = false;
717 CRef<CInstance> Inst(new CInstance(*AlignIter));
718 ITERATE(vector<CRef<CInstance> >, OldInstIter, CleanedInstances) {
719 Dupe |= ((*OldInstIter)->Query.Equals(Inst->Query)
720 && (*OldInstIter)->Subject.Equals(Inst->Subject));
721 }
722 if(!Dupe)
723 CleanedInstances.push_back(Inst);
724 }
725 }
726 }
727
728 copy(CleanedInstances.begin(), CleanedInstances.end(),
729 insert_iterator<TInstVector>(Instances, Instances.end()));
730 }
731 }
732
733 }
734
735
x_FilterInstances(vector<CRef<CInstance>> & Instances,double MaxRatio)736 void CInstancedAligner::x_FilterInstances(vector<CRef<CInstance> >& Instances, double MaxRatio)
737 {
738 // Filters out Instances of overly lopsided ratios
739 vector<CRef<CInstance> >::iterator Curr;
740 Curr = Instances.begin();
741 for(Curr = Instances.begin(); Curr != Instances.end(); ) {
742 if( (*Curr)->SubjToQueryRatio() > MaxRatio ||
743 (*Curr)->SubjToQueryRatio() < 0.10 )
744 Curr = Instances.erase(Curr);
745 else
746 ++Curr;
747 }
748
749 // Find the longest instance now
750 TSeqPos LongestInstance = 0;
751 for(Curr = Instances.begin(); Curr != Instances.end(); ++Curr) {
752 TSeqPos CurrLength = (*Curr)->QueryLength();
753 LongestInstance = max(CurrLength, LongestInstance);
754 }
755
756 // Filters out instances that are too tiny to both with
757 for(Curr = Instances.begin(); Curr != Instances.end(); ) {
758 if( (*Curr)->QueryLength() <= (LongestInstance*0.05))
759 Curr = Instances.erase(Curr);
760 else
761 ++Curr;
762 }
763
764 vector<CRef<CInstance> >::iterator Outer, Inner;
765 for(Outer = Instances.begin(); Outer != Instances.end(); ++Outer) {
766 for(Inner = Outer+1; Inner != Instances.end(); ) {
767 if( (*Outer)->Query.Equals((*Inner)->Query) &&
768 (*Outer)->Subject.Equals((*Inner)->Subject) ) {
769 Inner = Instances.erase(Inner);
770 } else {
771 ++Inner;
772 }
773 }
774 }
775
776 }
777
778
x_MinCoverageCheck(const CQuerySet & QueryAligns)779 bool CInstancedAligner::x_MinCoverageCheck(const CQuerySet& QueryAligns)
780 {
781 double BestPctCoverage = -1.0;
782
783 ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryAligns.Get()) {
784 ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
785 //ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
786
787 CRef<CSeq_align_set> Set = SubjectIter->second;
788
789 ITERATE(CSeq_align_set::Tdata, AlignIter, Set->Get()) {
790 double PctCoverage = -1.0;
791 (*AlignIter)->GetNamedScore("pct_coverage", PctCoverage);
792 BestPctCoverage = max(BestPctCoverage, PctCoverage);
793 }
794 }
795 }
796
797
798 return (BestPctCoverage >= m_MinPctCoverage || BestPctCoverage == -1.0);
799 }
800
801 END_SCOPE(ncbi)
802
803