1 /*  $Id: alignment_scorer.cpp 450210 2014-10-23 16:04:54Z boukn $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Nathan Bouk
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiexpt.hpp>
34 #include <corelib/ncbi_system.hpp>
35 #include <math.h>
36 
37 #include <objects/seqloc/Seq_loc.hpp>
38 #include <objects/seqloc/Seq_id.hpp>
39 #include <objmgr/scope.hpp>
40 #include <objmgr/annot_selector.hpp>
41 #include <objmgr/annot_ci.hpp>
42 #include <algo/blast/api/blast_types.hpp>
43 #include <algo/blast/api/bl2seq.hpp>
44 #include <algo/blast/api/blast_options_handle.hpp>
45 #include <algo/blast/api/blast_nucl_options.hpp>
46 #include <algo/blast/api/disc_nucl_options.hpp>
47 #include <algo/align/util/score_builder.hpp>
48 #include <objects/seqalign/Seq_align.hpp>
49 #include <objects/seqalign/Seq_align_set.hpp>
50 #include <objects/seqalign/Dense_seg.hpp>
51 #include <objects/seq/Seq_ext.hpp>
52 #include <objects/seq/Delta_ext.hpp>
53 #include <objects/seq/Delta_seq.hpp>
54 #include <objects/seq/Seq_literal.hpp>
55 #include <objmgr/seq_vector.hpp>
56 #include <objects/seqalign/Score.hpp>
57 
58 #include <objects/seq/Seq_descr.hpp>
59 #include <objects/seq/Seqdesc.hpp>
60 #include <objects/general/User_object.hpp>
61 #include <objects/general/User_field.hpp>
62 #include <objects/seq/Seq_hist.hpp>
63 #include <objmgr/util/sequence.hpp>
64 
65 #include <algo/blast/api/local_blast.hpp>
66 #include <algo/blast/blastinput/blastn_args.hpp>
67 
68 
69 
70 #include <algo/align/ngalign/alignment_scorer.hpp>
71 
72 BEGIN_SCOPE(ncbi)
73 USING_SCOPE(objects);
74 USING_SCOPE(blast);
75 
76 static const Int8 kEffectiveSearchSpace = NCBI_CONST_INT8(1050668186940);
77 
s_ScoreAlignments(const CSeq_align_set & alignments,CScope & Scope,CScoreBuilder & Scorer,bool skip_unsupported)78 static void s_ScoreAlignments(const CSeq_align_set &alignments,
79                               CScope& Scope, CScoreBuilder &Scorer,
80                               bool skip_unsupported)
81 {
82     ITERATE(CSeq_align_set::Tdata, Iter, alignments.Get()) {
83         CRef<CSeq_align> Curr(*Iter);
84 
85         double DummyScore;
86 
87         if ( Curr->GetSegs().IsDenseg()  ||  ! skip_unsupported) {
88             if(!Curr->GetNamedScore(CSeq_align::eScore_Blast, DummyScore)) {
89                 Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_Blast);
90             }
91 
92             if(!Curr->GetNamedScore(CSeq_align::eScore_BitScore, DummyScore)) {
93                 Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_BitScore);
94             }
95 
96             if(!Curr->GetNamedScore(CSeq_align::eScore_EValue, DummyScore)) {
97                 Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_EValue);
98             }
99         }
100 
101         if(!Curr->GetNamedScore(CSeq_align::eScore_IdentityCount, DummyScore)) {
102             Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_IdentityCount);
103         }
104 
105         if (Curr->GetSegs().Which() == CSeq_align::C_Segs::e_Disc) {
106             /// Compute score for the components of a Disc-seg alignment
107             s_ScoreAlignments(Curr->GetSegs().GetDisc(), Scope, Scorer,
108                               skip_unsupported);
109         }
110     }
111 }
112 
113 
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)114 void CBlastScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
115 {
116     bool skip_unsupported = m_Options & eSkipUnsupportedAlignments;
117     CScoreBuilder Scorer(blast::eMegablast);
118     Scorer.SetEffectiveSearchSpace (kEffectiveSearchSpace);
119 
120     NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
121         NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
122             NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
123             //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
124                 s_ScoreAlignments(*SubjectIter->second, Scope, Scorer,
125                                   skip_unsupported);
126             }
127         }
128     }
129 }
130 
131 
132 
133 
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)134 void CPctIdentScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
135 {
136     //CScoreBuilder Scorer(*Options);
137     //Scorer.SetEffectiveSearchSpace(Options->GetEffectiveSearchSpace());
138     CScoreBuilder Scorer(blast::eMegablast);
139 
140     NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
141         NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
142             NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
143             //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
144 
145                 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
146 
147                     CRef<CSeq_align> Curr(*Iter);
148 
149                     //Scorer.AddScore(Scope, *Curr, CScoreBuilder::eScore_PercentIdentity);
150                     Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_PercentIdentity_Gapped);
151                     Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_PercentIdentity_Ungapped);
152                     Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_PercentIdentity_GapOpeningOnly);
153                 }
154             }
155         }
156     }
157 }
158 
159 
160 
161 
162 
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)163 void CPctCoverageScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
164 {
165     //CScoreBuilder Scorer(*Options);
166     //Scorer.SetEffectiveSearchSpace(Options->GetEffectiveSearchSpace());
167     CScoreBuilder Scorer(blast::eMegablast);
168 
169     NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
170         NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
171             NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
172             //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
173 
174                 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
175 
176                     CRef<CSeq_align> Curr(*Iter);
177 
178                     Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_PercentCoverage);
179                 }
180             }
181         }
182     }
183 }
184 
185 
186 
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)187 void CExpansionScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
188 {
189     CScoreBuilder Scorer;
190 
191     NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
192         NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
193             NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
194             //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
195 
196                 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
197 
198                     CRef<CSeq_align> Curr(*Iter);
199 
200                     TSeqPos AlignLen, AlignedLen;
201                     AlignedLen = Scorer.GetAlignLength(*Curr, true);
202                     AlignLen = Curr->GetSeqRange(1).GetLength();
203 
204                     Curr->SetNamedScore("expansion", double(AlignLen)/double(AlignedLen) );
205                 }
206             }
207         }
208     }
209 }
210 
211 
212 
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)213 void CWeightedIdentityScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
214 {
215     CScoreBuilder Scorer;
216 
217     NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
218         NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
219             NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
220             //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
221 
222                 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
223 
224                     CRef<CSeq_align> Curr(*Iter);
225 
226 					CSeq_id QueryId;
227                     QueryId.Assign(Curr->GetSeq_id(0));
228                     CBioseq_Handle QueryHandle = Scope.GetBioseqHandle(QueryId);
229 
230 					double WeightedIdent;
231                     double NumIdent, Matches, QueryLen;
232 
233                     Curr->GetNamedScore(CSeq_align::eScore_IdentityCount, NumIdent);
234                     Matches = Curr->GetAlignLength(false);
235                     QueryLen = QueryHandle.GetInst_Length();
236 
237                     WeightedIdent = (NumIdent + (Matches * m_K)) / (Matches + (QueryLen * m_K));
238                     Curr->SetNamedScore("weighted_identity", WeightedIdent);
239                 }
240             }
241         }
242     }
243 }
244 
245 
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)246 void CHangScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
247 {
248     CScoreBuilder Scorer(blast::eMegablast);
249 
250     NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
251         CSeq_id QueryId;
252 		QueryId.Set(QueryIter->first);
253 		CBioseq_Handle Handle = Scope.GetBioseqHandle(QueryId);
254 		if(!Handle)
255 			continue;
256 		TSeqPos QueryLen = Handle.GetInst_Length();
257 
258 		NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
259             NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
260 				CSeq_id TargetId;
261 				TargetId.Set(SubjectIter->first);
262 				CBioseq_Handle Handle = Scope.GetBioseqHandle(TargetId);
263 				if (!Handle || !Handle.CanGetInst_Ext() ||
264 					!Handle.GetInst_Ext().IsDelta()) {
265 					continue;
266 				}
267 				TSeqPos TargetLen = Handle.GetInst_Length();
268 				const CDelta_ext& DeltaExt = Handle.GetInst_Ext().GetDelta();
269 				// Get Gaps on Target
270 				list<TSeqRange> Gaps;
271 				TSeqPos CurrStart = 0;
272 				ITERATE(list<CRef<CDelta_seq> >, DeltaIter, DeltaExt.Get()) {
273 					if( (*DeltaIter)->IsLiteral() ) {
274 						TSeqPos Len = (*DeltaIter)->GetLiteral().GetLength();
275 						TSeqRange Gap(CurrStart, CurrStart+Len-1);
276 						Gaps.push_back(Gap);
277 						CurrStart += Len;
278 					} else if( (*DeltaIter)->IsLoc() ) {
279 						TSeqPos Len = (*DeltaIter)->GetLoc().GetInt().GetLength();
280 						TSeqPos CurrStop = CurrStart + Len - 1;
281 						CurrStart = CurrStop + 1;
282 					}
283 				}
284 
285                 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
286 
287                     CRef<CSeq_align> Curr(*Iter);
288 
289 					TSeqRange TargetRange = Curr->GetSeqRange(1);
290 					if(TargetRange.GetLength() == QueryLen) {
291 						Curr->SetNamedScore("hangs", 0);
292 						continue;
293 					}
294 
295 					// the ranges will normally be next to gaps, not intersect gaps
296 					// so expand the aligned range by 1 in both directions
297 					if(TargetRange.GetFrom() > 0)
298 						TargetRange.SetFrom(TargetRange.GetFrom()-1);
299 					TargetRange.SetTo(TargetRange.GetTo()+1);
300 
301 					int Hangs = 0;
302 					// check internal gaps
303 					ITERATE(list<TSeqRange>, GapIter, Gaps) {
304 						if(GapIter->IntersectingWith(TargetRange)) {
305 							Hangs = 1;
306 							break;
307 						}
308 					}
309 					// check the edges
310 					if(TargetRange.GetFrom() == 0)
311 						Hangs = 1;
312 					if(TargetRange.GetTo() >= (TargetLen-1))
313 						Hangs = 1;
314 					Curr->SetNamedScore("hangs", Hangs);
315                	}
316             }
317         }
318     }
319 }
320 
321 
322 
323 
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)324 void COverlapScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
325 {
326     NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
327         NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
328             NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
329             //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
330 
331                 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
332 
333                     CRef<CSeq_align> Curr(*Iter);
334 
335                     if(!Curr->GetSegs().IsDenseg())
336                         continue;
337 
338                     if(Curr->GetSegs().GetDenseg().GetDim() != 2)
339                         continue;
340 
341                     vector<pair<TSeqPos, TSeqPos> > Tails;
342 
343                     for(int Row = 0;  Row < Curr->GetSegs().GetDenseg().GetDim(); ++Row) {
344                         TSeqPos Start = Curr->GetSeqStart(Row);
345                         TSeqPos Stop  = Curr->GetSeqStop(Row);
346 
347                         TSeqPos SeqLen = Scope.GetBioseqHandle(Curr->GetSeq_id(Row)).GetInst_Length();
348 
349                         pair<TSeqPos, TSeqPos> Pair;
350                         if(Curr->GetSeqStrand(Row) == eNa_strand_plus) {
351                             Pair.first = Start;
352                             Pair.second = SeqLen - Stop - 1;
353                         } else {
354                             Pair.second = Start;
355                             Pair.first = SeqLen - Stop - 1;
356                         }
357                         Tails.push_back(Pair);
358                     }
359 
360                     bool Full, Half;
361                     int Contained;
362                     TSeqPos TailLen;
363 
364                     Full = (Tails[0].second <= m_Slop && Tails[1].first <= m_Slop)
365                         || (Tails[0].first <= m_Slop && Tails[1].second <= m_Slop);
366 
367                     if(Tails[0].first <= m_Slop && Tails[0].second <= m_Slop)
368                         Contained = 0;
369                     else if(Tails[1].first <= m_Slop && Tails[1].second <= m_Slop)
370                         Contained = 1;
371                     else
372                         Contained = -1;
373 
374                     bool Forewards;
375                     Forewards = (Tails[0].first > Tails[0].second)
376                              && (Tails[1].first < Tails[1].second);
377 
378                     if(Forewards) {
379                         Half = Tails[0].second <= m_Slop || Tails[1].first <= m_Slop;
380                         TailLen = max(Tails[0].second, Tails[1].first);
381                     }
382                     else {
383                         Half = Tails[0].first <= m_Slop || Tails[1].second <= m_Slop;
384                         TailLen = max(Tails[0].first, Tails[1].second);
385                     }
386 
387                     Curr->SetNamedScore("full_dovetail", (int)Full);
388                     Curr->SetNamedScore("half_dovetail", (int)Half);
389                     Curr->SetNamedScore("contained", Contained);
390                     Curr->SetNamedScore("tail_length", (int)TailLen);
391                     //Curr->SetNamedScore("align_length_ungap", (int)Curr->GetAlignLength(false));
392                 }
393             }
394         }
395     }
396 }
397 
398 
399 
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)400 void CClippedScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
401 {
402     CScoreBuilder Scorer;
403 
404     NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
405         NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
406             NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
407             //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
408 
409                 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
410 
411                     CRef<CSeq_align> Curr(*Iter);
412 
413                     if(!Curr->GetSegs().IsDenseg())
414                         continue;
415 
416                     CSeq_id EndId;
417                     EndId.Assign(Curr->GetSeq_id(0));
418 
419                     CBioseq_Handle EndHandle = Scope.GetBioseqHandle(EndId);
420 
421                     // Extract the Seq-annot.locs, for the clip region.
422                     CSeq_loc ClipLoc;
423                     ClipLoc.SetEmpty();
424 
425                     SAnnotSelector Sel(CSeqFeatData::e_Region);
426                     CAnnot_CI AnnotIter(EndHandle, Sel);
427 
428                     while(AnnotIter) {
429                         if (AnnotIter->IsFtable() &&
430                             AnnotIter->GetName() == "NCBI_GPIPE") {
431                             CConstRef<CSeq_annot> Annot = AnnotIter->GetCompleteSeq_annot();
432 
433                             ITERATE(CSeq_annot::C_Data::TFtable, FeatIter,
434                                     Annot->GetData().GetFtable()) {
435                                 CConstRef<CSeq_feat> Feat = *FeatIter;
436                                 if (Feat->CanGetLocation() &&
437                                     Feat->CanGetData() &&
438                                     Feat->GetData().IsRegion() &&
439                                     (Feat->GetData().GetRegion() == "high_quality" ||
440                                      Feat->GetData().GetRegion() == "hight_quality") ) {
441 
442                                     ClipLoc.Assign(Feat->GetLocation());
443                                 }
444                             }
445                         }
446                         ++AnnotIter;
447                     }
448 
449 
450                     if(ClipLoc.IsEmpty() && EndHandle.HasAnnots() ) {
451                         CConstRef<CBioseq> EndBioseq = EndHandle.GetCompleteBioseq();
452                         ITERATE(CBioseq::TAnnot, AnnotIter, EndBioseq->GetAnnot()) {
453                             if( (*AnnotIter)->GetData().IsLocs() ) {
454                                 ITERATE(CSeq_annot::C_Data::TLocs,
455                                         LocIter, (*AnnotIter)->GetData().GetLocs()) {
456                                     if( (*LocIter)->IsInt() &&
457                                         (*LocIter)->GetInt().GetId().Equals(EndId))
458                                         ClipLoc.Assign(**LocIter);
459                                 }
460                             }
461                         }
462                     }
463 
464                     if(ClipLoc.IsEmpty())
465                         continue;
466 
467 
468                     CRef<CSeq_id> ClipId(new CSeq_id);
469                     ClipId->SetLocal().SetStr("CLIPPED_SCORER_ID_"+EndId.GetSeqIdString(true) );
470 
471                     {{
472                         CBioseq_Handle ClipHandle;
473 
474                         try {
475                             ClipHandle = Scope.GetBioseqHandle(*ClipId);
476                         } catch(...) {
477                             ;
478                         }
479                         // if this sequence hasn't been added to the scope yet, add it
480                         if(!ClipHandle) {
481                             CRef<CBioseq> ClipBioseq(new CBioseq);
482                             ClipBioseq->SetId().push_back(ClipId);
483                             CRef<CDelta_seq> ClipDelta(new CDelta_seq);
484                             ClipDelta->SetLoc().Assign(ClipLoc);
485                             ClipBioseq->SetInst().SetExt().SetDelta().Set().push_back(ClipDelta);
486                             ClipBioseq->SetInst().SetRepr() = CSeq_inst::eRepr_virtual;
487                             ClipBioseq->SetInst().SetMol() = CSeq_inst::eMol_dna;
488                             ClipBioseq->SetInst().SetLength() = ClipLoc.GetInt().GetLength();
489                             ClipHandle = Scope.AddBioseq(*ClipBioseq);
490                         }
491                     }}
492 
493 
494                     // Fudge the New alignment.
495                     CRef<CSeq_align> ClipAlign(new CSeq_align);
496                     try {
497                         ClipAlign->Assign(*Curr);
498                         CRef<CDense_seg> ClipDenseg;
499                         CRange<TSeqPos> ClipRange(ClipLoc.GetInt().GetFrom(), ClipLoc.GetInt().GetTo());
500                         CRange<TSeqPos> AlignRange = ClipAlign->GetSeqRange(0);
501                         CRange<TSeqPos> Intersect = ClipRange.IntersectionWith(AlignRange);
502                         if(Intersect.Empty())
503                             continue;
504 
505                         ClipDenseg = ClipAlign->GetSegs().GetDenseg().ExtractSlice(0,
506                                         Intersect.GetFrom(), Intersect.GetTo() );
507                         ClipDenseg->SetIds().front()->Assign(*ClipId);
508                         ClipDenseg->OffsetRow(0, -(int)ClipLoc.GetInt().GetFrom());
509                         ClipAlign->SetSegs().SetDenseg().Assign(*ClipDenseg);
510                         ClipAlign->SetScore().clear();
511                     } catch(CException& e) {
512                         ERR_POST(Error << "Make Clip Align exception: " << e.ReportAll());
513                         ERR_POST("Probably means the clip region is invalid.");
514                         throw e;
515                     }
516 
517 
518                     double Temp;
519 
520                     Scorer.AddScore(Scope, *ClipAlign, CSeq_align::eScore_IdentityCount);
521                     ClipAlign->GetNamedScore(CSeq_align::eScore_IdentityCount, Temp);
522                     Curr->SetNamedScore("num_ident_clip", Temp);
523 
524                     Scorer.AddScore(Scope, *ClipAlign, CSeq_align::eScore_PercentIdentity_Gapped);
525                     Scorer.AddScore(Scope, *ClipAlign, CSeq_align::eScore_PercentIdentity_Ungapped);
526                     Scorer.AddScore(Scope, *ClipAlign, CSeq_align::eScore_PercentIdentity_GapOpeningOnly);
527                     ClipAlign->GetNamedScore(CSeq_align::eScore_PercentIdentity_GapOpeningOnly, Temp);
528                     Curr->SetNamedScore("pct_identity_gapopen_only_clip", Temp);
529 
530                     Scorer.AddScore(Scope, *ClipAlign, CSeq_align::eScore_PercentCoverage);
531                     ClipAlign->GetNamedScore(CSeq_align::eScore_PercentCoverage, Temp);
532                     Curr->SetNamedScore("pct_coverage_clip", Temp);
533 
534                     TSeqPos AlignLen = 0;
535                     TSeqPos AlignedLen = 0;
536                     AlignedLen = Scorer.GetAlignLength(*ClipAlign, true);
537                     try {
538                         // Sometimes the clipped region ends up being entirely gap, on the subject side.
539                         AlignLen = ClipAlign->GetSeqRange(1).GetLength();
540                         ClipAlign->SetNamedScore("expansion", double(AlignLen)/double(AlignedLen) );
541                         ClipAlign->GetNamedScore("expansion", Temp);
542                         Curr->SetNamedScore("expansion_clip", Temp);
543                     } catch(CException& e) {
544                         ERR_POST(Error << "Expansion exception, unable to get alignment length: " << e.ReportAll());
545                         throw e;
546                     }
547 
548                 } // end alignment loop
549             }
550         }
551     }
552 }
553 
554 
555 
556 
557 
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)558 void CCommonComponentScorer::ScoreAlignments(TAlignResultsRef AlignSet,
559                                              CScope& Scope)
560 {
561 
562     NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter,
563                       AlignSet->Get()) {
564         NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
565             NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
566             //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
567 
568                 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
569 
570                     CRef<CSeq_align> Curr(*Iter);
571 
572                     list<CRef<CSeq_id> > QueryIds, SubjectIds;
573 
574                     x_GetCompList(Curr->GetSeq_id(0),
575                                   Curr->GetSeqStart(0),
576                                   Curr->GetSeqStop(0),
577                                   QueryIds, Scope);
578                     x_GetCompList(Curr->GetSeq_id(1),
579                                   Curr->GetSeqStart(1),
580                                   Curr->GetSeqStop(1),
581                                   SubjectIds, Scope);
582 
583                     bool IsCommon = x_CompareCompLists(QueryIds, SubjectIds);
584 
585                     Curr->SetNamedScore("common_component", (int)IsCommon);
586                 }
587             }
588         }
589     }
590 }
591 
592 
x_GetUserCompList(CBioseq_Handle Handle,list<CRef<CSeq_id>> & CompIds)593 void CCommonComponentScorer::x_GetUserCompList(CBioseq_Handle Handle,
594                                                 list<CRef<CSeq_id> >& CompIds)
595 {
596     if(!Handle.CanGetDescr())
597         return;
598 
599 /*
600 user {
601       type
602         str "RefGeneTracking" ,
603       data {
604         {
605           label
606             str "Status" ,
607           data
608             str "Inferred" } ,
609         {
610           label
611             str "Assembly" ,
612           data
613             fields {
614               {
615                 label
616                   id 0 ,
617                 data
618                   fields {
619                     {
620                       label
621                         str "accession" ,
622                       data
623                         str "AC092447.5" } } } ,
624               {
625                 label
626                   id 0 ,
627                 data
628                   fields {
629                     {
630                       label
631                         str "gi" ,
632                       data
633                         int 123456 } } } } } } } ,
634 */
635 
636     const CSeq_descr& Descr = Handle.GetDescr();
637     ITERATE(CSeq_descr::Tdata, DescIter, Descr.Get()) {
638         const CSeqdesc& Desc = **DescIter;
639         if(!Desc.IsUser())
640             continue;
641 
642         const CUser_object& User = Desc.GetUser();
643         string TypeString;
644         if(User.CanGetType()  &&  User.GetType().IsStr())
645             TypeString = User.GetType().GetStr();
646         if(TypeString == "RefGeneTracking" && User.HasField("Assembly")) {
647 
648             const CUser_field& TopField = User.GetField("Assembly");
649             ITERATE(CUser_field::C_Data::TFields, FieldIter, TopField.GetData().GetFields()) {
650                 const CUser_field& MiddleField = **FieldIter;
651                 ITERATE(CUser_field::C_Data::TFields, FieldIter, MiddleField.GetData().GetFields()) {
652 
653                     const CUser_field& LastField = **FieldIter;
654                     string IdStr;
655                     if(LastField.GetData().IsStr())
656                         IdStr = LastField.GetData().GetStr();
657                     else if(LastField.GetData().IsInt())
658                         IdStr = NStr::IntToString(LastField.GetData().GetInt());
659                     else
660                         continue;
661 
662                     CRef<CSeq_id> Id;
663                     try {
664                         Id.Reset(new CSeq_id(IdStr));
665                     } catch(...) {
666                         continue; // Non-Seq-id entry, just skip it.
667                     }
668 
669                     CSeq_id_Handle CanonicalId;
670                     CanonicalId = sequence::GetId(*Id, Handle.GetScope(),
671                                                 sequence::eGetId_Canonical);
672                     Id->Assign(*CanonicalId.GetSeqId());
673                     CompIds.push_back(Id);
674                 }
675             }
676         }
677     }
678 
679 }
680 
681 
x_GetDeltaExtCompList(CBioseq_Handle Handle,TSeqPos Start,TSeqPos Stop,list<CRef<CSeq_id>> & CompIds)682 void CCommonComponentScorer::x_GetDeltaExtCompList(CBioseq_Handle Handle,
683                                            TSeqPos Start, TSeqPos Stop,
684                                            list<CRef<CSeq_id> >& CompIds)
685 {
686     if(!Handle.CanGetInst_Ext())
687         return;
688 
689     const CSeq_ext& Ext = Handle.GetInst_Ext();
690 
691     if(!Ext.IsDelta())
692         return;
693 
694     const CDelta_ext& DeltaExt = Ext.GetDelta();
695 
696     TSeqPos SegStart = 0;
697     ITERATE(CDelta_ext::Tdata, SegIter, DeltaExt.Get()) {
698         CRef<CDelta_seq> Seg(*SegIter);
699 
700         if(Seg->IsLiteral()) {
701             SegStart += Seg->GetLiteral().GetLength();
702             continue;
703         }
704 
705         const CSeq_loc& Loc = Seg->GetLoc();
706 
707         if(!Loc.IsInt()) {
708             // ??
709             continue;
710         }
711 
712         const CSeq_interval& Int = Loc.GetInt();
713 
714         if(Start <= (SegStart+Int.GetLength()) && Stop >= SegStart) {
715             CSeq_id_Handle CanonicalId;
716             CanonicalId = sequence::GetId(Int.GetId(), Handle.GetScope(),
717                                         sequence::eGetId_Canonical);
718 
719             CRef<CSeq_id> Id(new CSeq_id);
720             Id->Assign(*CanonicalId.GetSeqId());
721             CompIds.push_back(Id);
722         }
723 
724         SegStart += Int.GetLength();
725     }
726 }
727 
728 
x_GetSeqHistCompList(CBioseq_Handle Handle,TSeqPos Start,TSeqPos Stop,list<CRef<CSeq_id>> & CompIds)729 void CCommonComponentScorer::x_GetSeqHistCompList(CBioseq_Handle Handle,
730                                            TSeqPos Start, TSeqPos Stop,
731                                            list<CRef<CSeq_id> >& CompIds)
732 {
733     if(!Handle.CanGetInst_Hist())
734         return;
735 
736     const CSeq_hist& Hist = Handle.GetInst_Hist();
737 
738     if(!Hist.CanGetAssembly())
739         return;
740 
741     const CSeq_hist::TAssembly& Assembly = Hist.GetAssembly();
742 
743     CSeq_id_Handle HandleId = sequence::GetId(Handle.GetSeq_id_Handle(),
744                                 Handle.GetScope(), sequence::eGetId_Canonical);
745 
746     ITERATE(CSeq_hist::TAssembly, AlignIter, Assembly) {
747         const CSeq_align& Align = **AlignIter;
748 
749         // the Seq-aligns are expected to be in this row order
750         // but just in case, there is a check and swap later if needed
751         int query_row     = 0;
752         int component_row = 1;
753 
754         CSeq_id_Handle AlignIdHandle =
755             CSeq_id_Handle::GetHandle(Align.GetSeq_id(query_row));
756         AlignIdHandle = sequence::GetId(AlignIdHandle, Handle.GetScope(),
757                                         sequence::eGetId_Canonical);
758         if (AlignIdHandle != HandleId) {
759             std::swap(query_row, component_row);
760         }
761 
762         list< CConstRef<CSeq_align> > SplitAligns;
763         if (Align.GetSegs().IsDisc()) {
764             ITERATE (CSeq_align_set::Tdata, it,
765                      Align.GetSegs().GetDisc().Get()) {
766                 SplitAligns.push_back(*it);
767             }
768         } else {
769             SplitAligns.push_back(CConstRef<CSeq_align>(&Align));
770         }
771 
772         ITERATE(list<CConstRef<CSeq_align> >, SplitIter, SplitAligns) {
773             CSeq_id_Handle CompId =
774                 CSeq_id_Handle::GetHandle
775                 ((*SplitIter)->GetSeq_id(component_row));
776             CompId = sequence::GetId(CompId, Handle.GetScope(),
777                                    sequence::eGetId_Canonical);
778 
779             if(Start <= (*SplitIter)->GetSeqStop(query_row) &&
780                Stop >= (*SplitIter)->GetSeqStart(query_row)) {
781                 CRef<CSeq_id> Id(new CSeq_id);
782                 Id->Assign(*CompId.GetSeqId());
783                 CompIds.push_back(Id);
784             }
785         }
786     }
787 }
788 
789 
x_GetCompList(const CSeq_id & Id,TSeqPos Start,TSeqPos Stop,list<CRef<CSeq_id>> & CompIds,CScope & Scope)790 void CCommonComponentScorer::x_GetCompList(const CSeq_id& Id,
791                                            TSeqPos Start, TSeqPos Stop,
792                                            list<CRef<CSeq_id> >& CompIds,
793                                            CScope& Scope)
794 {
795     CBioseq_Handle Handle = Scope.GetBioseqHandle(Id);
796 
797     x_GetDeltaExtCompList(Handle, Start, Stop, CompIds);
798     if(CompIds.empty())
799         x_GetUserCompList(Handle, CompIds);
800     if(CompIds.empty())
801         x_GetSeqHistCompList(Handle, Start, Stop, CompIds);
802     if(CompIds.empty()) {
803         CRef<CSeq_id> SelfId(new CSeq_id);
804         SelfId->Assign(Id);
805         CompIds.push_back(SelfId);
806     }
807 }
808 
809 
x_CompareCompLists(list<CRef<CSeq_id>> & QueryIds,list<CRef<CSeq_id>> & SubjectIds)810 bool CCommonComponentScorer::x_CompareCompLists(list<CRef<CSeq_id> >& QueryIds,
811                                                 list<CRef<CSeq_id> >& SubjectIds)
812 {
813     list<CRef<CSeq_id> >::iterator QueryIter, SubjectIter;
814     for(QueryIter = QueryIds.begin(); QueryIter != QueryIds.end(); ++QueryIter) {
815         for(SubjectIter = SubjectIds.begin(); SubjectIter != SubjectIds.end(); ++SubjectIter) {
816             if( (*QueryIter)->Equals(**SubjectIter) )
817                 return true;
818         }
819     }
820 
821     return false;
822 }
823 
824 
825 
826 
827 
828 END_SCOPE(ncbi)
829 
830