1 /* $Id: alignment_scorer.cpp 450210 2014-10-23 16:04:54Z boukn $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Nathan Bouk
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiexpt.hpp>
34 #include <corelib/ncbi_system.hpp>
35 #include <math.h>
36
37 #include <objects/seqloc/Seq_loc.hpp>
38 #include <objects/seqloc/Seq_id.hpp>
39 #include <objmgr/scope.hpp>
40 #include <objmgr/annot_selector.hpp>
41 #include <objmgr/annot_ci.hpp>
42 #include <algo/blast/api/blast_types.hpp>
43 #include <algo/blast/api/bl2seq.hpp>
44 #include <algo/blast/api/blast_options_handle.hpp>
45 #include <algo/blast/api/blast_nucl_options.hpp>
46 #include <algo/blast/api/disc_nucl_options.hpp>
47 #include <algo/align/util/score_builder.hpp>
48 #include <objects/seqalign/Seq_align.hpp>
49 #include <objects/seqalign/Seq_align_set.hpp>
50 #include <objects/seqalign/Dense_seg.hpp>
51 #include <objects/seq/Seq_ext.hpp>
52 #include <objects/seq/Delta_ext.hpp>
53 #include <objects/seq/Delta_seq.hpp>
54 #include <objects/seq/Seq_literal.hpp>
55 #include <objmgr/seq_vector.hpp>
56 #include <objects/seqalign/Score.hpp>
57
58 #include <objects/seq/Seq_descr.hpp>
59 #include <objects/seq/Seqdesc.hpp>
60 #include <objects/general/User_object.hpp>
61 #include <objects/general/User_field.hpp>
62 #include <objects/seq/Seq_hist.hpp>
63 #include <objmgr/util/sequence.hpp>
64
65 #include <algo/blast/api/local_blast.hpp>
66 #include <algo/blast/blastinput/blastn_args.hpp>
67
68
69
70 #include <algo/align/ngalign/alignment_scorer.hpp>
71
72 BEGIN_SCOPE(ncbi)
73 USING_SCOPE(objects);
74 USING_SCOPE(blast);
75
76 static const Int8 kEffectiveSearchSpace = NCBI_CONST_INT8(1050668186940);
77
s_ScoreAlignments(const CSeq_align_set & alignments,CScope & Scope,CScoreBuilder & Scorer,bool skip_unsupported)78 static void s_ScoreAlignments(const CSeq_align_set &alignments,
79 CScope& Scope, CScoreBuilder &Scorer,
80 bool skip_unsupported)
81 {
82 ITERATE(CSeq_align_set::Tdata, Iter, alignments.Get()) {
83 CRef<CSeq_align> Curr(*Iter);
84
85 double DummyScore;
86
87 if ( Curr->GetSegs().IsDenseg() || ! skip_unsupported) {
88 if(!Curr->GetNamedScore(CSeq_align::eScore_Blast, DummyScore)) {
89 Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_Blast);
90 }
91
92 if(!Curr->GetNamedScore(CSeq_align::eScore_BitScore, DummyScore)) {
93 Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_BitScore);
94 }
95
96 if(!Curr->GetNamedScore(CSeq_align::eScore_EValue, DummyScore)) {
97 Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_EValue);
98 }
99 }
100
101 if(!Curr->GetNamedScore(CSeq_align::eScore_IdentityCount, DummyScore)) {
102 Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_IdentityCount);
103 }
104
105 if (Curr->GetSegs().Which() == CSeq_align::C_Segs::e_Disc) {
106 /// Compute score for the components of a Disc-seg alignment
107 s_ScoreAlignments(Curr->GetSegs().GetDisc(), Scope, Scorer,
108 skip_unsupported);
109 }
110 }
111 }
112
113
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)114 void CBlastScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
115 {
116 bool skip_unsupported = m_Options & eSkipUnsupportedAlignments;
117 CScoreBuilder Scorer(blast::eMegablast);
118 Scorer.SetEffectiveSearchSpace (kEffectiveSearchSpace);
119
120 NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
121 NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
122 NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
123 //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
124 s_ScoreAlignments(*SubjectIter->second, Scope, Scorer,
125 skip_unsupported);
126 }
127 }
128 }
129 }
130
131
132
133
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)134 void CPctIdentScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
135 {
136 //CScoreBuilder Scorer(*Options);
137 //Scorer.SetEffectiveSearchSpace(Options->GetEffectiveSearchSpace());
138 CScoreBuilder Scorer(blast::eMegablast);
139
140 NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
141 NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
142 NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
143 //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
144
145 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
146
147 CRef<CSeq_align> Curr(*Iter);
148
149 //Scorer.AddScore(Scope, *Curr, CScoreBuilder::eScore_PercentIdentity);
150 Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_PercentIdentity_Gapped);
151 Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_PercentIdentity_Ungapped);
152 Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_PercentIdentity_GapOpeningOnly);
153 }
154 }
155 }
156 }
157 }
158
159
160
161
162
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)163 void CPctCoverageScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
164 {
165 //CScoreBuilder Scorer(*Options);
166 //Scorer.SetEffectiveSearchSpace(Options->GetEffectiveSearchSpace());
167 CScoreBuilder Scorer(blast::eMegablast);
168
169 NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
170 NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
171 NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
172 //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
173
174 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
175
176 CRef<CSeq_align> Curr(*Iter);
177
178 Scorer.AddScore(Scope, *Curr, CSeq_align::eScore_PercentCoverage);
179 }
180 }
181 }
182 }
183 }
184
185
186
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)187 void CExpansionScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
188 {
189 CScoreBuilder Scorer;
190
191 NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
192 NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
193 NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
194 //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
195
196 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
197
198 CRef<CSeq_align> Curr(*Iter);
199
200 TSeqPos AlignLen, AlignedLen;
201 AlignedLen = Scorer.GetAlignLength(*Curr, true);
202 AlignLen = Curr->GetSeqRange(1).GetLength();
203
204 Curr->SetNamedScore("expansion", double(AlignLen)/double(AlignedLen) );
205 }
206 }
207 }
208 }
209 }
210
211
212
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)213 void CWeightedIdentityScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
214 {
215 CScoreBuilder Scorer;
216
217 NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
218 NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
219 NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
220 //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
221
222 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
223
224 CRef<CSeq_align> Curr(*Iter);
225
226 CSeq_id QueryId;
227 QueryId.Assign(Curr->GetSeq_id(0));
228 CBioseq_Handle QueryHandle = Scope.GetBioseqHandle(QueryId);
229
230 double WeightedIdent;
231 double NumIdent, Matches, QueryLen;
232
233 Curr->GetNamedScore(CSeq_align::eScore_IdentityCount, NumIdent);
234 Matches = Curr->GetAlignLength(false);
235 QueryLen = QueryHandle.GetInst_Length();
236
237 WeightedIdent = (NumIdent + (Matches * m_K)) / (Matches + (QueryLen * m_K));
238 Curr->SetNamedScore("weighted_identity", WeightedIdent);
239 }
240 }
241 }
242 }
243 }
244
245
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)246 void CHangScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
247 {
248 CScoreBuilder Scorer(blast::eMegablast);
249
250 NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
251 CSeq_id QueryId;
252 QueryId.Set(QueryIter->first);
253 CBioseq_Handle Handle = Scope.GetBioseqHandle(QueryId);
254 if(!Handle)
255 continue;
256 TSeqPos QueryLen = Handle.GetInst_Length();
257
258 NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
259 NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
260 CSeq_id TargetId;
261 TargetId.Set(SubjectIter->first);
262 CBioseq_Handle Handle = Scope.GetBioseqHandle(TargetId);
263 if (!Handle || !Handle.CanGetInst_Ext() ||
264 !Handle.GetInst_Ext().IsDelta()) {
265 continue;
266 }
267 TSeqPos TargetLen = Handle.GetInst_Length();
268 const CDelta_ext& DeltaExt = Handle.GetInst_Ext().GetDelta();
269 // Get Gaps on Target
270 list<TSeqRange> Gaps;
271 TSeqPos CurrStart = 0;
272 ITERATE(list<CRef<CDelta_seq> >, DeltaIter, DeltaExt.Get()) {
273 if( (*DeltaIter)->IsLiteral() ) {
274 TSeqPos Len = (*DeltaIter)->GetLiteral().GetLength();
275 TSeqRange Gap(CurrStart, CurrStart+Len-1);
276 Gaps.push_back(Gap);
277 CurrStart += Len;
278 } else if( (*DeltaIter)->IsLoc() ) {
279 TSeqPos Len = (*DeltaIter)->GetLoc().GetInt().GetLength();
280 TSeqPos CurrStop = CurrStart + Len - 1;
281 CurrStart = CurrStop + 1;
282 }
283 }
284
285 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
286
287 CRef<CSeq_align> Curr(*Iter);
288
289 TSeqRange TargetRange = Curr->GetSeqRange(1);
290 if(TargetRange.GetLength() == QueryLen) {
291 Curr->SetNamedScore("hangs", 0);
292 continue;
293 }
294
295 // the ranges will normally be next to gaps, not intersect gaps
296 // so expand the aligned range by 1 in both directions
297 if(TargetRange.GetFrom() > 0)
298 TargetRange.SetFrom(TargetRange.GetFrom()-1);
299 TargetRange.SetTo(TargetRange.GetTo()+1);
300
301 int Hangs = 0;
302 // check internal gaps
303 ITERATE(list<TSeqRange>, GapIter, Gaps) {
304 if(GapIter->IntersectingWith(TargetRange)) {
305 Hangs = 1;
306 break;
307 }
308 }
309 // check the edges
310 if(TargetRange.GetFrom() == 0)
311 Hangs = 1;
312 if(TargetRange.GetTo() >= (TargetLen-1))
313 Hangs = 1;
314 Curr->SetNamedScore("hangs", Hangs);
315 }
316 }
317 }
318 }
319 }
320
321
322
323
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)324 void COverlapScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
325 {
326 NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
327 NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
328 NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
329 //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
330
331 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
332
333 CRef<CSeq_align> Curr(*Iter);
334
335 if(!Curr->GetSegs().IsDenseg())
336 continue;
337
338 if(Curr->GetSegs().GetDenseg().GetDim() != 2)
339 continue;
340
341 vector<pair<TSeqPos, TSeqPos> > Tails;
342
343 for(int Row = 0; Row < Curr->GetSegs().GetDenseg().GetDim(); ++Row) {
344 TSeqPos Start = Curr->GetSeqStart(Row);
345 TSeqPos Stop = Curr->GetSeqStop(Row);
346
347 TSeqPos SeqLen = Scope.GetBioseqHandle(Curr->GetSeq_id(Row)).GetInst_Length();
348
349 pair<TSeqPos, TSeqPos> Pair;
350 if(Curr->GetSeqStrand(Row) == eNa_strand_plus) {
351 Pair.first = Start;
352 Pair.second = SeqLen - Stop - 1;
353 } else {
354 Pair.second = Start;
355 Pair.first = SeqLen - Stop - 1;
356 }
357 Tails.push_back(Pair);
358 }
359
360 bool Full, Half;
361 int Contained;
362 TSeqPos TailLen;
363
364 Full = (Tails[0].second <= m_Slop && Tails[1].first <= m_Slop)
365 || (Tails[0].first <= m_Slop && Tails[1].second <= m_Slop);
366
367 if(Tails[0].first <= m_Slop && Tails[0].second <= m_Slop)
368 Contained = 0;
369 else if(Tails[1].first <= m_Slop && Tails[1].second <= m_Slop)
370 Contained = 1;
371 else
372 Contained = -1;
373
374 bool Forewards;
375 Forewards = (Tails[0].first > Tails[0].second)
376 && (Tails[1].first < Tails[1].second);
377
378 if(Forewards) {
379 Half = Tails[0].second <= m_Slop || Tails[1].first <= m_Slop;
380 TailLen = max(Tails[0].second, Tails[1].first);
381 }
382 else {
383 Half = Tails[0].first <= m_Slop || Tails[1].second <= m_Slop;
384 TailLen = max(Tails[0].first, Tails[1].second);
385 }
386
387 Curr->SetNamedScore("full_dovetail", (int)Full);
388 Curr->SetNamedScore("half_dovetail", (int)Half);
389 Curr->SetNamedScore("contained", Contained);
390 Curr->SetNamedScore("tail_length", (int)TailLen);
391 //Curr->SetNamedScore("align_length_ungap", (int)Curr->GetAlignLength(false));
392 }
393 }
394 }
395 }
396 }
397
398
399
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)400 void CClippedScorer::ScoreAlignments(TAlignResultsRef AlignSet, CScope& Scope)
401 {
402 CScoreBuilder Scorer;
403
404 NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter, AlignSet->Get()) {
405 NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
406 NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
407 //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
408
409 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
410
411 CRef<CSeq_align> Curr(*Iter);
412
413 if(!Curr->GetSegs().IsDenseg())
414 continue;
415
416 CSeq_id EndId;
417 EndId.Assign(Curr->GetSeq_id(0));
418
419 CBioseq_Handle EndHandle = Scope.GetBioseqHandle(EndId);
420
421 // Extract the Seq-annot.locs, for the clip region.
422 CSeq_loc ClipLoc;
423 ClipLoc.SetEmpty();
424
425 SAnnotSelector Sel(CSeqFeatData::e_Region);
426 CAnnot_CI AnnotIter(EndHandle, Sel);
427
428 while(AnnotIter) {
429 if (AnnotIter->IsFtable() &&
430 AnnotIter->GetName() == "NCBI_GPIPE") {
431 CConstRef<CSeq_annot> Annot = AnnotIter->GetCompleteSeq_annot();
432
433 ITERATE(CSeq_annot::C_Data::TFtable, FeatIter,
434 Annot->GetData().GetFtable()) {
435 CConstRef<CSeq_feat> Feat = *FeatIter;
436 if (Feat->CanGetLocation() &&
437 Feat->CanGetData() &&
438 Feat->GetData().IsRegion() &&
439 (Feat->GetData().GetRegion() == "high_quality" ||
440 Feat->GetData().GetRegion() == "hight_quality") ) {
441
442 ClipLoc.Assign(Feat->GetLocation());
443 }
444 }
445 }
446 ++AnnotIter;
447 }
448
449
450 if(ClipLoc.IsEmpty() && EndHandle.HasAnnots() ) {
451 CConstRef<CBioseq> EndBioseq = EndHandle.GetCompleteBioseq();
452 ITERATE(CBioseq::TAnnot, AnnotIter, EndBioseq->GetAnnot()) {
453 if( (*AnnotIter)->GetData().IsLocs() ) {
454 ITERATE(CSeq_annot::C_Data::TLocs,
455 LocIter, (*AnnotIter)->GetData().GetLocs()) {
456 if( (*LocIter)->IsInt() &&
457 (*LocIter)->GetInt().GetId().Equals(EndId))
458 ClipLoc.Assign(**LocIter);
459 }
460 }
461 }
462 }
463
464 if(ClipLoc.IsEmpty())
465 continue;
466
467
468 CRef<CSeq_id> ClipId(new CSeq_id);
469 ClipId->SetLocal().SetStr("CLIPPED_SCORER_ID_"+EndId.GetSeqIdString(true) );
470
471 {{
472 CBioseq_Handle ClipHandle;
473
474 try {
475 ClipHandle = Scope.GetBioseqHandle(*ClipId);
476 } catch(...) {
477 ;
478 }
479 // if this sequence hasn't been added to the scope yet, add it
480 if(!ClipHandle) {
481 CRef<CBioseq> ClipBioseq(new CBioseq);
482 ClipBioseq->SetId().push_back(ClipId);
483 CRef<CDelta_seq> ClipDelta(new CDelta_seq);
484 ClipDelta->SetLoc().Assign(ClipLoc);
485 ClipBioseq->SetInst().SetExt().SetDelta().Set().push_back(ClipDelta);
486 ClipBioseq->SetInst().SetRepr() = CSeq_inst::eRepr_virtual;
487 ClipBioseq->SetInst().SetMol() = CSeq_inst::eMol_dna;
488 ClipBioseq->SetInst().SetLength() = ClipLoc.GetInt().GetLength();
489 ClipHandle = Scope.AddBioseq(*ClipBioseq);
490 }
491 }}
492
493
494 // Fudge the New alignment.
495 CRef<CSeq_align> ClipAlign(new CSeq_align);
496 try {
497 ClipAlign->Assign(*Curr);
498 CRef<CDense_seg> ClipDenseg;
499 CRange<TSeqPos> ClipRange(ClipLoc.GetInt().GetFrom(), ClipLoc.GetInt().GetTo());
500 CRange<TSeqPos> AlignRange = ClipAlign->GetSeqRange(0);
501 CRange<TSeqPos> Intersect = ClipRange.IntersectionWith(AlignRange);
502 if(Intersect.Empty())
503 continue;
504
505 ClipDenseg = ClipAlign->GetSegs().GetDenseg().ExtractSlice(0,
506 Intersect.GetFrom(), Intersect.GetTo() );
507 ClipDenseg->SetIds().front()->Assign(*ClipId);
508 ClipDenseg->OffsetRow(0, -(int)ClipLoc.GetInt().GetFrom());
509 ClipAlign->SetSegs().SetDenseg().Assign(*ClipDenseg);
510 ClipAlign->SetScore().clear();
511 } catch(CException& e) {
512 ERR_POST(Error << "Make Clip Align exception: " << e.ReportAll());
513 ERR_POST("Probably means the clip region is invalid.");
514 throw e;
515 }
516
517
518 double Temp;
519
520 Scorer.AddScore(Scope, *ClipAlign, CSeq_align::eScore_IdentityCount);
521 ClipAlign->GetNamedScore(CSeq_align::eScore_IdentityCount, Temp);
522 Curr->SetNamedScore("num_ident_clip", Temp);
523
524 Scorer.AddScore(Scope, *ClipAlign, CSeq_align::eScore_PercentIdentity_Gapped);
525 Scorer.AddScore(Scope, *ClipAlign, CSeq_align::eScore_PercentIdentity_Ungapped);
526 Scorer.AddScore(Scope, *ClipAlign, CSeq_align::eScore_PercentIdentity_GapOpeningOnly);
527 ClipAlign->GetNamedScore(CSeq_align::eScore_PercentIdentity_GapOpeningOnly, Temp);
528 Curr->SetNamedScore("pct_identity_gapopen_only_clip", Temp);
529
530 Scorer.AddScore(Scope, *ClipAlign, CSeq_align::eScore_PercentCoverage);
531 ClipAlign->GetNamedScore(CSeq_align::eScore_PercentCoverage, Temp);
532 Curr->SetNamedScore("pct_coverage_clip", Temp);
533
534 TSeqPos AlignLen = 0;
535 TSeqPos AlignedLen = 0;
536 AlignedLen = Scorer.GetAlignLength(*ClipAlign, true);
537 try {
538 // Sometimes the clipped region ends up being entirely gap, on the subject side.
539 AlignLen = ClipAlign->GetSeqRange(1).GetLength();
540 ClipAlign->SetNamedScore("expansion", double(AlignLen)/double(AlignedLen) );
541 ClipAlign->GetNamedScore("expansion", Temp);
542 Curr->SetNamedScore("expansion_clip", Temp);
543 } catch(CException& e) {
544 ERR_POST(Error << "Expansion exception, unable to get alignment length: " << e.ReportAll());
545 throw e;
546 }
547
548 } // end alignment loop
549 }
550 }
551 }
552 }
553
554
555
556
557
ScoreAlignments(TAlignResultsRef AlignSet,CScope & Scope)558 void CCommonComponentScorer::ScoreAlignments(TAlignResultsRef AlignSet,
559 CScope& Scope)
560 {
561
562 NON_CONST_ITERATE(CAlignResultsSet::TQueryToSubjectSet, QueryIter,
563 AlignSet->Get()) {
564 NON_CONST_ITERATE(CQuerySet::TAssemblyToSubjectSet, AssemIter, QueryIter->second->Get()) {
565 NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, AssemIter->second) {
566 //NON_CONST_ITERATE(CQuerySet::TSubjectToAlignSet, SubjectIter, QueryIter->second->Get()) {
567
568 ITERATE(CSeq_align_set::Tdata, Iter, SubjectIter->second->Get()) {
569
570 CRef<CSeq_align> Curr(*Iter);
571
572 list<CRef<CSeq_id> > QueryIds, SubjectIds;
573
574 x_GetCompList(Curr->GetSeq_id(0),
575 Curr->GetSeqStart(0),
576 Curr->GetSeqStop(0),
577 QueryIds, Scope);
578 x_GetCompList(Curr->GetSeq_id(1),
579 Curr->GetSeqStart(1),
580 Curr->GetSeqStop(1),
581 SubjectIds, Scope);
582
583 bool IsCommon = x_CompareCompLists(QueryIds, SubjectIds);
584
585 Curr->SetNamedScore("common_component", (int)IsCommon);
586 }
587 }
588 }
589 }
590 }
591
592
x_GetUserCompList(CBioseq_Handle Handle,list<CRef<CSeq_id>> & CompIds)593 void CCommonComponentScorer::x_GetUserCompList(CBioseq_Handle Handle,
594 list<CRef<CSeq_id> >& CompIds)
595 {
596 if(!Handle.CanGetDescr())
597 return;
598
599 /*
600 user {
601 type
602 str "RefGeneTracking" ,
603 data {
604 {
605 label
606 str "Status" ,
607 data
608 str "Inferred" } ,
609 {
610 label
611 str "Assembly" ,
612 data
613 fields {
614 {
615 label
616 id 0 ,
617 data
618 fields {
619 {
620 label
621 str "accession" ,
622 data
623 str "AC092447.5" } } } ,
624 {
625 label
626 id 0 ,
627 data
628 fields {
629 {
630 label
631 str "gi" ,
632 data
633 int 123456 } } } } } } } ,
634 */
635
636 const CSeq_descr& Descr = Handle.GetDescr();
637 ITERATE(CSeq_descr::Tdata, DescIter, Descr.Get()) {
638 const CSeqdesc& Desc = **DescIter;
639 if(!Desc.IsUser())
640 continue;
641
642 const CUser_object& User = Desc.GetUser();
643 string TypeString;
644 if(User.CanGetType() && User.GetType().IsStr())
645 TypeString = User.GetType().GetStr();
646 if(TypeString == "RefGeneTracking" && User.HasField("Assembly")) {
647
648 const CUser_field& TopField = User.GetField("Assembly");
649 ITERATE(CUser_field::C_Data::TFields, FieldIter, TopField.GetData().GetFields()) {
650 const CUser_field& MiddleField = **FieldIter;
651 ITERATE(CUser_field::C_Data::TFields, FieldIter, MiddleField.GetData().GetFields()) {
652
653 const CUser_field& LastField = **FieldIter;
654 string IdStr;
655 if(LastField.GetData().IsStr())
656 IdStr = LastField.GetData().GetStr();
657 else if(LastField.GetData().IsInt())
658 IdStr = NStr::IntToString(LastField.GetData().GetInt());
659 else
660 continue;
661
662 CRef<CSeq_id> Id;
663 try {
664 Id.Reset(new CSeq_id(IdStr));
665 } catch(...) {
666 continue; // Non-Seq-id entry, just skip it.
667 }
668
669 CSeq_id_Handle CanonicalId;
670 CanonicalId = sequence::GetId(*Id, Handle.GetScope(),
671 sequence::eGetId_Canonical);
672 Id->Assign(*CanonicalId.GetSeqId());
673 CompIds.push_back(Id);
674 }
675 }
676 }
677 }
678
679 }
680
681
x_GetDeltaExtCompList(CBioseq_Handle Handle,TSeqPos Start,TSeqPos Stop,list<CRef<CSeq_id>> & CompIds)682 void CCommonComponentScorer::x_GetDeltaExtCompList(CBioseq_Handle Handle,
683 TSeqPos Start, TSeqPos Stop,
684 list<CRef<CSeq_id> >& CompIds)
685 {
686 if(!Handle.CanGetInst_Ext())
687 return;
688
689 const CSeq_ext& Ext = Handle.GetInst_Ext();
690
691 if(!Ext.IsDelta())
692 return;
693
694 const CDelta_ext& DeltaExt = Ext.GetDelta();
695
696 TSeqPos SegStart = 0;
697 ITERATE(CDelta_ext::Tdata, SegIter, DeltaExt.Get()) {
698 CRef<CDelta_seq> Seg(*SegIter);
699
700 if(Seg->IsLiteral()) {
701 SegStart += Seg->GetLiteral().GetLength();
702 continue;
703 }
704
705 const CSeq_loc& Loc = Seg->GetLoc();
706
707 if(!Loc.IsInt()) {
708 // ??
709 continue;
710 }
711
712 const CSeq_interval& Int = Loc.GetInt();
713
714 if(Start <= (SegStart+Int.GetLength()) && Stop >= SegStart) {
715 CSeq_id_Handle CanonicalId;
716 CanonicalId = sequence::GetId(Int.GetId(), Handle.GetScope(),
717 sequence::eGetId_Canonical);
718
719 CRef<CSeq_id> Id(new CSeq_id);
720 Id->Assign(*CanonicalId.GetSeqId());
721 CompIds.push_back(Id);
722 }
723
724 SegStart += Int.GetLength();
725 }
726 }
727
728
x_GetSeqHistCompList(CBioseq_Handle Handle,TSeqPos Start,TSeqPos Stop,list<CRef<CSeq_id>> & CompIds)729 void CCommonComponentScorer::x_GetSeqHistCompList(CBioseq_Handle Handle,
730 TSeqPos Start, TSeqPos Stop,
731 list<CRef<CSeq_id> >& CompIds)
732 {
733 if(!Handle.CanGetInst_Hist())
734 return;
735
736 const CSeq_hist& Hist = Handle.GetInst_Hist();
737
738 if(!Hist.CanGetAssembly())
739 return;
740
741 const CSeq_hist::TAssembly& Assembly = Hist.GetAssembly();
742
743 CSeq_id_Handle HandleId = sequence::GetId(Handle.GetSeq_id_Handle(),
744 Handle.GetScope(), sequence::eGetId_Canonical);
745
746 ITERATE(CSeq_hist::TAssembly, AlignIter, Assembly) {
747 const CSeq_align& Align = **AlignIter;
748
749 // the Seq-aligns are expected to be in this row order
750 // but just in case, there is a check and swap later if needed
751 int query_row = 0;
752 int component_row = 1;
753
754 CSeq_id_Handle AlignIdHandle =
755 CSeq_id_Handle::GetHandle(Align.GetSeq_id(query_row));
756 AlignIdHandle = sequence::GetId(AlignIdHandle, Handle.GetScope(),
757 sequence::eGetId_Canonical);
758 if (AlignIdHandle != HandleId) {
759 std::swap(query_row, component_row);
760 }
761
762 list< CConstRef<CSeq_align> > SplitAligns;
763 if (Align.GetSegs().IsDisc()) {
764 ITERATE (CSeq_align_set::Tdata, it,
765 Align.GetSegs().GetDisc().Get()) {
766 SplitAligns.push_back(*it);
767 }
768 } else {
769 SplitAligns.push_back(CConstRef<CSeq_align>(&Align));
770 }
771
772 ITERATE(list<CConstRef<CSeq_align> >, SplitIter, SplitAligns) {
773 CSeq_id_Handle CompId =
774 CSeq_id_Handle::GetHandle
775 ((*SplitIter)->GetSeq_id(component_row));
776 CompId = sequence::GetId(CompId, Handle.GetScope(),
777 sequence::eGetId_Canonical);
778
779 if(Start <= (*SplitIter)->GetSeqStop(query_row) &&
780 Stop >= (*SplitIter)->GetSeqStart(query_row)) {
781 CRef<CSeq_id> Id(new CSeq_id);
782 Id->Assign(*CompId.GetSeqId());
783 CompIds.push_back(Id);
784 }
785 }
786 }
787 }
788
789
x_GetCompList(const CSeq_id & Id,TSeqPos Start,TSeqPos Stop,list<CRef<CSeq_id>> & CompIds,CScope & Scope)790 void CCommonComponentScorer::x_GetCompList(const CSeq_id& Id,
791 TSeqPos Start, TSeqPos Stop,
792 list<CRef<CSeq_id> >& CompIds,
793 CScope& Scope)
794 {
795 CBioseq_Handle Handle = Scope.GetBioseqHandle(Id);
796
797 x_GetDeltaExtCompList(Handle, Start, Stop, CompIds);
798 if(CompIds.empty())
799 x_GetUserCompList(Handle, CompIds);
800 if(CompIds.empty())
801 x_GetSeqHistCompList(Handle, Start, Stop, CompIds);
802 if(CompIds.empty()) {
803 CRef<CSeq_id> SelfId(new CSeq_id);
804 SelfId->Assign(Id);
805 CompIds.push_back(SelfId);
806 }
807 }
808
809
x_CompareCompLists(list<CRef<CSeq_id>> & QueryIds,list<CRef<CSeq_id>> & SubjectIds)810 bool CCommonComponentScorer::x_CompareCompLists(list<CRef<CSeq_id> >& QueryIds,
811 list<CRef<CSeq_id> >& SubjectIds)
812 {
813 list<CRef<CSeq_id> >::iterator QueryIter, SubjectIter;
814 for(QueryIter = QueryIds.begin(); QueryIter != QueryIds.end(); ++QueryIter) {
815 for(SubjectIter = SubjectIds.begin(); SubjectIter != SubjectIds.end(); ++SubjectIter) {
816 if( (*QueryIter)->Equals(**SubjectIter) )
817 return true;
818 }
819 }
820
821 return false;
822 }
823
824
825
826
827
828 END_SCOPE(ncbi)
829
830