1 /* $Id: Seq_align.cpp 618884 2020-10-27 16:28:50Z gouriano $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: .......
27 *
28 * File Description:
29 * .......
30 *
31 * Remark:
32 * This code was originally generated by application DATATOOL
33 * using specifications from the data definition file
34 * 'seqalign.asn'.
35 */
36
37 // standard includes
38 #include <ncbi_pch.hpp>
39 #include <objects/seqalign/seqalign_exception.hpp>
40 #include <objects/seqalign/Dense_seg.hpp>
41 #include <objects/seqalign/Dense_diag.hpp>
42 #include <objects/seqalign/Std_seg.hpp>
43 #include <objects/seqalign/Spliced_seg.hpp>
44 #include <objects/seqalign/Spliced_exon.hpp>
45 #include <objects/seqalign/Spliced_exon_chunk.hpp>
46 #include <objects/seqalign/Sparse_seg.hpp>
47 #include <objects/seqalign/Seq_align_set.hpp>
48 #include <objects/seqalign/Score.hpp>
49 #include <objects/general/Object_id.hpp>
50 #include <objects/general/User_object.hpp>
51 #include <objects/seqloc/Seq_loc.hpp>
52 #include <objects/seqloc/Seq_interval.hpp>
53 #include <objects/seqloc/Seq_point.hpp>
54 #include <objects/seq/seq_loc_mapper_base.hpp>
55 #include <serial/iterator.hpp>
56 #include <util/static_set.hpp>
57
58 // generated includes
59 #include <objects/seqalign/Seq_align.hpp>
60
61 // generated classes
62
63 BEGIN_NCBI_SCOPE
64
65 BEGIN_objects_SCOPE // namespace ncbi::objects::
66
67 // destructor
~CSeq_align(void)68 CSeq_align::~CSeq_align(void)
69 {
70 }
71
72
CheckNumRows(void) const73 CSeq_align::TDim CSeq_align::CheckNumRows(void) const
74 {
75 switch (GetSegs().Which()) {
76 case C_Segs::e_Denseg:
77 return GetSegs().GetDenseg().CheckNumRows();
78
79 //case C_Segs::e_Packed:
80 //return GetSegs().GetPacked().CheckNumRows();
81
82 case C_Segs::e_Sparse:
83 return GetSegs().GetSparse().CheckNumRows();
84
85 case C_Segs::e_Spliced:
86 {{
87 // spliced seg always has two rows: genomic and protein/transcript
88 return 2;
89 }}
90
91 case C_Segs::e_Disc:
92 {{
93 TDim numrows = 0;
94 ITERATE (C_Segs::TDisc::Tdata, std_i, GetSegs().GetDisc().Get()) {
95 TDim seg_numrows = (*std_i)->CheckNumRows();
96 if (numrows) {
97 if ( seg_numrows != numrows ) {
98 NCBI_THROW(CSeqalignException, eInvalidAlignment,
99 "CSeq_align::CheckNumRows(): Number of rows "
100 "is not the same for each disc seg.");
101 }
102 } else {
103 numrows = seg_numrows;
104 }
105 }
106 return numrows;
107 }}
108
109 case C_Segs::e_Std:
110 {{
111 TDim numrows = 0;
112 ITERATE (C_Segs::TStd, std_i, GetSegs().GetStd()) {
113 TDim seg_numrows = (*std_i)->CheckNumRows();
114 if (numrows) {
115 if (seg_numrows != numrows) {
116 NCBI_THROW(CSeqalignException, eInvalidAlignment,
117 "CSeq_align::CheckNumRows(): Number of rows "
118 "is not the same for each std seg.");
119 }
120 } else {
121 numrows = seg_numrows;
122 }
123 }
124 return numrows;
125 }}
126
127 case C_Segs::e_Dendiag:
128 {
129 TDim numrows = 0;
130 ITERATE (C_Segs::TDendiag, it, GetSegs().GetDendiag()) {
131 TDim seg_numrows = (*it)->CheckNumRows();
132 if (numrows) {
133 if ( seg_numrows != numrows ) {
134 NCBI_THROW(CSeqalignException, eInvalidAlignment,
135 "CSeq_align::CheckNumRows(): Number of rows "
136 "is not the same for each dendiag seg.");
137 }
138 } else {
139 numrows = seg_numrows;
140 }
141 }
142 return numrows;
143 }
144
145 default:
146 NCBI_THROW(CSeqalignException, eUnsupported,
147 "CSeq_align::CheckNumRows() currently does not handle "
148 "this type of alignment");
149 }
150 }
151
152
GetSeqRange(TDim row) const153 CRange<TSeqPos> CSeq_align::GetSeqRange(TDim row) const
154 {
155 switch (GetSegs().Which()) {
156 case C_Segs::e_Denseg:
157 return GetSegs().GetDenseg().GetSeqRange(row);
158
159 case C_Segs::e_Sparse:
160 return GetSegs().GetSparse().GetSeqRange(row);
161
162 case C_Segs::e_Dendiag:
163 {
164 CRange<TSeqPos> rng;
165 ITERATE (C_Segs::TDendiag, dendiag_i, GetSegs().GetDendiag()) {
166 rng.CombineWith((*dendiag_i)->GetSeqRange(row));
167 }
168 return rng;
169 }
170
171 case C_Segs::e_Std:
172 {
173 CRange<TSeqPos> rng;
174 CSeq_id seq_id;
175 size_t seg_i = 0;
176 ITERATE (C_Segs::TStd, std_i, GetSegs().GetStd()) {
177 TDim row_i = 0;
178 ITERATE (CStd_seg::TLoc, i, (*std_i)->GetLoc()) {
179 const CSeq_loc& loc = **i;
180 if (row_i++ == row) {
181 if (loc.IsInt()) {
182 if ( !seg_i ) {
183 seq_id.Assign(loc.GetInt().GetId());
184 } else if (seq_id.Compare(loc.GetInt().GetId())
185 != CSeq_id::e_YES) {
186 NCBI_THROW(CSeqalignException,
187 eInvalidRowNumber,
188 "CSeq_align::GetSeqRange():"
189 " Row seqids not consistent."
190 " Cannot determine range.");
191 }
192 rng.CombineWith
193 (CRange<TSeqPos>
194 (loc.GetInt().GetFrom(),
195 loc.GetInt().GetTo()));
196 } else if (loc.IsPnt()) {
197 if ( !seg_i ) {
198 seq_id.Assign(loc.GetPnt().GetId());
199 } else if (seq_id.Compare(loc.GetPnt().GetId())
200 != CSeq_id::e_YES) {
201 NCBI_THROW(CSeqalignException,
202 eInvalidRowNumber,
203 "CSeq_align::GetSeqRange():"
204 " Row seqids not consistent."
205 " Cannot determine range.");
206 }
207 rng.CombineWith
208 (CRange<TSeqPos>
209 (loc.GetPnt().GetPoint(),
210 loc.GetPnt().GetPoint()));
211 }
212 }
213 }
214 if (row < 0 || row >= row_i) {
215 NCBI_THROW(CSeqalignException, eInvalidRowNumber,
216 "CSeq_align::GetSeqRange():"
217 " Invalid row number");
218 }
219 if (CanGetDim() && row_i != GetDim()) {
220 NCBI_THROW(CSeqalignException, eInvalidAlignment,
221 "CSeq_align::GetSeqRange():"
222 " loc.size is inconsistent with dim");
223 }
224 seg_i++;
225 }
226 if (rng.Empty()) {
227 NCBI_THROW(CSeqalignException, eInvalidAlignment,
228 "CSeq_align::GetSeqRange(): Row is empty");
229 }
230 return rng;
231 }
232
233 case C_Segs::e_Disc:
234 {
235 CRange<TSeqPos> rng;
236 ITERATE (C_Segs::TDisc::Tdata, disc_i, GetSegs().GetDisc().Get()) {
237 rng.CombineWith((*disc_i)->GetSeqRange(row));
238 }
239 return rng;
240 }
241
242 case C_Segs::e_Spliced:
243 return GetSegs().GetSpliced().GetSeqRange(row);
244
245 default:
246 NCBI_THROW(CSeqalignException, eUnsupported,
247 "CSeq_align::GetSeqRange() currently does not handle "
248 "this type of alignment.");
249 }
250 }
251
GetSeqStart(TDim row) const252 TSeqPos CSeq_align::GetSeqStart(TDim row) const
253 {
254 switch (GetSegs().Which()) {
255 case C_Segs::e_Denseg:
256 return GetSegs().GetDenseg().GetSeqStart(row);
257 case C_Segs::e_Sparse:
258 return GetSegs().GetSparse().GetSeqStart(row);
259 case C_Segs::e_Spliced:
260 return GetSegs().GetSpliced().GetSeqStart(row);
261 case C_Segs::e_Dendiag:
262 case C_Segs::e_Std:
263 case C_Segs::e_Disc:
264 return GetSeqRange(row).GetFrom();
265 default:
266 NCBI_THROW(CSeqalignException, eUnsupported,
267 "CSeq_align::GetSeqStart() currently does not handle "
268 "this type of alignment.");
269 }
270 }
271
272
GetSeqStop(TDim row) const273 TSeqPos CSeq_align::GetSeqStop (TDim row) const
274 {
275 switch (GetSegs().Which()) {
276 case C_Segs::e_Denseg:
277 return GetSegs().GetDenseg().GetSeqStop(row);
278 case C_Segs::e_Sparse:
279 return GetSegs().GetSparse().GetSeqStop(row);
280 case C_Segs::e_Spliced:
281 return GetSegs().GetSpliced().GetSeqStop(row);
282 case C_Segs::e_Dendiag:
283 case C_Segs::e_Std:
284 case C_Segs::e_Disc:
285 return GetSeqRange(row).GetTo();
286 default:
287 NCBI_THROW(CSeqalignException, eUnsupported,
288 "CSeq_align::GetSeqStop() currently does not handle "
289 "this type of alignment.");
290 }
291 }
292
293
GetSeqStrand(TDim row) const294 ENa_strand CSeq_align::GetSeqStrand(TDim row) const
295 {
296 switch (GetSegs().Which()) {
297 case C_Segs::e_Denseg:
298 return GetSegs().GetDenseg().GetSeqStrand(row);
299 case C_Segs::e_Sparse:
300 return GetSegs().GetSparse().GetSeqStrand(row);
301 case C_Segs::e_Spliced:
302 return GetSegs().GetSpliced().GetSeqStrand(row);
303 case C_Segs::e_Disc:
304 return GetSegs().GetDisc().Get().front()->GetSeqStrand(row);
305 case C_Segs::e_Std:
306 return GetSegs().GetStd().front()->GetLoc()[row]->GetStrand();
307 case C_Segs::e_Dendiag:
308 return GetSegs().GetDendiag().front()->GetSeqStrand(row);
309 default:
310 NCBI_THROW(CSeqalignException, eUnsupported,
311 "CSeq_align::GetSeqStrand() currently does not handle "
312 "this type of alignment.");
313 }
314 }
315
316
GetSeq_id(TDim row) const317 const CSeq_id& CSeq_align::GetSeq_id(TDim row) const
318 {
319 switch (GetSegs().Which()) {
320 case C_Segs::e_Denseg:
321 return GetSegs().GetDenseg().GetSeq_id(row);
322
323 case C_Segs::e_Sparse:
324 return GetSegs().GetSparse().GetSeq_id(row);
325
326 case C_Segs::e_Dendiag:
327 {{
328 // If segments have different number of rows, this will try
329 // to find the segment with enough rows to get the id.
330 ITERATE(CSeq_align::C_Segs::TDendiag, seg, GetSegs().GetDendiag()) {
331 if( (*seg)->IsSetIds() &&
332 (size_t)row < (*seg)->GetIds().size() ) {
333 return *(*seg)->GetIds()[row];
334 }
335 }
336 break;
337 }}
338
339 case C_Segs::e_Std:
340 {{
341 // If segments have different number of rows, this will try
342 // to find the segment with enough rows to get the id.
343 ITERATE(CSeq_align::C_Segs::TStd, seg, GetSegs().GetStd()) {
344 if ( (*seg)->IsSetIds() &&
345 (size_t)row < (*seg)->GetIds().size()) {
346 return *((*seg)->GetIds()[row]);
347 }
348 else if ( (*seg)->IsSetLoc() &&
349 (size_t)row < (*seg)->GetLoc().size() ) {
350 const CSeq_loc& loc = *(*seg)->GetLoc()[row];
351 CConstRef<CSeq_id> id(loc.GetId());
352 if (id) {
353 return *id;
354 }
355 }
356 }
357 }}
358 break;
359
360 case C_Segs::e_Disc:
361 {{
362 // Try to find a sub-alignment for which we can get a
363 // Seq-id for this row.
364 ITERATE (CSeq_align_set::Tdata, sub_aln, GetSegs().GetDisc().Get()) {
365 try {
366 const CSeq_id& rv = (*sub_aln)->GetSeq_id(row);
367 return rv;
368 }
369 catch (const CSeqalignException&) {
370 }
371 }
372 }}
373 break;
374
375 case C_Segs::e_Spliced:
376 {{
377 // Technically, there is no row order for product and product.
378 // However, in spliced seg, since product comes first, we assign it
379 // as row 0.
380 // Hence, the genomic is assigned to row 1.
381 const C_Segs::TSpliced& spliced_seg = GetSegs().GetSpliced();
382 if (row == 0 && spliced_seg.IsSetProduct_id()) {
383 return spliced_seg.GetProduct_id();
384 } else if (row == 1 && spliced_seg.IsSetGenomic_id()) {
385 return spliced_seg.GetGenomic_id();
386 }
387 }}
388 break;
389
390 default:
391 NCBI_THROW(CSeqalignException, eUnsupported,
392 "CSeq_align::GetSeq_id() currently does not handle "
393 "this type of alignment.");
394 }
395
396 NCBI_THROW(CSeqalignException, eInvalidRowNumber,
397 "CSeq_align::GetSeq_id(): "
398 "can not get seq-id for the row requested.");
399 // return CSeq_id();
400 }
401
402
403 typedef SStaticPair<CSeq_align::EScoreType, const char*> TScoreNamePair;
404 static const TScoreNamePair sc_ScoreNames[] = {
405 { CSeq_align::eScore_Score, "score" },
406 { CSeq_align::eScore_Blast, "score" },
407 { CSeq_align::eScore_BitScore, "bit_score" },
408 { CSeq_align::eScore_EValue, "e_value" },
409 { CSeq_align::eScore_AlignLength, "align_length" },
410 { CSeq_align::eScore_IdentityCount, "num_ident" },
411 { CSeq_align::eScore_PositiveCount, "num_positives" },
412 { CSeq_align::eScore_NegativeCount, "num_negatives" },
413 { CSeq_align::eScore_MismatchCount, "num_mismatch" },
414 { CSeq_align::eScore_GapCount, "gap_count" },
415 { CSeq_align::eScore_PercentIdentity_Gapped, "pct_identity_gap" },
416 { CSeq_align::eScore_PercentIdentity_Ungapped, "pct_identity_ungap" },
417 { CSeq_align::eScore_PercentIdentity_GapOpeningOnly, "pct_identity_gapopen_only" },
418 { CSeq_align::eScore_PercentCoverage, "pct_coverage" },
419 { CSeq_align::eScore_SumEValue, "sum_e" },
420 { CSeq_align::eScore_CompAdjMethod, "comp_adjustment_method" },
421 { CSeq_align::eScore_HighQualityPercentCoverage, "pct_coverage_hiqual" },
422 { CSeq_align::eScore_Matches, "matches"},
423 { CSeq_align::eScore_OverallIdentity, "identity"},
424 { CSeq_align::eScore_Splices, "splices"},
425 { CSeq_align::eScore_ConsensusSplices, "consensus_splices"},
426 { CSeq_align::eScore_ProductCoverage, "product_coverage"},
427 { CSeq_align::eScore_ExonIdentity, "exon_identity"}
428 };
429
430 static const char* const sc_ScoreHelpText[] = {
431 "Blast score",
432 "Blast score",
433 "Blast-style bit score",
434 "Blast-style e-value",
435 "Length of the aligned segments, including the length of all gap segments",
436 "Count of identities",
437 "Count of positives; protein-to-DNA score",
438 "Count of negatives; protein-to-DNA score",
439 "Count of mismatches",
440 "Number of gaps in the alignment",
441 "Percent identity (0.0-100.0); count each base in a gap in any row as a mismatch",
442 "Percent identity (0.0-100.0); don't count gaps",
443 "Percent identity (0.0-100.0); count a gap of any length in any row as a mismatch of length 1",
444 "Percentage of query sequence aligned to subject (0.0-100.0)",
445 "Blast-style sum_e",
446 "Composition-adjustment method from BLAST",
447 "Percent coverage (0.0-100.0) of high quality region",
448 "Count of identities",
449 "Percent identity; count gaps within exons and introns on product",
450 "Number of splices; 2 x number of introns that have no gap on product",
451 "Number of splices with consensus splice sequence",
452 "Percentage of query sequence aligned to subject (0.0-100.0)",
453 "Percent identity; count gaps within exons only"
454 };
455
456 static const bool sc_IsInteger[] = {
457 true, // eScore_Score
458 true, // eScore_Blast
459 false, // eScore_BitScore
460 false, // eScore_EValue
461 true, // eScore_AlignLength
462 true, // eScore_IdentityCount
463 true, // eScore_PositiveCount
464 true, // eScore_NegativeCount
465 true, // eScore_MismatchCount
466 true, // eScore_GapCount
467 false, // eScore_PercentIdentity_Gapped
468 false, // eScore_PercentIdentity_Ungapped
469 false, // eScore_PercentIdentity_GapOpeningOnly
470 false, // eScore_PercentCoverage
471 false, // eScore_SumEValue
472 false, // eScore_CompAdjMethod
473 false, // eScore_HighQualityPercentCoverage
474 true, // eScore_Matches
475 false, // eScore_OverallIdentity
476 true, // eScore_Splices
477 true, // eScore_ConsensusSplices
478 false, // eScore_ProductCoverage
479 false // eScore_ExonIdentity
480 };
481
482
ScoreNameMap()483 const CSeq_align::TScoreNameMap &CSeq_align::ScoreNameMap()
484 {
485 DEFINE_STATIC_FAST_MUTEX(s_ScoreNameMap_mutex);
486 CFastMutexGuard guard(s_ScoreNameMap_mutex);
487 {
488 static TScoreNameMap m_ScoreNameMap;
489 if (m_ScoreNameMap.empty()) {
490 /// initialize map
491 for(unsigned score = eScore_Blast;
492 score <= eScore_ExonIdentity; ++score)
493 {
494 m_ScoreNameMap[sc_ScoreNames[score].second] =
495 sc_ScoreNames[score].first;
496 }
497 }
498
499 return m_ScoreNameMap;
500 }
501 }
502
ScoreName(EScoreType score)503 string CSeq_align::ScoreName(EScoreType score)
504 {
505 return sc_ScoreNames[score].second;
506 }
507
HelpText(EScoreType score)508 string CSeq_align::HelpText(EScoreType score)
509 {
510 return sc_ScoreHelpText[score];
511 }
512
IsIntegerScore(EScoreType score)513 bool CSeq_align::IsIntegerScore(EScoreType score)
514 {
515 return sc_IsInteger[score];
516 }
517
518
519 /// retrieve a named score object
x_GetNamedScore(const string & name) const520 CConstRef<CScore> CSeq_align::x_GetNamedScore(const string& name) const
521 {
522 CConstRef<CScore> score;
523 if (IsSetScore()) {
524 ITERATE (TScore, iter, GetScore()) {
525 if ((*iter)->IsSetId() && (*iter)->GetId().IsStr() &&
526 (*iter)->GetId().GetStr() == name) {
527 score = *iter;
528 break;
529 }
530 }
531 }
532
533 return score;
534 }
535
536
x_SetNamedScore(const string & name)537 CRef<CScore> CSeq_align::x_SetNamedScore(const string& name)
538 {
539 CRef<CScore> score;
540 if (IsSetScore()) {
541 NON_CONST_ITERATE (TScore, iter, SetScore()) {
542 if ((*iter)->IsSetId() && (*iter)->GetId().IsStr() &&
543 (*iter)->GetId().GetStr() == name) {
544 score = *iter;
545 break;
546 }
547 }
548 }
549
550 if ( !score ) {
551 score.Reset(new CScore);
552 score->SetId().SetStr(name);
553 SetScore().push_back(score);
554 }
555
556 return score;
557 }
558
559
560 ///---------------------------------------------------------------------------
561 /// PRE : name of score to return
562 /// POST: whether or not we found that score; score converted to int
GetNamedScore(const string & id,int & score) const563 bool CSeq_align::GetNamedScore(const string &id, int &score) const
564 {
565 CConstRef<CScore> ref = x_GetNamedScore(id);
566 if (ref) {
567 if (ref->GetValue().IsInt()) {
568 score = ref->GetValue().GetInt();
569 } else {
570 score = (int)ref->GetValue().GetReal();
571 }
572 return true;
573 }
574 return false;
575 }
576
577 ///---------------------------------------------------------------------------
578 /// PRE : name of score to return
579 /// POST: whether or not we found that score; score converted to double
GetNamedScore(const string & id,double & score) const580 bool CSeq_align::GetNamedScore(const string &id, double &score) const
581 {
582 CConstRef<CScore> ref = x_GetNamedScore(id);
583 if (ref) {
584 if (ref->GetValue().IsInt()) {
585 score = (double)ref->GetValue().GetInt();
586 } else {
587 score = ref->GetValue().GetReal();
588 }
589 return true;
590 }
591 return false;
592 }
593
594
GetNamedScore(EScoreType type,int & score) const595 bool CSeq_align::GetNamedScore(EScoreType type, int& score) const
596 {
597 return GetNamedScore(sc_ScoreNames[type].second, score);
598 }
599
GetNamedScore(EScoreType type,double & score) const600 bool CSeq_align::GetNamedScore(EScoreType type, double& score) const
601 {
602 return GetNamedScore(sc_ScoreNames[type].second, score);
603 }
604
605
ResetNamedScore(const string & name)606 void CSeq_align::ResetNamedScore(const string& name)
607 {
608 if (IsSetScore()) {
609 NON_CONST_ITERATE (TScore, iter, SetScore()) {
610 if ((*iter)->IsSetId() && (*iter)->GetId().IsStr() &&
611 (*iter)->GetId().GetStr() == name) {
612 SetScore().erase(iter);
613 break;
614 }
615 }
616 }
617 }
618
ResetNamedScore(EScoreType type)619 void CSeq_align::ResetNamedScore(EScoreType type)
620 {
621 ResetNamedScore(sc_ScoreNames[type].second);
622 }
623
SetNamedScore(EScoreType type,int score)624 void CSeq_align::SetNamedScore(EScoreType type, int score)
625 {
626 CRef<CScore> ref = x_SetNamedScore(sc_ScoreNames[type].second);
627 ref->SetValue().SetInt(score);
628 }
629
SetNamedScore(EScoreType type,double score)630 void CSeq_align::SetNamedScore(EScoreType type, double score)
631 {
632 CRef<CScore> ref = x_SetNamedScore(sc_ScoreNames[type].second);
633 ref->SetValue().SetReal(score);
634 }
635
SetNamedScore(const string & id,int score)636 void CSeq_align::SetNamedScore(const string& id, int score)
637 {
638 CRef<CScore> ref = x_SetNamedScore(id);
639 ref->SetValue().SetInt(score);
640 }
641
SetNamedScore(const string & id,double score)642 void CSeq_align::SetNamedScore(const string& id, double score)
643 {
644 CRef<CScore> ref = x_SetNamedScore(id);
645 ref->SetValue().SetReal(score);
646 }
647
648
Validate(bool full_test) const649 void CSeq_align::Validate(bool full_test) const
650 {
651 switch (GetSegs().Which()) {
652 case TSegs::e_Dendiag:
653 ITERATE(TSegs::TDendiag, dendiag_it, GetSegs().GetDendiag()) {
654 (*dendiag_it)->Validate();
655 }
656 break;
657 case C_Segs::e_Denseg:
658 GetSegs().GetDenseg().Validate(full_test);
659 break;
660 case C_Segs::e_Disc:
661 ITERATE(CSeq_align_set::Tdata, seq_align_it, GetSegs().GetDisc().Get()) {
662 (*seq_align_it)->Validate(full_test);
663 }
664 break;
665 case C_Segs::e_Std:
666 CheckNumRows();
667 break;
668 case C_Segs::e_Spliced:
669 GetSegs().GetSpliced().Validate(full_test);
670 break;
671 case C_Segs::e_Sparse:
672 GetSegs().GetSparse().Validate(full_test);
673 break;
674 default:
675 NCBI_THROW(CSeqalignException, eUnsupported,
676 "CSeq_align::Validate() currently does not handle "
677 "this type of alignment");
678 }
679 }
680
681
682 ///---------------------------------------------------------------------------
683 /// PRE : currently only implemented for dense-seg segments
684 /// POST: same alignment, opposite orientation
Reverse(void)685 void CSeq_align::Reverse(void)
686 {
687 switch (GetSegs().Which()) {
688 case C_Segs::e_Denseg:
689 SetSegs().SetDenseg().Reverse();
690 break;
691 default:
692 NCBI_THROW(CSeqalignException, eUnsupported,
693 "CSeq_align::Reverse() currently only handles dense-seg "
694 "alignments");
695 }
696 }
697
698 ///---------------------------------------------------------------------------
699 /// PRE : currently only implemented for dense-seg segments; two row numbers
700 /// POST: same alignment, position of the two rows has been swapped
SwapRows(TDim row1,TDim row2)701 void CSeq_align::SwapRows(TDim row1, TDim row2)
702 {
703 switch (GetSegs().Which()) {
704 case C_Segs::e_Denseg:
705 SetSegs().SetDenseg().SwapRows(row1, row2);
706 break;
707
708 case C_Segs::e_Std:
709 NON_CONST_ITERATE (C_Segs::TStd, it, SetSegs().SetStd()) {
710 (*it)->SwapRows(row1, row2);
711 }
712 break;
713
714 case C_Segs::e_Disc:
715 SetSegs().SetDisc().SwapRows(row1, row2);
716 break;
717 default:
718 NCBI_THROW(CSeqalignException, eUnsupported,
719 "CSeq_align::SwapRows currently only handles dense-seg "
720 "alignments");
721 }
722 }
723
724 ///----------------------------------------------------------------------------
725 /// PRE : the Seq-align has StdSeg segs
726 /// POST: Seq_align of type Dense-seg is created with m_Widths if necessary
727 CRef<CSeq_align>
CreateDensegFromStdseg(SSeqIdChooser * SeqIdChooser) const728 CSeq_align::CreateDensegFromStdseg(SSeqIdChooser* SeqIdChooser) const
729 {
730 if ( !GetSegs().IsStd() ) {
731 NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
732 "CSeq_align::CreateDensegFromStdseg(): "
733 "Input Seq-align should have segs of type StdSeg!");
734 }
735
736 CRef<CSeq_align> sa(new CSeq_align);
737 sa->SetType(eType_not_set);
738 if (IsSetScore()) {
739 sa->SetScore() = GetScore();
740 }
741 CDense_seg& ds = sa->SetSegs().SetDenseg();
742
743 typedef CDense_seg::TDim TNumrow;
744 typedef CDense_seg::TNumseg TNumseg;
745
746 vector<TSeqPos> row_lens;
747 CDense_seg::TLens& seg_lens = ds.SetLens();
748 CDense_seg::TStarts& starts = ds.SetStarts();
749 CDense_seg::TStrands& strands = ds.SetStrands();
750 CDense_seg::TWidths& widths = ds.SetWidths();
751 vector<bool> widths_determined;
752
753 TSeqPos row_len;
754 TSeqPos from, to;
755 ENa_strand strand;
756
757
758 TNumseg seg = 0;
759 TNumrow dim = 0, row = 0;
760 ITERATE (C_Segs::TStd, std_i, GetSegs().GetStd()) {
761
762 const CStd_seg& ss = **std_i;
763
764 seg_lens.push_back(0);
765 TSeqPos& seg_len = seg_lens.back();
766 row_lens.clear();
767 widths_determined.push_back(false);
768
769 row = 0;
770 ITERATE (CStd_seg::TLoc, i, ss.GetLoc()) {
771
772 const CSeq_id* seq_id = 0;
773 // push back initialization values
774 if (seg == 0) {
775 widths.push_back(0);
776 strands.push_back(eNa_strand_unknown);
777 }
778
779 if ((*i)->IsInt()) {
780 const CSeq_interval& interval = (*i)->GetInt();
781
782 // determine start and len
783 from = interval.GetFrom();
784 to = interval.GetTo();
785 starts.push_back(from);
786 row_len = to - from + 1;
787 row_lens.push_back(row_len);
788
789 int width = 0;
790 // try to determine/check the seg_len and width
791 if (!seg_len) {
792 width = 0;
793 seg_len = row_len;
794 } else {
795 if (row_len * 3 == seg_len) {
796 seg_len /= 3;
797 width = 1;
798 } else if (row_len / 3 == seg_len) {
799 width = 3;
800 } else if (row_len != seg_len) {
801 NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
802 "CreateDensegFromStdseg(): "
803 "Std-seg segment lengths not accurate!");
804 }
805 }
806 if (width > 0) {
807 widths_determined[seg] = true;
808 if (widths[row] > 0 && widths[row] != width) {
809 NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
810 "CreateDensegFromStdseg(): "
811 "Std-seg segment lengths not accurate!");
812 } else {
813 widths[row] = width;
814 }
815 }
816
817 // get the id
818 seq_id = &(*i)->GetInt().GetId();
819
820 // determine/check the strand
821 if (interval.CanGetStrand()) {
822 strand = interval.GetStrand();
823 if (seg == 0 || strands[row] == eNa_strand_unknown) {
824 strands[row] = strand;
825 } else {
826 if (strands[row] != strand) {
827 NCBI_THROW(CSeqalignException,
828 eInvalidInputAlignment,
829 "CreateDensegFromStdseg(): "
830 "Inconsistent strands!");
831 }
832 }
833 }
834 } else if ((*i)->IsEmpty()) {
835 starts.push_back(-1);
836 if (seg == 0) {
837 strands[row] = eNa_strand_unknown;
838 }
839 seq_id = &(*i)->GetEmpty();
840 row_lens.push_back(0);
841 }
842
843 // determine/check the id
844 if (seg == 0) {
845 CRef<CSeq_id> id(new CSeq_id);
846 SerialAssign(*id, *seq_id);
847 ds.SetIds().push_back(id);
848 } else {
849 CSeq_id& id = *ds.SetIds()[row];
850 if (!SerialEquals(id, *seq_id)) {
851 if (SeqIdChooser) {
852 SeqIdChooser->ChooseSeqId(id, *seq_id);
853 } else {
854 string errstr =
855 string("CreateDensegFromStdseg(): Seq-ids: ") +
856 id.AsFastaString() + " and " +
857 seq_id->AsFastaString() + " are not identical!" +
858 " Without the OM it cannot be determined if they belong to"
859 " the same sequence."
860 " Define and pass ChooseSeqId to resolve seq-ids.";
861 NCBI_THROW(CSeqalignException, eInvalidInputAlignment, errstr);
862 }
863 }
864 }
865
866 // next row
867 row++;
868 if (seg == 0) {
869 dim++;
870 }
871 }
872 if (dim != ss.GetDim() || row != dim) {
873 NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
874 "CreateDensegFromStdseg(): "
875 "Inconsistent dimentions!");
876 }
877
878 if (widths_determined[seg]) {
879 // go back and determine/check widths
880 for (row = 0; row < dim; row++) {
881 if ((row_len = row_lens[row]) > 0) {
882 int width = 0;
883 if (row_len == seg_len * 3) {
884 width = 3;
885 } else if (row_len == seg_len) {
886 width = 1;
887 }
888 if (widths[row] > 0 && widths[row] != width) {
889 NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
890 "CreateDensegFromStdseg(): "
891 "Std-seg segment lengths not accurate!");
892 } else {
893 widths[row] = width;
894 }
895 }
896 }
897 }
898
899 // next seg
900 seg++;
901 }
902
903 ds.SetDim(dim);
904 ds.SetNumseg(seg);
905
906 // go back and finish lens determination
907 bool widths_failure = false;
908 bool widths_success = false;
909 for (seg = 0; seg < ds.GetNumseg(); seg++) {
910 if (!widths_determined[seg]) {
911 for(row = 0; row < dim; row++) {
912 if (starts[seg * dim + row] >= 0) {
913 int width = widths[row];
914 if (width == 3) {
915 seg_lens[seg] /= 3;
916 } else if (width == 0) {
917 widths_failure = true;
918 }
919 break;
920 }
921 }
922 } else {
923 widths_success = true;
924 }
925 }
926
927 if (widths_failure) {
928 if (widths_success) {
929 NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
930 "CreateDensegFromStdseg(): "
931 "Some widths cannot be determined!");
932 } else {
933 ds.ResetWidths();
934 }
935 }
936
937 // finish the strands
938 for (seg = 1; seg < ds.GetNumseg(); seg++) {
939 for (row = 0; row < dim; row++) {
940 strands.push_back(strands[row]);
941 }
942 }
943
944 return sa;
945 }
946
947
948 ///----------------------------------------------------------------------------
949 /// PRE : the Seq-align is a Dense-seg of aligned nucleotide sequences
950 /// POST: Seq_align of type Dense-seg is created with each of the m_Widths
951 /// equal to 3 and m_Lengths devided by 3.
952 CRef<CSeq_align>
CreateTranslatedDensegFromNADenseg() const953 CSeq_align::CreateTranslatedDensegFromNADenseg() const
954 {
955 if ( !GetSegs().IsDenseg() ) {
956 NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
957 "CSeq_align::CreateTranslatedDensegFromNADenseg(): "
958 "Input Seq-align should have segs of type Dense-seg!");
959 }
960
961 CRef<CSeq_align> sa(new CSeq_align);
962 sa->SetType(eType_not_set);
963
964 if (GetSegs().GetDenseg().IsSetWidths()) {
965 NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
966 "CSeq_align::CreateTranslatedDensegFromNADenseg(): "
967 "Widths already exist for the original alignment");
968 }
969
970 // copy from the original
971 sa->Assign(*this);
972
973 CDense_seg& ds = sa->SetSegs().SetDenseg();
974
975 // fix the lengths
976 const CDense_seg::TLens& orig_lens = GetSegs().GetDenseg().GetLens();
977 CDense_seg::TLens& lens = ds.SetLens();
978
979 for (CDense_seg::TNumseg numseg = 0; numseg < ds.GetNumseg(); numseg++) {
980 if (orig_lens[numseg] % 3) {
981 string errstr =
982 string("CSeq_align::CreateTranslatedDensegFromNADenseg(): ") +
983 "Length of segment " + NStr::IntToString(numseg) +
984 " is not divisible by 3.";
985 NCBI_THROW(CSeqalignException, eInvalidInputAlignment, errstr);
986 } else {
987 lens[numseg] = orig_lens[numseg] / 3;
988 }
989 }
990
991 // add the widths
992 ds.SetWidths().resize(ds.GetDim(), 3);
993
994 #if _DEBUG
995 ds.Validate(true);
996 #endif
997
998 return sa;
999 }
1000
1001
1002 /// Strict weak ordering for pairs (by first)
1003 /// Used by CreateDensegFromDisc
1004 template <typename T, typename Pred = less<TSeqPos> >
1005 struct ds_cmp {
operator ()ds_cmp1006 bool operator()(const T& x, const T& y) {
1007 return m_Pred(x.first, y.first);
1008 }
1009 private:
1010 Pred m_Pred;
1011 };
1012
1013
1014 CRef<CSeq_align>
CreateDensegFromDisc(SSeqIdChooser * SeqIdChooser) const1015 CSeq_align::CreateDensegFromDisc(SSeqIdChooser* SeqIdChooser) const
1016 {
1017 if ( !GetSegs().IsDisc() ) {
1018 NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
1019 "CSeq_align::CreateDensegFromDisc(): "
1020 "Input Seq-align should have segs of type StdSeg!");
1021 }
1022
1023 CRef<CSeq_align> new_sa(new CSeq_align);
1024 new_sa->SetType(eType_not_set);
1025 if (IsSetScore()) {
1026 new_sa->SetScore() = GetScore();
1027 }
1028
1029 CDense_seg& new_ds = new_sa->SetSegs().SetDenseg();
1030
1031 new_ds.SetDim(0);
1032 new_ds.SetNumseg(0);
1033
1034
1035 /// Order the discontinuous densegs
1036 typedef pair<TSeqPos, const CDense_seg *> TPosDsPair;
1037 typedef vector<TPosDsPair> TDsVec;
1038 TDsVec ds_vec;
1039 ds_vec.reserve(GetSegs().GetDisc().Get().size());
1040 int strand = -1;
1041 ITERATE (CSeq_align_set::Tdata, sa_i, GetSegs().GetDisc().Get()) {
1042 const CDense_seg& ds = (*sa_i)->GetSegs().GetDenseg();
1043 ds_vec.push_back(make_pair<TSeqPos, const CDense_seg *>(ds.GetSeqStart(0), &ds));
1044 if (ds.IsSetStrands() &&
1045 !ds.GetStrands().empty()) {
1046 if (strand < 0) {
1047 strand = ds.GetStrands()[0];
1048 } else {
1049 if (strand != ds.GetStrands()[0]) {
1050 NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
1051 "CreateDensegFromDisc(): "
1052 "Inconsistent strands!");
1053 }
1054 }
1055 }
1056 }
1057 if ( !IsReverse(ENa_strand(strand)) ) {
1058 sort(ds_vec.begin(), ds_vec.end(),
1059 ds_cmp<TPosDsPair>());
1060 } else {
1061 sort(ds_vec.begin(), ds_vec.end(),
1062 ds_cmp<TPosDsPair, greater<TSeqPos> >());
1063 }
1064
1065
1066 /// First pass: determine dim & numseg
1067 CDense_seg::TStrands single_segment_strands;
1068 ITERATE(TDsVec, ds_i, ds_vec) {
1069 const CDense_seg& ds = *ds_i->second;
1070
1071 /// Numseg
1072 new_ds.SetNumseg() += ds.GetNumseg();
1073
1074 /// Dim
1075 if ( !new_ds.GetDim() ) {
1076 new_ds.SetDim(ds.GetDim());
1077 } else if (new_ds.GetDim() != ds.GetDim() ) {
1078 NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
1079 "CreateDensegFromDisc(): "
1080 "All disc dense-segs need to have the same dimension!");
1081 }
1082
1083 /// Strands?
1084 if ( !ds.GetStrands().empty() ) {
1085 if (single_segment_strands.empty()) {
1086 single_segment_strands.assign(ds.GetStrands().begin(),
1087 ds.GetStrands().begin() + ds.GetDim());
1088 } else {
1089 if ( !equal(single_segment_strands.begin(),
1090 single_segment_strands.end(),
1091 ds.GetStrands().begin()) ) {
1092 NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
1093 "CreateDensegFromDisc(): "
1094 "All disc dense-segs need to have the same strands!");
1095 }
1096 }
1097 }
1098 }
1099
1100 new_ds.SetStarts().resize(new_ds.GetDim() * new_ds.GetNumseg());
1101 new_ds.SetLens().resize(new_ds.GetNumseg());
1102 if ( !single_segment_strands.empty() ) {
1103 /// Multiply the strands by the number of segments.
1104 new_ds.SetStrands().reserve(new_ds.GetDim() * new_ds.GetNumseg());
1105 for (CDense_seg::TNumseg seg = 0;
1106 seg < new_ds.GetNumseg();
1107 ++seg) {
1108 new_ds.SetStrands().insert(new_ds.SetStrands().end(),
1109 single_segment_strands.begin(),
1110 single_segment_strands.begin() + new_ds.GetDim());
1111 }
1112 }
1113
1114
1115 /// Second pass: validate and set ids and starts
1116
1117 CDense_seg::TNumseg new_seg = 0;
1118 int new_starts_i = 0;
1119
1120 ITERATE(TDsVec, ds_i, ds_vec) {
1121 const CDense_seg& ds = *ds_i->second;
1122
1123 _ASSERT(ds.GetStarts().size() == (size_t)ds.GetNumseg() * ds.GetDim());
1124 _ASSERT(new_ds.GetDim() == ds.GetDim());
1125
1126 /// Ids
1127 if (new_ds.GetIds().empty()) {
1128 new_ds.SetIds().resize(new_ds.GetDim());
1129 for (CDense_seg::TDim row = 0; row < ds.GetDim(); ++row) {
1130 CRef<CSeq_id> id(new CSeq_id);
1131 SerialAssign(*id, *ds.GetIds()[row]);
1132 new_ds.SetIds()[row] = id;
1133 }
1134 } else {
1135 if (SeqIdChooser) {
1136 for (CDense_seg::TDim row = 0; row < ds.GetDim(); ++row) {
1137 SeqIdChooser->ChooseSeqId(*new_ds.SetIds()[row], *ds.GetIds()[row]);
1138 }
1139 } else {
1140 #if _DEBUG
1141 for (CDense_seg::TDim row = 0; row < ds.GetDim(); ++row) {
1142 const CSeq_id& new_id = *new_ds.GetIds()[row];
1143 const CSeq_id& id = *ds.GetIds()[row];
1144 if ( !SerialEquals(new_id, id) ) {
1145 string errstr =
1146 string("CreateDensegFromDisc(): Seq-ids: ") +
1147 new_id.AsFastaString() + " and " +
1148 id.AsFastaString() + " are not identical!" +
1149 " Without the OM it cannot be determined if they belong to"
1150 " the same sequence."
1151 " Define and pass ChooseSeqId to resolve seq-ids.";
1152 NCBI_THROW(CSeqalignException, eInvalidInputAlignment, errstr);
1153 }
1154 }
1155 #endif
1156 }
1157 }
1158
1159 /// Starts
1160 int starts_i = 0;
1161 for (CDense_seg::TNumseg seg = 0;
1162 seg < ds.GetNumseg();
1163 ++new_seg, ++seg) {
1164
1165 new_ds.SetLens()[new_seg] = ds.GetLens()[seg];
1166
1167 for (CDense_seg::TDim row = 0;
1168 row < ds.GetDim();
1169 ++starts_i, ++new_starts_i, ++row) {
1170 new_ds.SetStarts()[new_starts_i] = ds.GetStarts()[starts_i];
1171 }
1172 }
1173 }
1174
1175
1176 /// Perform full test (including segment starts consistency check)
1177 new_sa->Validate(true);
1178
1179
1180 return new_sa;
1181 }
1182
s_Distance(const TSeqRange & range1,const TSeqRange & range2)1183 static TSeqPos s_Distance(const TSeqRange &range1,
1184 const TSeqRange &range2)
1185 {
1186 if (range1.IntersectingWith(range2)) {
1187 return 0;
1188 }
1189 return max(range1,range2).GetFrom() - min(range1,range2).GetTo();
1190 }
1191
1192 /// Join the alignments in the set into one alignment.
s_GetJoinedAlignment(const CSeq_align_set & aligns)1193 static CRef<CSeq_align> s_GetJoinedAlignment(const CSeq_align_set &aligns)
1194 {
1195 if (aligns.Get().size() == 1) {
1196 return aligns.Get().front();
1197 }
1198
1199 /// There's more than one, so create a disc alignment
1200 CRef<CSeq_align> align(new CSeq_align);
1201 align->SetDim(2);
1202 align->SetType(CSeq_align::eType_partial);
1203 align->SetSegs().SetDisc().Set() = aligns.Get();
1204
1205 try {
1206 align->Validate(true);
1207 }
1208 catch (CException& e) {
1209 ERR_POST(Error << e);
1210 cerr << MSerial_AsnText << aligns;
1211 cerr << MSerial_AsnText << *align;
1212 throw;
1213 }
1214 return align;
1215 }
1216
SplitOnLongDiscontinuity(list<CRef<CSeq_align>> & aligns,TSeqPos discontinuity_threshold) const1217 void CSeq_align::SplitOnLongDiscontinuity(list< CRef<CSeq_align> >& aligns,
1218 TSeqPos discontinuity_threshold) const
1219 {
1220 if (IsSetDim() && GetDim() > 2) {
1221 NCBI_THROW(CException, eUnknown,
1222 "SplitOnLongDiscontinuity() only implemented for pairwise alignments");
1223 }
1224
1225 size_t num_splits = 0;
1226 switch (GetSegs().Which()) {
1227 case TSegs::e_Denseg:
1228 {{
1229 // determine if we can extract any slice based on a discontinuity
1230 const CDense_seg& seg = GetSegs().GetDenseg();
1231 CDense_seg::TNumseg last_seg = 0;
1232 CDense_seg::TNumseg curr_seg = 0;
1233 TSeqRange curr_range0;
1234 TSeqRange curr_range1;
1235 for ( ; curr_seg < seg.GetNumseg(); ++curr_seg) {
1236 TSignedSeqPos p0_curr = seg.GetStarts()[ curr_seg * 2 ];
1237 TSignedSeqPos p1_curr = seg.GetStarts()[ curr_seg * 2 + 1];
1238 if (p0_curr < 0 || p1_curr < 0) {
1239 continue;
1240 }
1241
1242 TSeqPos curr_seg_len = seg.GetLens()[curr_seg];
1243
1244 TSeqRange prev_range0 = curr_range0;
1245 TSeqRange prev_range1 = curr_range1;
1246 curr_range0 = TSeqRange(p0_curr, p0_curr+curr_seg_len);
1247 curr_range1 = TSeqRange(p1_curr, p1_curr+curr_seg_len);
1248
1249 if (prev_range0.Empty()) {
1250 /// We're at start of alignment, so there's no gap to check for
1251 continue;
1252 }
1253
1254 if (prev_range0.NotEmpty() &&
1255 (s_Distance(curr_range0, prev_range0) > discontinuity_threshold ||
1256 s_Distance(curr_range1, prev_range1) > discontinuity_threshold))
1257 {
1258 aligns.push_back(x_CreateSubsegAlignment(last_seg, curr_seg - 1));
1259 ++num_splits;
1260 last_seg = curr_seg;
1261 }
1262 }
1263 if (num_splits) {
1264 /// We had to split the alignment; create a final subseg alignment
1265 aligns.push_back(x_CreateSubsegAlignment(last_seg, curr_seg - 1));
1266 ++num_splits;
1267 }
1268 }}
1269 break;
1270
1271 case TSegs::e_Disc:
1272 {{
1273 const CSeq_align_set &segs = GetSegs().GetDisc();
1274 CSeq_align_set curr_disc;
1275 TSeqRange curr_range0;
1276 TSeqRange curr_range1;
1277 ITERATE (list< CRef<CSeq_align> >, seg_it, segs.Get()) {
1278 TSeqRange prev_range0 = curr_range0;
1279 TSeqRange prev_range1 = curr_range1;
1280 curr_range0 = (*seg_it)->GetSeqRange(0);
1281 curr_range1 = (*seg_it)->GetSeqRange(1);
1282 list< CRef<CSeq_align> > seg_splits;
1283 (*seg_it)->SplitOnLongDiscontinuity(seg_splits,
1284 discontinuity_threshold);
1285
1286 if (prev_range0.Empty() ||
1287 (s_Distance(curr_range0, prev_range0) <= discontinuity_threshold &&
1288 s_Distance(curr_range1, prev_range1) <= discontinuity_threshold))
1289 {
1290 /// distance between this alignment and previous one is less than
1291 /// threshold; first split section of this alignment should be joined
1292 /// to previous one
1293 curr_disc.Set().push_back(seg_splits.front());
1294 seg_splits.pop_front();
1295 if (seg_splits.empty()) {
1296 continue;
1297 }
1298 }
1299
1300 /// Put current contents of curr_disc in aligns
1301 aligns.push_back(s_GetJoinedAlignment(curr_disc));
1302 ++num_splits;
1303
1304 /// Put results of current section's split in aligns except for last
1305 /// section; place last section in curr_disc, since it may have to be
1306 /// joined with next one
1307 num_splits += seg_splits.size();
1308 curr_disc.Set().clear();
1309 curr_disc.Set().push_back(seg_splits.back());
1310 seg_splits.pop_back();
1311
1312 aligns.splice(aligns.end(), seg_splits);
1313 }
1314 if (num_splits) {
1315 /// We had to split the alignment; create a final alignment from
1316 /// the contents of curr_disc
1317 aligns.push_back(s_GetJoinedAlignment(curr_disc));
1318 ++num_splits;
1319 }
1320 }}
1321 break;
1322
1323 default:
1324 break;
1325 }
1326
1327 if (!num_splits) {
1328 aligns.push_back(CRef<CSeq_align>(const_cast<CSeq_align *>(this)));
1329 }
1330 }
1331
OffsetRow(TDim row,TSignedSeqPos offset)1332 void CSeq_align::OffsetRow(TDim row,
1333 TSignedSeqPos offset)
1334 {
1335 if (offset == 0) return;
1336
1337 switch (SetSegs().Which()) {
1338 case TSegs::e_Dendiag:
1339 NON_CONST_ITERATE(TSegs::TDendiag, dendiag_it, SetSegs().SetDendiag()) {
1340 (*dendiag_it)->OffsetRow(row, offset);
1341 }
1342 break;
1343 case TSegs::e_Denseg:
1344 SetSegs().SetDenseg().OffsetRow(row, offset);
1345 break;
1346 case TSegs::e_Std:
1347 NON_CONST_ITERATE(TSegs::TStd, std_it, SetSegs().SetStd()) {
1348 (*std_it)->OffsetRow(row, offset);
1349 }
1350 break;
1351 case TSegs::e_Disc:
1352 NON_CONST_ITERATE(CSeq_align_set::Tdata, seq_align_it, SetSegs().SetDisc().Set()) {
1353 (*seq_align_it)->OffsetRow(row, offset);
1354 }
1355 break;
1356 default:
1357 NCBI_THROW(CSeqalignException, eUnsupported,
1358 "CSeq_align::OffsetRow() currently does not handle "
1359 "this type of alignment");
1360 }
1361 }
1362
1363
1364 /// @deprecated
RemapToLoc(TDim row,const CSeq_loc & dst_loc,bool ignore_strand)1365 void CSeq_align::RemapToLoc(TDim row,
1366 const CSeq_loc& dst_loc,
1367 bool ignore_strand)
1368 {
1369 // Limit to certain types of locs:
1370 switch (dst_loc.Which()) {
1371 case CSeq_loc::e_Whole:
1372 return;
1373 case CSeq_loc::e_Int:
1374 break;
1375 default:
1376 NCBI_THROW(CSeqalignException, eUnsupported,
1377 "CSeq_align::RemapToLoc only supports int target seq-locs");
1378 }
1379
1380 switch (SetSegs().Which()) {
1381 case TSegs::e_Denseg:
1382 SetSegs().SetDenseg().RemapToLoc(row, dst_loc, ignore_strand);
1383 break;
1384 case TSegs::e_Std:
1385 NON_CONST_ITERATE(TSegs::TStd, std_it, SetSegs().SetStd()) {
1386 (*std_it)->RemapToLoc(row, dst_loc, ignore_strand);
1387 }
1388 break;
1389 case TSegs::e_Disc:
1390 NON_CONST_ITERATE(CSeq_align_set::Tdata, seq_align_it, SetSegs().SetDisc().Set()) {
1391 (*seq_align_it)->RemapToLoc(row, dst_loc, ignore_strand);
1392 }
1393 break;
1394 default:
1395 NCBI_THROW(CSeqalignException, eUnsupported,
1396 "CSeq_align::RemapToLoc only supports Dense-seg and Std-seg alignments.");
1397 }
1398 }
1399
1400
RemapAlignToLoc(const CSeq_align & align,CSeq_align::TDim row,const CSeq_loc & loc)1401 CRef<CSeq_align> RemapAlignToLoc(const CSeq_align& align,
1402 CSeq_align::TDim row,
1403 const CSeq_loc& loc)
1404 {
1405 if ( loc.IsWhole() ) {
1406 CRef<CSeq_align> copy(new CSeq_align);
1407 copy->Assign(align);
1408 return copy;
1409 }
1410 const CSeq_id* orig_id = loc.GetId();
1411 if ( !orig_id ) {
1412 NCBI_THROW(CAnnotMapperException, eBadLocation,
1413 "Location with multiple ids can not be used to "
1414 "remap seq-aligns.");
1415 }
1416 CRef<CSeq_id> id(new CSeq_id);
1417 id->Assign(*orig_id);
1418
1419 // Create source seq-loc
1420 TSeqPos len = 0;
1421 for (CSeq_loc_CI it(loc); it; ++it) {
1422 if ( it.IsWhole() ) {
1423 NCBI_THROW(CAnnotMapperException, eBadLocation,
1424 "Whole seq-loc can not be used to "
1425 "remap seq-aligns.");
1426 }
1427 len += it.GetRange().GetLength();
1428 }
1429 CSeq_loc src_loc(*id, 0, len - 1);
1430 ENa_strand strand = loc.GetStrand();
1431 if (strand != eNa_strand_unknown) {
1432 src_loc.SetStrand(strand);
1433 }
1434 CSeq_loc_Mapper_Base mapper(src_loc, loc);
1435 return mapper.Map(align, row);
1436 }
1437
1438 /// Get length of intersection between a range and a range collection
s_IntersectionLength(const CRangeCollection<TSeqPos> & ranges,const TSeqRange & range)1439 static inline TSeqPos s_IntersectionLength(const CRangeCollection<TSeqPos> &ranges,
1440 const TSeqRange &range)
1441 {
1442 TSeqPos length = 0;
1443 ITERATE (CRangeCollection<TSeqPos>, it, ranges) {
1444 length += it->IntersectionWith(range).GetLength();
1445 }
1446 return length;
1447 }
1448
1449 /// Retrieves the number of gaps in an alignment
1450 /// @param align Alignment to examine [in]
1451 /// @param get_total_count if true, the total number of gaps is retrived,
1452 /// otherwise only the number of gap openings is retrieved
1453 /// @throws CSeqalignException if alignment type is not supported
1454 static TSeqPos
s_GetGapCount(const CSeq_align & align,CSeq_align::TDim row,const CRangeCollection<TSeqPos> & ranges,bool get_total_count)1455 s_GetGapCount(const CSeq_align& align, CSeq_align::TDim row,
1456 const CRangeCollection<TSeqPos> &ranges,
1457 bool get_total_count)
1458 {
1459 if (ranges.empty()) {
1460 return 0;
1461 }
1462
1463 TSeqPos retval = 0;
1464 switch (align.GetSegs().Which()) {
1465 case CSeq_align::TSegs::e_Denseg:
1466 {{
1467 const CDense_seg& ds = align.GetSegs().GetDenseg();
1468 for (CDense_seg::TNumseg i = 0; i < ds.GetNumseg(); ++i) {
1469 bool is_gapped = false;
1470 for (CDense_seg::TDim j = 0; j < ds.GetDim(); ++j) {
1471 if (ds.GetStarts()[i * ds.GetDim() + j] == -1 &&
1472 (row < 0 || row == j))
1473 {
1474 is_gapped = true;
1475 break;
1476 }
1477 }
1478 if (is_gapped) {
1479 TSeqPos gap_len = ds.GetLens()[i];
1480 if (!ranges.begin()->IsWhole()) {
1481 TSignedSeqPos gap_start = ds.GetStarts()[i * ds.GetDim()];
1482 if (gap_start >= 0) {
1483 gap_len = s_IntersectionLength(ranges,
1484 TSeqRange(gap_start, gap_start + gap_len - 1));
1485 } else {
1486 gap_start = ds.GetStarts()[(i-1) * ds.GetDim()]
1487 + ds.GetLens()[i-1];
1488 if (s_IntersectionLength(ranges,
1489 TSeqRange(gap_start, gap_start)) == 0)
1490 {
1491 gap_len = 0;
1492 }
1493 }
1494 }
1495 retval += (get_total_count ? gap_len : (gap_len ? 1 : 0));
1496 }
1497 }
1498 }}
1499 break;
1500
1501 case CSeq_align::TSegs::e_Disc:
1502 {{
1503 ITERATE(CSeq_align::TSegs::TDisc::Tdata, iter,
1504 align.GetSegs().GetDisc().Get()) {
1505 retval += s_GetGapCount(**iter, row, ranges, get_total_count);
1506 }
1507 }}
1508 break;
1509
1510 case CSeq_align::TSegs::e_Spliced:
1511 {{
1512 ITERATE (CSpliced_seg::TExons, iter, align.GetSegs().GetSpliced().GetExons()) {
1513 for (CSeq_align::TDim r = 0; r < 2; ++r) {
1514 if (row != r) {
1515 CRangeCollection<TSeqPos> insertions =
1516 (*iter)->GetRowSeq_insertions(
1517 r, align.GetSegs().GetSpliced(), ranges);
1518 retval += get_total_count ? insertions.GetCoveredLength()
1519 : insertions.size();
1520 }
1521 }
1522 }
1523 }}
1524 break;
1525
1526 case CSeq_align::TSegs::e_Std:
1527 if (row < 0 && !get_total_count && ranges.begin()->IsWhole()) {
1528 ITERATE (CSeq_align::TSegs::TStd, iter, align.GetSegs().GetStd()) {
1529 const CStd_seg& seg = **iter;
1530 ITERATE (CStd_seg::TLoc, it, seg.GetLoc()) {
1531 const CSeq_loc& loc = **it;
1532 if (loc.IsEmpty()) {
1533 ++retval;
1534 break;
1535 }
1536 }
1537 }
1538 break;
1539 }
1540
1541 default:
1542 NCBI_THROW(CSeqalignException, eUnsupported,
1543 "CSeq_align::GetGapCount() currently does not handle "
1544 "this type of alignment.");
1545 }
1546
1547 return retval;
1548 }
1549
GetTotalGapCount(TDim row) const1550 TSeqPos CSeq_align::GetTotalGapCount(TDim row) const
1551 {
1552 return s_GetGapCount(*this, row,
1553 CRangeCollection<TSeqPos>(TSeqRange::GetWhole()),
1554 true);
1555 }
1556
GetNumGapOpenings(TDim row) const1557 TSeqPos CSeq_align::GetNumGapOpenings(TDim row) const
1558 {
1559 return s_GetGapCount(*this, row,
1560 CRangeCollection<TSeqPos>(TSeqRange::GetWhole()),
1561 false);
1562 }
1563
GetTotalGapCountWithinRange(const TSeqRange & range,TDim row) const1564 TSeqPos CSeq_align::GetTotalGapCountWithinRange(
1565 const TSeqRange &range, TDim row) const
1566 {
1567 return s_GetGapCount(*this, row, CRangeCollection<TSeqPos>(range), true);
1568 }
1569
GetNumGapOpeningsWithinRange(const TSeqRange & range,TDim row) const1570 TSeqPos CSeq_align::GetNumGapOpeningsWithinRange(
1571 const TSeqRange &range, TDim row) const
1572 {
1573 return s_GetGapCount(*this, row, CRangeCollection<TSeqPos>(range), false);
1574 }
1575
GetTotalGapCountWithinRanges(const CRangeCollection<TSeqPos> & ranges,TDim row) const1576 TSeqPos CSeq_align::GetTotalGapCountWithinRanges(
1577 const CRangeCollection<TSeqPos> &ranges, TDim row) const
1578 {
1579 return s_GetGapCount(*this, row, ranges, true);
1580 }
1581
GetNumGapOpeningsWithinRanges(const CRangeCollection<TSeqPos> & ranges,TDim row) const1582 TSeqPos CSeq_align::GetNumGapOpeningsWithinRanges(
1583 const CRangeCollection<TSeqPos> &ranges, TDim row) const
1584 {
1585 return s_GetGapCount(*this, row, ranges, false);
1586 }
1587
AsString() const1588 string CSeq_align::SIndel::AsString() const
1589 {
1590 return NStr::NumericToString(genomic_pos)
1591 + (row == 0 ? 'I' : 'D')
1592 + NStr::NumericToString(length);
1593 }
1594
1595 static vector<CSeq_align::SIndel>
s_GetIndels(const CSeq_align & align,CSeq_align::TDim row,const CRangeCollection<TSeqPos> & ranges,bool include_frameshifts,bool include_non_frameshifts)1596 s_GetIndels(const CSeq_align& align, CSeq_align::TDim row,
1597 const CRangeCollection<TSeqPos> &ranges,
1598 bool include_frameshifts, bool include_non_frameshifts)
1599 {
1600 vector<CSeq_align::SIndel> results;
1601 if (ranges.empty()) {
1602 return results;;
1603 }
1604
1605 CRef<CSeq_align> generated_denseg;
1606 switch (align.GetSegs().Which()) {
1607 case CSeq_align::TSegs::e_Denseg:
1608 break;
1609
1610 case CSeq_align::TSegs::e_Disc:
1611 generated_denseg = align.CreateDensegFromDisc();
1612 break;
1613
1614 case CSeq_align::TSegs::e_Spliced:
1615 {{
1616 CRef<CSeq_align> as_disc_seg =
1617 align.GetSegs().GetSpliced().AsDiscSeg();
1618 if (align.GetSeqStart(1) > align.GetSeqStop(1)) {
1619 /// genomic sequence is circular; won't work on the entire
1620 /// alignment, need to check separately on the parts before and
1621 /// after the origin
1622 CSeq_align before_origin, after_origin;
1623 before_origin.SetType(CSeq_align::eType_disc);
1624 after_origin.SetType(CSeq_align::eType_disc);
1625 for (const CRef<CSeq_align> &segment
1626 : as_disc_seg->GetSegs().GetDisc().Get())
1627 {
1628 (segment->GetSeqStart(1) < align.GetSeqStart(1)
1629 ? after_origin : before_origin)
1630 . SetSegs().SetDisc().Set().push_back(segment);
1631 }
1632 vector<CSeq_align::SIndel> before_origin_frameshifts =
1633 s_GetIndels(before_origin, row, ranges, include_frameshifts, include_non_frameshifts),
1634 after_origin_frameshifts =
1635 s_GetIndels(after_origin, row, ranges, include_frameshifts, include_non_frameshifts);
1636 results = before_origin_frameshifts;
1637 results.insert(results.end(), after_origin_frameshifts.begin(),
1638 after_origin_frameshifts.end());
1639 return results;
1640 }
1641 generated_denseg = as_disc_seg->CreateDensegFromDisc();
1642 }}
1643 break;
1644
1645 default:
1646 NCBI_THROW(CSeqalignException, eUnsupported,
1647 "CSeq_align::GetNumFrameshifts() currently does not handle "
1648 "this type of alignment.");
1649 }
1650
1651 if (generated_denseg) {
1652 generated_denseg->SetSegs().SetDenseg().OrderAdjacentGaps();
1653 generated_denseg->SetSegs().SetDenseg().Compact();
1654 generated_denseg->SetSegs().SetDenseg().RemovePureGapSegs();
1655 }
1656
1657 const CDense_seg &ds = (generated_denseg ? *generated_denseg : align)
1658 . GetSegs().GetDenseg();
1659
1660 for (CDense_seg::TNumseg i = 0; i < ds.GetNumseg(); ++i) {
1661 bool is_gapped = false;
1662 for (CDense_seg::TDim j = 0; j < ds.GetDim(); ++j) {
1663 if (ds.GetStarts()[i * ds.GetDim() + j] == -1 && row != j)
1664 {
1665 is_gapped = true;
1666 break;
1667 }
1668 }
1669 if (is_gapped) {
1670 TSeqPos gap_len = ds.GetLens()[i];
1671 if (!ranges.begin()->IsWhole()) {
1672 TSignedSeqPos gap_start = ds.GetStarts()[i * ds.GetDim()];
1673 if (gap_start >= 0) {
1674 gap_len = s_IntersectionLength(ranges,
1675 TSeqRange(gap_start, gap_start + gap_len - 1));
1676 } else {
1677 gap_start = ds.GetStarts()[(i-1) * ds.GetDim()]
1678 + ds.GetLens()[i-1];
1679 if (s_IntersectionLength(ranges,
1680 TSeqRange(gap_start, gap_start)) == 0)
1681 {
1682 gap_len = 0;
1683 }
1684 }
1685 }
1686 if (gap_len > 0 &&
1687 ((include_frameshifts && gap_len % 3 != 0) ||
1688 (include_non_frameshifts && gap_len % 3 == 0)))
1689 {
1690 TSignedSeqPos genomic_gap_start = ds.GetStarts()[i*ds.GetDim() + 1];
1691 CSeq_align::TDim inserted_row = 1;
1692 if (genomic_gap_start < 0) {
1693 ENa_strand genomic_strand = ds.IsSetStrands() ? ds.GetStrands()[(i-1) * ds.GetDim() + 1] : eNa_strand_plus;
1694 genomic_gap_start = genomic_strand == eNa_strand_minus
1695 ? ds.GetStarts()[(i+1) * ds.GetDim() + 1]
1696 + ds.GetLens()[i+1]
1697 : ds.GetStarts()[(i-1) * ds.GetDim() + 1]
1698 + ds.GetLens()[i-1];
1699 inserted_row = 0;
1700 }
1701 results.push_back(CSeq_align::SIndel(genomic_gap_start, inserted_row, gap_len));
1702 }
1703 }
1704 }
1705
1706 return results;
1707 }
1708
GetNumFrameshifts(TDim row) const1709 TSeqPos CSeq_align::GetNumFrameshifts(TDim row) const
1710 {
1711 return s_GetIndels(*this, row,
1712 CRangeCollection<TSeqPos>(TSeqRange::GetWhole()), true, false)
1713 .size();
1714 }
1715
GetNumFrameshiftsWithinRange(const TSeqRange & range,TDim row) const1716 TSeqPos CSeq_align::GetNumFrameshiftsWithinRange(
1717 const TSeqRange &range, TDim row) const
1718 {
1719 return s_GetIndels(*this, row, CRangeCollection<TSeqPos>(range), true, false)
1720 .size();
1721 }
1722
GetNumFrameshiftsWithinRanges(const CRangeCollection<TSeqPos> & ranges,TDim row) const1723 TSeqPos CSeq_align::GetNumFrameshiftsWithinRanges(
1724 const CRangeCollection<TSeqPos> &ranges, TDim row) const
1725 {
1726 return s_GetIndels(*this, row, ranges, true, false)
1727 .size();
1728 }
1729
GetFrameshifts(TDim row) const1730 vector<CSeq_align::SIndel> CSeq_align::GetFrameshifts(TDim row) const
1731 {
1732 return s_GetIndels(*this, row,
1733 CRangeCollection<TSeqPos>(TSeqRange::GetWhole()), true, false);
1734 }
1735
GetFrameshiftsWithinRange(const TSeqRange & range,TDim row) const1736 vector<CSeq_align::SIndel> CSeq_align::GetFrameshiftsWithinRange(
1737 const TSeqRange &range, TDim row) const
1738 {
1739 return s_GetIndels(*this, row, CRangeCollection<TSeqPos>(range), true, false);
1740 }
1741
GetFrameshiftsWithinRanges(const CRangeCollection<TSeqPos> & ranges,TDim row) const1742 vector<CSeq_align::SIndel> CSeq_align::GetFrameshiftsWithinRanges(
1743 const CRangeCollection<TSeqPos> &ranges, TDim row) const
1744 {
1745 return s_GetIndels(*this, row, ranges, true, false);
1746 }
1747
GetNonFrameshifts(TDim row) const1748 vector<CSeq_align::SIndel> CSeq_align::GetNonFrameshifts(TDim row) const
1749 {
1750 return s_GetIndels(*this, row,
1751 CRangeCollection<TSeqPos>(TSeqRange::GetWhole()), false, true);
1752 }
1753
GetNonFrameshiftsWithinRange(const TSeqRange & range,TDim row) const1754 vector<CSeq_align::SIndel> CSeq_align::GetNonFrameshiftsWithinRange(
1755 const TSeqRange &range, TDim row) const
1756 {
1757 return s_GetIndels(*this, row, CRangeCollection<TSeqPos>(range), false, true);
1758 }
1759
GetNonFrameshiftsWithinRanges(const CRangeCollection<TSeqPos> & ranges,TDim row) const1760 vector<CSeq_align::SIndel> CSeq_align::GetNonFrameshiftsWithinRanges(
1761 const CRangeCollection<TSeqPos> &ranges, TDim row) const
1762 {
1763 return s_GetIndels(*this, row, ranges, false, true);
1764 }
1765
GetIndels(TDim row) const1766 vector<CSeq_align::SIndel> CSeq_align::GetIndels(TDim row) const
1767 {
1768 return s_GetIndels(*this, row,
1769 CRangeCollection<TSeqPos>(TSeqRange::GetWhole()), true, true);
1770 }
1771
GetIndelsWithinRange(const TSeqRange & range,TDim row) const1772 vector<CSeq_align::SIndel> CSeq_align::GetIndelsWithinRange(
1773 const TSeqRange &range, TDim row) const
1774 {
1775 return s_GetIndels(*this, row, CRangeCollection<TSeqPos>(range), true, true);
1776 }
1777
GetIndelsWithinRanges(const CRangeCollection<TSeqPos> & ranges,TDim row) const1778 vector<CSeq_align::SIndel> CSeq_align::GetIndelsWithinRanges(
1779 const CRangeCollection<TSeqPos> &ranges, TDim row) const
1780 {
1781 return s_GetIndels(*this, row, ranges, true, true);
1782 }
1783
1784
GetAlignedBases(TDim row) const1785 CRangeCollection<TSeqPos> CSeq_align::GetAlignedBases(TDim row) const
1786 {
1787 CRangeCollection<TSeqPos> ranges;
1788 switch (GetSegs().Which()) {
1789 case CSeq_align::TSegs::e_Denseg:
1790 {{
1791 const CDense_seg& ds = GetSegs().GetDenseg();
1792 for (CDense_seg::TNumseg i = 0; i < ds.GetNumseg(); ++i) {
1793 bool is_gapped = false;
1794 for (CDense_seg::TDim j = 0; j < ds.GetDim(); ++j) {
1795 if (ds.GetStarts()[i * ds.GetDim() + j] == -1)
1796 {
1797 is_gapped = true;
1798 break;
1799 }
1800 }
1801 if (!is_gapped) {
1802 TSignedSeqPos start = ds.GetStarts()[i * ds.GetDim() + row];
1803 ranges += TSeqRange(start, start + ds.GetLens()[i] - 1);
1804 }
1805 }
1806 }}
1807 break;
1808
1809 case CSeq_align::TSegs::e_Disc:
1810 {{
1811 ITERATE(CSeq_align::TSegs::TDisc::Tdata, iter,
1812 GetSegs().GetDisc().Get()) {
1813 ranges += (*iter)->GetAlignedBases(row);
1814 }
1815 }}
1816 break;
1817
1818 case CSeq_align::TSegs::e_Spliced:
1819 {{
1820 ITERATE (CSpliced_seg::TExons, iter, GetSegs().GetSpliced().GetExons()) {
1821 ranges += (*iter)->GetRowSeq_range(row, true);
1822 ranges -= (*iter)->GetRowSeq_insertions(row, GetSegs().GetSpliced());
1823 }
1824 }}
1825 break;
1826
1827 default:
1828 NCBI_THROW(CSeqalignException, eUnsupported,
1829 "CSeq_align::GetInsertedRanges() currently does not handle "
1830 "this type of alignment.");
1831 }
1832
1833 return ranges;
1834 }
1835
1836
1837 /// Get length of dense-seg alignment within range
s_DenseSegLength(const CDense_seg & ds,CDense_seg::TNumseg seg,const CRangeCollection<TSeqPos> & ranges)1838 static inline TSeqPos s_DenseSegLength(const CDense_seg& ds,
1839 CDense_seg::TNumseg seg,
1840 const CRangeCollection<TSeqPos> &ranges)
1841 {
1842 if(ranges.begin()->IsWhole())
1843 return ds.GetLens()[seg];
1844
1845 TSignedSeqPos start = ds.GetStarts()[seg * ds.GetDim()];
1846 TSeqRange seg_range(start, start + ds.GetLens()[seg] - 1);
1847 return s_IntersectionLength(ranges, seg_range);
1848 }
1849
1850 //////////////////////////////////////////////////////////////////////////////
1851 ///
1852 /// calculate the length of our alignment within given range
1853 ///
s_GetAlignmentLength(const CSeq_align & align,const CRangeCollection<TSeqPos> & ranges,bool ungapped)1854 static size_t s_GetAlignmentLength(const CSeq_align& align,
1855 const CRangeCollection<TSeqPos> &ranges,
1856 bool ungapped)
1857 {
1858 if (ranges.empty()) {
1859 return 0;
1860 }
1861
1862 size_t len = 0;
1863 switch (align.GetSegs().Which()) {
1864 case CSeq_align::TSegs::e_Denseg:
1865 {{
1866 const CDense_seg& ds = align.GetSegs().GetDenseg();
1867 if (ungapped) {
1868 for (CDense_seg::TNumseg i = 0; i < ds.GetNumseg(); ++i) {
1869 bool is_gap_seg = false;
1870 for (CDense_seg::TDim j = 0;
1871 !is_gap_seg && j < ds.GetDim(); ++j) {
1872 //int start = ds.GetStarts()[i * ds.GetDim() + j];
1873 if (ds.GetStarts()[i * ds.GetDim() + j] == -1) {
1874 is_gap_seg = true;
1875 }
1876 }
1877 if ( !is_gap_seg ) {
1878 len += s_DenseSegLength(ds, i, ranges);
1879 }
1880 }
1881 } else {
1882 for (size_t i = 0; i < ds.GetLens().size(); ++i) {
1883 len += s_DenseSegLength(ds, i, ranges);
1884 }
1885 }
1886 }}
1887 break;
1888
1889 case CSeq_align::TSegs::e_Disc:
1890 ITERATE (CSeq_align::TSegs::TDisc::Tdata, iter,
1891 align.GetSegs().GetDisc().Get()) {
1892 len += s_GetAlignmentLength(**iter, ranges, ungapped);
1893 }
1894 break;
1895
1896 case CSeq_align::TSegs::e_Std:
1897 {{
1898 if(!ranges.begin()->IsWhole())
1899 NCBI_THROW(CSeqalignException, eUnsupported,
1900 "Can't calculate alignment length within a range "
1901 "for standard seg representation");
1902
1903 /// pass 1:
1904 /// find total ranges
1905 vector<TSeqPos> sizes;
1906
1907 size_t count = 0;
1908 ITERATE (CSeq_align::TSegs::TStd, iter, align.GetSegs().GetStd()) {
1909 const CStd_seg& seg = **iter;
1910
1911 bool is_gap = false;
1912 ITERATE (CStd_seg::TLoc, it, seg.GetLoc()) {
1913 if ((*it)->IsEmpty()) {
1914 is_gap = true;
1915 break;
1916 }
1917 }
1918
1919 if ( !ungapped || !is_gap) {
1920 size_t i = 0;
1921 ITERATE (CStd_seg::TLoc, it, seg.GetLoc()) {
1922 if ( !(*it)->IsEmpty() ) {
1923 if (sizes.empty()) {
1924 sizes.resize(seg.GetDim(), 0);
1925 } else if (sizes.size() != (size_t)seg.GetDim()) {
1926 NCBI_THROW(CException, eUnknown,
1927 "invalid std-seg: "
1928 "inconsistent number of locs");
1929 }
1930
1931 for (CSeq_loc_CI loc_it(**it); loc_it; ++loc_it) {
1932 sizes[i] += loc_it.GetRange().GetLength();
1933 }
1934 }
1935
1936 ++i;
1937 }
1938 }
1939
1940 ++count;
1941 }
1942
1943 /// pass 2: determine shortest length
1944 if ( sizes.empty() ) return 0;
1945 vector<TSeqPos>::iterator iter = sizes.begin();
1946 vector<TSeqPos>::iterator smallest = iter;
1947 for (++iter; iter != sizes.end(); ++iter) {
1948 if (*iter < *smallest) {
1949 smallest = iter;
1950 }
1951 }
1952 return *smallest;
1953 }}
1954 break;
1955
1956 case CSeq_align::TSegs::e_Spliced:
1957 ITERATE (CSpliced_seg::TExons, iter,
1958 align.GetSegs().GetSpliced().GetExons()) {
1959 CRangeCollection<TSeqPos> exon_ranges = ranges;
1960 exon_ranges &= (*iter)->GetRowSeq_range(0, true);
1961 len += exon_ranges.GetCoveredLength();
1962 if (ungapped) {
1963 len -= (*iter)->GetRowSeq_insertions(
1964 0, align.GetSegs().GetSpliced(), ranges).GetCoveredLength();
1965 } else {
1966 /// When computing length without gaps, genomic inserts
1967 /// are ignored; when computing with gaps, we need to add them
1968 /// to length if their starting point is within range
1969 len += (*iter)->GetRowSeq_insertions(
1970 1, align.GetSegs().GetSpliced(), ranges).GetCoveredLength();
1971 }
1972 }
1973 break;
1974
1975 default:
1976 _ASSERT(false);
1977 break;
1978 }
1979
1980 return len;
1981 }
1982
1983
GetAlignLength(bool include_gaps) const1984 TSeqPos CSeq_align::GetAlignLength(bool include_gaps) const
1985 {
1986 return s_GetAlignmentLength(*this,
1987 CRangeCollection<TSeqPos>(TSeqRange::GetWhole()),
1988 !include_gaps );
1989 }
1990
1991
GetAlignLengthWithinRange(const TSeqRange & range,bool include_gaps) const1992 TSeqPos CSeq_align::GetAlignLengthWithinRange(const TSeqRange &range,
1993 bool include_gaps) const
1994 {
1995 return s_GetAlignmentLength(*this,
1996 CRangeCollection<TSeqPos>(range),
1997 !include_gaps );
1998 }
1999
2000
GetAlignLengthWithinRanges(const CRangeCollection<TSeqPos> & ranges,bool include_gaps) const2001 TSeqPos CSeq_align::GetAlignLengthWithinRanges(const CRangeCollection<TSeqPos> &ranges,
2002 bool include_gaps) const
2003 {
2004 return s_GetAlignmentLength(*this, ranges, !include_gaps );
2005 }
2006
2007
AlignLengthRatio() const2008 double CSeq_align::AlignLengthRatio() const {
2009 TSeqRange r0 = GetSeqRange(0);
2010 TSeqRange r1 = GetSeqRange(1);
2011 double r = 0;
2012 if (r0.GetLength()) {
2013 r = double(r1.GetLength()) / double(r0.GetLength());
2014 }
2015 return r;
2016 }
2017
2018
CreateRowSeq_loc(TDim row) const2019 CRef<CSeq_loc> CSeq_align::CreateRowSeq_loc(TDim row) const
2020 {
2021 CRef<CSeq_loc> loc(new CSeq_loc);
2022 switch (GetSegs().Which()) {
2023 case CSeq_align::TSegs::e_Dendiag:
2024 {
2025 ITERATE(TSegs::TDendiag, it, GetSegs().GetDendiag()) {
2026 loc->SetPacked_int().Set().push_back(
2027 (*it)->CreateRowSeq_interval(row));
2028 }
2029 break;
2030 }
2031 case CSeq_align::TSegs::e_Denseg:
2032 {
2033 loc->SetInt(*GetSegs().GetDenseg().CreateRowSeq_interval(row));
2034 break;
2035 }
2036 case CSeq_align::TSegs::e_Std:
2037 {
2038 ITERATE(TSegs::TStd, it, GetSegs().GetStd()) {
2039 // Std-seg may contain empty locations, so
2040 // we have to use mix rather than packed-int.
2041 loc->SetMix().Set().push_back(
2042 (*it)->CreateRowSeq_loc(row));
2043 }
2044 break;
2045 }
2046 case CSeq_align::TSegs::e_Disc:
2047 {
2048 ITERATE(TSegs::TDisc::Tdata, it, GetSegs().GetDisc().Get()) {
2049 loc->SetMix().Set().push_back((*it)->CreateRowSeq_loc(row));
2050 }
2051 break;
2052 }
2053 case CSeq_align::TSegs::e_Spliced:
2054 {
2055 if (row > 1) {
2056 NCBI_THROW(CSeqalignException, eInvalidRowNumber,
2057 "CSeq_align::CreateRowSeq_loc() - row number must "
2058 "be 0 or 1 for spliced-segs.");
2059 }
2060 const CSpliced_seg& spl = GetSegs().GetSpliced();
2061 ITERATE(TSegs::TSpliced::TExons, ex, spl.GetExons()) {
2062 // Spliced-seg may be required to get seq-id if
2063 // it's not set in the exon.
2064 loc->SetPacked_int().Set().push_back(
2065 (*ex)->CreateRowSeq_interval(row, spl));
2066 }
2067 break;
2068 }
2069 case CSeq_align::TSegs::e_Packed:
2070 case CSeq_align::TSegs::e_Sparse:
2071 default:
2072 NCBI_THROW(CSeqalignException, eUnsupported,
2073 "CSeq_align::CreateRowSeq_loc() currently does not handle "
2074 "this type of alignment.");
2075 break;
2076 }
2077 return loc;
2078 }
2079
2080 struct SLengthRange
2081 {
SLengthRangeSLengthRange2082 SLengthRange() : range(numeric_limits<TSeqPos>::max(), 0) {}
AddLengthSLengthRange2083 void AddLength(TSeqPos length)
2084 {
2085 if (range.first > length) {
2086 range.first = length;
2087 }
2088 if (range.second < length) {
2089 range.second = length;
2090 }
2091 }
AddRangeSLengthRange2092 void AddRange(CSeq_align::TLengthRange new_range)
2093 {
2094 if (range.first > new_range.first) {
2095 range.first = new_range.first;
2096 }
2097 if (range.second < new_range.second) {
2098 range.second = new_range.second;
2099 }
2100 }
2101
2102 CSeq_align::TLengthRange range;
2103 };
2104
GapLengthRange() const2105 CSeq_align::TLengthRange CSeq_align::GapLengthRange() const
2106 {
2107 SLengthRange length_range;
2108 switch (GetSegs().Which()) {
2109 case CSeq_align::TSegs::e_Denseg:
2110 {{
2111 const CDense_seg& ds = GetSegs().GetDenseg();
2112 for (CDense_seg::TNumseg i = 0; i < ds.GetNumseg(); ++i) {
2113 bool is_gapped = false;
2114 for (CDense_seg::TDim j = 0; j < ds.GetDim(); ++j) {
2115 if (ds.GetStarts()[i * ds.GetDim() + j] == -1) {
2116 is_gapped = true;
2117 break;
2118 }
2119 }
2120 if (is_gapped) {
2121 length_range.AddLength(ds.GetLens()[i]);
2122 }
2123 }
2124 }}
2125 break;
2126
2127 case CSeq_align::TSegs::e_Disc:
2128 {{
2129 int num_rows = CheckNumRows();
2130 vector<TSeqRange> last_seg_ranges;
2131 ITERATE(CSeq_align::TSegs::TDisc::Tdata, iter,
2132 GetSegs().GetDisc().Get()) {
2133 length_range.AddRange((*iter)->GapLengthRange());
2134 vector<TSeqRange> seg_ranges;
2135 for (int i=0; i < num_rows; i++) {
2136 seg_ranges.push_back((*iter)->GetSeqRange(i));
2137 /// If this is not first segment, include gaps between last
2138 /// segment and this one
2139 if (!last_seg_ranges.empty()) {
2140 TSeqPos gap = s_Distance(seg_ranges[i], last_seg_ranges[i]);
2141 if (gap > 0) {
2142 length_range.AddLength(gap);
2143 }
2144 }
2145 }
2146 last_seg_ranges = seg_ranges;
2147 }
2148 }}
2149 break;
2150
2151 case CSeq_align::TSegs::e_Spliced:
2152 {{
2153 ITERATE (CSpliced_seg::TExons, iter,
2154 GetSegs().GetSpliced().GetExons()) {
2155 const CSpliced_exon& exon = **iter;
2156 for (unsigned r = 0; r < 2; ++r) {
2157 CRangeCollection<TSeqPos> insertions =
2158 exon.GetRowSeq_insertions(
2159 r, GetSegs().GetSpliced());
2160 ITERATE (CRangeCollection<TSeqPos>, ins_it, insertions)
2161 {
2162 length_range.AddLength(ins_it->GetLength());
2163 }
2164 }
2165 }
2166 }}
2167
2168 break;
2169
2170 default:
2171 NCBI_THROW(CSeqalignException, eUnsupported,
2172 "Can't get gap lengths for this type of alignment.");
2173 }
2174 return length_range.range;
2175 }
2176
IntronLengthRange() const2177 CSeq_align::TLengthRange CSeq_align::IntronLengthRange() const
2178 {
2179 if (!GetSegs().IsSpliced()) {
2180 NCBI_THROW(CSeqalignException, eUnsupported,
2181 "Requested exon lengths for a non-spliced alignment.");
2182 }
2183 SLengthRange length_range;
2184 const CSpliced_exon* previous_exon = NULL;
2185 bool minus_strand = GetSeqStrand(1) == eNa_strand_minus;
2186 ITERATE (CSpliced_seg::TExons, iter,
2187 GetSegs().GetSpliced().GetExons()) {
2188 const CSpliced_exon* exon = &**iter;
2189 if (previous_exon) {
2190 bool descending_pos = exon->GetGenomic_end() <
2191 previous_exon->GetGenomic_start();
2192 if (descending_pos != minus_strand) {
2193 /// Crossed origins; don't count as intron size
2194 continue;
2195 }
2196 TSeqRange intron((minus_strand ? exon : previous_exon)->GetGenomic_end() + 1,
2197 (minus_strand ? previous_exon : exon)->GetGenomic_start() - 1);
2198 length_range.AddLength(intron.GetLength());
2199 }
2200 previous_exon = exon;
2201 }
2202 return length_range.range;
2203 }
2204
ExonLengthRange() const2205 CSeq_align::TLengthRange CSeq_align::ExonLengthRange() const
2206 {
2207 if (!GetSegs().IsSpliced()) {
2208 NCBI_THROW(CSeqalignException, eUnsupported,
2209 "Requested exon lengths for a non-spliced alignment.");
2210 }
2211 SLengthRange length_range;
2212 ITERATE (CSpliced_seg::TExons, iter,
2213 GetSegs().GetSpliced().GetExons()) {
2214 const CSpliced_exon* exon = &**iter;
2215 length_range.AddLength(
2216 exon->GetGenomic_end() - exon->GetGenomic_start() + 1);
2217 }
2218 return length_range.range;
2219 }
2220
x_CreateSubsegAlignment(int from,int to) const2221 CRef<CSeq_align> CSeq_align::x_CreateSubsegAlignment(int from, int to) const
2222 {
2223 // break at this segment
2224 CRef<CSeq_align> align(new CSeq_align);
2225 align->SetDim(2);
2226 align->SetType(CSeq_align::eType_partial);
2227 const CDense_seg& seg = GetSegs().GetDenseg();
2228 CDense_seg& subseg = align->SetSegs().SetDenseg();
2229
2230 subseg.SetIds() = seg.GetIds();
2231 subseg.SetDim(2);
2232 subseg.SetNumseg(to - from + 1);
2233 subseg.SetStarts().reserve(subseg.GetNumseg()*2);
2234 subseg.SetLens().reserve(subseg.GetNumseg());
2235 if (seg.IsSetStrands()) {
2236 subseg.SetStrands().reserve(subseg.GetNumseg()*2);
2237 }
2238
2239 int i;
2240
2241 for (i = from; i <= to; ++i) {
2242 subseg.SetLens().push_back(seg.GetLens()[i]);
2243 subseg.SetStarts().push_back(seg.GetStarts()[i * 2]);
2244 subseg.SetStarts().push_back(seg.GetStarts()[i * 2 + 1]);
2245 if (seg.IsSetStrands()) {
2246 subseg.SetStrands().push_back(seg.GetStrands()[i * 2]);
2247 subseg.SetStrands().push_back(seg.GetStrands()[i * 2 + 1]);
2248 }
2249 }
2250
2251 subseg.TrimEndGaps();
2252 try {
2253 align->Validate(true);
2254 }
2255 catch (CException& e) {
2256 ERR_POST(Error << e);
2257 cerr << MSerial_AsnText << seg;
2258 cerr << MSerial_AsnText << *align;
2259 throw;
2260 }
2261 return align;
2262 }
2263
2264
FindExt(const string & ext_type) const2265 CConstRef<CUser_object> CSeq_align::FindExt(const string& ext_type) const
2266 {
2267 CConstRef<CUser_object> ret;
2268 if ( IsSetExt() ) {
2269 ITERATE(TExt, it, GetExt()) {
2270 const CObject_id& obj_type = (*it)->GetType();
2271 if ( obj_type.IsStr() && obj_type.GetStr() == ext_type ) {
2272 ret.Reset(it->GetPointer());
2273 break;
2274 }
2275 }
2276 }
2277 return ret;
2278 }
2279
FindExt(const string & ext_type)2280 CRef<CUser_object> CSeq_align::FindExt(const string& ext_type)
2281 {
2282 CRef<CUser_object> ret;
2283 if ( IsSetExt() ) {
2284 NON_CONST_ITERATE(TExt, it, SetExt()) {
2285 const CObject_id& obj_type = (*it)->GetType();
2286 if ( obj_type.IsStr() && obj_type.GetStr() == ext_type ) {
2287 ret.Reset(it->GetPointer());
2288 break;
2289 }
2290 }
2291 }
2292 return ret;
2293 }
2294
2295 END_objects_SCOPE // namespace ncbi::objects::
2296
2297 END_NCBI_SCOPE
2298