1 /* $Id: Seq_align.cpp 618884 2020-10-27 16:28:50Z gouriano $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  .......
27  *
28  * File Description:
29  *   .......
30  *
31  * Remark:
32  *   This code was originally generated by application DATATOOL
33  *   using specifications from the data definition file
34  *   'seqalign.asn'.
35  */
36 
37 // standard includes
38 #include <ncbi_pch.hpp>
39 #include <objects/seqalign/seqalign_exception.hpp>
40 #include <objects/seqalign/Dense_seg.hpp>
41 #include <objects/seqalign/Dense_diag.hpp>
42 #include <objects/seqalign/Std_seg.hpp>
43 #include <objects/seqalign/Spliced_seg.hpp>
44 #include <objects/seqalign/Spliced_exon.hpp>
45 #include <objects/seqalign/Spliced_exon_chunk.hpp>
46 #include <objects/seqalign/Sparse_seg.hpp>
47 #include <objects/seqalign/Seq_align_set.hpp>
48 #include <objects/seqalign/Score.hpp>
49 #include <objects/general/Object_id.hpp>
50 #include <objects/general/User_object.hpp>
51 #include <objects/seqloc/Seq_loc.hpp>
52 #include <objects/seqloc/Seq_interval.hpp>
53 #include <objects/seqloc/Seq_point.hpp>
54 #include <objects/seq/seq_loc_mapper_base.hpp>
55 #include <serial/iterator.hpp>
56 #include <util/static_set.hpp>
57 
58 // generated includes
59 #include <objects/seqalign/Seq_align.hpp>
60 
61 // generated classes
62 
63 BEGIN_NCBI_SCOPE
64 
65 BEGIN_objects_SCOPE // namespace ncbi::objects::
66 
67 // destructor
~CSeq_align(void)68 CSeq_align::~CSeq_align(void)
69 {
70 }
71 
72 
CheckNumRows(void) const73 CSeq_align::TDim CSeq_align::CheckNumRows(void) const
74 {
75     switch (GetSegs().Which()) {
76     case C_Segs::e_Denseg:
77         return GetSegs().GetDenseg().CheckNumRows();
78 
79         //case C_Segs::e_Packed:
80         //return GetSegs().GetPacked().CheckNumRows();
81 
82     case C_Segs::e_Sparse:
83         return GetSegs().GetSparse().CheckNumRows();
84 
85     case C_Segs::e_Spliced:
86         {{
87             // spliced seg always has two rows: genomic and protein/transcript
88             return 2;
89         }}
90 
91     case C_Segs::e_Disc:
92         {{
93             TDim numrows = 0;
94             ITERATE (C_Segs::TDisc::Tdata, std_i, GetSegs().GetDisc().Get()) {
95                 TDim seg_numrows = (*std_i)->CheckNumRows();
96                 if (numrows) {
97                     if ( seg_numrows != numrows ) {
98                         NCBI_THROW(CSeqalignException, eInvalidAlignment,
99                                    "CSeq_align::CheckNumRows(): Number of rows "
100                                    "is not the same for each disc seg.");
101                     }
102                 } else {
103                     numrows = seg_numrows;
104                 }
105             }
106             return numrows;
107         }}
108 
109     case C_Segs::e_Std:
110         {{
111             TDim numrows = 0;
112             ITERATE (C_Segs::TStd, std_i, GetSegs().GetStd()) {
113                 TDim seg_numrows = (*std_i)->CheckNumRows();
114                 if (numrows) {
115                     if (seg_numrows != numrows) {
116                         NCBI_THROW(CSeqalignException, eInvalidAlignment,
117                                    "CSeq_align::CheckNumRows(): Number of rows "
118                                    "is not the same for each std seg.");
119                     }
120                 } else {
121                     numrows = seg_numrows;
122                 }
123             }
124             return numrows;
125         }}
126 
127     case C_Segs::e_Dendiag:
128         {
129             TDim numrows = 0;
130             ITERATE (C_Segs::TDendiag, it, GetSegs().GetDendiag()) {
131                 TDim seg_numrows = (*it)->CheckNumRows();
132                 if (numrows) {
133                     if ( seg_numrows != numrows ) {
134                         NCBI_THROW(CSeqalignException, eInvalidAlignment,
135                                    "CSeq_align::CheckNumRows(): Number of rows "
136                                    "is not the same for each dendiag seg.");
137                     }
138                 } else {
139                     numrows = seg_numrows;
140                 }
141             }
142             return numrows;
143         }
144 
145     default:
146         NCBI_THROW(CSeqalignException, eUnsupported,
147                    "CSeq_align::CheckNumRows() currently does not handle "
148                    "this type of alignment");
149     }
150 }
151 
152 
GetSeqRange(TDim row) const153 CRange<TSeqPos> CSeq_align::GetSeqRange(TDim row) const
154 {
155     switch (GetSegs().Which()) {
156     case C_Segs::e_Denseg:
157         return GetSegs().GetDenseg().GetSeqRange(row);
158 
159     case C_Segs::e_Sparse:
160         return GetSegs().GetSparse().GetSeqRange(row);
161 
162     case C_Segs::e_Dendiag:
163         {
164             CRange<TSeqPos> rng;
165             ITERATE (C_Segs::TDendiag, dendiag_i, GetSegs().GetDendiag()) {
166                 rng.CombineWith((*dendiag_i)->GetSeqRange(row));
167             }
168             return rng;
169         }
170 
171     case C_Segs::e_Std:
172         {
173             CRange<TSeqPos> rng;
174             CSeq_id seq_id;
175             size_t seg_i = 0;
176             ITERATE (C_Segs::TStd, std_i, GetSegs().GetStd()) {
177                 TDim row_i = 0;
178                 ITERATE (CStd_seg::TLoc, i, (*std_i)->GetLoc()) {
179                     const CSeq_loc& loc = **i;
180                     if (row_i++ == row) {
181                         if (loc.IsInt()) {
182                             if ( !seg_i ) {
183                                 seq_id.Assign(loc.GetInt().GetId());
184                             } else if (seq_id.Compare(loc.GetInt().GetId())
185                                        != CSeq_id::e_YES) {
186                                 NCBI_THROW(CSeqalignException,
187                                            eInvalidRowNumber,
188                                            "CSeq_align::GetSeqRange():"
189                                            " Row seqids not consistent."
190                                            " Cannot determine range.");
191                             }
192                             rng.CombineWith
193                                 (CRange<TSeqPos>
194                                  (loc.GetInt().GetFrom(),
195                                   loc.GetInt().GetTo()));
196                         } else if (loc.IsPnt()) {
197                             if ( !seg_i ) {
198                                 seq_id.Assign(loc.GetPnt().GetId());
199                             } else if (seq_id.Compare(loc.GetPnt().GetId())
200                                        != CSeq_id::e_YES) {
201                                 NCBI_THROW(CSeqalignException,
202                                            eInvalidRowNumber,
203                                            "CSeq_align::GetSeqRange():"
204                                            " Row seqids not consistent."
205                                            " Cannot determine range.");
206                             }
207                             rng.CombineWith
208                                 (CRange<TSeqPos>
209                                  (loc.GetPnt().GetPoint(),
210                                   loc.GetPnt().GetPoint()));
211                         }
212                     }
213                 }
214                 if (row < 0  ||  row >= row_i) {
215                     NCBI_THROW(CSeqalignException, eInvalidRowNumber,
216                                "CSeq_align::GetSeqRange():"
217                                " Invalid row number");
218                 }
219                 if (CanGetDim()  &&  row_i != GetDim()) {
220                     NCBI_THROW(CSeqalignException, eInvalidAlignment,
221                                "CSeq_align::GetSeqRange():"
222                                " loc.size is inconsistent with dim");
223                 }
224                 seg_i++;
225             }
226             if (rng.Empty()) {
227                 NCBI_THROW(CSeqalignException, eInvalidAlignment,
228                            "CSeq_align::GetSeqRange(): Row is empty");
229             }
230             return rng;
231         }
232 
233     case C_Segs::e_Disc:
234         {
235             CRange<TSeqPos> rng;
236             ITERATE (C_Segs::TDisc::Tdata, disc_i, GetSegs().GetDisc().Get()) {
237                 rng.CombineWith((*disc_i)->GetSeqRange(row));
238             }
239             return rng;
240         }
241 
242     case C_Segs::e_Spliced:
243         return GetSegs().GetSpliced().GetSeqRange(row);
244 
245     default:
246         NCBI_THROW(CSeqalignException, eUnsupported,
247                    "CSeq_align::GetSeqRange() currently does not handle "
248                    "this type of alignment.");
249     }
250 }
251 
GetSeqStart(TDim row) const252 TSeqPos CSeq_align::GetSeqStart(TDim row) const
253 {
254     switch (GetSegs().Which()) {
255     case C_Segs::e_Denseg:
256         return GetSegs().GetDenseg().GetSeqStart(row);
257     case C_Segs::e_Sparse:
258         return GetSegs().GetSparse().GetSeqStart(row);
259     case C_Segs::e_Spliced:
260         return GetSegs().GetSpliced().GetSeqStart(row);
261     case C_Segs::e_Dendiag:
262     case C_Segs::e_Std:
263     case C_Segs::e_Disc:
264         return GetSeqRange(row).GetFrom();
265     default:
266         NCBI_THROW(CSeqalignException, eUnsupported,
267                    "CSeq_align::GetSeqStart() currently does not handle "
268                    "this type of alignment.");
269     }
270 }
271 
272 
GetSeqStop(TDim row) const273 TSeqPos CSeq_align::GetSeqStop (TDim row) const
274 {
275     switch (GetSegs().Which()) {
276     case C_Segs::e_Denseg:
277         return GetSegs().GetDenseg().GetSeqStop(row);
278     case C_Segs::e_Sparse:
279         return GetSegs().GetSparse().GetSeqStop(row);
280     case C_Segs::e_Spliced:
281         return GetSegs().GetSpliced().GetSeqStop(row);
282     case C_Segs::e_Dendiag:
283     case C_Segs::e_Std:
284     case C_Segs::e_Disc:
285         return GetSeqRange(row).GetTo();
286     default:
287         NCBI_THROW(CSeqalignException, eUnsupported,
288                    "CSeq_align::GetSeqStop() currently does not handle "
289                    "this type of alignment.");
290     }
291 }
292 
293 
GetSeqStrand(TDim row) const294 ENa_strand CSeq_align::GetSeqStrand(TDim row) const
295 {
296     switch (GetSegs().Which()) {
297     case C_Segs::e_Denseg:
298         return GetSegs().GetDenseg().GetSeqStrand(row);
299     case C_Segs::e_Sparse:
300         return GetSegs().GetSparse().GetSeqStrand(row);
301     case C_Segs::e_Spliced:
302         return GetSegs().GetSpliced().GetSeqStrand(row);
303     case C_Segs::e_Disc:
304         return GetSegs().GetDisc().Get().front()->GetSeqStrand(row);
305     case C_Segs::e_Std:
306         return GetSegs().GetStd().front()->GetLoc()[row]->GetStrand();
307     case C_Segs::e_Dendiag:
308         return GetSegs().GetDendiag().front()->GetSeqStrand(row);
309     default:
310         NCBI_THROW(CSeqalignException, eUnsupported,
311                    "CSeq_align::GetSeqStrand() currently does not handle "
312                    "this type of alignment.");
313     }
314 }
315 
316 
GetSeq_id(TDim row) const317 const CSeq_id& CSeq_align::GetSeq_id(TDim row) const
318 {
319     switch (GetSegs().Which()) {
320     case C_Segs::e_Denseg:
321         return GetSegs().GetDenseg().GetSeq_id(row);
322 
323     case C_Segs::e_Sparse:
324         return GetSegs().GetSparse().GetSeq_id(row);
325 
326     case C_Segs::e_Dendiag:
327         {{
328             // If segments have different number of rows, this will try
329             // to find the segment with enough rows to get the id.
330             ITERATE(CSeq_align::C_Segs::TDendiag, seg, GetSegs().GetDendiag()) {
331                 if( (*seg)->IsSetIds()  &&
332                     (size_t)row < (*seg)->GetIds().size() ) {
333                     return *(*seg)->GetIds()[row];
334                 }
335             }
336             break;
337         }}
338 
339     case C_Segs::e_Std:
340         {{
341             // If segments have different number of rows, this will try
342             // to find the segment with enough rows to get the id.
343             ITERATE(CSeq_align::C_Segs::TStd, seg, GetSegs().GetStd()) {
344                 if ( (*seg)->IsSetIds()  &&
345                      (size_t)row < (*seg)->GetIds().size()) {
346                     return *((*seg)->GetIds()[row]);
347                 }
348                 else if ( (*seg)->IsSetLoc()  &&
349                      (size_t)row < (*seg)->GetLoc().size() ) {
350                     const CSeq_loc& loc = *(*seg)->GetLoc()[row];
351                     CConstRef<CSeq_id> id(loc.GetId());
352                     if (id) {
353                         return *id;
354                     }
355                 }
356             }
357         }}
358         break;
359 
360     case C_Segs::e_Disc:
361         {{
362             // Try to find a sub-alignment for which we can get a
363             // Seq-id for this row.
364             ITERATE (CSeq_align_set::Tdata, sub_aln, GetSegs().GetDisc().Get()) {
365                 try {
366                     const CSeq_id& rv = (*sub_aln)->GetSeq_id(row);
367                     return rv;
368                 }
369                 catch (const CSeqalignException&) {
370                 }
371             }
372         }}
373         break;
374 
375     case C_Segs::e_Spliced:
376         {{
377             // Technically, there is no row order for product and product.
378             // However, in spliced seg, since product comes first, we assign it
379             // as row 0.
380             // Hence, the genomic is assigned to row 1.
381             const C_Segs::TSpliced& spliced_seg = GetSegs().GetSpliced();
382             if (row == 0 && spliced_seg.IsSetProduct_id()) {
383                 return spliced_seg.GetProduct_id();
384             } else if (row == 1 && spliced_seg.IsSetGenomic_id()) {
385                 return spliced_seg.GetGenomic_id();
386             }
387         }}
388         break;
389 
390     default:
391         NCBI_THROW(CSeqalignException, eUnsupported,
392                    "CSeq_align::GetSeq_id() currently does not handle "
393                    "this type of alignment.");
394     }
395 
396     NCBI_THROW(CSeqalignException, eInvalidRowNumber,
397                "CSeq_align::GetSeq_id(): "
398                "can not get seq-id for the row requested.");
399     // return CSeq_id();
400 }
401 
402 
403 typedef SStaticPair<CSeq_align::EScoreType, const char*> TScoreNamePair;
404 static const TScoreNamePair sc_ScoreNames[] = {
405     { CSeq_align::eScore_Score,           "score" },
406     { CSeq_align::eScore_Blast,           "score" },
407     { CSeq_align::eScore_BitScore,        "bit_score" },
408     { CSeq_align::eScore_EValue,          "e_value" },
409     { CSeq_align::eScore_AlignLength,     "align_length" },
410     { CSeq_align::eScore_IdentityCount,   "num_ident" },
411     { CSeq_align::eScore_PositiveCount,   "num_positives" },
412     { CSeq_align::eScore_NegativeCount,   "num_negatives" },
413     { CSeq_align::eScore_MismatchCount,   "num_mismatch" },
414     { CSeq_align::eScore_GapCount,        "gap_count" },
415     { CSeq_align::eScore_PercentIdentity_Gapped, "pct_identity_gap" },
416     { CSeq_align::eScore_PercentIdentity_Ungapped, "pct_identity_ungap" },
417     { CSeq_align::eScore_PercentIdentity_GapOpeningOnly, "pct_identity_gapopen_only" },
418     { CSeq_align::eScore_PercentCoverage, "pct_coverage" },
419     { CSeq_align::eScore_SumEValue,       "sum_e" },
420     { CSeq_align::eScore_CompAdjMethod,   "comp_adjustment_method" },
421     { CSeq_align::eScore_HighQualityPercentCoverage, "pct_coverage_hiqual" },
422     { CSeq_align::eScore_Matches,         "matches"},
423     { CSeq_align::eScore_OverallIdentity, "identity"},
424     { CSeq_align::eScore_Splices,         "splices"},
425     { CSeq_align::eScore_ConsensusSplices, "consensus_splices"},
426     { CSeq_align::eScore_ProductCoverage, "product_coverage"},
427     { CSeq_align::eScore_ExonIdentity,    "exon_identity"}
428 };
429 
430 static const char* const sc_ScoreHelpText[] = {
431     "Blast score",
432     "Blast score",
433     "Blast-style bit score",
434     "Blast-style e-value",
435     "Length of the aligned segments, including the length of all gap segments",
436     "Count of identities",
437     "Count of positives; protein-to-DNA score",
438     "Count of negatives; protein-to-DNA score",
439     "Count of mismatches",
440     "Number of gaps in the alignment",
441     "Percent identity (0.0-100.0); count each base in a gap in any row as a mismatch",
442     "Percent identity (0.0-100.0); don't count gaps",
443     "Percent identity (0.0-100.0); count a gap of any length in any row as a mismatch of length 1",
444     "Percentage of query sequence aligned to subject (0.0-100.0)",
445     "Blast-style sum_e",
446     "Composition-adjustment method from BLAST",
447     "Percent coverage (0.0-100.0) of high quality region",
448     "Count of identities",
449     "Percent identity; count gaps within exons and introns on product",
450     "Number of splices; 2 x number of introns that have no gap on product",
451     "Number of splices with consensus splice sequence",
452     "Percentage of query sequence aligned to subject (0.0-100.0)",
453     "Percent identity; count gaps within exons only"
454 };
455 
456 static const bool sc_IsInteger[] = {
457     true,   // eScore_Score
458     true,   // eScore_Blast
459     false,  // eScore_BitScore
460     false,  // eScore_EValue
461     true,   // eScore_AlignLength
462     true,   // eScore_IdentityCount
463     true,   // eScore_PositiveCount
464     true,   // eScore_NegativeCount
465     true,   // eScore_MismatchCount
466     true,   // eScore_GapCount
467     false,  // eScore_PercentIdentity_Gapped
468     false,  // eScore_PercentIdentity_Ungapped
469     false,  // eScore_PercentIdentity_GapOpeningOnly
470     false,  // eScore_PercentCoverage
471     false,  // eScore_SumEValue
472     false,  // eScore_CompAdjMethod
473     false,  // eScore_HighQualityPercentCoverage
474     true,   // eScore_Matches
475     false,  // eScore_OverallIdentity
476     true,   // eScore_Splices
477     true,   // eScore_ConsensusSplices
478     false,  // eScore_ProductCoverage
479     false   // eScore_ExonIdentity
480 };
481 
482 
ScoreNameMap()483 const CSeq_align::TScoreNameMap &CSeq_align::ScoreNameMap()
484 {
485     DEFINE_STATIC_FAST_MUTEX(s_ScoreNameMap_mutex);
486     CFastMutexGuard guard(s_ScoreNameMap_mutex);
487     {
488         static TScoreNameMap m_ScoreNameMap;
489         if (m_ScoreNameMap.empty()) {
490             /// initialize map
491             for(unsigned score = eScore_Blast;
492                 score <= eScore_ExonIdentity; ++score)
493                 {
494                     m_ScoreNameMap[sc_ScoreNames[score].second] =
495                         sc_ScoreNames[score].first;
496                 }
497         }
498 
499         return m_ScoreNameMap;
500     }
501 }
502 
ScoreName(EScoreType score)503 string CSeq_align::ScoreName(EScoreType score)
504 {
505     return sc_ScoreNames[score].second;
506 }
507 
HelpText(EScoreType score)508 string CSeq_align::HelpText(EScoreType score)
509 {
510     return sc_ScoreHelpText[score];
511 }
512 
IsIntegerScore(EScoreType score)513 bool CSeq_align::IsIntegerScore(EScoreType score)
514 {
515     return sc_IsInteger[score];
516 }
517 
518 
519 /// retrieve a named score object
x_GetNamedScore(const string & name) const520 CConstRef<CScore> CSeq_align::x_GetNamedScore(const string& name) const
521 {
522     CConstRef<CScore> score;
523     if (IsSetScore()) {
524         ITERATE (TScore, iter, GetScore()) {
525             if ((*iter)->IsSetId()  &&  (*iter)->GetId().IsStr()  &&
526                 (*iter)->GetId().GetStr() == name) {
527                 score = *iter;
528                 break;
529             }
530         }
531     }
532 
533     return score;
534 }
535 
536 
x_SetNamedScore(const string & name)537 CRef<CScore> CSeq_align::x_SetNamedScore(const string& name)
538 {
539     CRef<CScore> score;
540     if (IsSetScore()) {
541         NON_CONST_ITERATE (TScore, iter, SetScore()) {
542             if ((*iter)->IsSetId()  &&  (*iter)->GetId().IsStr()  &&
543                 (*iter)->GetId().GetStr() == name) {
544                 score = *iter;
545                 break;
546             }
547         }
548     }
549 
550     if ( !score ) {
551         score.Reset(new CScore);
552         score->SetId().SetStr(name);
553         SetScore().push_back(score);
554     }
555 
556     return score;
557 }
558 
559 
560 ///---------------------------------------------------------------------------
561 /// PRE : name of score to return
562 /// POST: whether or not we found that score; score converted to int
GetNamedScore(const string & id,int & score) const563 bool CSeq_align::GetNamedScore(const string &id, int &score) const
564 {
565     CConstRef<CScore> ref = x_GetNamedScore(id);
566     if (ref) {
567         if (ref->GetValue().IsInt()) {
568             score = ref->GetValue().GetInt();
569         } else {
570             score = (int)ref->GetValue().GetReal();
571         }
572         return true;
573     }
574     return false;
575 }
576 
577 ///---------------------------------------------------------------------------
578 /// PRE : name of score to return
579 /// POST: whether or not we found that score; score converted to double
GetNamedScore(const string & id,double & score) const580 bool CSeq_align::GetNamedScore(const string &id, double &score) const
581 {
582     CConstRef<CScore> ref = x_GetNamedScore(id);
583     if (ref) {
584         if (ref->GetValue().IsInt()) {
585             score = (double)ref->GetValue().GetInt();
586         } else {
587             score = ref->GetValue().GetReal();
588         }
589         return true;
590     }
591     return false;
592 }
593 
594 
GetNamedScore(EScoreType type,int & score) const595 bool CSeq_align::GetNamedScore(EScoreType type, int& score) const
596 {
597     return GetNamedScore(sc_ScoreNames[type].second, score);
598 }
599 
GetNamedScore(EScoreType type,double & score) const600 bool CSeq_align::GetNamedScore(EScoreType type, double& score) const
601 {
602     return GetNamedScore(sc_ScoreNames[type].second, score);
603 }
604 
605 
ResetNamedScore(const string & name)606 void CSeq_align::ResetNamedScore(const string& name)
607 {
608     if (IsSetScore()) {
609         NON_CONST_ITERATE (TScore, iter, SetScore()) {
610             if ((*iter)->IsSetId()  &&  (*iter)->GetId().IsStr()  &&
611                 (*iter)->GetId().GetStr() == name) {
612                 SetScore().erase(iter);
613                 break;
614             }
615         }
616     }
617 }
618 
ResetNamedScore(EScoreType type)619 void CSeq_align::ResetNamedScore(EScoreType type)
620 {
621     ResetNamedScore(sc_ScoreNames[type].second);
622 }
623 
SetNamedScore(EScoreType type,int score)624 void CSeq_align::SetNamedScore(EScoreType type, int score)
625 {
626     CRef<CScore> ref = x_SetNamedScore(sc_ScoreNames[type].second);
627     ref->SetValue().SetInt(score);
628 }
629 
SetNamedScore(EScoreType type,double score)630 void CSeq_align::SetNamedScore(EScoreType type, double score)
631 {
632     CRef<CScore> ref = x_SetNamedScore(sc_ScoreNames[type].second);
633     ref->SetValue().SetReal(score);
634 }
635 
SetNamedScore(const string & id,int score)636 void CSeq_align::SetNamedScore(const string& id, int score)
637 {
638     CRef<CScore> ref = x_SetNamedScore(id);
639     ref->SetValue().SetInt(score);
640 }
641 
SetNamedScore(const string & id,double score)642 void CSeq_align::SetNamedScore(const string& id, double score)
643 {
644     CRef<CScore> ref = x_SetNamedScore(id);
645     ref->SetValue().SetReal(score);
646 }
647 
648 
Validate(bool full_test) const649 void CSeq_align::Validate(bool full_test) const
650 {
651     switch (GetSegs().Which()) {
652     case TSegs::e_Dendiag:
653         ITERATE(TSegs::TDendiag, dendiag_it, GetSegs().GetDendiag()) {
654             (*dendiag_it)->Validate();
655         }
656         break;
657     case C_Segs::e_Denseg:
658         GetSegs().GetDenseg().Validate(full_test);
659         break;
660     case C_Segs::e_Disc:
661         ITERATE(CSeq_align_set::Tdata, seq_align_it, GetSegs().GetDisc().Get()) {
662             (*seq_align_it)->Validate(full_test);
663         }
664         break;
665     case C_Segs::e_Std:
666         CheckNumRows();
667         break;
668     case C_Segs::e_Spliced:
669         GetSegs().GetSpliced().Validate(full_test);
670         break;
671     case C_Segs::e_Sparse:
672         GetSegs().GetSparse().Validate(full_test);
673         break;
674     default:
675         NCBI_THROW(CSeqalignException, eUnsupported,
676                    "CSeq_align::Validate() currently does not handle "
677                    "this type of alignment");
678     }
679 }
680 
681 
682 ///---------------------------------------------------------------------------
683 /// PRE : currently only implemented for dense-seg segments
684 /// POST: same alignment, opposite orientation
Reverse(void)685 void CSeq_align::Reverse(void)
686 {
687     switch (GetSegs().Which()) {
688     case C_Segs::e_Denseg:
689         SetSegs().SetDenseg().Reverse();
690         break;
691     default:
692         NCBI_THROW(CSeqalignException, eUnsupported,
693                    "CSeq_align::Reverse() currently only handles dense-seg "
694                    "alignments");
695     }
696 }
697 
698 ///---------------------------------------------------------------------------
699 /// PRE : currently only implemented for dense-seg segments; two row numbers
700 /// POST: same alignment, position of the two rows has been swapped
SwapRows(TDim row1,TDim row2)701 void CSeq_align::SwapRows(TDim row1, TDim row2)
702 {
703     switch (GetSegs().Which()) {
704     case C_Segs::e_Denseg:
705         SetSegs().SetDenseg().SwapRows(row1, row2);
706         break;
707 
708     case C_Segs::e_Std:
709         NON_CONST_ITERATE (C_Segs::TStd, it, SetSegs().SetStd()) {
710             (*it)->SwapRows(row1, row2);
711         }
712         break;
713 
714     case C_Segs::e_Disc:
715         SetSegs().SetDisc().SwapRows(row1, row2);
716         break;
717     default:
718         NCBI_THROW(CSeqalignException, eUnsupported,
719                    "CSeq_align::SwapRows currently only handles dense-seg "
720                    "alignments");
721     }
722 }
723 
724 ///----------------------------------------------------------------------------
725 /// PRE : the Seq-align has StdSeg segs
726 /// POST: Seq_align of type Dense-seg is created with m_Widths if necessary
727 CRef<CSeq_align>
CreateDensegFromStdseg(SSeqIdChooser * SeqIdChooser) const728 CSeq_align::CreateDensegFromStdseg(SSeqIdChooser* SeqIdChooser) const
729 {
730     if ( !GetSegs().IsStd() ) {
731         NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
732                    "CSeq_align::CreateDensegFromStdseg(): "
733                    "Input Seq-align should have segs of type StdSeg!");
734     }
735 
736     CRef<CSeq_align> sa(new CSeq_align);
737     sa->SetType(eType_not_set);
738     if (IsSetScore()) {
739         sa->SetScore() = GetScore();
740     }
741     CDense_seg& ds = sa->SetSegs().SetDenseg();
742 
743     typedef CDense_seg::TDim    TNumrow;
744     typedef CDense_seg::TNumseg TNumseg;
745 
746     vector<TSeqPos>       row_lens;
747     CDense_seg::TLens&    seg_lens = ds.SetLens();
748     CDense_seg::TStarts&  starts   = ds.SetStarts();
749     CDense_seg::TStrands& strands  = ds.SetStrands();
750     CDense_seg::TWidths&  widths   = ds.SetWidths();
751     vector<bool>          widths_determined;
752 
753     TSeqPos row_len;
754     TSeqPos from, to;
755     ENa_strand strand;
756 
757 
758     TNumseg seg = 0;
759     TNumrow dim = 0, row = 0;
760     ITERATE (C_Segs::TStd, std_i, GetSegs().GetStd()) {
761 
762         const CStd_seg& ss = **std_i;
763 
764         seg_lens.push_back(0);
765         TSeqPos& seg_len = seg_lens.back();
766         row_lens.clear();
767         widths_determined.push_back(false);
768 
769         row = 0;
770         ITERATE (CStd_seg::TLoc, i, ss.GetLoc()) {
771 
772             const CSeq_id* seq_id = 0;
773             // push back initialization values
774             if (seg == 0) {
775                 widths.push_back(0);
776                 strands.push_back(eNa_strand_unknown);
777             }
778 
779             if ((*i)->IsInt()) {
780                 const CSeq_interval& interval = (*i)->GetInt();
781 
782                 // determine start and len
783                 from = interval.GetFrom();
784                 to = interval.GetTo();
785                 starts.push_back(from);
786                 row_len = to - from + 1;
787                 row_lens.push_back(row_len);
788 
789                 int width = 0;
790                 // try to determine/check the seg_len and width
791                 if (!seg_len) {
792                     width = 0;
793                     seg_len = row_len;
794                 } else {
795                     if (row_len * 3 == seg_len) {
796                         seg_len /= 3;
797                         width = 1;
798                     } else if (row_len / 3 == seg_len) {
799                         width = 3;
800                     } else if (row_len != seg_len) {
801                         NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
802                                    "CreateDensegFromStdseg(): "
803                                    "Std-seg segment lengths not accurate!");
804                     }
805                 }
806                 if (width > 0) {
807                     widths_determined[seg] = true;
808                     if (widths[row] > 0  &&  widths[row] != width) {
809                         NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
810                                    "CreateDensegFromStdseg(): "
811                                    "Std-seg segment lengths not accurate!");
812                     } else {
813                         widths[row] = width;
814                     }
815                 }
816 
817                 // get the id
818                 seq_id = &(*i)->GetInt().GetId();
819 
820                 // determine/check the strand
821                 if (interval.CanGetStrand()) {
822                     strand = interval.GetStrand();
823                     if (seg == 0  ||  strands[row] == eNa_strand_unknown) {
824                         strands[row] = strand;
825                     } else {
826                         if (strands[row] != strand) {
827                             NCBI_THROW(CSeqalignException,
828                                        eInvalidInputAlignment,
829                                        "CreateDensegFromStdseg(): "
830                                        "Inconsistent strands!");
831                         }
832                     }
833                 }
834             } else if ((*i)->IsEmpty()) {
835                 starts.push_back(-1);
836                 if (seg == 0) {
837                     strands[row] = eNa_strand_unknown;
838                 }
839                 seq_id = &(*i)->GetEmpty();
840                 row_lens.push_back(0);
841             }
842 
843             // determine/check the id
844             if (seg == 0) {
845                 CRef<CSeq_id> id(new CSeq_id);
846                 SerialAssign(*id, *seq_id);
847                 ds.SetIds().push_back(id);
848             } else {
849                 CSeq_id& id = *ds.SetIds()[row];
850                 if (!SerialEquals(id, *seq_id)) {
851                     if (SeqIdChooser) {
852                         SeqIdChooser->ChooseSeqId(id, *seq_id);
853                     } else {
854                         string errstr =
855                             string("CreateDensegFromStdseg(): Seq-ids: ") +
856                             id.AsFastaString() + " and " +
857                             seq_id->AsFastaString() + " are not identical!" +
858                             " Without the OM it cannot be determined if they belong to"
859                             " the same sequence."
860                             " Define and pass ChooseSeqId to resolve seq-ids.";
861                         NCBI_THROW(CSeqalignException, eInvalidInputAlignment, errstr);
862                     }
863                 }
864             }
865 
866             // next row
867             row++;
868             if (seg == 0) {
869                 dim++;
870             }
871         }
872         if (dim != ss.GetDim()  ||  row != dim) {
873             NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
874                        "CreateDensegFromStdseg(): "
875                        "Inconsistent dimentions!");
876         }
877 
878         if (widths_determined[seg]) {
879             // go back and determine/check widths
880             for (row = 0; row < dim; row++) {
881                 if ((row_len = row_lens[row]) > 0) {
882                     int width = 0;
883                     if (row_len == seg_len * 3) {
884                         width = 3;
885                     } else if (row_len == seg_len) {
886                         width = 1;
887                     }
888                     if (widths[row] > 0  &&  widths[row] != width) {
889                         NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
890                                    "CreateDensegFromStdseg(): "
891                                    "Std-seg segment lengths not accurate!");
892                     } else {
893                         widths[row] = width;
894                     }
895                 }
896             }
897         }
898 
899         // next seg
900         seg++;
901     }
902 
903     ds.SetDim(dim);
904     ds.SetNumseg(seg);
905 
906     // go back and finish lens determination
907     bool widths_failure = false;
908     bool widths_success = false;
909     for (seg = 0; seg < ds.GetNumseg(); seg++) {
910         if (!widths_determined[seg]) {
911             for(row = 0; row < dim; row++) {
912                 if (starts[seg * dim + row] >= 0) {
913                     int width = widths[row];
914                     if (width == 3) {
915                         seg_lens[seg] /= 3;
916                     } else if (width == 0) {
917                         widths_failure = true;
918                     }
919                     break;
920                 }
921             }
922         } else {
923             widths_success = true;
924         }
925     }
926 
927     if (widths_failure) {
928         if (widths_success) {
929             NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
930                        "CreateDensegFromStdseg(): "
931                        "Some widths cannot be determined!");
932         } else {
933             ds.ResetWidths();
934         }
935     }
936 
937     // finish the strands
938     for (seg = 1; seg < ds.GetNumseg(); seg++) {
939         for (row = 0; row < dim; row++) {
940             strands.push_back(strands[row]);
941         }
942     }
943 
944     return sa;
945 }
946 
947 
948 ///----------------------------------------------------------------------------
949 /// PRE : the Seq-align is a Dense-seg of aligned nucleotide sequences
950 /// POST: Seq_align of type Dense-seg is created with each of the m_Widths
951 ///       equal to 3 and m_Lengths devided by 3.
952 CRef<CSeq_align>
CreateTranslatedDensegFromNADenseg() const953 CSeq_align::CreateTranslatedDensegFromNADenseg() const
954 {
955     if ( !GetSegs().IsDenseg() ) {
956         NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
957                    "CSeq_align::CreateTranslatedDensegFromNADenseg(): "
958                    "Input Seq-align should have segs of type Dense-seg!");
959     }
960 
961     CRef<CSeq_align> sa(new CSeq_align);
962     sa->SetType(eType_not_set);
963 
964     if (GetSegs().GetDenseg().IsSetWidths()) {
965         NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
966                    "CSeq_align::CreateTranslatedDensegFromNADenseg(): "
967                    "Widths already exist for the original alignment");
968     }
969 
970     // copy from the original
971     sa->Assign(*this);
972 
973     CDense_seg& ds = sa->SetSegs().SetDenseg();
974 
975     // fix the lengths
976     const CDense_seg::TLens& orig_lens = GetSegs().GetDenseg().GetLens();
977     CDense_seg::TLens&       lens      = ds.SetLens();
978 
979     for (CDense_seg::TNumseg numseg = 0; numseg < ds.GetNumseg(); numseg++) {
980         if (orig_lens[numseg] % 3) {
981             string errstr =
982                 string("CSeq_align::CreateTranslatedDensegFromNADenseg(): ") +
983                 "Length of segment " + NStr::IntToString(numseg) +
984                 " is not divisible by 3.";
985             NCBI_THROW(CSeqalignException, eInvalidInputAlignment, errstr);
986         } else {
987             lens[numseg] = orig_lens[numseg] / 3;
988         }
989     }
990 
991     // add the widths
992     ds.SetWidths().resize(ds.GetDim(), 3);
993 
994 #if _DEBUG
995     ds.Validate(true);
996 #endif
997 
998     return sa;
999 }
1000 
1001 
1002 /// Strict weak ordering for pairs (by first)
1003 /// Used by CreateDensegFromDisc
1004 template <typename T, typename Pred = less<TSeqPos> >
1005 struct ds_cmp {
operator ()ds_cmp1006     bool operator()(const T& x, const T& y) {
1007         return m_Pred(x.first, y.first);
1008     }
1009 private:
1010     Pred m_Pred;
1011 };
1012 
1013 
1014 CRef<CSeq_align>
CreateDensegFromDisc(SSeqIdChooser * SeqIdChooser) const1015 CSeq_align::CreateDensegFromDisc(SSeqIdChooser* SeqIdChooser) const
1016 {
1017     if ( !GetSegs().IsDisc() ) {
1018         NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
1019                    "CSeq_align::CreateDensegFromDisc(): "
1020                    "Input Seq-align should have segs of type StdSeg!");
1021     }
1022 
1023     CRef<CSeq_align> new_sa(new CSeq_align);
1024     new_sa->SetType(eType_not_set);
1025     if (IsSetScore()) {
1026         new_sa->SetScore() = GetScore();
1027     }
1028 
1029     CDense_seg& new_ds = new_sa->SetSegs().SetDenseg();
1030 
1031     new_ds.SetDim(0);
1032     new_ds.SetNumseg(0);
1033 
1034 
1035     /// Order the discontinuous densegs
1036     typedef pair<TSeqPos, const CDense_seg *> TPosDsPair;
1037     typedef vector<TPosDsPair> TDsVec;
1038     TDsVec ds_vec;
1039     ds_vec.reserve(GetSegs().GetDisc().Get().size());
1040     int strand = -1;
1041     ITERATE (CSeq_align_set::Tdata, sa_i, GetSegs().GetDisc().Get()) {
1042         const CDense_seg& ds = (*sa_i)->GetSegs().GetDenseg();
1043         ds_vec.push_back(make_pair<TSeqPos, const CDense_seg *>(ds.GetSeqStart(0), &ds));
1044         if (ds.IsSetStrands()  &&
1045             !ds.GetStrands().empty()) {
1046             if (strand < 0) {
1047                 strand = ds.GetStrands()[0];
1048             } else {
1049                 if (strand != ds.GetStrands()[0]) {
1050                     NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
1051                                "CreateDensegFromDisc(): "
1052                                "Inconsistent strands!");
1053                 }
1054             }
1055         }
1056     }
1057     if ( !IsReverse(ENa_strand(strand)) ) {
1058         sort(ds_vec.begin(), ds_vec.end(),
1059              ds_cmp<TPosDsPair>());
1060     } else {
1061         sort(ds_vec.begin(), ds_vec.end(),
1062              ds_cmp<TPosDsPair, greater<TSeqPos> >());
1063     }
1064 
1065 
1066     /// First pass: determine dim & numseg
1067     CDense_seg::TStrands single_segment_strands;
1068     ITERATE(TDsVec, ds_i, ds_vec) {
1069         const CDense_seg& ds = *ds_i->second;
1070 
1071         /// Numseg
1072         new_ds.SetNumseg() += ds.GetNumseg();
1073 
1074         /// Dim
1075         if ( !new_ds.GetDim() ) {
1076             new_ds.SetDim(ds.GetDim());
1077         } else if (new_ds.GetDim() != ds.GetDim() ) {
1078             NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
1079                        "CreateDensegFromDisc(): "
1080                        "All disc dense-segs need to have the same dimension!");
1081         }
1082 
1083         /// Strands?
1084         if ( !ds.GetStrands().empty() ) {
1085             if (single_segment_strands.empty()) {
1086                 single_segment_strands.assign(ds.GetStrands().begin(),
1087                     ds.GetStrands().begin() + ds.GetDim());
1088             } else {
1089                 if ( !equal(single_segment_strands.begin(),
1090                             single_segment_strands.end(),
1091                             ds.GetStrands().begin()) ) {
1092                     NCBI_THROW(CSeqalignException, eInvalidInputAlignment,
1093                                "CreateDensegFromDisc(): "
1094                                "All disc dense-segs need to have the same strands!");
1095                 }
1096             }
1097         }
1098     }
1099 
1100     new_ds.SetStarts().resize(new_ds.GetDim() * new_ds.GetNumseg());
1101     new_ds.SetLens().resize(new_ds.GetNumseg());
1102     if ( !single_segment_strands.empty() ) {
1103         /// Multiply the strands by the number of segments.
1104         new_ds.SetStrands().reserve(new_ds.GetDim() * new_ds.GetNumseg());
1105         for (CDense_seg::TNumseg seg = 0;
1106              seg < new_ds.GetNumseg();
1107              ++seg) {
1108             new_ds.SetStrands().insert(new_ds.SetStrands().end(),
1109                 single_segment_strands.begin(),
1110                 single_segment_strands.begin() + new_ds.GetDim());
1111         }
1112     }
1113 
1114 
1115     /// Second pass: validate and set ids and starts
1116 
1117     CDense_seg::TNumseg new_seg = 0;
1118     int                 new_starts_i = 0;
1119 
1120     ITERATE(TDsVec, ds_i, ds_vec) {
1121         const CDense_seg& ds = *ds_i->second;
1122 
1123         _ASSERT(ds.GetStarts().size() == (size_t)ds.GetNumseg() * ds.GetDim());
1124         _ASSERT(new_ds.GetDim() == ds.GetDim());
1125 
1126         /// Ids
1127         if (new_ds.GetIds().empty()) {
1128             new_ds.SetIds().resize(new_ds.GetDim());
1129             for (CDense_seg::TDim row = 0;  row < ds.GetDim();  ++row) {
1130                 CRef<CSeq_id> id(new CSeq_id);
1131                 SerialAssign(*id, *ds.GetIds()[row]);
1132                 new_ds.SetIds()[row] = id;
1133             }
1134         } else {
1135             if (SeqIdChooser) {
1136                 for (CDense_seg::TDim row = 0;  row < ds.GetDim();  ++row) {
1137                     SeqIdChooser->ChooseSeqId(*new_ds.SetIds()[row], *ds.GetIds()[row]);
1138                 }
1139             } else {
1140 #if _DEBUG
1141                 for (CDense_seg::TDim row = 0;  row < ds.GetDim();  ++row) {
1142                     const CSeq_id& new_id = *new_ds.GetIds()[row];
1143                     const CSeq_id& id     = *ds.GetIds()[row];
1144                     if ( !SerialEquals(new_id, id) ) {
1145                         string errstr =
1146                             string("CreateDensegFromDisc(): Seq-ids: ") +
1147                             new_id.AsFastaString() + " and " +
1148                             id.AsFastaString() + " are not identical!" +
1149                             " Without the OM it cannot be determined if they belong to"
1150                             " the same sequence."
1151                             " Define and pass ChooseSeqId to resolve seq-ids.";
1152                         NCBI_THROW(CSeqalignException, eInvalidInputAlignment, errstr);
1153                     }
1154                 }
1155 #endif
1156             }
1157         }
1158 
1159         /// Starts
1160         int starts_i = 0;
1161         for (CDense_seg::TNumseg seg = 0;
1162              seg < ds.GetNumseg();
1163              ++new_seg, ++seg) {
1164 
1165             new_ds.SetLens()[new_seg] = ds.GetLens()[seg];
1166 
1167             for (CDense_seg::TDim row = 0;
1168                  row < ds.GetDim();
1169                  ++starts_i, ++new_starts_i, ++row) {
1170                 new_ds.SetStarts()[new_starts_i] = ds.GetStarts()[starts_i];
1171             }
1172         }
1173     }
1174 
1175 
1176     /// Perform full test (including segment starts consistency check)
1177     new_sa->Validate(true);
1178 
1179 
1180     return new_sa;
1181 }
1182 
s_Distance(const TSeqRange & range1,const TSeqRange & range2)1183 static TSeqPos s_Distance(const TSeqRange &range1,
1184                           const TSeqRange &range2)
1185 {
1186     if (range1.IntersectingWith(range2)) {
1187         return 0;
1188     }
1189     return max(range1,range2).GetFrom() - min(range1,range2).GetTo();
1190 }
1191 
1192 /// Join the alignments in the set into one alignment.
s_GetJoinedAlignment(const CSeq_align_set & aligns)1193 static CRef<CSeq_align> s_GetJoinedAlignment(const CSeq_align_set &aligns)
1194 {
1195     if (aligns.Get().size() == 1) {
1196         return aligns.Get().front();
1197     }
1198 
1199     /// There's more than one, so create a disc alignment
1200     CRef<CSeq_align> align(new CSeq_align);
1201     align->SetDim(2);
1202     align->SetType(CSeq_align::eType_partial);
1203     align->SetSegs().SetDisc().Set() = aligns.Get();
1204 
1205     try {
1206         align->Validate(true);
1207     }
1208     catch (CException& e) {
1209         ERR_POST(Error << e);
1210         cerr << MSerial_AsnText << aligns;
1211         cerr << MSerial_AsnText << *align;
1212         throw;
1213     }
1214     return align;
1215 }
1216 
SplitOnLongDiscontinuity(list<CRef<CSeq_align>> & aligns,TSeqPos discontinuity_threshold) const1217 void CSeq_align::SplitOnLongDiscontinuity(list< CRef<CSeq_align> >& aligns,
1218                                           TSeqPos discontinuity_threshold) const
1219 {
1220     if (IsSetDim() && GetDim() > 2) {
1221         NCBI_THROW(CException, eUnknown,
1222             "SplitOnLongDiscontinuity() only implemented for pairwise alignments");
1223     }
1224 
1225     size_t num_splits = 0;
1226     switch (GetSegs().Which()) {
1227     case TSegs::e_Denseg:
1228     {{
1229         // determine if we can extract any slice based on a discontinuity
1230         const CDense_seg& seg = GetSegs().GetDenseg();
1231         CDense_seg::TNumseg last_seg = 0;
1232         CDense_seg::TNumseg curr_seg = 0;
1233         TSeqRange curr_range0;
1234         TSeqRange curr_range1;
1235         for ( ;  curr_seg < seg.GetNumseg();  ++curr_seg) {
1236             TSignedSeqPos p0_curr = seg.GetStarts()[ curr_seg * 2 ];
1237             TSignedSeqPos p1_curr = seg.GetStarts()[ curr_seg * 2 + 1];
1238             if (p0_curr < 0 || p1_curr < 0) {
1239                 continue;
1240             }
1241 
1242             TSeqPos curr_seg_len = seg.GetLens()[curr_seg];
1243 
1244             TSeqRange prev_range0 = curr_range0;
1245             TSeqRange prev_range1 = curr_range1;
1246             curr_range0 = TSeqRange(p0_curr, p0_curr+curr_seg_len);
1247             curr_range1 = TSeqRange(p1_curr, p1_curr+curr_seg_len);
1248 
1249             if (prev_range0.Empty()) {
1250                 /// We're at start of alignment, so there's no gap to check for
1251                 continue;
1252             }
1253 
1254             if (prev_range0.NotEmpty() &&
1255                 (s_Distance(curr_range0, prev_range0) > discontinuity_threshold ||
1256                  s_Distance(curr_range1, prev_range1) > discontinuity_threshold))
1257             {
1258                 aligns.push_back(x_CreateSubsegAlignment(last_seg, curr_seg - 1));
1259                 ++num_splits;
1260                 last_seg = curr_seg;
1261             }
1262         }
1263         if (num_splits) {
1264             /// We had to split the alignment; create a final subseg alignment
1265             aligns.push_back(x_CreateSubsegAlignment(last_seg, curr_seg - 1));
1266             ++num_splits;
1267         }
1268     }}
1269     break;
1270 
1271     case TSegs::e_Disc:
1272     {{
1273         const CSeq_align_set &segs = GetSegs().GetDisc();
1274         CSeq_align_set curr_disc;
1275         TSeqRange curr_range0;
1276         TSeqRange curr_range1;
1277         ITERATE (list< CRef<CSeq_align> >, seg_it, segs.Get()) {
1278             TSeqRange prev_range0 = curr_range0;
1279             TSeqRange prev_range1 = curr_range1;
1280             curr_range0 = (*seg_it)->GetSeqRange(0);
1281             curr_range1 = (*seg_it)->GetSeqRange(1);
1282             list< CRef<CSeq_align> > seg_splits;
1283             (*seg_it)->SplitOnLongDiscontinuity(seg_splits,
1284                                                 discontinuity_threshold);
1285 
1286             if (prev_range0.Empty() ||
1287                 (s_Distance(curr_range0, prev_range0) <= discontinuity_threshold &&
1288                  s_Distance(curr_range1, prev_range1) <= discontinuity_threshold))
1289             {
1290                 /// distance between this alignment and previous one is less than
1291                 /// threshold; first split section of this alignment should be joined
1292                 /// to previous one
1293                 curr_disc.Set().push_back(seg_splits.front());
1294                 seg_splits.pop_front();
1295                 if (seg_splits.empty()) {
1296                     continue;
1297                 }
1298             }
1299 
1300             /// Put current contents of curr_disc in aligns
1301             aligns.push_back(s_GetJoinedAlignment(curr_disc));
1302             ++num_splits;
1303 
1304             /// Put results of current section's split in aligns except for last
1305             /// section; place last section in curr_disc, since it may have to be
1306             /// joined with next one
1307             num_splits += seg_splits.size();
1308             curr_disc.Set().clear();
1309             curr_disc.Set().push_back(seg_splits.back());
1310             seg_splits.pop_back();
1311 
1312             aligns.splice(aligns.end(), seg_splits);
1313         }
1314         if (num_splits) {
1315             /// We had to split the alignment; create a final alignment from
1316             /// the contents of curr_disc
1317             aligns.push_back(s_GetJoinedAlignment(curr_disc));
1318             ++num_splits;
1319         }
1320     }}
1321     break;
1322 
1323     default:
1324     break;
1325     }
1326 
1327     if (!num_splits) {
1328         aligns.push_back(CRef<CSeq_align>(const_cast<CSeq_align *>(this)));
1329     }
1330 }
1331 
OffsetRow(TDim row,TSignedSeqPos offset)1332 void CSeq_align::OffsetRow(TDim row,
1333                           TSignedSeqPos offset)
1334 {
1335     if (offset == 0) return;
1336 
1337     switch (SetSegs().Which()) {
1338     case TSegs::e_Dendiag:
1339         NON_CONST_ITERATE(TSegs::TDendiag, dendiag_it, SetSegs().SetDendiag()) {
1340             (*dendiag_it)->OffsetRow(row, offset);
1341         }
1342         break;
1343     case TSegs::e_Denseg:
1344         SetSegs().SetDenseg().OffsetRow(row, offset);
1345         break;
1346     case TSegs::e_Std:
1347         NON_CONST_ITERATE(TSegs::TStd, std_it, SetSegs().SetStd()) {
1348             (*std_it)->OffsetRow(row, offset);
1349         }
1350         break;
1351     case TSegs::e_Disc:
1352         NON_CONST_ITERATE(CSeq_align_set::Tdata, seq_align_it, SetSegs().SetDisc().Set()) {
1353             (*seq_align_it)->OffsetRow(row, offset);
1354         }
1355         break;
1356     default:
1357         NCBI_THROW(CSeqalignException, eUnsupported,
1358                    "CSeq_align::OffsetRow() currently does not handle "
1359                    "this type of alignment");
1360     }
1361 }
1362 
1363 
1364 /// @deprecated
RemapToLoc(TDim row,const CSeq_loc & dst_loc,bool ignore_strand)1365 void CSeq_align::RemapToLoc(TDim row,
1366                             const CSeq_loc& dst_loc,
1367                             bool ignore_strand)
1368 {
1369     // Limit to certain types of locs:
1370     switch (dst_loc.Which()) {
1371     case CSeq_loc::e_Whole:
1372         return;
1373     case CSeq_loc::e_Int:
1374         break;
1375     default:
1376         NCBI_THROW(CSeqalignException, eUnsupported,
1377                    "CSeq_align::RemapToLoc only supports int target seq-locs");
1378     }
1379 
1380     switch (SetSegs().Which()) {
1381     case TSegs::e_Denseg:
1382         SetSegs().SetDenseg().RemapToLoc(row, dst_loc, ignore_strand);
1383         break;
1384     case TSegs::e_Std:
1385         NON_CONST_ITERATE(TSegs::TStd, std_it, SetSegs().SetStd()) {
1386             (*std_it)->RemapToLoc(row, dst_loc, ignore_strand);
1387         }
1388         break;
1389     case TSegs::e_Disc:
1390         NON_CONST_ITERATE(CSeq_align_set::Tdata, seq_align_it, SetSegs().SetDisc().Set()) {
1391             (*seq_align_it)->RemapToLoc(row, dst_loc, ignore_strand);
1392         }
1393         break;
1394     default:
1395         NCBI_THROW(CSeqalignException, eUnsupported,
1396                    "CSeq_align::RemapToLoc only supports Dense-seg and Std-seg alignments.");
1397     }
1398 }
1399 
1400 
RemapAlignToLoc(const CSeq_align & align,CSeq_align::TDim row,const CSeq_loc & loc)1401 CRef<CSeq_align> RemapAlignToLoc(const CSeq_align& align,
1402                                  CSeq_align::TDim  row,
1403                                  const CSeq_loc&   loc)
1404 {
1405     if ( loc.IsWhole() ) {
1406         CRef<CSeq_align> copy(new CSeq_align);
1407         copy->Assign(align);
1408         return copy;
1409     }
1410     const CSeq_id* orig_id = loc.GetId();
1411     if ( !orig_id ) {
1412         NCBI_THROW(CAnnotMapperException, eBadLocation,
1413                    "Location with multiple ids can not be used to "
1414                    "remap seq-aligns.");
1415     }
1416     CRef<CSeq_id> id(new CSeq_id);
1417     id->Assign(*orig_id);
1418 
1419     // Create source seq-loc
1420     TSeqPos len = 0;
1421     for (CSeq_loc_CI it(loc); it; ++it) {
1422         if ( it.IsWhole() ) {
1423             NCBI_THROW(CAnnotMapperException, eBadLocation,
1424                     "Whole seq-loc can not be used to "
1425                     "remap seq-aligns.");
1426         }
1427         len += it.GetRange().GetLength();
1428     }
1429     CSeq_loc src_loc(*id, 0, len - 1);
1430     ENa_strand strand = loc.GetStrand();
1431     if (strand != eNa_strand_unknown) {
1432         src_loc.SetStrand(strand);
1433     }
1434     CSeq_loc_Mapper_Base mapper(src_loc, loc);
1435     return mapper.Map(align, row);
1436 }
1437 
1438 /// Get length of intersection between a range and a range collection
s_IntersectionLength(const CRangeCollection<TSeqPos> & ranges,const TSeqRange & range)1439 static inline TSeqPos s_IntersectionLength(const CRangeCollection<TSeqPos> &ranges,
1440                                            const TSeqRange &range)
1441 {
1442     TSeqPos length = 0;
1443     ITERATE (CRangeCollection<TSeqPos>, it, ranges) {
1444         length += it->IntersectionWith(range).GetLength();
1445     }
1446     return length;
1447 }
1448 
1449 /// Retrieves the number of gaps in an alignment
1450 /// @param align Alignment to examine [in]
1451 /// @param get_total_count if true, the total number of gaps is retrived,
1452 /// otherwise only the number of gap openings is retrieved
1453 /// @throws CSeqalignException if alignment type is not supported
1454 static TSeqPos
s_GetGapCount(const CSeq_align & align,CSeq_align::TDim row,const CRangeCollection<TSeqPos> & ranges,bool get_total_count)1455 s_GetGapCount(const CSeq_align& align, CSeq_align::TDim row,
1456               const CRangeCollection<TSeqPos> &ranges,
1457               bool get_total_count)
1458 {
1459     if (ranges.empty()) {
1460         return 0;
1461     }
1462 
1463     TSeqPos retval = 0;
1464     switch (align.GetSegs().Which()) {
1465     case CSeq_align::TSegs::e_Denseg:
1466         {{
1467             const CDense_seg& ds = align.GetSegs().GetDenseg();
1468             for (CDense_seg::TNumseg i = 0;  i < ds.GetNumseg();  ++i) {
1469                 bool is_gapped = false;
1470                 for (CDense_seg::TDim j = 0;  j < ds.GetDim();  ++j) {
1471                     if (ds.GetStarts()[i * ds.GetDim() + j] == -1 &&
1472                         (row < 0 || row == j))
1473                     {
1474                         is_gapped = true;
1475                         break;
1476                     }
1477                 }
1478                 if (is_gapped) {
1479                     TSeqPos gap_len = ds.GetLens()[i];
1480                     if (!ranges.begin()->IsWhole()) {
1481                         TSignedSeqPos gap_start = ds.GetStarts()[i * ds.GetDim()];
1482                         if (gap_start >= 0) {
1483                             gap_len = s_IntersectionLength(ranges,
1484                                 TSeqRange(gap_start, gap_start + gap_len - 1));
1485                         } else {
1486                             gap_start = ds.GetStarts()[(i-1) * ds.GetDim()]
1487                                       + ds.GetLens()[i-1];
1488                             if (s_IntersectionLength(ranges,
1489                                     TSeqRange(gap_start, gap_start)) == 0)
1490                             {
1491                                 gap_len = 0;
1492                             }
1493                         }
1494                     }
1495                     retval += (get_total_count ? gap_len : (gap_len ? 1 : 0));
1496                 }
1497             }
1498         }}
1499         break;
1500 
1501     case CSeq_align::TSegs::e_Disc:
1502         {{
1503             ITERATE(CSeq_align::TSegs::TDisc::Tdata, iter,
1504                     align.GetSegs().GetDisc().Get()) {
1505                 retval += s_GetGapCount(**iter, row, ranges, get_total_count);
1506             }
1507         }}
1508         break;
1509 
1510     case CSeq_align::TSegs::e_Spliced:
1511         {{
1512             ITERATE (CSpliced_seg::TExons, iter, align.GetSegs().GetSpliced().GetExons()) {
1513                 for (CSeq_align::TDim r = 0; r < 2; ++r) {
1514                     if (row != r) {
1515                         CRangeCollection<TSeqPos> insertions =
1516                              (*iter)->GetRowSeq_insertions(
1517                                   r, align.GetSegs().GetSpliced(), ranges);
1518                         retval += get_total_count ? insertions.GetCoveredLength()
1519                                                   : insertions.size();
1520                     }
1521                 }
1522             }
1523         }}
1524         break;
1525 
1526     case CSeq_align::TSegs::e_Std:
1527         if (row < 0 && !get_total_count && ranges.begin()->IsWhole()) {
1528             ITERATE (CSeq_align::TSegs::TStd, iter, align.GetSegs().GetStd()) {
1529                 const CStd_seg& seg = **iter;
1530                 ITERATE (CStd_seg::TLoc, it, seg.GetLoc()) {
1531                     const CSeq_loc& loc = **it;
1532                     if (loc.IsEmpty()) {
1533                         ++retval;
1534                         break;
1535                     }
1536                 }
1537             }
1538             break;
1539         }
1540 
1541     default:
1542         NCBI_THROW(CSeqalignException, eUnsupported,
1543                    "CSeq_align::GetGapCount() currently does not handle "
1544                    "this type of alignment.");
1545     }
1546 
1547     return retval;
1548 }
1549 
GetTotalGapCount(TDim row) const1550 TSeqPos CSeq_align::GetTotalGapCount(TDim row) const
1551 {
1552     return s_GetGapCount(*this, row,
1553                          CRangeCollection<TSeqPos>(TSeqRange::GetWhole()),
1554                          true);
1555 }
1556 
GetNumGapOpenings(TDim row) const1557 TSeqPos CSeq_align::GetNumGapOpenings(TDim row) const
1558 {
1559     return s_GetGapCount(*this, row,
1560                          CRangeCollection<TSeqPos>(TSeqRange::GetWhole()),
1561                          false);
1562 }
1563 
GetTotalGapCountWithinRange(const TSeqRange & range,TDim row) const1564 TSeqPos CSeq_align::GetTotalGapCountWithinRange(
1565     const TSeqRange &range, TDim row) const
1566 {
1567     return s_GetGapCount(*this, row, CRangeCollection<TSeqPos>(range), true);
1568 }
1569 
GetNumGapOpeningsWithinRange(const TSeqRange & range,TDim row) const1570 TSeqPos CSeq_align::GetNumGapOpeningsWithinRange(
1571     const TSeqRange &range, TDim row) const
1572 {
1573     return s_GetGapCount(*this, row, CRangeCollection<TSeqPos>(range), false);
1574 }
1575 
GetTotalGapCountWithinRanges(const CRangeCollection<TSeqPos> & ranges,TDim row) const1576 TSeqPos CSeq_align::GetTotalGapCountWithinRanges(
1577     const CRangeCollection<TSeqPos> &ranges, TDim row) const
1578 {
1579     return s_GetGapCount(*this, row, ranges, true);
1580 }
1581 
GetNumGapOpeningsWithinRanges(const CRangeCollection<TSeqPos> & ranges,TDim row) const1582 TSeqPos CSeq_align::GetNumGapOpeningsWithinRanges(
1583     const CRangeCollection<TSeqPos> &ranges, TDim row) const
1584 {
1585     return s_GetGapCount(*this, row, ranges, false);
1586 }
1587 
AsString() const1588 string CSeq_align::SIndel::AsString() const
1589 {
1590     return NStr::NumericToString(genomic_pos)
1591          + (row == 0 ? 'I' : 'D')
1592          + NStr::NumericToString(length);
1593 }
1594 
1595 static vector<CSeq_align::SIndel>
s_GetIndels(const CSeq_align & align,CSeq_align::TDim row,const CRangeCollection<TSeqPos> & ranges,bool include_frameshifts,bool include_non_frameshifts)1596 s_GetIndels(const CSeq_align& align, CSeq_align::TDim row,
1597             const CRangeCollection<TSeqPos> &ranges,
1598             bool include_frameshifts, bool include_non_frameshifts)
1599 {
1600     vector<CSeq_align::SIndel> results;
1601     if (ranges.empty()) {
1602         return results;;
1603     }
1604 
1605     CRef<CSeq_align> generated_denseg;
1606     switch (align.GetSegs().Which()) {
1607     case CSeq_align::TSegs::e_Denseg:
1608         break;
1609 
1610     case CSeq_align::TSegs::e_Disc:
1611         generated_denseg = align.CreateDensegFromDisc();
1612         break;
1613 
1614     case CSeq_align::TSegs::e_Spliced:
1615         {{
1616             CRef<CSeq_align> as_disc_seg =
1617                 align.GetSegs().GetSpliced().AsDiscSeg();
1618             if (align.GetSeqStart(1) > align.GetSeqStop(1)) {
1619                 /// genomic sequence is circular; won't work on the entire
1620                 /// alignment, need to check separately on the parts before and
1621                 /// after the origin
1622                 CSeq_align before_origin, after_origin;
1623                 before_origin.SetType(CSeq_align::eType_disc);
1624                 after_origin.SetType(CSeq_align::eType_disc);
1625                 for (const CRef<CSeq_align> &segment
1626                      : as_disc_seg->GetSegs().GetDisc().Get())
1627                 {
1628                     (segment->GetSeqStart(1) < align.GetSeqStart(1)
1629                        ? after_origin : before_origin)
1630                     . SetSegs().SetDisc().Set().push_back(segment);
1631                 }
1632                 vector<CSeq_align::SIndel> before_origin_frameshifts =
1633                     s_GetIndels(before_origin, row, ranges, include_frameshifts, include_non_frameshifts),
1634                                            after_origin_frameshifts =
1635                     s_GetIndels(after_origin, row, ranges, include_frameshifts, include_non_frameshifts);
1636                 results = before_origin_frameshifts;
1637                 results.insert(results.end(), after_origin_frameshifts.begin(),
1638                                               after_origin_frameshifts.end());
1639                 return results;
1640             }
1641             generated_denseg = as_disc_seg->CreateDensegFromDisc();
1642         }}
1643         break;
1644 
1645     default:
1646         NCBI_THROW(CSeqalignException, eUnsupported,
1647                    "CSeq_align::GetNumFrameshifts() currently does not handle "
1648                    "this type of alignment.");
1649     }
1650 
1651     if (generated_denseg) {
1652         generated_denseg->SetSegs().SetDenseg().OrderAdjacentGaps();
1653         generated_denseg->SetSegs().SetDenseg().Compact();
1654         generated_denseg->SetSegs().SetDenseg().RemovePureGapSegs();
1655     }
1656 
1657     const CDense_seg &ds = (generated_denseg ? *generated_denseg : align)
1658                            . GetSegs().GetDenseg();
1659 
1660             for (CDense_seg::TNumseg i = 0;  i < ds.GetNumseg();  ++i) {
1661                 bool is_gapped = false;
1662                 for (CDense_seg::TDim j = 0;  j < ds.GetDim();  ++j) {
1663                     if (ds.GetStarts()[i * ds.GetDim() + j] == -1 && row != j)
1664                     {
1665                         is_gapped = true;
1666                         break;
1667                     }
1668                 }
1669                 if (is_gapped) {
1670                     TSeqPos gap_len = ds.GetLens()[i];
1671                     if (!ranges.begin()->IsWhole()) {
1672                         TSignedSeqPos gap_start = ds.GetStarts()[i * ds.GetDim()];
1673                         if (gap_start >= 0) {
1674                             gap_len = s_IntersectionLength(ranges,
1675                                 TSeqRange(gap_start, gap_start + gap_len - 1));
1676                         } else {
1677                             gap_start = ds.GetStarts()[(i-1) * ds.GetDim()]
1678                                       + ds.GetLens()[i-1];
1679                             if (s_IntersectionLength(ranges,
1680                                     TSeqRange(gap_start, gap_start)) == 0)
1681                             {
1682                                 gap_len = 0;
1683                             }
1684                         }
1685                     }
1686                     if (gap_len > 0 &&
1687                        ((include_frameshifts && gap_len % 3 != 0) ||
1688                         (include_non_frameshifts && gap_len % 3 == 0)))
1689                     {
1690                         TSignedSeqPos genomic_gap_start = ds.GetStarts()[i*ds.GetDim() + 1];
1691                         CSeq_align::TDim inserted_row = 1;
1692                         if (genomic_gap_start < 0) {
1693                             ENa_strand genomic_strand = ds.IsSetStrands() ? ds.GetStrands()[(i-1) * ds.GetDim() + 1] : eNa_strand_plus;
1694                             genomic_gap_start = genomic_strand == eNa_strand_minus
1695                                  ? ds.GetStarts()[(i+1) * ds.GetDim() + 1]
1696                                       + ds.GetLens()[i+1]
1697                                  : ds.GetStarts()[(i-1) * ds.GetDim() + 1]
1698                                       + ds.GetLens()[i-1];
1699                             inserted_row = 0;
1700                         }
1701                         results.push_back(CSeq_align::SIndel(genomic_gap_start, inserted_row, gap_len));
1702                     }
1703                 }
1704             }
1705 
1706     return results;
1707 }
1708 
GetNumFrameshifts(TDim row) const1709 TSeqPos CSeq_align::GetNumFrameshifts(TDim row) const
1710 {
1711     return s_GetIndels(*this, row,
1712                          CRangeCollection<TSeqPos>(TSeqRange::GetWhole()), true, false)
1713            .size();
1714 }
1715 
GetNumFrameshiftsWithinRange(const TSeqRange & range,TDim row) const1716 TSeqPos CSeq_align::GetNumFrameshiftsWithinRange(
1717     const TSeqRange &range, TDim row) const
1718 {
1719     return s_GetIndels(*this, row, CRangeCollection<TSeqPos>(range), true, false)
1720            .size();
1721 }
1722 
GetNumFrameshiftsWithinRanges(const CRangeCollection<TSeqPos> & ranges,TDim row) const1723 TSeqPos CSeq_align::GetNumFrameshiftsWithinRanges(
1724     const CRangeCollection<TSeqPos> &ranges, TDim row) const
1725 {
1726     return s_GetIndels(*this, row, ranges, true, false)
1727            .size();
1728 }
1729 
GetFrameshifts(TDim row) const1730 vector<CSeq_align::SIndel> CSeq_align::GetFrameshifts(TDim row) const
1731 {
1732     return s_GetIndels(*this, row,
1733                          CRangeCollection<TSeqPos>(TSeqRange::GetWhole()), true, false);
1734 }
1735 
GetFrameshiftsWithinRange(const TSeqRange & range,TDim row) const1736 vector<CSeq_align::SIndel> CSeq_align::GetFrameshiftsWithinRange(
1737     const TSeqRange &range, TDim row) const
1738 {
1739     return s_GetIndels(*this, row, CRangeCollection<TSeqPos>(range), true, false);
1740 }
1741 
GetFrameshiftsWithinRanges(const CRangeCollection<TSeqPos> & ranges,TDim row) const1742 vector<CSeq_align::SIndel> CSeq_align::GetFrameshiftsWithinRanges(
1743     const CRangeCollection<TSeqPos> &ranges, TDim row) const
1744 {
1745     return s_GetIndels(*this, row, ranges, true, false);
1746 }
1747 
GetNonFrameshifts(TDim row) const1748 vector<CSeq_align::SIndel> CSeq_align::GetNonFrameshifts(TDim row) const
1749 {
1750     return s_GetIndels(*this, row,
1751                          CRangeCollection<TSeqPos>(TSeqRange::GetWhole()), false, true);
1752 }
1753 
GetNonFrameshiftsWithinRange(const TSeqRange & range,TDim row) const1754 vector<CSeq_align::SIndel> CSeq_align::GetNonFrameshiftsWithinRange(
1755     const TSeqRange &range, TDim row) const
1756 {
1757     return s_GetIndels(*this, row, CRangeCollection<TSeqPos>(range), false, true);
1758 }
1759 
GetNonFrameshiftsWithinRanges(const CRangeCollection<TSeqPos> & ranges,TDim row) const1760 vector<CSeq_align::SIndel> CSeq_align::GetNonFrameshiftsWithinRanges(
1761     const CRangeCollection<TSeqPos> &ranges, TDim row) const
1762 {
1763     return s_GetIndels(*this, row, ranges, false, true);
1764 }
1765 
GetIndels(TDim row) const1766 vector<CSeq_align::SIndel> CSeq_align::GetIndels(TDim row) const
1767 {
1768     return s_GetIndels(*this, row,
1769                          CRangeCollection<TSeqPos>(TSeqRange::GetWhole()), true, true);
1770 }
1771 
GetIndelsWithinRange(const TSeqRange & range,TDim row) const1772 vector<CSeq_align::SIndel> CSeq_align::GetIndelsWithinRange(
1773     const TSeqRange &range, TDim row) const
1774 {
1775     return s_GetIndels(*this, row, CRangeCollection<TSeqPos>(range), true, true);
1776 }
1777 
GetIndelsWithinRanges(const CRangeCollection<TSeqPos> & ranges,TDim row) const1778 vector<CSeq_align::SIndel> CSeq_align::GetIndelsWithinRanges(
1779     const CRangeCollection<TSeqPos> &ranges, TDim row) const
1780 {
1781     return s_GetIndels(*this, row, ranges, true, true);
1782 }
1783 
1784 
GetAlignedBases(TDim row) const1785 CRangeCollection<TSeqPos> CSeq_align::GetAlignedBases(TDim row) const
1786 {
1787     CRangeCollection<TSeqPos> ranges;
1788     switch (GetSegs().Which()) {
1789     case CSeq_align::TSegs::e_Denseg:
1790         {{
1791             const CDense_seg& ds = GetSegs().GetDenseg();
1792             for (CDense_seg::TNumseg i = 0;  i < ds.GetNumseg();  ++i) {
1793                 bool is_gapped = false;
1794                 for (CDense_seg::TDim j = 0;  j < ds.GetDim();  ++j) {
1795                     if (ds.GetStarts()[i * ds.GetDim() + j] == -1)
1796                     {
1797                         is_gapped = true;
1798                         break;
1799                     }
1800                 }
1801                 if (!is_gapped) {
1802                     TSignedSeqPos start = ds.GetStarts()[i * ds.GetDim() + row];
1803                     ranges += TSeqRange(start, start + ds.GetLens()[i] - 1);
1804                 }
1805             }
1806         }}
1807         break;
1808 
1809     case CSeq_align::TSegs::e_Disc:
1810         {{
1811             ITERATE(CSeq_align::TSegs::TDisc::Tdata, iter,
1812                     GetSegs().GetDisc().Get()) {
1813                 ranges += (*iter)->GetAlignedBases(row);
1814             }
1815         }}
1816         break;
1817 
1818     case CSeq_align::TSegs::e_Spliced:
1819         {{
1820             ITERATE (CSpliced_seg::TExons, iter, GetSegs().GetSpliced().GetExons()) {
1821                 ranges += (*iter)->GetRowSeq_range(row, true);
1822                 ranges -= (*iter)->GetRowSeq_insertions(row, GetSegs().GetSpliced());
1823             }
1824         }}
1825         break;
1826 
1827     default:
1828         NCBI_THROW(CSeqalignException, eUnsupported,
1829                    "CSeq_align::GetInsertedRanges() currently does not handle "
1830                    "this type of alignment.");
1831     }
1832 
1833     return ranges;
1834 }
1835 
1836 
1837 /// Get length of dense-seg alignment within range
s_DenseSegLength(const CDense_seg & ds,CDense_seg::TNumseg seg,const CRangeCollection<TSeqPos> & ranges)1838 static inline TSeqPos s_DenseSegLength(const CDense_seg& ds,
1839                                        CDense_seg::TNumseg seg,
1840                                        const CRangeCollection<TSeqPos> &ranges)
1841 {
1842     if(ranges.begin()->IsWhole())
1843         return ds.GetLens()[seg];
1844 
1845     TSignedSeqPos start = ds.GetStarts()[seg * ds.GetDim()];
1846     TSeqRange seg_range(start, start + ds.GetLens()[seg] - 1);
1847     return s_IntersectionLength(ranges, seg_range);
1848 }
1849 
1850 //////////////////////////////////////////////////////////////////////////////
1851 ///
1852 /// calculate the length of our alignment within given range
1853 ///
s_GetAlignmentLength(const CSeq_align & align,const CRangeCollection<TSeqPos> & ranges,bool ungapped)1854 static size_t s_GetAlignmentLength(const CSeq_align& align,
1855                                    const CRangeCollection<TSeqPos> &ranges,
1856                                    bool ungapped)
1857 {
1858     if (ranges.empty()) {
1859         return 0;
1860     }
1861 
1862     size_t len = 0;
1863     switch (align.GetSegs().Which()) {
1864     case CSeq_align::TSegs::e_Denseg:
1865         {{
1866             const CDense_seg& ds = align.GetSegs().GetDenseg();
1867             if (ungapped) {
1868                 for (CDense_seg::TNumseg i = 0;  i < ds.GetNumseg();  ++i) {
1869                     bool is_gap_seg = false;
1870                     for (CDense_seg::TDim j = 0;
1871                          !is_gap_seg  &&  j < ds.GetDim();  ++j) {
1872                         //int start = ds.GetStarts()[i * ds.GetDim() + j];
1873                         if (ds.GetStarts()[i * ds.GetDim() + j] == -1) {
1874                             is_gap_seg = true;
1875                         }
1876                     }
1877                     if ( !is_gap_seg ) {
1878                         len += s_DenseSegLength(ds, i, ranges);
1879                     }
1880                 }
1881             } else {
1882                 for (size_t i = 0;  i < ds.GetLens().size();  ++i) {
1883                     len += s_DenseSegLength(ds, i, ranges);
1884                }
1885             }
1886         }}
1887         break;
1888 
1889     case CSeq_align::TSegs::e_Disc:
1890         ITERATE (CSeq_align::TSegs::TDisc::Tdata, iter,
1891                  align.GetSegs().GetDisc().Get()) {
1892             len += s_GetAlignmentLength(**iter, ranges, ungapped);
1893         }
1894         break;
1895 
1896     case CSeq_align::TSegs::e_Std:
1897         {{
1898              if(!ranges.begin()->IsWhole())
1899                  NCBI_THROW(CSeqalignException, eUnsupported,
1900                             "Can't calculate alignment length within a range "
1901                             "for standard seg representation");
1902 
1903              /// pass 1:
1904              /// find total ranges
1905              vector<TSeqPos> sizes;
1906 
1907              size_t count = 0;
1908              ITERATE (CSeq_align::TSegs::TStd, iter, align.GetSegs().GetStd()) {
1909                  const CStd_seg& seg = **iter;
1910 
1911                  bool is_gap = false;
1912                  ITERATE (CStd_seg::TLoc, it, seg.GetLoc()) {
1913                      if ((*it)->IsEmpty()) {
1914                          is_gap = true;
1915                          break;
1916                      }
1917                  }
1918 
1919                  if ( !ungapped  ||  !is_gap) {
1920                      size_t i = 0;
1921                      ITERATE (CStd_seg::TLoc, it, seg.GetLoc()) {
1922                          if ( !(*it)->IsEmpty() ) {
1923                              if (sizes.empty()) {
1924                                  sizes.resize(seg.GetDim(), 0);
1925                              } else if (sizes.size() != (size_t)seg.GetDim()) {
1926                                  NCBI_THROW(CException, eUnknown,
1927                                             "invalid std-seg: "
1928                                             "inconsistent number of locs");
1929                              }
1930 
1931                              for (CSeq_loc_CI loc_it(**it);  loc_it;  ++loc_it) {
1932                                  sizes[i] += loc_it.GetRange().GetLength();
1933                              }
1934                          }
1935 
1936                          ++i;
1937                      }
1938                  }
1939 
1940                  ++count;
1941              }
1942 
1943              /// pass 2: determine shortest length
1944              if ( sizes.empty() ) return 0;
1945              vector<TSeqPos>::iterator iter = sizes.begin();
1946              vector<TSeqPos>::iterator smallest = iter;
1947              for (++iter;  iter != sizes.end();  ++iter) {
1948                  if (*iter < *smallest) {
1949                      smallest = iter;
1950                  }
1951              }
1952              return *smallest;
1953          }}
1954         break;
1955 
1956     case CSeq_align::TSegs::e_Spliced:
1957         ITERATE (CSpliced_seg::TExons, iter,
1958                  align.GetSegs().GetSpliced().GetExons()) {
1959             CRangeCollection<TSeqPos> exon_ranges = ranges;
1960             exon_ranges &= (*iter)->GetRowSeq_range(0, true);
1961             len += exon_ranges.GetCoveredLength();
1962             if (ungapped) {
1963                 len -= (*iter)->GetRowSeq_insertions(
1964                          0, align.GetSegs().GetSpliced(), ranges).GetCoveredLength();
1965             } else {
1966                 /// When computing length without gaps, genomic inserts
1967                 /// are ignored; when computing with gaps, we need to add them
1968                 /// to length if their starting point is within range
1969                 len += (*iter)->GetRowSeq_insertions(
1970                          1, align.GetSegs().GetSpliced(), ranges).GetCoveredLength();
1971             }
1972         }
1973         break;
1974 
1975     default:
1976         _ASSERT(false);
1977         break;
1978     }
1979 
1980     return len;
1981 }
1982 
1983 
GetAlignLength(bool include_gaps) const1984 TSeqPos CSeq_align::GetAlignLength(bool include_gaps) const
1985 {
1986     return s_GetAlignmentLength(*this,
1987                                 CRangeCollection<TSeqPos>(TSeqRange::GetWhole()),
1988                                 !include_gaps );
1989 }
1990 
1991 
GetAlignLengthWithinRange(const TSeqRange & range,bool include_gaps) const1992 TSeqPos CSeq_align::GetAlignLengthWithinRange(const TSeqRange &range,
1993                                               bool include_gaps) const
1994 {
1995     return s_GetAlignmentLength(*this,
1996                                 CRangeCollection<TSeqPos>(range),
1997                                 !include_gaps );
1998 }
1999 
2000 
GetAlignLengthWithinRanges(const CRangeCollection<TSeqPos> & ranges,bool include_gaps) const2001 TSeqPos CSeq_align::GetAlignLengthWithinRanges(const CRangeCollection<TSeqPos> &ranges,
2002                                                bool include_gaps) const
2003 {
2004     return s_GetAlignmentLength(*this, ranges, !include_gaps );
2005 }
2006 
2007 
AlignLengthRatio() const2008 double CSeq_align::AlignLengthRatio() const {
2009     TSeqRange r0 = GetSeqRange(0);
2010     TSeqRange r1 = GetSeqRange(1);
2011     double r = 0;
2012     if (r0.GetLength()) {
2013         r = double(r1.GetLength()) / double(r0.GetLength());
2014     }
2015     return r;
2016 }
2017 
2018 
CreateRowSeq_loc(TDim row) const2019 CRef<CSeq_loc> CSeq_align::CreateRowSeq_loc(TDim row) const
2020 {
2021     CRef<CSeq_loc> loc(new CSeq_loc);
2022     switch (GetSegs().Which()) {
2023     case CSeq_align::TSegs::e_Dendiag:
2024         {
2025             ITERATE(TSegs::TDendiag, it, GetSegs().GetDendiag()) {
2026                 loc->SetPacked_int().Set().push_back(
2027                     (*it)->CreateRowSeq_interval(row));
2028             }
2029             break;
2030         }
2031     case CSeq_align::TSegs::e_Denseg:
2032         {
2033             loc->SetInt(*GetSegs().GetDenseg().CreateRowSeq_interval(row));
2034             break;
2035         }
2036     case CSeq_align::TSegs::e_Std:
2037         {
2038             ITERATE(TSegs::TStd, it, GetSegs().GetStd()) {
2039                 // Std-seg may contain empty locations, so
2040                 // we have to use mix rather than packed-int.
2041                 loc->SetMix().Set().push_back(
2042                     (*it)->CreateRowSeq_loc(row));
2043             }
2044             break;
2045         }
2046     case CSeq_align::TSegs::e_Disc:
2047         {
2048             ITERATE(TSegs::TDisc::Tdata, it, GetSegs().GetDisc().Get()) {
2049                 loc->SetMix().Set().push_back((*it)->CreateRowSeq_loc(row));
2050             }
2051             break;
2052         }
2053     case CSeq_align::TSegs::e_Spliced:
2054         {
2055             if (row > 1) {
2056                 NCBI_THROW(CSeqalignException, eInvalidRowNumber,
2057                            "CSeq_align::CreateRowSeq_loc() - row number must "
2058                            "be 0 or 1 for spliced-segs.");
2059             }
2060             const CSpliced_seg& spl = GetSegs().GetSpliced();
2061             ITERATE(TSegs::TSpliced::TExons, ex, spl.GetExons()) {
2062                 // Spliced-seg may be required to get seq-id if
2063                 // it's not set in the exon.
2064                 loc->SetPacked_int().Set().push_back(
2065                     (*ex)->CreateRowSeq_interval(row, spl));
2066             }
2067             break;
2068         }
2069     case CSeq_align::TSegs::e_Packed:
2070     case CSeq_align::TSegs::e_Sparse:
2071     default:
2072         NCBI_THROW(CSeqalignException, eUnsupported,
2073                    "CSeq_align::CreateRowSeq_loc() currently does not handle "
2074                    "this type of alignment.");
2075         break;
2076     }
2077     return loc;
2078 }
2079 
2080 struct SLengthRange
2081 {
SLengthRangeSLengthRange2082     SLengthRange() : range(numeric_limits<TSeqPos>::max(), 0) {}
AddLengthSLengthRange2083     void AddLength(TSeqPos length)
2084     {
2085         if (range.first > length) {
2086             range.first = length;
2087         }
2088         if (range.second < length) {
2089             range.second = length;
2090         }
2091     }
AddRangeSLengthRange2092     void AddRange(CSeq_align::TLengthRange new_range)
2093     {
2094         if (range.first > new_range.first) {
2095             range.first = new_range.first;
2096         }
2097         if (range.second < new_range.second) {
2098             range.second = new_range.second;
2099         }
2100     }
2101 
2102     CSeq_align::TLengthRange range;
2103 };
2104 
GapLengthRange() const2105 CSeq_align::TLengthRange CSeq_align::GapLengthRange() const
2106 {
2107     SLengthRange length_range;
2108     switch (GetSegs().Which()) {
2109     case CSeq_align::TSegs::e_Denseg:
2110         {{
2111              const CDense_seg& ds = GetSegs().GetDenseg();
2112              for (CDense_seg::TNumseg i = 0;  i < ds.GetNumseg();  ++i) {
2113                  bool is_gapped = false;
2114                  for (CDense_seg::TDim j = 0;  j < ds.GetDim();  ++j) {
2115                      if (ds.GetStarts()[i * ds.GetDim() + j] == -1) {
2116                          is_gapped = true;
2117                          break;
2118                      }
2119                  }
2120                  if (is_gapped) {
2121                      length_range.AddLength(ds.GetLens()[i]);
2122                  }
2123              }
2124          }}
2125         break;
2126 
2127     case CSeq_align::TSegs::e_Disc:
2128         {{
2129              int num_rows = CheckNumRows();
2130              vector<TSeqRange> last_seg_ranges;
2131              ITERATE(CSeq_align::TSegs::TDisc::Tdata, iter,
2132                      GetSegs().GetDisc().Get()) {
2133                  length_range.AddRange((*iter)->GapLengthRange());
2134                  vector<TSeqRange> seg_ranges;
2135                  for (int i=0; i < num_rows; i++) {
2136                      seg_ranges.push_back((*iter)->GetSeqRange(i));
2137                      /// If this is not first segment, include gaps between last
2138                      /// segment and this one
2139                      if (!last_seg_ranges.empty()) {
2140                          TSeqPos gap = s_Distance(seg_ranges[i], last_seg_ranges[i]);
2141                          if (gap > 0) {
2142                              length_range.AddLength(gap);
2143                          }
2144                      }
2145                  }
2146                  last_seg_ranges = seg_ranges;
2147              }
2148          }}
2149         break;
2150 
2151     case CSeq_align::TSegs::e_Spliced:
2152         {{
2153              ITERATE (CSpliced_seg::TExons, iter,
2154                       GetSegs().GetSpliced().GetExons()) {
2155                  const CSpliced_exon& exon = **iter;
2156                  for (unsigned r = 0; r < 2; ++r) {
2157                      CRangeCollection<TSeqPos> insertions =
2158                           exon.GetRowSeq_insertions(
2159                                r, GetSegs().GetSpliced());
2160                      ITERATE (CRangeCollection<TSeqPos>, ins_it, insertions)
2161                      {
2162                          length_range.AddLength(ins_it->GetLength());
2163                      }
2164                  }
2165              }
2166          }}
2167 
2168         break;
2169 
2170     default:
2171         NCBI_THROW(CSeqalignException, eUnsupported,
2172                    "Can't get gap lengths for this type of alignment.");
2173     }
2174     return length_range.range;
2175 }
2176 
IntronLengthRange() const2177 CSeq_align::TLengthRange CSeq_align::IntronLengthRange() const
2178 {
2179     if (!GetSegs().IsSpliced()) {
2180         NCBI_THROW(CSeqalignException, eUnsupported,
2181                    "Requested exon lengths for a non-spliced alignment.");
2182     }
2183     SLengthRange length_range;
2184     const CSpliced_exon* previous_exon = NULL;
2185     bool minus_strand = GetSeqStrand(1) == eNa_strand_minus;
2186     ITERATE (CSpliced_seg::TExons, iter,
2187              GetSegs().GetSpliced().GetExons()) {
2188         const CSpliced_exon* exon = &**iter;
2189         if (previous_exon) {
2190             bool descending_pos = exon->GetGenomic_end() <
2191                                 previous_exon->GetGenomic_start();
2192             if (descending_pos != minus_strand) {
2193                 /// Crossed origins; don't count as intron size
2194                 continue;
2195             }
2196             TSeqRange intron((minus_strand ? exon : previous_exon)->GetGenomic_end() + 1,
2197                              (minus_strand ? previous_exon : exon)->GetGenomic_start() - 1);
2198             length_range.AddLength(intron.GetLength());
2199         }
2200         previous_exon = exon;
2201     }
2202     return length_range.range;
2203 }
2204 
ExonLengthRange() const2205 CSeq_align::TLengthRange CSeq_align::ExonLengthRange() const
2206 {
2207     if (!GetSegs().IsSpliced()) {
2208         NCBI_THROW(CSeqalignException, eUnsupported,
2209                    "Requested exon lengths for a non-spliced alignment.");
2210     }
2211     SLengthRange length_range;
2212     ITERATE (CSpliced_seg::TExons, iter,
2213              GetSegs().GetSpliced().GetExons()) {
2214         const CSpliced_exon* exon = &**iter;
2215         length_range.AddLength(
2216             exon->GetGenomic_end() - exon->GetGenomic_start() + 1);
2217     }
2218     return length_range.range;
2219 }
2220 
x_CreateSubsegAlignment(int from,int to) const2221 CRef<CSeq_align>  CSeq_align::x_CreateSubsegAlignment(int from, int to) const
2222 {
2223     // break at this segment
2224     CRef<CSeq_align> align(new CSeq_align);
2225     align->SetDim(2);
2226     align->SetType(CSeq_align::eType_partial);
2227     const CDense_seg& seg = GetSegs().GetDenseg();
2228     CDense_seg& subseg = align->SetSegs().SetDenseg();
2229 
2230     subseg.SetIds() = seg.GetIds();
2231     subseg.SetDim(2);
2232     subseg.SetNumseg(to - from + 1);
2233     subseg.SetStarts().reserve(subseg.GetNumseg()*2);
2234     subseg.SetLens().reserve(subseg.GetNumseg());
2235     if (seg.IsSetStrands()) {
2236         subseg.SetStrands().reserve(subseg.GetNumseg()*2);
2237     }
2238 
2239     int i;
2240 
2241     for (i = from;  i <= to;  ++i) {
2242         subseg.SetLens().push_back(seg.GetLens()[i]);
2243         subseg.SetStarts().push_back(seg.GetStarts()[i * 2]);
2244         subseg.SetStarts().push_back(seg.GetStarts()[i * 2 + 1]);
2245         if (seg.IsSetStrands()) {
2246             subseg.SetStrands().push_back(seg.GetStrands()[i * 2]);
2247             subseg.SetStrands().push_back(seg.GetStrands()[i * 2 + 1]);
2248         }
2249     }
2250 
2251     subseg.TrimEndGaps();
2252     try {
2253         align->Validate(true);
2254     }
2255     catch (CException& e) {
2256         ERR_POST(Error << e);
2257         cerr << MSerial_AsnText << seg;
2258         cerr << MSerial_AsnText << *align;
2259         throw;
2260     }
2261     return align;
2262 }
2263 
2264 
FindExt(const string & ext_type) const2265 CConstRef<CUser_object> CSeq_align::FindExt(const string& ext_type) const
2266 {
2267     CConstRef<CUser_object> ret;
2268     if ( IsSetExt() ) {
2269         ITERATE(TExt, it, GetExt()) {
2270             const CObject_id& obj_type = (*it)->GetType();
2271             if ( obj_type.IsStr()  &&  obj_type.GetStr() == ext_type ) {
2272                 ret.Reset(it->GetPointer());
2273                 break;
2274             }
2275         }
2276     }
2277     return ret;
2278 }
2279 
FindExt(const string & ext_type)2280 CRef<CUser_object> CSeq_align::FindExt(const string& ext_type)
2281 {
2282     CRef<CUser_object> ret;
2283     if ( IsSetExt() ) {
2284         NON_CONST_ITERATE(TExt, it, SetExt()) {
2285             const CObject_id& obj_type = (*it)->GetType();
2286             if ( obj_type.IsStr()  &&  obj_type.GetStr() == ext_type ) {
2287                 ret.Reset(it->GetPointer());
2288                 break;
2289             }
2290         }
2291     }
2292     return ret;
2293 }
2294 
2295 END_objects_SCOPE // namespace ncbi::objects::
2296 
2297 END_NCBI_SCOPE
2298