1 /* $Id: Dense_seg.cpp 464942 2015-04-15 15:46:17Z vasilche $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  .......
27  *
28  * File Description:
29  *   .......
30  *
31  * Remark:
32  *   This code was originally generated by application DATATOOL
33  *   using specifications from the data definition file
34  *   'seqalign.asn'.
35  */
36 
37 // standard includes
38 #include <ncbi_pch.hpp>
39 #include <algorithm>
40 #include <objects/seqalign/seqalign_exception.hpp>
41 
42 // generated includes
43 #include <objects/seqalign/Dense_seg.hpp>
44 
45 #include <objects/seqloc/Seq_id.hpp>
46 #include <objects/seqloc/Seq_loc.hpp>
47 #include <serial/objistr.hpp>
48 #include <corelib/ncbi_param.hpp>
49 
50 // generated classes
51 
52 BEGIN_NCBI_SCOPE
53 
54 BEGIN_objects_SCOPE // namespace ncbi::objects::
55 
56 // destructor
~CDense_seg(void)57 CDense_seg::~CDense_seg(void)
58 {
59 }
60 
61 
Assign(const CSerialObject & obj,ESerialRecursionMode how)62 void CDense_seg::Assign(const CSerialObject& obj, ESerialRecursionMode how)
63 {
64     /// do the base copy
65     CSerialObject::Assign(obj, how);
66 
67     /// copy our specific items
68     if (GetTypeInfo() == obj.GetThisTypeInfo()) {
69         const CDense_seg& other = static_cast<const CDense_seg&>(obj);
70         m_set_State1[0] = other.m_set_State1[0];
71         m_Widths = other.m_Widths;
72     }
73 }
74 
75 
CheckNumRows() const76 CDense_seg::TDim CDense_seg::CheckNumRows() const
77 {
78     const size_t& dim = GetDim();
79     if (dim != GetIds().size()) {
80         NCBI_THROW(CSeqalignException, eInvalidAlignment,
81                    "CDense_seg::CheckNumRows()"
82                    " ids.size is inconsistent with dim");
83     }
84     return dim;
85 }
86 
87 
CheckNumSegs() const88 CDense_seg::TNumseg CDense_seg::CheckNumSegs() const
89 {
90     const CDense_seg::TStarts&  starts  = GetStarts();
91     const CDense_seg::TStrands& strands = GetStrands();
92     const CDense_seg::TLens&    lens    = GetLens();
93     const CDense_seg::TWidths&  widths  = GetWidths();
94 
95     const size_t& numrows = GetDim();
96     const size_t& numsegs = GetNumseg();
97     const size_t  num     = numrows * numsegs;
98 
99     if (starts.size() != num) {
100         string errstr = string("CDense_seg::CheckNumSegs():")
101             + " starts.size is inconsistent with dim * numseg";
102         NCBI_THROW(CSeqalignException, eInvalidAlignment, errstr);
103     }
104     if (lens.size() != numsegs) {
105         string errstr = string("CDense_seg::CheckNumSegs():")
106             + " lens.size is inconsistent with numseg";
107         NCBI_THROW(CSeqalignException, eInvalidAlignment, errstr);
108     }
109     if (strands.size()  &&  strands.size() != num) {
110         string errstr = string("CDense_seg::CheckNumSegs():")
111             + " strands.size is inconsistent with dim * numseg";
112         NCBI_THROW(CSeqalignException, eInvalidAlignment, errstr);
113     }
114     if (widths.size()  &&  widths.size() != numrows) {
115         string errstr = string("CDense_seg::CheckNumSegs():")
116             + " widths.size is inconsistent with dim";
117         NCBI_THROW(CSeqalignException, eInvalidAlignment, errstr);
118     }
119     return numsegs;
120 }
121 
122 
GetSeq_id(TDim row) const123 const CSeq_id& CDense_seg::GetSeq_id(TDim row) const
124 {
125     if ( IsSetIds()  && (size_t)row < GetIds().size()) {
126         return *GetIds()[row];
127     }
128     NCBI_THROW(CSeqalignException, eInvalidRowNumber,
129                "CDense_seg::GetSeq_id(): "
130                "can not get seq-id for the row requested.");
131 }
132 
133 
GetSeqStart(TDim row) const134 TSeqPos CDense_seg::GetSeqStart(TDim row) const
135 {
136     const TDim&    dim    = GetDim();
137     const TNumseg& numseg = CheckNumSegs();
138     const TStarts& starts = GetStarts();
139 
140     if (row < 0  ||  row >= dim) {
141         NCBI_THROW(CSeqalignException, eInvalidRowNumber,
142                    "CDense_seg::GetSeqStart():"
143                    " Invalid row number");
144     }
145 
146     TSignedSeqPos start;
147     if (CanGetStrands()  &&  GetStrands().size()  &&  GetStrands()[row] == eNa_strand_minus) {
148         TNumseg seg = numseg;
149         int pos = (seg - 1) * dim + row;
150         while (seg--) {
151             if ((start = starts[pos]) >= 0) {
152                 return start;
153             }
154             pos -= dim;
155         }
156     } else {
157         TNumseg seg = -1;
158         int pos = row;
159         while (++seg < numseg) {
160             if ((start = starts[pos]) >= 0) {
161                 return start;
162             }
163             pos += dim;
164         }
165     }
166     NCBI_THROW(CSeqalignException, eInvalidAlignment,
167                "CDense_seg::GetSeqStart(): Row is empty");
168 }
169 
170 
GetSeqStop(TDim row) const171 TSeqPos CDense_seg::GetSeqStop(TDim row) const
172 {
173     const TDim& dim       = GetDim();
174     const TNumseg& numseg = CheckNumSegs();
175     const TStarts& starts = GetStarts();
176 
177     if (row < 0  ||  row >= dim) {
178         NCBI_THROW(CSeqalignException, eInvalidRowNumber,
179                    "CDense_seg::GetSeqStop():"
180                    " Invalid row number");
181     }
182 
183     TSignedSeqPos start;
184     if (CanGetStrands()  &&  GetStrands().size()  &&  GetStrands()[row] == eNa_strand_minus) {
185         TNumseg seg = -1;
186         int pos = row;
187         while (++seg < numseg) {
188             if ((start = starts[pos]) >= 0) {
189                 return start + GetLens()[seg] - 1;
190             }
191             pos += dim;
192         }
193     } else {
194         TNumseg seg = numseg;
195         int pos = (seg - 1) * dim + row;
196         while (seg--) {
197             if ((start = starts[pos]) >= 0) {
198                 return start + GetLens()[seg] - 1;
199             }
200             pos -= dim;
201         }
202     }
203     NCBI_THROW(CSeqalignException, eInvalidAlignment,
204                "CDense_seg::GetSeqStop(): Row is empty");
205 }
206 
207 
GetSeqStrand(TDim row) const208 ENa_strand CDense_seg::GetSeqStrand(TDim row) const
209 {
210     const size_t& strands_size = GetStrands().size();
211 
212     if ( !strands_size ) {
213         return eNa_strand_plus;
214     } else {
215         TDim dim = CheckNumRows();
216 
217         if (strands_size < (size_t) dim) {
218             // The ASN.1 spec technically requires numrows x numsegs
219             // strands, however in practice NCBI assumes that
220             // Dense-seg's strands are fixed per row.  Since we will
221             // obtain the strand from the first segment (assuming they
222             // are fixed) and for efficiency (to eliminate unnecessary
223             // multiplication) we don't check for the full numrows x
224             // numsegs size here
225             NCBI_THROW(CSeqalignException, eInvalidAlignment,
226                        "Invalid strands size");
227         }
228 
229         if (row < 0  ||  row >= dim) {
230             NCBI_THROW(CSeqalignException, eInvalidRowNumber,
231                        "CDense_seg::GetSeqStrand():"
232                        " Invalid row number");
233         }
234 
235         return GetStrands()[row];
236     }
237 }
238 
239 
Validate(bool full_test) const240 void CDense_seg::Validate(bool full_test) const
241 {
242     const CDense_seg::TStarts&  starts  = GetStarts();
243     const CDense_seg::TStrands& strands = GetStrands();
244     const CDense_seg::TLens&    lens    = GetLens();
245     const CDense_seg::TWidths&  widths  = GetWidths();
246 
247     const size_t& numrows = CheckNumRows();
248     const size_t& numsegs = CheckNumSegs();
249 
250     if (numsegs  == 0) {
251         return;
252     }
253 
254     if (full_test) {
255         const size_t  max     = numrows * (numsegs -1);
256 
257         bool strands_exist = !strands.empty();
258 
259         size_t numseg = 0, numrow = 0, offset = 0;
260         for (numrow = 0;  numrow < numrows;  numrow++) {
261             TSignedSeqPos min_start = -1, start;
262             bool plus = strands_exist ?
263                 strands[numrow] != eNa_strand_minus:
264                 true;
265 
266             if (plus) {
267                 offset = 0;
268             } else {
269                 offset = max;
270             }
271 
272             for (numseg = 0;  numseg < numsegs;  numseg++) {
273                 start = starts[offset + numrow];
274                 if (start >= 0) {
275                     if (start < min_start) {
276                         string errstr = string("CDense_seg::Validate():")
277                             + " Starts are not consistent!"
278                             + " Row=" + NStr::SizetToString(numrow) +
279                             " Seg=" + NStr::SizetToString(plus ? numseg :
280                                                         numsegs - 1 - numseg) +
281                             " MinStart=" + NStr::NumericToString(min_start) +
282                             " Start=" + NStr::NumericToString(start);
283 
284                         NCBI_THROW(CSeqalignException, eInvalidAlignment,
285                                    errstr);
286                     }
287                     min_start = start +
288                         lens[plus ? numseg : numsegs - 1 - numseg] *
289                         (widths.size() == numrows ?
290                          widths[numrow] : 1);
291                 }
292                 if (plus) {
293                     offset += numrows;
294                 } else {
295                     offset -= numrows;
296                 }
297             }
298             if (min_start == -1) {
299                 string errstr = string("CDense_seg::Validate():")
300                     + " Row " + NStr::SizetToString(numrow) +
301                     " is empty.";
302                 NCBI_THROW(CSeqalignException, eInvalidAlignment,
303                            errstr);
304             }
305         }
306     }
307 }
308 
309 
TrimEndGaps()310 void CDense_seg::TrimEndGaps()
311 {
312     _ASSERT(GetNumseg() == static_cast<TNumseg>(GetLens().size()));
313     _ASSERT(GetNumseg() * GetDim() == static_cast<int>(GetStarts().size()));
314     _ASSERT(!IsSetStrands()
315             ||  GetNumseg() * GetDim() == static_cast<int>(GetStrands().size()));
316     _ASSERT(GetDim() == static_cast<TDim>(GetIds().size()));
317 
318     list<TSignedSeqRange> delete_ranges;
319     int i;
320     int j;
321 
322     /// leading gap segments first
323     for (i = 0;  i < GetNumseg();  ++i) {
324         int count_gaps = 0;
325         for (j = 0;  j < GetDim();  ++j) {
326             TSignedSeqPos this_start = GetStarts()[i * GetDim() + j];
327             if (this_start == -1) {
328                 ++count_gaps;
329             }
330         }
331 
332         if (GetDim() - count_gaps > 1) {
333             /// no can do
334             break;
335         }
336     }
337 
338     if (i == GetNumseg() + 1) {
339         /// error case - all gapped, so don't bother
340         return;
341     }
342     if (i != 0) {
343         delete_ranges.push_back(TSignedSeqRange(0, i));
344     }
345 
346     /// trailing gap segments next
347     for (i = GetNumseg() - 1;  i >= 0;  --i) {
348         int count_gaps = 0;
349         for (j = 0;  j < GetDim();  ++j) {
350             TSignedSeqPos this_start = GetStarts()[i * GetDim() + j];
351             if (this_start == -1) {
352                 ++count_gaps;
353             }
354         }
355 
356         if (GetDim() - count_gaps > 1) {
357             /// no can do
358             break;
359         }
360     }
361 
362     if (i != GetNumseg() - 1) {
363         delete_ranges.push_back(TSignedSeqRange(i + 1, GetNumseg()));
364     }
365 
366     list<TSignedSeqRange>::reverse_iterator iter = delete_ranges.rbegin();
367     list<TSignedSeqRange>::reverse_iterator end  = delete_ranges.rend();
368     for ( ;  iter != end;  ++iter) {
369         TSignedSeqRange r = *iter;
370         if (r.GetLength() == 0) {
371             continue;
372         }
373 
374         /// we can trim the first i segments
375         if (IsSetStrands()) {
376             _ASSERT(static_cast<int>(GetStrands().size())
377                     >= r.GetTo() * GetDim());
378             SetStrands().erase(SetStrands().begin() + r.GetFrom() * GetDim(),
379                                SetStrands().begin() + r.GetTo()   * GetDim());
380         }
381         if (IsSetStarts()) {
382             _ASSERT(static_cast<int>(GetStarts().size())
383                     >= r.GetTo() * GetDim());
384             SetStarts().erase(SetStarts().begin() + r.GetFrom() * GetDim(),
385                               SetStarts().begin() + r.GetTo()   * GetDim());
386         }
387         if (IsSetLens()) {
388             _ASSERT(static_cast<int>(GetLens().size())
389                     >= r.GetTo());
390             SetLens().erase(SetLens().begin() + r.GetFrom(),
391                             SetLens().begin() + r.GetTo());
392         }
393     }
394 
395     /// fix our number of segments
396     SetNumseg(SetLens().size());
397 
398     _ASSERT(GetNumseg() == static_cast<TNumseg>(GetLens().size()));
399     _ASSERT(GetNumseg() * GetDim() == static_cast<int>(GetStarts().size()));
400     _ASSERT(!IsSetStrands()
401             ||  GetNumseg() * GetDim() == static_cast<int>(GetStrands().size()));
402     _ASSERT(GetDim() == static_cast<TDim>(GetIds().size()));
403 }
404 
405 
Compact()406 void CDense_seg::Compact()
407 {
408     _ASSERT(GetNumseg() == static_cast<TNumseg>(GetLens().size()));
409     _ASSERT(GetNumseg() * GetDim() == static_cast<int>(GetStarts().size()));
410     _ASSERT(!IsSetStrands()
411             ||  GetNumseg() * GetDim() == static_cast<int>(GetStrands().size()));
412     _ASSERT(GetDim() == static_cast<TDim>(GetIds().size()));
413 
414     int i;
415     int j;
416     vector<bool> can_merge(GetNumseg() - 1, true);  // start off with all true
417     unsigned int merge_count = 0;
418     for (i = 0;  i < GetNumseg() - 1;  ++i) {
419 
420         for (j = 0;  j < GetDim();  ++j) {
421             TSignedSeqPos this_start = GetStarts()[i * GetDim() + j];
422             TSignedSeqPos next_start = GetStarts()[(i + 1) * GetDim() + j];
423 
424             /// check to make sure there is not a gap mismatch
425             if ( (this_start == -1  &&  next_start != -1)
426                  ||  (this_start != -1  &&  next_start == -1) ) {
427                 can_merge[i] = false;
428                 break;
429             }
430 
431             /// check to make sure there is no unaligned space
432             /// between this segment and the next
433             if (this_start != -1  &&  next_start != -1) {
434                 TSignedSeqPos seg_len = GetLens()[i];
435                 if (IsSetStrands()  &&
436                     GetStrands()[i * GetDim() + j] == eNa_strand_minus) {
437                     seg_len = GetLens()[i + 1];
438                     seg_len = -seg_len;
439                 }
440 
441                 if (this_start + seg_len != next_start) {
442                     can_merge[i] = false;
443                     break;
444                 }
445             }
446 
447             /// check that the strands all agree between this segment
448             /// and the next (although it is rare, it is legal for them
449             /// to be different)
450             if (IsSetStrands()) {
451                 if (GetStrands()[i * GetDim() + j]
452                     != GetStrands()[(i + 1) * GetDim() + j]) {
453                     can_merge[i] = false;
454                     break;
455                 }
456             }
457         }
458         if (can_merge[i]) {
459             ++merge_count;
460         }
461     }
462 
463     if (merge_count == 0) {
464         // nothing needs to be done
465         return;
466     }
467 
468     CDense_seg::TStarts new_starts;
469     CDense_seg::TLens  new_lens;
470     CDense_seg::TStrands new_strands;
471     new_starts.reserve((GetNumseg() - merge_count) * GetDim());
472     new_lens.reserve(GetNumseg() - merge_count);
473     if (IsSetStrands()) {
474         new_strands.reserve((GetNumseg() - merge_count) * GetDim());
475     }
476     for (i = 0;  i < GetNumseg();  ++i) {
477         if (i > 0 && can_merge[i - 1]) {
478             // merge this segment into the last one
479             new_lens.back() += GetLens()[i];
480             if (IsSetStrands()) {
481                 for (j = 0;  j < GetDim();  ++j) {
482                     if (GetStrands()[i * GetDim() + j] == eNa_strand_minus) {
483                         new_starts[new_starts.size() - GetDim() + j] =
484                             GetStarts()[i * GetDim() + j];
485                     }
486                 }
487             }
488         } else {
489             // just copy the original segment i onto the end of the new stuff
490             new_lens.push_back(GetLens()[i]);
491             for (j = 0;  j < GetDim();  ++j) {
492                 new_starts.push_back(GetStarts()[i * GetDim() + j]);
493                 if (IsSetStrands()) {
494                     new_strands.push_back(GetStrands()[i * GetDim() + j]);
495                 }
496             }
497         }
498     }
499 
500     SetStarts().swap(new_starts);
501     SetLens().swap(new_lens);
502     if (IsSetStrands()) {
503         SetStrands().swap(new_strands);
504     }
505 
506     SetNumseg(SetLens().size());
507 
508     _ASSERT(GetNumseg() == static_cast<TNumseg>(GetLens().size()));
509     _ASSERT(GetNumseg() * GetDim() == static_cast<int>(GetStarts().size()));
510     _ASSERT(!IsSetStrands()
511             ||  GetNumseg() * GetDim() == static_cast<int>(GetStrands().size()));
512     _ASSERT(GetDim() == static_cast<TDim>(GetIds().size()));
513 }
514 
515 
OrderAdjacentGaps()516 void CDense_seg::OrderAdjacentGaps()
517 {
518 	#define IDX(_x,_y) (((_x)*GetDim())+(_y))
519 	bool swaps_made = false;
520 	do {
521 		swaps_made = false;
522 		for(int i=0; i < GetNumseg()-1; ++i) {
523 
524 			bool curr_gap = false, next_gap = false;
525 			int curr_seq_row = GetDim()+1, next_seq_row = GetDim()+1;
526 			for (int j=0; j < GetDim(); ++j) {
527 				if (GetStarts()[IDX(i,j)] == -1)
528 					curr_gap = true;
529 				else
530 					curr_seq_row = min(curr_seq_row, j);
531 				if (GetStarts()[IDX(i+1,j)] == -1)
532 					next_gap = true;
533 				else
534 					next_seq_row = min(next_seq_row, j);
535 			}
536 			if(!(curr_gap & next_gap))
537 				continue;
538 
539 			// if this Seg and next Seg are both gaps,
540 			// and the first row with sequence is not curr
541 			// swap the two Segs
542 			if(next_seq_row < curr_seq_row) {
543 				for(int j=0; j < GetDim(); ++j) {
544 					swap(SetStarts()[IDX(i,j)], SetStarts()[IDX(i+1,j)]);
545 					if (GetStrands().size() > (size_t)IDX(i+1,j))
546 						swap(SetStrands()[IDX(i,j)], SetStrands()[IDX(i+1,j)]);
547 				}
548                 swap(SetLens()[i], SetLens()[i+1]);
549 				swaps_made = true;
550 			}
551 		}
552 	} while(swaps_made);
553 }
554 
555 
RemovePureGapSegs()556 void CDense_seg::RemovePureGapSegs()
557 {
558     _ASSERT(GetNumseg() == static_cast<TNumseg>(GetLens().size()));
559     _ASSERT(GetNumseg() * GetDim() == static_cast<int>(GetStarts().size()));
560     _ASSERT(!IsSetStrands()
561             ||  GetNumseg() * GetDim() == static_cast<int>(GetStrands().size()));
562     _ASSERT(GetDim() == static_cast<TDim>(GetIds().size()));
563 
564 
565     // consistency checks
566     TDim dim = CheckNumRows();
567     TNumseg numseg = CheckNumSegs();
568 
569     int i;
570     int j;
571     vector<bool> remove(numseg, true);  // start out with all true
572     unsigned int remove_count = 0;
573     for (i = 0;  i < numseg;  ++i) {
574 
575         for (j = 0;  j < dim;  ++j) {
576             if (GetStarts()[i * dim + j] != -1) {
577                 // not a gap
578                 remove[i] = false;
579                 break;
580             }
581         }
582         if (remove[i]) {
583             ++remove_count;
584         }
585     }
586 
587     if (remove_count == 0) {
588         // nothing to remove; leave unchanged
589         return;
590     }
591 
592     CDense_seg::TStarts new_starts;
593     CDense_seg::TLens  new_lens;
594     CDense_seg::TStrands new_strands;
595     new_starts.reserve((numseg - remove_count) * dim);
596     new_lens.reserve(numseg - remove_count);
597     if (IsSetStrands()) {
598         new_strands.reserve((numseg - remove_count) * dim);
599     }
600     for (i = 0;  i < numseg;  ++i) {
601         if (!remove[i]) {
602             // copy the original segment i onto the end of the new stuff
603             new_lens.push_back(GetLens()[i]);
604             for (j = 0;  j < dim;  ++j) {
605                 new_starts.push_back(GetStarts()[i * dim + j]);
606                 if (IsSetStrands()) {
607                     new_strands.push_back(GetStrands()[i * dim + j]);
608                 }
609             }
610         }
611     }
612 
613     SetStarts().swap(new_starts);
614     SetLens().swap(new_lens);
615     if (IsSetStrands()) {
616         SetStrands().swap(new_strands);
617     }
618 
619     SetNumseg(SetLens().size());
620 
621 #ifdef _DEBUG
622     Validate(true);
623 #endif
624 
625 }
626 
627 
628 //-----------------------------------------------------------------------------
629 // PRE : none
630 // POST: same alignment, opposite orientation
Reverse(void)631 void CDense_seg::Reverse(void)
632 {
633     //flip strands
634     if (IsSetStrands()) {
635         NON_CONST_ITERATE (CDense_seg::TStrands, i, SetStrands()) {
636             switch (*i) {
637             case eNa_strand_plus:  *i = eNa_strand_minus; break;
638             case eNa_strand_minus: *i = eNa_strand_plus;  break;
639             default:                    break;//do nothing if not + or -
640             }
641         }
642     } else {
643         // Interpret unset strands as plus strands.
644         // Since we're reversing, set them all to minus.
645         SetStrands().resize(GetStarts().size(), eNa_strand_minus);
646     }
647 
648     //reverse list o' lengths
649     {
650         CDense_seg::TLens::iterator f = SetLens().begin();
651         CDense_seg::TLens::iterator r = SetLens().end();
652         while (f < r) {
653             swap(*(f++), *(--r));
654         }
655     }
656 
657     //reverse list o' starts
658     CDense_seg::TStarts &starts = SetStarts();
659     int f = 0;
660     int r = (GetNumseg() - 1) * GetDim();
661     while (f < r) {
662         for (int i = 0;  i < GetDim();  ++i) {
663             swap(starts[f+i], starts[r+i]);
664         }
665         f += GetDim();
666         r -= GetDim();
667     }
668 }
669 
670 //-----------------------------------------------------------------------------
671 // PRE : numbers of the rows to swap
672 // POST: alignment rearranged with row1 where row2 used to be & vice versa
SwapRows(TDim row1,TDim row2)673 void CDense_seg::SwapRows(TDim row1, TDim row2)
674 {
675     if (row1 >= GetDim()  ||  row1 < 0  ||
676         row2 >= GetDim()  ||  row2 < 0) {
677         NCBI_THROW(CSeqalignException, eOutOfRange,
678                    "Row numbers supplied to CDense_seg::SwapRows must be "
679                    "in the range [0, dim)");
680     }
681 
682     //swap ids
683     swap(SetIds()[row1], SetIds()[row2]);
684 
685     int idxStop = GetNumseg()*GetDim();
686 
687     //swap starts
688     for(int i = 0; i < idxStop; i += GetDim()) {
689         swap(SetStarts()[i+row1], SetStarts()[i+row2]);
690     }
691 
692     //swap strands
693     if (IsSetStrands()) {
694         for(int i = 0; i < idxStop; i += GetDim()) {
695             swap(SetStrands()[i+row1], SetStrands()[i+row2]);
696         }
697     }
698 }
699 
700 
701 /*---------------------------------------------------------------------------*/
702 // PRE : this is a validated dense seg; row & sequence position on row in
703 // alignment
704 // POST: the number of the segment in which this sequence position falls
705 CDense_seg::TNumseg CDense_seg::
x_FindSegment(CDense_seg::TDim row,TSignedSeqPos pos) const706 x_FindSegment(CDense_seg::TDim row, TSignedSeqPos pos) const
707 {
708     bool found = false;
709     CDense_seg::TNumseg seg;
710     for (seg = 0;  seg < GetNumseg()  &&  !found;  ++seg) {
711         TSignedSeqPos start = GetStarts()[seg * GetDim() + row];
712         TSignedSeqPos len   = GetLens()[seg];
713         if (start != -1) {
714             if (pos >= start  &&  pos < start + len) {
715                 found = true;
716             }
717         }
718     }
719     if (!found) {
720         NCBI_THROW(CSeqalignException, eInvalidAlignment,
721                    "CDense_seg::x_FindSegment(): "
722                    "Can't find a segment containing position " +
723                    NStr::NumericToString(pos));
724     }
725 
726     return seg - 1;
727 }
728 
729 
730 //-----------------------------------------------------------------------------
731 // PRE : range on a row in the alignment
732 // POST: dst Dense_seg reset &
733 CRef<CDense_seg> CDense_seg::
ExtractSlice(TDim row,TSeqPos from,TSeqPos to) const734 ExtractSlice(TDim row, TSeqPos from, TSeqPos to) const
735 {
736     if (row < 0  ||  row >= GetDim()) {
737         NCBI_THROW(CSeqalignException, eInvalidRowNumber,
738                    "CDense_seg::ExtractSlice():"
739                    " Invalid row number ("
740                    + NStr::NumericToString(row) + ")");
741     }
742 
743     if (from > to) {
744         swap(from, to);
745     }
746     if (from < GetSeqStart(row)) {
747         NCBI_THROW(CSeqalignException, eOutOfRange,
748                    "CDense_seg::ExtractSlice(): "
749                    "start position (" + NStr::NumericToString(from) +
750                    ") off end of alignment");
751     }
752     if (to > GetSeqStop(row)) {
753         NCBI_THROW(CSeqalignException, eOutOfRange,
754                    "CDense_seg::ExtractSlice(): "
755                    "stop position (" + NStr::NumericToString(to) +
756                    ") off end of alignment");
757     }
758 
759 
760     CRef<CDense_seg> ds(new CDense_seg);
761     ds->SetDim(GetDim());
762     ds->SetNumseg(0);
763     ITERATE(CDense_seg::TIds, idI, GetIds()) {
764         CSeq_id *si = new CSeq_id;
765         si->Assign(**idI);
766         ds->SetIds().push_back(CRef<CSeq_id>(si));
767     }
768 
769     //find start/stop segments
770     CDense_seg::TNumseg startSeg = x_FindSegment(row, from);
771     CDense_seg::TNumseg stopSeg  = x_FindSegment(row, to);
772 
773     TSeqPos startOffset = from - GetStarts()[startSeg * GetDim() + row];
774     TSeqPos stopOffset  = GetStarts()[stopSeg * GetDim() + row] +
775         GetLens()[stopSeg] - 1 - to;
776     if (IsSetStrands() && GetStrands()[row] == eNa_strand_minus) {
777         swap(startOffset, stopOffset);
778         swap(startSeg, stopSeg); // make sure startSeg is first
779     }
780 
781     for (CDense_seg::TNumseg seg = startSeg;  seg <= stopSeg;  ++seg) {
782         //starts
783         for (CDense_seg::TDim dim = 0;  dim < GetDim();  ++dim) {
784             TSignedSeqPos start = GetStarts()[seg * GetDim() + dim];
785             if (start != -1) {
786                 if (seg == startSeg  && (!IsSetStrands() ||
787                     GetStrands()[seg * GetDim() + dim] == eNa_strand_plus)) {
788                     start += startOffset;
789                 }
790                 if (seg == stopSeg  && IsSetStrands() &&
791                     GetStrands()[seg * GetDim() + dim] == eNa_strand_minus) {
792                     start += stopOffset;
793                 }
794             }
795             ds->SetStarts().push_back(start);
796         }
797 
798         //len
799         TSeqPos len = GetLens()[seg];
800         if (seg == startSeg) {
801             len -= startOffset;
802         }
803         if (seg == stopSeg) {
804             len -= stopOffset;
805         }
806         ds->SetLens().push_back(len);
807 
808         //strands
809         if (IsSetStrands()) {
810             for (CDense_seg::TDim dim = 0;  dim < GetDim();  ++dim) {
811                 ds->SetStrands().push_back(GetStrands()[seg * GetDim() + dim]);
812             }
813         }
814         ++ds->SetNumseg();
815     }
816 
817 #ifdef _DEBUG
818     ds->Validate(true);
819 #endif
820     return ds;
821 }
822 
823 
ExtractRows(const vector<TDim> & rows) const824 CRef<CDense_seg> CDense_seg::ExtractRows(const vector<TDim>& rows) const
825 {
826     // consistency checks
827     TDim dim = CheckNumRows();
828     TNumseg numseg = CheckNumSegs();
829 
830     CRef<CDense_seg> new_ds(new CDense_seg);
831     new_ds->SetDim(rows.size());
832     new_ds->SetNumseg(GetNumseg());
833 
834     // reserve for efficiency
835     new_ds->SetIds().reserve(rows.size());
836     new_ds->SetStarts().reserve(rows.size() * GetNumseg());
837     new_ds->SetLens().reserve(GetNumseg());
838     if (IsSetStrands()) {
839         new_ds->SetStrands().reserve(rows.size() * GetNumseg());
840     }
841 
842     ITERATE (vector<TDim>, row, rows) {
843         // sole check that rows are not out of range
844         if (*row < 0  ||  *row >= dim) {
845             NCBI_THROW(CSeqalignException, eInvalidRowNumber,
846                        "CDense_seg::ExtractRows():"
847                        " Invalid row number ("
848                        + NStr::NumericToString(*row) + ")");
849         }
850         // *copy* the ID (don't just make a reference to it)
851         CRef<CSeq_id> id_copy(new CSeq_id);
852         id_copy->Assign(*GetIds()[*row]);
853         new_ds->SetIds().push_back(id_copy);
854     }
855     for (TNumseg segnum = 0; segnum < numseg; ++segnum) {
856         new_ds->SetLens().push_back(GetLens()[segnum]);
857         ITERATE (vector<TDim>, row, rows) {
858             int idx = segnum * dim + *row;
859             new_ds->SetStarts().push_back(GetStarts()[idx]);
860             if (IsSetStrands()) {
861                 new_ds->SetStrands().push_back(GetStrands()[idx]);
862             }
863         }
864     }
865 
866     new_ds->Compact();  // even if original was compact, new one may not be
867 
868 #ifdef _DEBUG
869     new_ds->Validate(true);
870 #endif
871 
872     // Scores are not propagated
873     return new_ds;
874 }
875 
876 
OffsetRow(TDim row,TSignedSeqPos offset)877 void CDense_seg::OffsetRow(TDim row,
878                            TSignedSeqPos offset)
879 {
880     if (offset == 0) return;
881 
882     // Check for out-of-range negative offset
883     if (offset < 0) {
884         for (TNumseg seg = 0, pos = row;
885              seg < GetNumseg();
886              ++seg, pos += GetDim()) {
887 
888             if (GetStarts()[pos] >= 0) {
889                 if (GetStarts()[pos] < -offset) {
890                     NCBI_THROW(CSeqalignException, eOutOfRange,
891                                "Negative offset greater than seq position");
892                 }
893             }
894         }
895     }
896 
897     // Modify positions
898     for (TNumseg seg = 0, pos = row;
899          seg < GetNumseg();
900          ++seg, pos += GetDim()) {
901         if (GetStarts()[pos] >= 0) {
902             SetStarts()[pos] += offset;
903         }
904     }
905 }
906 
907 
908 /// @deprecated
RemapToLoc(TDim row,const CSeq_loc & loc,bool ignore_strand)909 void CDense_seg::RemapToLoc(TDim row, const CSeq_loc& loc,
910                             bool ignore_strand)
911 {
912     if (loc.IsWhole()) {
913         return;
914     }
915 
916     TSeqPos row_stop  = GetSeqStop(row);
917 
918     size_t  ttl_loc_len = 0;
919     {{
920         CSeq_loc_CI seq_loc_i(loc);
921         do {
922             ttl_loc_len += seq_loc_i.GetRange().GetLength();
923         } while (++seq_loc_i);
924     }}
925 
926     // check the validity of the seq-loc
927     if (ttl_loc_len < row_stop + 1) {
928         string errstr("CDense_seg::RemapToLoc():"
929                       " Seq-loc is not long enough to"
930                       " cover the alignment!"
931                       " Maximum row seq pos is ");
932         errstr += NStr::NumericToString(row_stop);
933         errstr += ". The total seq-loc len is only ";
934         errstr += NStr::SizetToString(ttl_loc_len);
935         errstr += ", it should be at least ";
936         errstr += NStr::NumericToString(row_stop+1);
937         errstr += " (= max seq pos + 1).";
938         NCBI_THROW(CSeqalignException, eOutOfRange, errstr);
939     }
940 
941     const CDense_seg::TStarts&  starts  = GetStarts();
942     const CDense_seg::TStrands& strands = GetStrands();
943     const CDense_seg::TLens&    lens    = GetLens();
944 
945     const size_t& numrows = CheckNumRows();
946     const size_t& numsegs = CheckNumSegs();
947 
948     CSeq_loc_CI seq_loc_i(loc);
949 
950     TSeqPos start, loc_len, len, len_so_far;
951     start = seq_loc_i.GetRange().GetFrom();
952     len = loc_len = seq_loc_i.GetRange().GetLength();
953     len_so_far = 0;
954 
955     bool row_plus = !strands.size() || strands[row] != eNa_strand_minus;
956     bool loc_plus = seq_loc_i.GetStrand() != eNa_strand_minus;
957 
958     // iterate through segments
959     size_t  idx = loc_plus ? row : (numsegs - 1) * numrows + row;
960     TNumseg seg = loc_plus ? 0 : numsegs - 1;
961     while (loc_plus ? seg < GetNumseg() : seg >= 0) {
962         if (starts[idx] == -1) {
963             // ignore gaps in our sequence
964             if (loc_plus) {
965                 idx += numrows; seg++;
966             } else {
967                 idx -= numrows; seg--;
968             }
969             continue;
970         }
971 
972         // iterate the seq-loc if needed
973         if ((loc_plus == row_plus ?
974              starts[idx] : ttl_loc_len - starts[idx] - lens[seg])
975             > len_so_far + loc_len) {
976 
977             if (++seq_loc_i) {
978                 len_so_far += len;
979                 len   = seq_loc_i.GetRange().GetLength();
980                 start = seq_loc_i.GetRange().GetFrom();
981             } else {
982                 NCBI_THROW(CSeqalignException, eInvalidInputData,
983                            "CDense_seg::RemapToLoc():"
984                            " Internal error");
985             }
986 
987             // assert the strand is the same
988             if (loc_plus != (seq_loc_i.GetStrand() != eNa_strand_minus)) {
989                 NCBI_THROW(CSeqalignException, eInvalidInputData,
990                            "CDense_seg::RemapToLoc():"
991                            " The strand should be the same accross"
992                            " the input seq-loc");
993             }
994         }
995 
996         // offset for the starting position
997         if (loc_plus == row_plus) {
998             SetStarts()[idx] += start - len_so_far;
999         } else {
1000             SetStarts()[idx] =
1001                 start - len_so_far + ttl_loc_len - starts[idx] - lens[seg];
1002         }
1003 
1004         if (lens[seg] > len) {
1005             TSignedSeqPos len_diff = lens[seg] - len;
1006             while (1) {
1007                 // move to the next loc part that extends beyond our length
1008                 ++seq_loc_i;
1009                 if (seq_loc_i) {
1010                     start = seq_loc_i.GetRange().GetFrom();
1011                 } else {
1012                     NCBI_THROW(CSeqalignException, eOutOfRange,
1013                                "CDense_seg::RemapToLoc():"
1014                                " Internal error");
1015                 }
1016 
1017                 // split our segment
1018                 SetLens().insert(SetLens().begin() +
1019                                  (loc_plus ? seg : seg + 1),
1020                                  len);
1021                 SetLens()[loc_plus ? seg + 1 : seg] = len_diff;
1022 
1023                 // insert new data to account for our split segment
1024                 TStarts temp_starts(numrows, -1);
1025                 for (size_t row_i = 0, tmp_idx = seg * numrows;
1026                      row_i < numrows;  ++row_i, ++tmp_idx) {
1027                     TSignedSeqPos& this_start = SetStarts()[tmp_idx];
1028                     if (this_start != -1) {
1029                         temp_starts[row_i] = this_start;
1030                         if (loc_plus == (strands[row_i] != eNa_strand_minus)) {
1031                             if ((size_t) row == row_i) {
1032                                 temp_starts[row_i] = start;
1033                             } else {
1034                                 temp_starts[row_i] += len;
1035                             }
1036                         } else {
1037                             this_start += len_diff;
1038                         }
1039                     }
1040                 }
1041 
1042                 len_so_far += loc_len;
1043                 len = loc_len = seq_loc_i.GetRange().GetLength();
1044 
1045                 SetStarts().insert(SetStarts().begin() +
1046                                    (loc_plus ? seg + 1 : seg) * numrows,
1047                                    temp_starts.begin(), temp_starts.end());
1048 
1049                 if (strands.size()) {
1050                     SetStrands().insert
1051                         (SetStrands().begin(),
1052                          strands.begin(), strands.begin() + numrows);
1053                 }
1054 
1055                 SetNumseg()++;
1056 
1057                 if ((len_diff = lens[seg] - len) > 0) {
1058                     if (loc_plus) {
1059                         idx += numrows; seg++;
1060                     } else {
1061                         idx -= numrows; seg--;
1062                     }
1063                 } else {
1064                     break;
1065                 }
1066             }
1067         } else {
1068             len -= lens[seg];
1069         }
1070 
1071         if (loc_plus) {
1072             idx += numrows; seg++;
1073         } else {
1074             idx -= numrows; seg--;
1075         }
1076     } // while iterating through segments
1077 
1078     // finally, modify the strands if different
1079     if ( !ignore_strand ) {
1080         if (loc_plus != row_plus) {
1081             if (!strands.size()) {
1082                 // strands do not exist, create them
1083                 SetStrands().resize(GetNumseg() * GetDim(), eNa_strand_plus);
1084             }
1085             for (seg = 0, idx = row;
1086                  seg < GetNumseg(); seg++, idx += numrows) {
1087                 SetStrands()[idx] = loc_plus ? eNa_strand_plus : eNa_strand_minus;
1088             }
1089         }
1090     }
1091 
1092 }
1093 
1094 
FillUnaligned() const1095 CRef<CDense_seg> CDense_seg::FillUnaligned() const
1096 {
1097     // this dense-seg
1098     const CDense_seg::TStarts&  starts  = GetStarts();
1099     const CDense_seg::TStrands& strands = GetStrands();
1100     const CDense_seg::TLens&    lens    = GetLens();
1101     const CDense_seg::TIds&     ids     = GetIds();
1102 
1103     const size_t& numrows = CheckNumRows();
1104     const size_t& numsegs = CheckNumSegs();
1105 
1106     bool strands_exist = !strands.empty();
1107 
1108     size_t seg = 0, row = 0;
1109 
1110 
1111     // extra segments
1112     CDense_seg::TStarts extra_starts;
1113     CDense_seg::TLens   extra_lens;
1114     size_t extra_numsegs = 0;
1115     size_t extra_seg = 0;
1116     vector<size_t> extra_segs;
1117 
1118 
1119     // new dense-seg
1120     CRef<CDense_seg> new_ds(new CDense_seg);
1121     CDense_seg::TStarts&  new_starts  = new_ds->SetStarts();
1122     CDense_seg::TStrands& new_strands = new_ds->SetStrands();
1123     CDense_seg::TLens&    new_lens    = new_ds->SetLens();
1124     CDense_seg::TIds&     new_ids     = new_ds->SetIds();
1125 
1126     // dimentions
1127     new_ds->SetDim(numrows);
1128     TNumseg& new_numsegs = new_ds->SetNumseg();
1129     new_numsegs = numsegs; // initialize
1130 
1131     // ids
1132     new_ids.resize(numrows);
1133     for (row = 0; row < numrows; row++) {
1134         CRef<CSeq_id> id(new CSeq_id);
1135         SerialAssign(*id, *ids[row]);
1136         new_ids[row] = id;
1137     }
1138 
1139     TNumseg new_seg = 0;
1140 
1141     // temporary data
1142     vector<TSignedSeqPos> expected_positions;
1143     expected_positions.resize(numrows, -1);
1144 
1145     vector<bool> plus;
1146     plus.resize(numrows, true);
1147     if (strands_exist) {
1148         for (row = 0; row < numrows; row++) {
1149             if (strands[row] == eNa_strand_minus) {
1150                 plus[row] = false;
1151             }
1152         }
1153     }
1154 
1155     TSignedSeqPos extra_len = 0;
1156     size_t idx = 0, new_idx = 0, extra_idx = 0;
1157 
1158     // main loop through segments
1159     for (seg = 0; seg < numsegs; seg++) {
1160         const TSeqPos& len = lens[seg];
1161         for (row = 0; row < numrows; row++) {
1162 
1163             const TSignedSeqPos& start = starts[idx++];
1164 
1165             TSignedSeqPos& expected_pos = expected_positions[row];
1166 
1167             if (start >= 0) {
1168 
1169                 if (expected_pos >= 0) {
1170                     // check if there's an unaligned insert
1171 
1172                     if (plus[row]) {
1173                         extra_len = start - expected_pos;
1174                     } else {
1175                         extra_len = expected_pos - start - len;
1176                     }
1177 
1178                     if (extra_len < 0) {
1179                         string errstr("CDense_seg::AddUnalignedSegments():"
1180                                       " Illegal overlap at Row ");
1181                         errstr += NStr::SizetToString(row);
1182                         errstr += " Segment ";
1183                         errstr += NStr::SizetToString(seg);
1184                         errstr += ".";
1185                         NCBI_THROW(CSeqalignException, eInvalidAlignment,
1186                                    errstr);
1187                     } else if (extra_len > 0) {
1188                         // insert new segment
1189                         extra_segs.push_back(seg);
1190                         extra_lens.push_back(extra_len);
1191                         extra_starts.resize(extra_idx + numrows, -1);
1192                         extra_starts[extra_idx + row] =
1193                             plus[row] ? expected_pos : start + len;
1194 
1195                         extra_idx += numrows;
1196                         ++extra_numsegs;
1197                     }
1198                 }
1199 
1200                 // set the new expected_pos
1201                 if (plus[row]) {
1202                     expected_pos = start + len;
1203                 } else {
1204                     expected_pos = start;
1205                 }
1206             }
1207         }
1208     }
1209 
1210     // lens & starts
1211     new_numsegs = numsegs + extra_numsegs;
1212     new_lens.resize(numsegs + extra_numsegs);
1213     new_starts.resize(numrows * new_numsegs);
1214     for (seg = 0, new_seg = 0, extra_seg = 0,
1215              idx = 0, new_idx = 0, extra_idx = 0;
1216          seg < numsegs;
1217          seg++, new_seg++) {
1218 
1219         // insert extra segments
1220         if (extra_numsegs) {
1221             for ( ;  extra_seg < extra_segs.size()  &&  extra_segs[extra_seg] == seg;  ++new_seg, ++extra_seg) {
1222                 new_lens[new_seg] = extra_lens[extra_seg];
1223                 for (row = 0;  row < numrows;  ++row, ++new_idx, ++extra_idx) {
1224                     new_starts[new_idx] = extra_starts[extra_idx];
1225                 }
1226             }
1227         }
1228 
1229         // add the existing segment
1230         new_lens[new_seg] = lens[seg];
1231         for (row = 0; row < numrows; row++) {
1232             new_starts[new_idx++] = starts[idx++];
1233         }
1234     }
1235 
1236 
1237     // strands
1238     new_strands.resize(numrows * new_numsegs, eNa_strand_plus);
1239     if (strands_exist) {
1240         new_idx = 0;
1241         for (new_seg = 0; new_seg < new_numsegs; new_seg++) {
1242             for (row = 0; row < numrows; row++, new_idx++) {
1243                 if ( !plus[row] ) {
1244                     new_strands[new_idx] = eNa_strand_minus;
1245                 }
1246             }
1247         }
1248     }
1249 
1250     return new_ds;
1251 }
1252 
1253 
1254 //-----------------------------------------------------------------------------
1255 // PRE : alignment transcript (RLE or not) and start coordinates
1256 // POST: Starts, lens and strands. Ids and scores not affected.
1257 
1258 // initialize from pairwise alignment transcript
FromTranscript(TSeqPos query_start,ENa_strand query_strand,TSeqPos subj_start,ENa_strand subj_strand,const string & transcript)1259 void CDense_seg::FromTranscript(TSeqPos query_start, ENa_strand query_strand,
1260                                 TSeqPos subj_start, ENa_strand subj_strand,
1261                                 const string& transcript )
1262 {
1263     // check that strands are specific
1264     bool query_strand_specific =
1265         query_strand == eNa_strand_plus || query_strand == eNa_strand_minus;
1266     bool subj_strand_specific =
1267         subj_strand == eNa_strand_plus || subj_strand == eNa_strand_minus;
1268 
1269     if(!query_strand_specific || !subj_strand_specific) {
1270         NCBI_THROW(CSeqalignException, eInvalidInputData, "Unknown strand");
1271     }
1272 
1273     TStarts &starts  = SetStarts();
1274     starts.clear();
1275     TLens &lens    = SetLens();
1276     lens.clear();
1277     TStrands &strands = SetStrands();
1278     strands.clear();
1279 
1280     SetDim(2);
1281 
1282     // iterate through the transcript
1283     size_t seg_count = 0;
1284 
1285     size_t start1 = 0, pos1 = 0; // relative to exon start in mrna
1286     size_t start2 = 0, pos2 = 0; // and genomic
1287     size_t seg_len = 0;
1288 
1289     string::const_iterator ib = transcript.begin();
1290     string::const_iterator ie = transcript.end();
1291     string::const_iterator ii = ib;
1292     unsigned char seg_type;
1293     static const char* badsymerr = "Unknown or unsupported transcript symbol";
1294     char c = ii != ie ? *ii++ : 0;
1295     if(c == 'M' || c == 'R') {
1296         seg_type = 0;
1297         ++pos1;
1298         ++pos2;
1299     }
1300     else if (c == 'I') {
1301         seg_type = 1;
1302         ++pos2;
1303     }
1304     else if (c == 'D') {
1305         seg_type = 2;
1306         ++pos1;
1307     }
1308     else {
1309         if (c == 0) {
1310             NCBI_THROW(CSeqalignException, eInvalidInputData,
1311                 "Empty transcript");
1312         }
1313         else {
1314             NCBI_THROW(CSeqalignException, eInvalidInputData, badsymerr);
1315         }
1316     }
1317 
1318     while(ii < ie) {
1319 
1320         c = *ii;
1321         if(isalpha((unsigned char) c)) {
1322 
1323             if(seg_type == 0 && (c == 'M' || c == 'R')) {
1324                 ++pos1;
1325                 ++pos2;
1326             }
1327             else if(seg_type == 1 && c == 'I') {
1328                 ++pos2;
1329             }
1330             else if(seg_type == 2 && c == 'D') {
1331                 ++pos1;
1332             }
1333             else {
1334 
1335                 // close current seg
1336                 TSeqPos query_close = query_strand == eNa_strand_plus?
1337                     start1: 1 - pos1;
1338                 starts.push_back(seg_type == 1? (TSeqPos)-1: query_start + query_close);
1339                 strands.push_back(query_strand);
1340 
1341                 TSeqPos subj_close = subj_strand == eNa_strand_plus?
1342                     start2: 1- pos2;
1343                 starts.push_back(seg_type == 2? (TSeqPos)-1: subj_start + subj_close);
1344                 strands.push_back(subj_strand);
1345 
1346                 switch(seg_type) {
1347                 case 0: seg_len = pos1 - start1; break;
1348                 case 1: seg_len = pos2 - start2; break;
1349                 case 2: seg_len = pos1 - start1; break;
1350                 }
1351                 lens.push_back(seg_len);
1352                 ++seg_count;
1353 
1354                 // start a new seg
1355                 start1 = pos1;
1356                 start2 = pos2;
1357 
1358                 if(c == 'M' || c == 'R'){
1359                     seg_type = 0; // matches and mismatches
1360                     ++pos1;
1361                     ++pos2;
1362                 }
1363                 else if (c == 'I') {
1364                     seg_type = 1;  // inserts
1365                     ++pos2;
1366                 }
1367                 else  if (c == 'D') {
1368                     seg_type = 2;  // dels
1369                     ++pos1;
1370                 }
1371                 else {
1372 
1373                     NCBI_THROW(CSeqalignException, eInvalidInputData,
1374                                badsymerr);
1375                 }
1376             }
1377             ++ii;
1378         }
1379         else {
1380 
1381             if(!isdigit((unsigned char) c)) {
1382 
1383                 NCBI_THROW(CSeqalignException, eInvalidInputData,
1384                            "Alignment transcript corrupt");
1385             }
1386 
1387             size_t len = 0;
1388             while(ii < ie && isdigit((unsigned char)(*ii))) {
1389                 len = 10*len + *ii - '0';
1390                 ++ii;
1391             }
1392             --len;
1393             switch(seg_type) {
1394             case 0: pos1 += len; pos2 += len; break;
1395             case 1: pos2 += len; break;
1396             case 2: pos1 += len; break;
1397             }
1398         }
1399     }
1400 
1401     TSeqPos query_close = query_strand == eNa_strand_plus? start1: 1 - pos1;
1402     starts.push_back(seg_type == 1? (TSeqPos)-1: query_start + query_close);
1403     strands.push_back(query_strand);
1404 
1405     TSeqPos subj_close = subj_strand == eNa_strand_plus? start2: 1 - pos2;
1406     starts.push_back(seg_type == 2? (TSeqPos)-1: subj_start + subj_close);
1407     strands.push_back(subj_strand);
1408 
1409     switch(seg_type) {
1410 
1411     case 0: seg_len = pos1 - start1; break;
1412     case 1: seg_len = pos2 - start2; break;
1413     case 2: seg_len = pos1 - start1; break;
1414     }
1415     lens.push_back(seg_len);
1416     ++seg_count;
1417 
1418     SetNumseg(seg_count);
1419 }
1420 
1421 
CreateRowSeq_interval(TDim row) const1422 CRef<CSeq_interval> CDense_seg::CreateRowSeq_interval(TDim row) const
1423 {
1424     if (GetDim() <= row) {
1425         NCBI_THROW(CSeqalignException, eInvalidRowNumber,
1426             "Invalid row number in CreateRowSeq_interval(): " +
1427             NStr::NumericToString(row));
1428     }
1429     CRef<CSeq_interval> ret(new CSeq_interval);
1430     TSeqPos from = kInvalidSeqPos;
1431     TSeqPos to = 0;
1432     TSeqPos plus_len = 0;
1433     TSeqPos minus_len = 0;
1434     ret->SetId().Assign(*GetIds()[row]);
1435     for (int seg = 0; seg < GetNumseg(); seg++) {
1436         int idx = seg*GetDim() + row;
1437         int start = GetStarts()[idx];
1438         if (start < 0) {
1439             continue;
1440         }
1441         if (TSeqPos(start) < from) {
1442             from = TSeqPos(start);
1443         }
1444         TSeqPos len = GetLens()[seg];
1445         if (start + len > to) {
1446             to = start + len;
1447         }
1448         if ( !IsSetStrands()  ||  !IsReverse(GetStrands()[idx]) ) {
1449             plus_len += len;
1450         }
1451         else {
1452             minus_len += len;
1453         }
1454     }
1455     if (from == kInvalidSeqPos  ||  to == 0) {
1456         NCBI_THROW(CSeqalignException, eOutOfRange,
1457             "Can not convert row to seq-interval - invalid from/to value");
1458     }
1459     ret->SetFrom(from);
1460     ret->SetTo(to - 1);
1461     if ( IsSetStrands() ) {
1462         // If more than 2/3 of the segment is on plus, use plus strand
1463         if (plus_len >= minus_len*2) {
1464             ret->SetStrand(eNa_strand_plus);
1465         }
1466         // If more than 2/3 is on minus, use minus strand
1467         else if (plus_len*2 < minus_len) {
1468             ret->SetStrand(eNa_strand_minus);
1469         }
1470         // Otherwise set strand to 'both'
1471         else {
1472             ret->SetStrand(eNa_strand_both);
1473         }
1474     }
1475     return ret;
1476 }
1477 
1478 
1479 NCBI_PARAM_DECL(bool, OBJECTS, DENSE_SEG_RESERVE);
1480 NCBI_PARAM_DEF_EX(bool, OBJECTS, DENSE_SEG_RESERVE, true,
1481                   eParam_NoThread, OBJECTS_DENSE_SEG_RESERVE);
1482 
PreReadClassMember(CObjectIStream & in,const CObjectInfoMI & member)1483 void CDense_seg::CReserveHook::PreReadClassMember(CObjectIStream& in,
1484                                                   const CObjectInfoMI& member)
1485 {
1486     static CSafeStatic<NCBI_PARAM_TYPE(OBJECTS, DENSE_SEG_RESERVE)> s_Reserve;
1487 
1488     if ( !s_Reserve->Get() ) {
1489         return;
1490     }
1491     CDense_seg& ds = *CType<CDense_seg>::Get(member.GetClassObject());
1492     size_t numseg = ds.GetNumseg();
1493     try {
1494         switch ( member.GetMemberIndex() ) {
1495         case 4: // "starts"
1496             ds.SetStarts().reserve(ds.GetDim()*numseg);
1497             break;
1498         case 5: // "lens"
1499             ds.SetLens().reserve(numseg);
1500             break;
1501         case 6: // "strands"
1502             ds.SetStrands().reserve(ds.GetDim()*numseg);
1503             break;
1504         default:
1505             break;
1506         }
1507     }
1508     catch ( bad_alloc& /*ignored*/ ) {
1509         // ignore insufficient memory exception from advisory reserve()
1510     }
1511 }
1512 
1513 
1514 /////////////////////////////////////////////////////////////////////////////
1515 // Read hooks to reserve memory of Dense-seg vector<> to estimated size.
1516 /////////////////////////////////////////////////////////////////////////////
1517 
1518 
SetReserveHooks(CObjectIStream & in)1519 void CDense_seg::SetReserveHooks(CObjectIStream& in)
1520 {
1521     CDenseSegReserveStartsHook::SetHook(in);
1522     CDenseSegReserveLensHook::SetHook(in);
1523     CDenseSegReserveStrandsHook::SetHook(in);
1524 }
1525 
1526 
SetGlobalReserveHooks(void)1527 void CDense_seg::SetGlobalReserveHooks(void)
1528 {
1529     CDenseSegReserveStartsHook::SetGlobalHook();
1530     CDenseSegReserveLensHook::SetGlobalHook();
1531     CDenseSegReserveStrandsHook::SetGlobalHook();
1532 }
1533 
1534 
ReadClassMember(CObjectIStream & in,const CObjectInfoMI & member)1535 void CDenseSegReserveStartsHook::ReadClassMember(CObjectIStream& in,
1536                                                  const CObjectInfoMI& member)
1537 {
1538     CDense_seg& ds = *CType<CDense_seg>::Get(member.GetClassObject());
1539     size_t size = ds.GetDim()*ds.GetNumseg();
1540     CDense_seg::TStarts& array = ds.SetStarts();
1541     try {
1542         array.reserve(size);
1543     }
1544     catch ( bad_alloc& /*ignored*/ ) {
1545         // ignore insufficient memory exception from advisory reserve()
1546     }
1547     DefaultRead(in, member);
1548 }
1549 
1550 
x_GetMember(void)1551 CObjectTypeInfoMI CDenseSegReserveStartsHook::x_GetMember(void)
1552 {
1553     CObjectTypeInfo type = CType<CDense_seg>();
1554     return type.FindMember("starts");
1555 }
1556 
1557 
SetHook(CObjectIStream & in)1558 void CDenseSegReserveStartsHook::SetHook(CObjectIStream& in)
1559 {
1560     CRef<CDenseSegReserveStartsHook> hook(new CDenseSegReserveStartsHook);
1561     x_GetMember().SetLocalReadHook(in, hook);
1562 }
1563 
1564 
SetGlobalHook(void)1565 void CDenseSegReserveStartsHook::SetGlobalHook(void)
1566 {
1567     CRef<CDenseSegReserveStartsHook> hook(new CDenseSegReserveStartsHook);
1568     x_GetMember().SetGlobalReadHook(hook);
1569 }
1570 
1571 
ReadClassMember(CObjectIStream & in,const CObjectInfoMI & member)1572 void CDenseSegReserveLensHook::ReadClassMember(CObjectIStream& in,
1573                                                const CObjectInfoMI& member)
1574 {
1575     CDense_seg& ds = *CType<CDense_seg>::Get(member.GetClassObject());
1576     size_t size = ds.GetNumseg();
1577     CDense_seg::TLens& array = ds.SetLens();
1578     try {
1579         array.reserve(size);
1580     }
1581     catch ( bad_alloc& /*ignored*/ ) {
1582         // ignore insufficient memory exception from advisory reserve()
1583     }
1584     DefaultRead(in, member);
1585 }
1586 
1587 
x_GetMember(void)1588 CObjectTypeInfoMI CDenseSegReserveLensHook::x_GetMember(void)
1589 {
1590     CObjectTypeInfo type = CType<CDense_seg>();
1591     return type.FindMember("lens");
1592 }
1593 
1594 
SetHook(CObjectIStream & in)1595 void CDenseSegReserveLensHook::SetHook(CObjectIStream& in)
1596 {
1597     CRef<CDenseSegReserveLensHook> hook(new CDenseSegReserveLensHook);
1598     x_GetMember().SetLocalReadHook(in, hook);
1599 }
1600 
1601 
SetGlobalHook(void)1602 void CDenseSegReserveLensHook::SetGlobalHook(void)
1603 {
1604     CRef<CDenseSegReserveLensHook> hook(new CDenseSegReserveLensHook);
1605     x_GetMember().SetGlobalReadHook(hook);
1606 }
1607 
1608 
ReadClassMember(CObjectIStream & in,const CObjectInfoMI & member)1609 void CDenseSegReserveStrandsHook::ReadClassMember(CObjectIStream& in,
1610                                                   const CObjectInfoMI& member)
1611 {
1612     CDense_seg& ds = *CType<CDense_seg>::Get(member.GetClassObject());
1613     size_t size = ds.GetDim()*ds.GetNumseg();
1614     CDense_seg::TStrands& array = ds.SetStrands();
1615     try {
1616         array.reserve(size);
1617     }
1618     catch ( bad_alloc& /*ignored*/ ) {
1619         // ignore insufficient memory exception from advisory reserve()
1620     }
1621     DefaultRead(in, member);
1622 }
1623 
1624 
x_GetMember(void)1625 CObjectTypeInfoMI CDenseSegReserveStrandsHook::x_GetMember(void)
1626 {
1627     CObjectTypeInfo type = CType<CDense_seg>();
1628     return type.FindMember("strands");
1629 }
1630 
1631 
SetHook(CObjectIStream & in)1632 void CDenseSegReserveStrandsHook::SetHook(CObjectIStream& in)
1633 {
1634     CRef<CDenseSegReserveStrandsHook> hook(new CDenseSegReserveStrandsHook);
1635     x_GetMember().SetLocalReadHook(in, hook);
1636 }
1637 
1638 
SetGlobalHook(void)1639 void CDenseSegReserveStrandsHook::SetGlobalHook(void)
1640 {
1641     CRef<CDenseSegReserveStrandsHook> hook(new CDenseSegReserveStrandsHook);
1642     x_GetMember().SetGlobalReadHook(hook);
1643 }
1644 
1645 
1646 END_objects_SCOPE // namespace ncbi::objects::
1647 
1648 END_NCBI_SCOPE
1649