1 /* $Id: Dense_seg.cpp 464942 2015-04-15 15:46:17Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: .......
27 *
28 * File Description:
29 * .......
30 *
31 * Remark:
32 * This code was originally generated by application DATATOOL
33 * using specifications from the data definition file
34 * 'seqalign.asn'.
35 */
36
37 // standard includes
38 #include <ncbi_pch.hpp>
39 #include <algorithm>
40 #include <objects/seqalign/seqalign_exception.hpp>
41
42 // generated includes
43 #include <objects/seqalign/Dense_seg.hpp>
44
45 #include <objects/seqloc/Seq_id.hpp>
46 #include <objects/seqloc/Seq_loc.hpp>
47 #include <serial/objistr.hpp>
48 #include <corelib/ncbi_param.hpp>
49
50 // generated classes
51
52 BEGIN_NCBI_SCOPE
53
54 BEGIN_objects_SCOPE // namespace ncbi::objects::
55
56 // destructor
~CDense_seg(void)57 CDense_seg::~CDense_seg(void)
58 {
59 }
60
61
Assign(const CSerialObject & obj,ESerialRecursionMode how)62 void CDense_seg::Assign(const CSerialObject& obj, ESerialRecursionMode how)
63 {
64 /// do the base copy
65 CSerialObject::Assign(obj, how);
66
67 /// copy our specific items
68 if (GetTypeInfo() == obj.GetThisTypeInfo()) {
69 const CDense_seg& other = static_cast<const CDense_seg&>(obj);
70 m_set_State1[0] = other.m_set_State1[0];
71 m_Widths = other.m_Widths;
72 }
73 }
74
75
CheckNumRows() const76 CDense_seg::TDim CDense_seg::CheckNumRows() const
77 {
78 const size_t& dim = GetDim();
79 if (dim != GetIds().size()) {
80 NCBI_THROW(CSeqalignException, eInvalidAlignment,
81 "CDense_seg::CheckNumRows()"
82 " ids.size is inconsistent with dim");
83 }
84 return dim;
85 }
86
87
CheckNumSegs() const88 CDense_seg::TNumseg CDense_seg::CheckNumSegs() const
89 {
90 const CDense_seg::TStarts& starts = GetStarts();
91 const CDense_seg::TStrands& strands = GetStrands();
92 const CDense_seg::TLens& lens = GetLens();
93 const CDense_seg::TWidths& widths = GetWidths();
94
95 const size_t& numrows = GetDim();
96 const size_t& numsegs = GetNumseg();
97 const size_t num = numrows * numsegs;
98
99 if (starts.size() != num) {
100 string errstr = string("CDense_seg::CheckNumSegs():")
101 + " starts.size is inconsistent with dim * numseg";
102 NCBI_THROW(CSeqalignException, eInvalidAlignment, errstr);
103 }
104 if (lens.size() != numsegs) {
105 string errstr = string("CDense_seg::CheckNumSegs():")
106 + " lens.size is inconsistent with numseg";
107 NCBI_THROW(CSeqalignException, eInvalidAlignment, errstr);
108 }
109 if (strands.size() && strands.size() != num) {
110 string errstr = string("CDense_seg::CheckNumSegs():")
111 + " strands.size is inconsistent with dim * numseg";
112 NCBI_THROW(CSeqalignException, eInvalidAlignment, errstr);
113 }
114 if (widths.size() && widths.size() != numrows) {
115 string errstr = string("CDense_seg::CheckNumSegs():")
116 + " widths.size is inconsistent with dim";
117 NCBI_THROW(CSeqalignException, eInvalidAlignment, errstr);
118 }
119 return numsegs;
120 }
121
122
GetSeq_id(TDim row) const123 const CSeq_id& CDense_seg::GetSeq_id(TDim row) const
124 {
125 if ( IsSetIds() && (size_t)row < GetIds().size()) {
126 return *GetIds()[row];
127 }
128 NCBI_THROW(CSeqalignException, eInvalidRowNumber,
129 "CDense_seg::GetSeq_id(): "
130 "can not get seq-id for the row requested.");
131 }
132
133
GetSeqStart(TDim row) const134 TSeqPos CDense_seg::GetSeqStart(TDim row) const
135 {
136 const TDim& dim = GetDim();
137 const TNumseg& numseg = CheckNumSegs();
138 const TStarts& starts = GetStarts();
139
140 if (row < 0 || row >= dim) {
141 NCBI_THROW(CSeqalignException, eInvalidRowNumber,
142 "CDense_seg::GetSeqStart():"
143 " Invalid row number");
144 }
145
146 TSignedSeqPos start;
147 if (CanGetStrands() && GetStrands().size() && GetStrands()[row] == eNa_strand_minus) {
148 TNumseg seg = numseg;
149 int pos = (seg - 1) * dim + row;
150 while (seg--) {
151 if ((start = starts[pos]) >= 0) {
152 return start;
153 }
154 pos -= dim;
155 }
156 } else {
157 TNumseg seg = -1;
158 int pos = row;
159 while (++seg < numseg) {
160 if ((start = starts[pos]) >= 0) {
161 return start;
162 }
163 pos += dim;
164 }
165 }
166 NCBI_THROW(CSeqalignException, eInvalidAlignment,
167 "CDense_seg::GetSeqStart(): Row is empty");
168 }
169
170
GetSeqStop(TDim row) const171 TSeqPos CDense_seg::GetSeqStop(TDim row) const
172 {
173 const TDim& dim = GetDim();
174 const TNumseg& numseg = CheckNumSegs();
175 const TStarts& starts = GetStarts();
176
177 if (row < 0 || row >= dim) {
178 NCBI_THROW(CSeqalignException, eInvalidRowNumber,
179 "CDense_seg::GetSeqStop():"
180 " Invalid row number");
181 }
182
183 TSignedSeqPos start;
184 if (CanGetStrands() && GetStrands().size() && GetStrands()[row] == eNa_strand_minus) {
185 TNumseg seg = -1;
186 int pos = row;
187 while (++seg < numseg) {
188 if ((start = starts[pos]) >= 0) {
189 return start + GetLens()[seg] - 1;
190 }
191 pos += dim;
192 }
193 } else {
194 TNumseg seg = numseg;
195 int pos = (seg - 1) * dim + row;
196 while (seg--) {
197 if ((start = starts[pos]) >= 0) {
198 return start + GetLens()[seg] - 1;
199 }
200 pos -= dim;
201 }
202 }
203 NCBI_THROW(CSeqalignException, eInvalidAlignment,
204 "CDense_seg::GetSeqStop(): Row is empty");
205 }
206
207
GetSeqStrand(TDim row) const208 ENa_strand CDense_seg::GetSeqStrand(TDim row) const
209 {
210 const size_t& strands_size = GetStrands().size();
211
212 if ( !strands_size ) {
213 return eNa_strand_plus;
214 } else {
215 TDim dim = CheckNumRows();
216
217 if (strands_size < (size_t) dim) {
218 // The ASN.1 spec technically requires numrows x numsegs
219 // strands, however in practice NCBI assumes that
220 // Dense-seg's strands are fixed per row. Since we will
221 // obtain the strand from the first segment (assuming they
222 // are fixed) and for efficiency (to eliminate unnecessary
223 // multiplication) we don't check for the full numrows x
224 // numsegs size here
225 NCBI_THROW(CSeqalignException, eInvalidAlignment,
226 "Invalid strands size");
227 }
228
229 if (row < 0 || row >= dim) {
230 NCBI_THROW(CSeqalignException, eInvalidRowNumber,
231 "CDense_seg::GetSeqStrand():"
232 " Invalid row number");
233 }
234
235 return GetStrands()[row];
236 }
237 }
238
239
Validate(bool full_test) const240 void CDense_seg::Validate(bool full_test) const
241 {
242 const CDense_seg::TStarts& starts = GetStarts();
243 const CDense_seg::TStrands& strands = GetStrands();
244 const CDense_seg::TLens& lens = GetLens();
245 const CDense_seg::TWidths& widths = GetWidths();
246
247 const size_t& numrows = CheckNumRows();
248 const size_t& numsegs = CheckNumSegs();
249
250 if (numsegs == 0) {
251 return;
252 }
253
254 if (full_test) {
255 const size_t max = numrows * (numsegs -1);
256
257 bool strands_exist = !strands.empty();
258
259 size_t numseg = 0, numrow = 0, offset = 0;
260 for (numrow = 0; numrow < numrows; numrow++) {
261 TSignedSeqPos min_start = -1, start;
262 bool plus = strands_exist ?
263 strands[numrow] != eNa_strand_minus:
264 true;
265
266 if (plus) {
267 offset = 0;
268 } else {
269 offset = max;
270 }
271
272 for (numseg = 0; numseg < numsegs; numseg++) {
273 start = starts[offset + numrow];
274 if (start >= 0) {
275 if (start < min_start) {
276 string errstr = string("CDense_seg::Validate():")
277 + " Starts are not consistent!"
278 + " Row=" + NStr::SizetToString(numrow) +
279 " Seg=" + NStr::SizetToString(plus ? numseg :
280 numsegs - 1 - numseg) +
281 " MinStart=" + NStr::NumericToString(min_start) +
282 " Start=" + NStr::NumericToString(start);
283
284 NCBI_THROW(CSeqalignException, eInvalidAlignment,
285 errstr);
286 }
287 min_start = start +
288 lens[plus ? numseg : numsegs - 1 - numseg] *
289 (widths.size() == numrows ?
290 widths[numrow] : 1);
291 }
292 if (plus) {
293 offset += numrows;
294 } else {
295 offset -= numrows;
296 }
297 }
298 if (min_start == -1) {
299 string errstr = string("CDense_seg::Validate():")
300 + " Row " + NStr::SizetToString(numrow) +
301 " is empty.";
302 NCBI_THROW(CSeqalignException, eInvalidAlignment,
303 errstr);
304 }
305 }
306 }
307 }
308
309
TrimEndGaps()310 void CDense_seg::TrimEndGaps()
311 {
312 _ASSERT(GetNumseg() == static_cast<TNumseg>(GetLens().size()));
313 _ASSERT(GetNumseg() * GetDim() == static_cast<int>(GetStarts().size()));
314 _ASSERT(!IsSetStrands()
315 || GetNumseg() * GetDim() == static_cast<int>(GetStrands().size()));
316 _ASSERT(GetDim() == static_cast<TDim>(GetIds().size()));
317
318 list<TSignedSeqRange> delete_ranges;
319 int i;
320 int j;
321
322 /// leading gap segments first
323 for (i = 0; i < GetNumseg(); ++i) {
324 int count_gaps = 0;
325 for (j = 0; j < GetDim(); ++j) {
326 TSignedSeqPos this_start = GetStarts()[i * GetDim() + j];
327 if (this_start == -1) {
328 ++count_gaps;
329 }
330 }
331
332 if (GetDim() - count_gaps > 1) {
333 /// no can do
334 break;
335 }
336 }
337
338 if (i == GetNumseg() + 1) {
339 /// error case - all gapped, so don't bother
340 return;
341 }
342 if (i != 0) {
343 delete_ranges.push_back(TSignedSeqRange(0, i));
344 }
345
346 /// trailing gap segments next
347 for (i = GetNumseg() - 1; i >= 0; --i) {
348 int count_gaps = 0;
349 for (j = 0; j < GetDim(); ++j) {
350 TSignedSeqPos this_start = GetStarts()[i * GetDim() + j];
351 if (this_start == -1) {
352 ++count_gaps;
353 }
354 }
355
356 if (GetDim() - count_gaps > 1) {
357 /// no can do
358 break;
359 }
360 }
361
362 if (i != GetNumseg() - 1) {
363 delete_ranges.push_back(TSignedSeqRange(i + 1, GetNumseg()));
364 }
365
366 list<TSignedSeqRange>::reverse_iterator iter = delete_ranges.rbegin();
367 list<TSignedSeqRange>::reverse_iterator end = delete_ranges.rend();
368 for ( ; iter != end; ++iter) {
369 TSignedSeqRange r = *iter;
370 if (r.GetLength() == 0) {
371 continue;
372 }
373
374 /// we can trim the first i segments
375 if (IsSetStrands()) {
376 _ASSERT(static_cast<int>(GetStrands().size())
377 >= r.GetTo() * GetDim());
378 SetStrands().erase(SetStrands().begin() + r.GetFrom() * GetDim(),
379 SetStrands().begin() + r.GetTo() * GetDim());
380 }
381 if (IsSetStarts()) {
382 _ASSERT(static_cast<int>(GetStarts().size())
383 >= r.GetTo() * GetDim());
384 SetStarts().erase(SetStarts().begin() + r.GetFrom() * GetDim(),
385 SetStarts().begin() + r.GetTo() * GetDim());
386 }
387 if (IsSetLens()) {
388 _ASSERT(static_cast<int>(GetLens().size())
389 >= r.GetTo());
390 SetLens().erase(SetLens().begin() + r.GetFrom(),
391 SetLens().begin() + r.GetTo());
392 }
393 }
394
395 /// fix our number of segments
396 SetNumseg(SetLens().size());
397
398 _ASSERT(GetNumseg() == static_cast<TNumseg>(GetLens().size()));
399 _ASSERT(GetNumseg() * GetDim() == static_cast<int>(GetStarts().size()));
400 _ASSERT(!IsSetStrands()
401 || GetNumseg() * GetDim() == static_cast<int>(GetStrands().size()));
402 _ASSERT(GetDim() == static_cast<TDim>(GetIds().size()));
403 }
404
405
Compact()406 void CDense_seg::Compact()
407 {
408 _ASSERT(GetNumseg() == static_cast<TNumseg>(GetLens().size()));
409 _ASSERT(GetNumseg() * GetDim() == static_cast<int>(GetStarts().size()));
410 _ASSERT(!IsSetStrands()
411 || GetNumseg() * GetDim() == static_cast<int>(GetStrands().size()));
412 _ASSERT(GetDim() == static_cast<TDim>(GetIds().size()));
413
414 int i;
415 int j;
416 vector<bool> can_merge(GetNumseg() - 1, true); // start off with all true
417 unsigned int merge_count = 0;
418 for (i = 0; i < GetNumseg() - 1; ++i) {
419
420 for (j = 0; j < GetDim(); ++j) {
421 TSignedSeqPos this_start = GetStarts()[i * GetDim() + j];
422 TSignedSeqPos next_start = GetStarts()[(i + 1) * GetDim() + j];
423
424 /// check to make sure there is not a gap mismatch
425 if ( (this_start == -1 && next_start != -1)
426 || (this_start != -1 && next_start == -1) ) {
427 can_merge[i] = false;
428 break;
429 }
430
431 /// check to make sure there is no unaligned space
432 /// between this segment and the next
433 if (this_start != -1 && next_start != -1) {
434 TSignedSeqPos seg_len = GetLens()[i];
435 if (IsSetStrands() &&
436 GetStrands()[i * GetDim() + j] == eNa_strand_minus) {
437 seg_len = GetLens()[i + 1];
438 seg_len = -seg_len;
439 }
440
441 if (this_start + seg_len != next_start) {
442 can_merge[i] = false;
443 break;
444 }
445 }
446
447 /// check that the strands all agree between this segment
448 /// and the next (although it is rare, it is legal for them
449 /// to be different)
450 if (IsSetStrands()) {
451 if (GetStrands()[i * GetDim() + j]
452 != GetStrands()[(i + 1) * GetDim() + j]) {
453 can_merge[i] = false;
454 break;
455 }
456 }
457 }
458 if (can_merge[i]) {
459 ++merge_count;
460 }
461 }
462
463 if (merge_count == 0) {
464 // nothing needs to be done
465 return;
466 }
467
468 CDense_seg::TStarts new_starts;
469 CDense_seg::TLens new_lens;
470 CDense_seg::TStrands new_strands;
471 new_starts.reserve((GetNumseg() - merge_count) * GetDim());
472 new_lens.reserve(GetNumseg() - merge_count);
473 if (IsSetStrands()) {
474 new_strands.reserve((GetNumseg() - merge_count) * GetDim());
475 }
476 for (i = 0; i < GetNumseg(); ++i) {
477 if (i > 0 && can_merge[i - 1]) {
478 // merge this segment into the last one
479 new_lens.back() += GetLens()[i];
480 if (IsSetStrands()) {
481 for (j = 0; j < GetDim(); ++j) {
482 if (GetStrands()[i * GetDim() + j] == eNa_strand_minus) {
483 new_starts[new_starts.size() - GetDim() + j] =
484 GetStarts()[i * GetDim() + j];
485 }
486 }
487 }
488 } else {
489 // just copy the original segment i onto the end of the new stuff
490 new_lens.push_back(GetLens()[i]);
491 for (j = 0; j < GetDim(); ++j) {
492 new_starts.push_back(GetStarts()[i * GetDim() + j]);
493 if (IsSetStrands()) {
494 new_strands.push_back(GetStrands()[i * GetDim() + j]);
495 }
496 }
497 }
498 }
499
500 SetStarts().swap(new_starts);
501 SetLens().swap(new_lens);
502 if (IsSetStrands()) {
503 SetStrands().swap(new_strands);
504 }
505
506 SetNumseg(SetLens().size());
507
508 _ASSERT(GetNumseg() == static_cast<TNumseg>(GetLens().size()));
509 _ASSERT(GetNumseg() * GetDim() == static_cast<int>(GetStarts().size()));
510 _ASSERT(!IsSetStrands()
511 || GetNumseg() * GetDim() == static_cast<int>(GetStrands().size()));
512 _ASSERT(GetDim() == static_cast<TDim>(GetIds().size()));
513 }
514
515
OrderAdjacentGaps()516 void CDense_seg::OrderAdjacentGaps()
517 {
518 #define IDX(_x,_y) (((_x)*GetDim())+(_y))
519 bool swaps_made = false;
520 do {
521 swaps_made = false;
522 for(int i=0; i < GetNumseg()-1; ++i) {
523
524 bool curr_gap = false, next_gap = false;
525 int curr_seq_row = GetDim()+1, next_seq_row = GetDim()+1;
526 for (int j=0; j < GetDim(); ++j) {
527 if (GetStarts()[IDX(i,j)] == -1)
528 curr_gap = true;
529 else
530 curr_seq_row = min(curr_seq_row, j);
531 if (GetStarts()[IDX(i+1,j)] == -1)
532 next_gap = true;
533 else
534 next_seq_row = min(next_seq_row, j);
535 }
536 if(!(curr_gap & next_gap))
537 continue;
538
539 // if this Seg and next Seg are both gaps,
540 // and the first row with sequence is not curr
541 // swap the two Segs
542 if(next_seq_row < curr_seq_row) {
543 for(int j=0; j < GetDim(); ++j) {
544 swap(SetStarts()[IDX(i,j)], SetStarts()[IDX(i+1,j)]);
545 if (GetStrands().size() > (size_t)IDX(i+1,j))
546 swap(SetStrands()[IDX(i,j)], SetStrands()[IDX(i+1,j)]);
547 }
548 swap(SetLens()[i], SetLens()[i+1]);
549 swaps_made = true;
550 }
551 }
552 } while(swaps_made);
553 }
554
555
RemovePureGapSegs()556 void CDense_seg::RemovePureGapSegs()
557 {
558 _ASSERT(GetNumseg() == static_cast<TNumseg>(GetLens().size()));
559 _ASSERT(GetNumseg() * GetDim() == static_cast<int>(GetStarts().size()));
560 _ASSERT(!IsSetStrands()
561 || GetNumseg() * GetDim() == static_cast<int>(GetStrands().size()));
562 _ASSERT(GetDim() == static_cast<TDim>(GetIds().size()));
563
564
565 // consistency checks
566 TDim dim = CheckNumRows();
567 TNumseg numseg = CheckNumSegs();
568
569 int i;
570 int j;
571 vector<bool> remove(numseg, true); // start out with all true
572 unsigned int remove_count = 0;
573 for (i = 0; i < numseg; ++i) {
574
575 for (j = 0; j < dim; ++j) {
576 if (GetStarts()[i * dim + j] != -1) {
577 // not a gap
578 remove[i] = false;
579 break;
580 }
581 }
582 if (remove[i]) {
583 ++remove_count;
584 }
585 }
586
587 if (remove_count == 0) {
588 // nothing to remove; leave unchanged
589 return;
590 }
591
592 CDense_seg::TStarts new_starts;
593 CDense_seg::TLens new_lens;
594 CDense_seg::TStrands new_strands;
595 new_starts.reserve((numseg - remove_count) * dim);
596 new_lens.reserve(numseg - remove_count);
597 if (IsSetStrands()) {
598 new_strands.reserve((numseg - remove_count) * dim);
599 }
600 for (i = 0; i < numseg; ++i) {
601 if (!remove[i]) {
602 // copy the original segment i onto the end of the new stuff
603 new_lens.push_back(GetLens()[i]);
604 for (j = 0; j < dim; ++j) {
605 new_starts.push_back(GetStarts()[i * dim + j]);
606 if (IsSetStrands()) {
607 new_strands.push_back(GetStrands()[i * dim + j]);
608 }
609 }
610 }
611 }
612
613 SetStarts().swap(new_starts);
614 SetLens().swap(new_lens);
615 if (IsSetStrands()) {
616 SetStrands().swap(new_strands);
617 }
618
619 SetNumseg(SetLens().size());
620
621 #ifdef _DEBUG
622 Validate(true);
623 #endif
624
625 }
626
627
628 //-----------------------------------------------------------------------------
629 // PRE : none
630 // POST: same alignment, opposite orientation
Reverse(void)631 void CDense_seg::Reverse(void)
632 {
633 //flip strands
634 if (IsSetStrands()) {
635 NON_CONST_ITERATE (CDense_seg::TStrands, i, SetStrands()) {
636 switch (*i) {
637 case eNa_strand_plus: *i = eNa_strand_minus; break;
638 case eNa_strand_minus: *i = eNa_strand_plus; break;
639 default: break;//do nothing if not + or -
640 }
641 }
642 } else {
643 // Interpret unset strands as plus strands.
644 // Since we're reversing, set them all to minus.
645 SetStrands().resize(GetStarts().size(), eNa_strand_minus);
646 }
647
648 //reverse list o' lengths
649 {
650 CDense_seg::TLens::iterator f = SetLens().begin();
651 CDense_seg::TLens::iterator r = SetLens().end();
652 while (f < r) {
653 swap(*(f++), *(--r));
654 }
655 }
656
657 //reverse list o' starts
658 CDense_seg::TStarts &starts = SetStarts();
659 int f = 0;
660 int r = (GetNumseg() - 1) * GetDim();
661 while (f < r) {
662 for (int i = 0; i < GetDim(); ++i) {
663 swap(starts[f+i], starts[r+i]);
664 }
665 f += GetDim();
666 r -= GetDim();
667 }
668 }
669
670 //-----------------------------------------------------------------------------
671 // PRE : numbers of the rows to swap
672 // POST: alignment rearranged with row1 where row2 used to be & vice versa
SwapRows(TDim row1,TDim row2)673 void CDense_seg::SwapRows(TDim row1, TDim row2)
674 {
675 if (row1 >= GetDim() || row1 < 0 ||
676 row2 >= GetDim() || row2 < 0) {
677 NCBI_THROW(CSeqalignException, eOutOfRange,
678 "Row numbers supplied to CDense_seg::SwapRows must be "
679 "in the range [0, dim)");
680 }
681
682 //swap ids
683 swap(SetIds()[row1], SetIds()[row2]);
684
685 int idxStop = GetNumseg()*GetDim();
686
687 //swap starts
688 for(int i = 0; i < idxStop; i += GetDim()) {
689 swap(SetStarts()[i+row1], SetStarts()[i+row2]);
690 }
691
692 //swap strands
693 if (IsSetStrands()) {
694 for(int i = 0; i < idxStop; i += GetDim()) {
695 swap(SetStrands()[i+row1], SetStrands()[i+row2]);
696 }
697 }
698 }
699
700
701 /*---------------------------------------------------------------------------*/
702 // PRE : this is a validated dense seg; row & sequence position on row in
703 // alignment
704 // POST: the number of the segment in which this sequence position falls
705 CDense_seg::TNumseg CDense_seg::
x_FindSegment(CDense_seg::TDim row,TSignedSeqPos pos) const706 x_FindSegment(CDense_seg::TDim row, TSignedSeqPos pos) const
707 {
708 bool found = false;
709 CDense_seg::TNumseg seg;
710 for (seg = 0; seg < GetNumseg() && !found; ++seg) {
711 TSignedSeqPos start = GetStarts()[seg * GetDim() + row];
712 TSignedSeqPos len = GetLens()[seg];
713 if (start != -1) {
714 if (pos >= start && pos < start + len) {
715 found = true;
716 }
717 }
718 }
719 if (!found) {
720 NCBI_THROW(CSeqalignException, eInvalidAlignment,
721 "CDense_seg::x_FindSegment(): "
722 "Can't find a segment containing position " +
723 NStr::NumericToString(pos));
724 }
725
726 return seg - 1;
727 }
728
729
730 //-----------------------------------------------------------------------------
731 // PRE : range on a row in the alignment
732 // POST: dst Dense_seg reset &
733 CRef<CDense_seg> CDense_seg::
ExtractSlice(TDim row,TSeqPos from,TSeqPos to) const734 ExtractSlice(TDim row, TSeqPos from, TSeqPos to) const
735 {
736 if (row < 0 || row >= GetDim()) {
737 NCBI_THROW(CSeqalignException, eInvalidRowNumber,
738 "CDense_seg::ExtractSlice():"
739 " Invalid row number ("
740 + NStr::NumericToString(row) + ")");
741 }
742
743 if (from > to) {
744 swap(from, to);
745 }
746 if (from < GetSeqStart(row)) {
747 NCBI_THROW(CSeqalignException, eOutOfRange,
748 "CDense_seg::ExtractSlice(): "
749 "start position (" + NStr::NumericToString(from) +
750 ") off end of alignment");
751 }
752 if (to > GetSeqStop(row)) {
753 NCBI_THROW(CSeqalignException, eOutOfRange,
754 "CDense_seg::ExtractSlice(): "
755 "stop position (" + NStr::NumericToString(to) +
756 ") off end of alignment");
757 }
758
759
760 CRef<CDense_seg> ds(new CDense_seg);
761 ds->SetDim(GetDim());
762 ds->SetNumseg(0);
763 ITERATE(CDense_seg::TIds, idI, GetIds()) {
764 CSeq_id *si = new CSeq_id;
765 si->Assign(**idI);
766 ds->SetIds().push_back(CRef<CSeq_id>(si));
767 }
768
769 //find start/stop segments
770 CDense_seg::TNumseg startSeg = x_FindSegment(row, from);
771 CDense_seg::TNumseg stopSeg = x_FindSegment(row, to);
772
773 TSeqPos startOffset = from - GetStarts()[startSeg * GetDim() + row];
774 TSeqPos stopOffset = GetStarts()[stopSeg * GetDim() + row] +
775 GetLens()[stopSeg] - 1 - to;
776 if (IsSetStrands() && GetStrands()[row] == eNa_strand_minus) {
777 swap(startOffset, stopOffset);
778 swap(startSeg, stopSeg); // make sure startSeg is first
779 }
780
781 for (CDense_seg::TNumseg seg = startSeg; seg <= stopSeg; ++seg) {
782 //starts
783 for (CDense_seg::TDim dim = 0; dim < GetDim(); ++dim) {
784 TSignedSeqPos start = GetStarts()[seg * GetDim() + dim];
785 if (start != -1) {
786 if (seg == startSeg && (!IsSetStrands() ||
787 GetStrands()[seg * GetDim() + dim] == eNa_strand_plus)) {
788 start += startOffset;
789 }
790 if (seg == stopSeg && IsSetStrands() &&
791 GetStrands()[seg * GetDim() + dim] == eNa_strand_minus) {
792 start += stopOffset;
793 }
794 }
795 ds->SetStarts().push_back(start);
796 }
797
798 //len
799 TSeqPos len = GetLens()[seg];
800 if (seg == startSeg) {
801 len -= startOffset;
802 }
803 if (seg == stopSeg) {
804 len -= stopOffset;
805 }
806 ds->SetLens().push_back(len);
807
808 //strands
809 if (IsSetStrands()) {
810 for (CDense_seg::TDim dim = 0; dim < GetDim(); ++dim) {
811 ds->SetStrands().push_back(GetStrands()[seg * GetDim() + dim]);
812 }
813 }
814 ++ds->SetNumseg();
815 }
816
817 #ifdef _DEBUG
818 ds->Validate(true);
819 #endif
820 return ds;
821 }
822
823
ExtractRows(const vector<TDim> & rows) const824 CRef<CDense_seg> CDense_seg::ExtractRows(const vector<TDim>& rows) const
825 {
826 // consistency checks
827 TDim dim = CheckNumRows();
828 TNumseg numseg = CheckNumSegs();
829
830 CRef<CDense_seg> new_ds(new CDense_seg);
831 new_ds->SetDim(rows.size());
832 new_ds->SetNumseg(GetNumseg());
833
834 // reserve for efficiency
835 new_ds->SetIds().reserve(rows.size());
836 new_ds->SetStarts().reserve(rows.size() * GetNumseg());
837 new_ds->SetLens().reserve(GetNumseg());
838 if (IsSetStrands()) {
839 new_ds->SetStrands().reserve(rows.size() * GetNumseg());
840 }
841
842 ITERATE (vector<TDim>, row, rows) {
843 // sole check that rows are not out of range
844 if (*row < 0 || *row >= dim) {
845 NCBI_THROW(CSeqalignException, eInvalidRowNumber,
846 "CDense_seg::ExtractRows():"
847 " Invalid row number ("
848 + NStr::NumericToString(*row) + ")");
849 }
850 // *copy* the ID (don't just make a reference to it)
851 CRef<CSeq_id> id_copy(new CSeq_id);
852 id_copy->Assign(*GetIds()[*row]);
853 new_ds->SetIds().push_back(id_copy);
854 }
855 for (TNumseg segnum = 0; segnum < numseg; ++segnum) {
856 new_ds->SetLens().push_back(GetLens()[segnum]);
857 ITERATE (vector<TDim>, row, rows) {
858 int idx = segnum * dim + *row;
859 new_ds->SetStarts().push_back(GetStarts()[idx]);
860 if (IsSetStrands()) {
861 new_ds->SetStrands().push_back(GetStrands()[idx]);
862 }
863 }
864 }
865
866 new_ds->Compact(); // even if original was compact, new one may not be
867
868 #ifdef _DEBUG
869 new_ds->Validate(true);
870 #endif
871
872 // Scores are not propagated
873 return new_ds;
874 }
875
876
OffsetRow(TDim row,TSignedSeqPos offset)877 void CDense_seg::OffsetRow(TDim row,
878 TSignedSeqPos offset)
879 {
880 if (offset == 0) return;
881
882 // Check for out-of-range negative offset
883 if (offset < 0) {
884 for (TNumseg seg = 0, pos = row;
885 seg < GetNumseg();
886 ++seg, pos += GetDim()) {
887
888 if (GetStarts()[pos] >= 0) {
889 if (GetStarts()[pos] < -offset) {
890 NCBI_THROW(CSeqalignException, eOutOfRange,
891 "Negative offset greater than seq position");
892 }
893 }
894 }
895 }
896
897 // Modify positions
898 for (TNumseg seg = 0, pos = row;
899 seg < GetNumseg();
900 ++seg, pos += GetDim()) {
901 if (GetStarts()[pos] >= 0) {
902 SetStarts()[pos] += offset;
903 }
904 }
905 }
906
907
908 /// @deprecated
RemapToLoc(TDim row,const CSeq_loc & loc,bool ignore_strand)909 void CDense_seg::RemapToLoc(TDim row, const CSeq_loc& loc,
910 bool ignore_strand)
911 {
912 if (loc.IsWhole()) {
913 return;
914 }
915
916 TSeqPos row_stop = GetSeqStop(row);
917
918 size_t ttl_loc_len = 0;
919 {{
920 CSeq_loc_CI seq_loc_i(loc);
921 do {
922 ttl_loc_len += seq_loc_i.GetRange().GetLength();
923 } while (++seq_loc_i);
924 }}
925
926 // check the validity of the seq-loc
927 if (ttl_loc_len < row_stop + 1) {
928 string errstr("CDense_seg::RemapToLoc():"
929 " Seq-loc is not long enough to"
930 " cover the alignment!"
931 " Maximum row seq pos is ");
932 errstr += NStr::NumericToString(row_stop);
933 errstr += ". The total seq-loc len is only ";
934 errstr += NStr::SizetToString(ttl_loc_len);
935 errstr += ", it should be at least ";
936 errstr += NStr::NumericToString(row_stop+1);
937 errstr += " (= max seq pos + 1).";
938 NCBI_THROW(CSeqalignException, eOutOfRange, errstr);
939 }
940
941 const CDense_seg::TStarts& starts = GetStarts();
942 const CDense_seg::TStrands& strands = GetStrands();
943 const CDense_seg::TLens& lens = GetLens();
944
945 const size_t& numrows = CheckNumRows();
946 const size_t& numsegs = CheckNumSegs();
947
948 CSeq_loc_CI seq_loc_i(loc);
949
950 TSeqPos start, loc_len, len, len_so_far;
951 start = seq_loc_i.GetRange().GetFrom();
952 len = loc_len = seq_loc_i.GetRange().GetLength();
953 len_so_far = 0;
954
955 bool row_plus = !strands.size() || strands[row] != eNa_strand_minus;
956 bool loc_plus = seq_loc_i.GetStrand() != eNa_strand_minus;
957
958 // iterate through segments
959 size_t idx = loc_plus ? row : (numsegs - 1) * numrows + row;
960 TNumseg seg = loc_plus ? 0 : numsegs - 1;
961 while (loc_plus ? seg < GetNumseg() : seg >= 0) {
962 if (starts[idx] == -1) {
963 // ignore gaps in our sequence
964 if (loc_plus) {
965 idx += numrows; seg++;
966 } else {
967 idx -= numrows; seg--;
968 }
969 continue;
970 }
971
972 // iterate the seq-loc if needed
973 if ((loc_plus == row_plus ?
974 starts[idx] : ttl_loc_len - starts[idx] - lens[seg])
975 > len_so_far + loc_len) {
976
977 if (++seq_loc_i) {
978 len_so_far += len;
979 len = seq_loc_i.GetRange().GetLength();
980 start = seq_loc_i.GetRange().GetFrom();
981 } else {
982 NCBI_THROW(CSeqalignException, eInvalidInputData,
983 "CDense_seg::RemapToLoc():"
984 " Internal error");
985 }
986
987 // assert the strand is the same
988 if (loc_plus != (seq_loc_i.GetStrand() != eNa_strand_minus)) {
989 NCBI_THROW(CSeqalignException, eInvalidInputData,
990 "CDense_seg::RemapToLoc():"
991 " The strand should be the same accross"
992 " the input seq-loc");
993 }
994 }
995
996 // offset for the starting position
997 if (loc_plus == row_plus) {
998 SetStarts()[idx] += start - len_so_far;
999 } else {
1000 SetStarts()[idx] =
1001 start - len_so_far + ttl_loc_len - starts[idx] - lens[seg];
1002 }
1003
1004 if (lens[seg] > len) {
1005 TSignedSeqPos len_diff = lens[seg] - len;
1006 while (1) {
1007 // move to the next loc part that extends beyond our length
1008 ++seq_loc_i;
1009 if (seq_loc_i) {
1010 start = seq_loc_i.GetRange().GetFrom();
1011 } else {
1012 NCBI_THROW(CSeqalignException, eOutOfRange,
1013 "CDense_seg::RemapToLoc():"
1014 " Internal error");
1015 }
1016
1017 // split our segment
1018 SetLens().insert(SetLens().begin() +
1019 (loc_plus ? seg : seg + 1),
1020 len);
1021 SetLens()[loc_plus ? seg + 1 : seg] = len_diff;
1022
1023 // insert new data to account for our split segment
1024 TStarts temp_starts(numrows, -1);
1025 for (size_t row_i = 0, tmp_idx = seg * numrows;
1026 row_i < numrows; ++row_i, ++tmp_idx) {
1027 TSignedSeqPos& this_start = SetStarts()[tmp_idx];
1028 if (this_start != -1) {
1029 temp_starts[row_i] = this_start;
1030 if (loc_plus == (strands[row_i] != eNa_strand_minus)) {
1031 if ((size_t) row == row_i) {
1032 temp_starts[row_i] = start;
1033 } else {
1034 temp_starts[row_i] += len;
1035 }
1036 } else {
1037 this_start += len_diff;
1038 }
1039 }
1040 }
1041
1042 len_so_far += loc_len;
1043 len = loc_len = seq_loc_i.GetRange().GetLength();
1044
1045 SetStarts().insert(SetStarts().begin() +
1046 (loc_plus ? seg + 1 : seg) * numrows,
1047 temp_starts.begin(), temp_starts.end());
1048
1049 if (strands.size()) {
1050 SetStrands().insert
1051 (SetStrands().begin(),
1052 strands.begin(), strands.begin() + numrows);
1053 }
1054
1055 SetNumseg()++;
1056
1057 if ((len_diff = lens[seg] - len) > 0) {
1058 if (loc_plus) {
1059 idx += numrows; seg++;
1060 } else {
1061 idx -= numrows; seg--;
1062 }
1063 } else {
1064 break;
1065 }
1066 }
1067 } else {
1068 len -= lens[seg];
1069 }
1070
1071 if (loc_plus) {
1072 idx += numrows; seg++;
1073 } else {
1074 idx -= numrows; seg--;
1075 }
1076 } // while iterating through segments
1077
1078 // finally, modify the strands if different
1079 if ( !ignore_strand ) {
1080 if (loc_plus != row_plus) {
1081 if (!strands.size()) {
1082 // strands do not exist, create them
1083 SetStrands().resize(GetNumseg() * GetDim(), eNa_strand_plus);
1084 }
1085 for (seg = 0, idx = row;
1086 seg < GetNumseg(); seg++, idx += numrows) {
1087 SetStrands()[idx] = loc_plus ? eNa_strand_plus : eNa_strand_minus;
1088 }
1089 }
1090 }
1091
1092 }
1093
1094
FillUnaligned() const1095 CRef<CDense_seg> CDense_seg::FillUnaligned() const
1096 {
1097 // this dense-seg
1098 const CDense_seg::TStarts& starts = GetStarts();
1099 const CDense_seg::TStrands& strands = GetStrands();
1100 const CDense_seg::TLens& lens = GetLens();
1101 const CDense_seg::TIds& ids = GetIds();
1102
1103 const size_t& numrows = CheckNumRows();
1104 const size_t& numsegs = CheckNumSegs();
1105
1106 bool strands_exist = !strands.empty();
1107
1108 size_t seg = 0, row = 0;
1109
1110
1111 // extra segments
1112 CDense_seg::TStarts extra_starts;
1113 CDense_seg::TLens extra_lens;
1114 size_t extra_numsegs = 0;
1115 size_t extra_seg = 0;
1116 vector<size_t> extra_segs;
1117
1118
1119 // new dense-seg
1120 CRef<CDense_seg> new_ds(new CDense_seg);
1121 CDense_seg::TStarts& new_starts = new_ds->SetStarts();
1122 CDense_seg::TStrands& new_strands = new_ds->SetStrands();
1123 CDense_seg::TLens& new_lens = new_ds->SetLens();
1124 CDense_seg::TIds& new_ids = new_ds->SetIds();
1125
1126 // dimentions
1127 new_ds->SetDim(numrows);
1128 TNumseg& new_numsegs = new_ds->SetNumseg();
1129 new_numsegs = numsegs; // initialize
1130
1131 // ids
1132 new_ids.resize(numrows);
1133 for (row = 0; row < numrows; row++) {
1134 CRef<CSeq_id> id(new CSeq_id);
1135 SerialAssign(*id, *ids[row]);
1136 new_ids[row] = id;
1137 }
1138
1139 TNumseg new_seg = 0;
1140
1141 // temporary data
1142 vector<TSignedSeqPos> expected_positions;
1143 expected_positions.resize(numrows, -1);
1144
1145 vector<bool> plus;
1146 plus.resize(numrows, true);
1147 if (strands_exist) {
1148 for (row = 0; row < numrows; row++) {
1149 if (strands[row] == eNa_strand_minus) {
1150 plus[row] = false;
1151 }
1152 }
1153 }
1154
1155 TSignedSeqPos extra_len = 0;
1156 size_t idx = 0, new_idx = 0, extra_idx = 0;
1157
1158 // main loop through segments
1159 for (seg = 0; seg < numsegs; seg++) {
1160 const TSeqPos& len = lens[seg];
1161 for (row = 0; row < numrows; row++) {
1162
1163 const TSignedSeqPos& start = starts[idx++];
1164
1165 TSignedSeqPos& expected_pos = expected_positions[row];
1166
1167 if (start >= 0) {
1168
1169 if (expected_pos >= 0) {
1170 // check if there's an unaligned insert
1171
1172 if (plus[row]) {
1173 extra_len = start - expected_pos;
1174 } else {
1175 extra_len = expected_pos - start - len;
1176 }
1177
1178 if (extra_len < 0) {
1179 string errstr("CDense_seg::AddUnalignedSegments():"
1180 " Illegal overlap at Row ");
1181 errstr += NStr::SizetToString(row);
1182 errstr += " Segment ";
1183 errstr += NStr::SizetToString(seg);
1184 errstr += ".";
1185 NCBI_THROW(CSeqalignException, eInvalidAlignment,
1186 errstr);
1187 } else if (extra_len > 0) {
1188 // insert new segment
1189 extra_segs.push_back(seg);
1190 extra_lens.push_back(extra_len);
1191 extra_starts.resize(extra_idx + numrows, -1);
1192 extra_starts[extra_idx + row] =
1193 plus[row] ? expected_pos : start + len;
1194
1195 extra_idx += numrows;
1196 ++extra_numsegs;
1197 }
1198 }
1199
1200 // set the new expected_pos
1201 if (plus[row]) {
1202 expected_pos = start + len;
1203 } else {
1204 expected_pos = start;
1205 }
1206 }
1207 }
1208 }
1209
1210 // lens & starts
1211 new_numsegs = numsegs + extra_numsegs;
1212 new_lens.resize(numsegs + extra_numsegs);
1213 new_starts.resize(numrows * new_numsegs);
1214 for (seg = 0, new_seg = 0, extra_seg = 0,
1215 idx = 0, new_idx = 0, extra_idx = 0;
1216 seg < numsegs;
1217 seg++, new_seg++) {
1218
1219 // insert extra segments
1220 if (extra_numsegs) {
1221 for ( ; extra_seg < extra_segs.size() && extra_segs[extra_seg] == seg; ++new_seg, ++extra_seg) {
1222 new_lens[new_seg] = extra_lens[extra_seg];
1223 for (row = 0; row < numrows; ++row, ++new_idx, ++extra_idx) {
1224 new_starts[new_idx] = extra_starts[extra_idx];
1225 }
1226 }
1227 }
1228
1229 // add the existing segment
1230 new_lens[new_seg] = lens[seg];
1231 for (row = 0; row < numrows; row++) {
1232 new_starts[new_idx++] = starts[idx++];
1233 }
1234 }
1235
1236
1237 // strands
1238 new_strands.resize(numrows * new_numsegs, eNa_strand_plus);
1239 if (strands_exist) {
1240 new_idx = 0;
1241 for (new_seg = 0; new_seg < new_numsegs; new_seg++) {
1242 for (row = 0; row < numrows; row++, new_idx++) {
1243 if ( !plus[row] ) {
1244 new_strands[new_idx] = eNa_strand_minus;
1245 }
1246 }
1247 }
1248 }
1249
1250 return new_ds;
1251 }
1252
1253
1254 //-----------------------------------------------------------------------------
1255 // PRE : alignment transcript (RLE or not) and start coordinates
1256 // POST: Starts, lens and strands. Ids and scores not affected.
1257
1258 // initialize from pairwise alignment transcript
FromTranscript(TSeqPos query_start,ENa_strand query_strand,TSeqPos subj_start,ENa_strand subj_strand,const string & transcript)1259 void CDense_seg::FromTranscript(TSeqPos query_start, ENa_strand query_strand,
1260 TSeqPos subj_start, ENa_strand subj_strand,
1261 const string& transcript )
1262 {
1263 // check that strands are specific
1264 bool query_strand_specific =
1265 query_strand == eNa_strand_plus || query_strand == eNa_strand_minus;
1266 bool subj_strand_specific =
1267 subj_strand == eNa_strand_plus || subj_strand == eNa_strand_minus;
1268
1269 if(!query_strand_specific || !subj_strand_specific) {
1270 NCBI_THROW(CSeqalignException, eInvalidInputData, "Unknown strand");
1271 }
1272
1273 TStarts &starts = SetStarts();
1274 starts.clear();
1275 TLens &lens = SetLens();
1276 lens.clear();
1277 TStrands &strands = SetStrands();
1278 strands.clear();
1279
1280 SetDim(2);
1281
1282 // iterate through the transcript
1283 size_t seg_count = 0;
1284
1285 size_t start1 = 0, pos1 = 0; // relative to exon start in mrna
1286 size_t start2 = 0, pos2 = 0; // and genomic
1287 size_t seg_len = 0;
1288
1289 string::const_iterator ib = transcript.begin();
1290 string::const_iterator ie = transcript.end();
1291 string::const_iterator ii = ib;
1292 unsigned char seg_type;
1293 static const char* badsymerr = "Unknown or unsupported transcript symbol";
1294 char c = ii != ie ? *ii++ : 0;
1295 if(c == 'M' || c == 'R') {
1296 seg_type = 0;
1297 ++pos1;
1298 ++pos2;
1299 }
1300 else if (c == 'I') {
1301 seg_type = 1;
1302 ++pos2;
1303 }
1304 else if (c == 'D') {
1305 seg_type = 2;
1306 ++pos1;
1307 }
1308 else {
1309 if (c == 0) {
1310 NCBI_THROW(CSeqalignException, eInvalidInputData,
1311 "Empty transcript");
1312 }
1313 else {
1314 NCBI_THROW(CSeqalignException, eInvalidInputData, badsymerr);
1315 }
1316 }
1317
1318 while(ii < ie) {
1319
1320 c = *ii;
1321 if(isalpha((unsigned char) c)) {
1322
1323 if(seg_type == 0 && (c == 'M' || c == 'R')) {
1324 ++pos1;
1325 ++pos2;
1326 }
1327 else if(seg_type == 1 && c == 'I') {
1328 ++pos2;
1329 }
1330 else if(seg_type == 2 && c == 'D') {
1331 ++pos1;
1332 }
1333 else {
1334
1335 // close current seg
1336 TSeqPos query_close = query_strand == eNa_strand_plus?
1337 start1: 1 - pos1;
1338 starts.push_back(seg_type == 1? (TSeqPos)-1: query_start + query_close);
1339 strands.push_back(query_strand);
1340
1341 TSeqPos subj_close = subj_strand == eNa_strand_plus?
1342 start2: 1- pos2;
1343 starts.push_back(seg_type == 2? (TSeqPos)-1: subj_start + subj_close);
1344 strands.push_back(subj_strand);
1345
1346 switch(seg_type) {
1347 case 0: seg_len = pos1 - start1; break;
1348 case 1: seg_len = pos2 - start2; break;
1349 case 2: seg_len = pos1 - start1; break;
1350 }
1351 lens.push_back(seg_len);
1352 ++seg_count;
1353
1354 // start a new seg
1355 start1 = pos1;
1356 start2 = pos2;
1357
1358 if(c == 'M' || c == 'R'){
1359 seg_type = 0; // matches and mismatches
1360 ++pos1;
1361 ++pos2;
1362 }
1363 else if (c == 'I') {
1364 seg_type = 1; // inserts
1365 ++pos2;
1366 }
1367 else if (c == 'D') {
1368 seg_type = 2; // dels
1369 ++pos1;
1370 }
1371 else {
1372
1373 NCBI_THROW(CSeqalignException, eInvalidInputData,
1374 badsymerr);
1375 }
1376 }
1377 ++ii;
1378 }
1379 else {
1380
1381 if(!isdigit((unsigned char) c)) {
1382
1383 NCBI_THROW(CSeqalignException, eInvalidInputData,
1384 "Alignment transcript corrupt");
1385 }
1386
1387 size_t len = 0;
1388 while(ii < ie && isdigit((unsigned char)(*ii))) {
1389 len = 10*len + *ii - '0';
1390 ++ii;
1391 }
1392 --len;
1393 switch(seg_type) {
1394 case 0: pos1 += len; pos2 += len; break;
1395 case 1: pos2 += len; break;
1396 case 2: pos1 += len; break;
1397 }
1398 }
1399 }
1400
1401 TSeqPos query_close = query_strand == eNa_strand_plus? start1: 1 - pos1;
1402 starts.push_back(seg_type == 1? (TSeqPos)-1: query_start + query_close);
1403 strands.push_back(query_strand);
1404
1405 TSeqPos subj_close = subj_strand == eNa_strand_plus? start2: 1 - pos2;
1406 starts.push_back(seg_type == 2? (TSeqPos)-1: subj_start + subj_close);
1407 strands.push_back(subj_strand);
1408
1409 switch(seg_type) {
1410
1411 case 0: seg_len = pos1 - start1; break;
1412 case 1: seg_len = pos2 - start2; break;
1413 case 2: seg_len = pos1 - start1; break;
1414 }
1415 lens.push_back(seg_len);
1416 ++seg_count;
1417
1418 SetNumseg(seg_count);
1419 }
1420
1421
CreateRowSeq_interval(TDim row) const1422 CRef<CSeq_interval> CDense_seg::CreateRowSeq_interval(TDim row) const
1423 {
1424 if (GetDim() <= row) {
1425 NCBI_THROW(CSeqalignException, eInvalidRowNumber,
1426 "Invalid row number in CreateRowSeq_interval(): " +
1427 NStr::NumericToString(row));
1428 }
1429 CRef<CSeq_interval> ret(new CSeq_interval);
1430 TSeqPos from = kInvalidSeqPos;
1431 TSeqPos to = 0;
1432 TSeqPos plus_len = 0;
1433 TSeqPos minus_len = 0;
1434 ret->SetId().Assign(*GetIds()[row]);
1435 for (int seg = 0; seg < GetNumseg(); seg++) {
1436 int idx = seg*GetDim() + row;
1437 int start = GetStarts()[idx];
1438 if (start < 0) {
1439 continue;
1440 }
1441 if (TSeqPos(start) < from) {
1442 from = TSeqPos(start);
1443 }
1444 TSeqPos len = GetLens()[seg];
1445 if (start + len > to) {
1446 to = start + len;
1447 }
1448 if ( !IsSetStrands() || !IsReverse(GetStrands()[idx]) ) {
1449 plus_len += len;
1450 }
1451 else {
1452 minus_len += len;
1453 }
1454 }
1455 if (from == kInvalidSeqPos || to == 0) {
1456 NCBI_THROW(CSeqalignException, eOutOfRange,
1457 "Can not convert row to seq-interval - invalid from/to value");
1458 }
1459 ret->SetFrom(from);
1460 ret->SetTo(to - 1);
1461 if ( IsSetStrands() ) {
1462 // If more than 2/3 of the segment is on plus, use plus strand
1463 if (plus_len >= minus_len*2) {
1464 ret->SetStrand(eNa_strand_plus);
1465 }
1466 // If more than 2/3 is on minus, use minus strand
1467 else if (plus_len*2 < minus_len) {
1468 ret->SetStrand(eNa_strand_minus);
1469 }
1470 // Otherwise set strand to 'both'
1471 else {
1472 ret->SetStrand(eNa_strand_both);
1473 }
1474 }
1475 return ret;
1476 }
1477
1478
1479 NCBI_PARAM_DECL(bool, OBJECTS, DENSE_SEG_RESERVE);
1480 NCBI_PARAM_DEF_EX(bool, OBJECTS, DENSE_SEG_RESERVE, true,
1481 eParam_NoThread, OBJECTS_DENSE_SEG_RESERVE);
1482
PreReadClassMember(CObjectIStream & in,const CObjectInfoMI & member)1483 void CDense_seg::CReserveHook::PreReadClassMember(CObjectIStream& in,
1484 const CObjectInfoMI& member)
1485 {
1486 static CSafeStatic<NCBI_PARAM_TYPE(OBJECTS, DENSE_SEG_RESERVE)> s_Reserve;
1487
1488 if ( !s_Reserve->Get() ) {
1489 return;
1490 }
1491 CDense_seg& ds = *CType<CDense_seg>::Get(member.GetClassObject());
1492 size_t numseg = ds.GetNumseg();
1493 try {
1494 switch ( member.GetMemberIndex() ) {
1495 case 4: // "starts"
1496 ds.SetStarts().reserve(ds.GetDim()*numseg);
1497 break;
1498 case 5: // "lens"
1499 ds.SetLens().reserve(numseg);
1500 break;
1501 case 6: // "strands"
1502 ds.SetStrands().reserve(ds.GetDim()*numseg);
1503 break;
1504 default:
1505 break;
1506 }
1507 }
1508 catch ( bad_alloc& /*ignored*/ ) {
1509 // ignore insufficient memory exception from advisory reserve()
1510 }
1511 }
1512
1513
1514 /////////////////////////////////////////////////////////////////////////////
1515 // Read hooks to reserve memory of Dense-seg vector<> to estimated size.
1516 /////////////////////////////////////////////////////////////////////////////
1517
1518
SetReserveHooks(CObjectIStream & in)1519 void CDense_seg::SetReserveHooks(CObjectIStream& in)
1520 {
1521 CDenseSegReserveStartsHook::SetHook(in);
1522 CDenseSegReserveLensHook::SetHook(in);
1523 CDenseSegReserveStrandsHook::SetHook(in);
1524 }
1525
1526
SetGlobalReserveHooks(void)1527 void CDense_seg::SetGlobalReserveHooks(void)
1528 {
1529 CDenseSegReserveStartsHook::SetGlobalHook();
1530 CDenseSegReserveLensHook::SetGlobalHook();
1531 CDenseSegReserveStrandsHook::SetGlobalHook();
1532 }
1533
1534
ReadClassMember(CObjectIStream & in,const CObjectInfoMI & member)1535 void CDenseSegReserveStartsHook::ReadClassMember(CObjectIStream& in,
1536 const CObjectInfoMI& member)
1537 {
1538 CDense_seg& ds = *CType<CDense_seg>::Get(member.GetClassObject());
1539 size_t size = ds.GetDim()*ds.GetNumseg();
1540 CDense_seg::TStarts& array = ds.SetStarts();
1541 try {
1542 array.reserve(size);
1543 }
1544 catch ( bad_alloc& /*ignored*/ ) {
1545 // ignore insufficient memory exception from advisory reserve()
1546 }
1547 DefaultRead(in, member);
1548 }
1549
1550
x_GetMember(void)1551 CObjectTypeInfoMI CDenseSegReserveStartsHook::x_GetMember(void)
1552 {
1553 CObjectTypeInfo type = CType<CDense_seg>();
1554 return type.FindMember("starts");
1555 }
1556
1557
SetHook(CObjectIStream & in)1558 void CDenseSegReserveStartsHook::SetHook(CObjectIStream& in)
1559 {
1560 CRef<CDenseSegReserveStartsHook> hook(new CDenseSegReserveStartsHook);
1561 x_GetMember().SetLocalReadHook(in, hook);
1562 }
1563
1564
SetGlobalHook(void)1565 void CDenseSegReserveStartsHook::SetGlobalHook(void)
1566 {
1567 CRef<CDenseSegReserveStartsHook> hook(new CDenseSegReserveStartsHook);
1568 x_GetMember().SetGlobalReadHook(hook);
1569 }
1570
1571
ReadClassMember(CObjectIStream & in,const CObjectInfoMI & member)1572 void CDenseSegReserveLensHook::ReadClassMember(CObjectIStream& in,
1573 const CObjectInfoMI& member)
1574 {
1575 CDense_seg& ds = *CType<CDense_seg>::Get(member.GetClassObject());
1576 size_t size = ds.GetNumseg();
1577 CDense_seg::TLens& array = ds.SetLens();
1578 try {
1579 array.reserve(size);
1580 }
1581 catch ( bad_alloc& /*ignored*/ ) {
1582 // ignore insufficient memory exception from advisory reserve()
1583 }
1584 DefaultRead(in, member);
1585 }
1586
1587
x_GetMember(void)1588 CObjectTypeInfoMI CDenseSegReserveLensHook::x_GetMember(void)
1589 {
1590 CObjectTypeInfo type = CType<CDense_seg>();
1591 return type.FindMember("lens");
1592 }
1593
1594
SetHook(CObjectIStream & in)1595 void CDenseSegReserveLensHook::SetHook(CObjectIStream& in)
1596 {
1597 CRef<CDenseSegReserveLensHook> hook(new CDenseSegReserveLensHook);
1598 x_GetMember().SetLocalReadHook(in, hook);
1599 }
1600
1601
SetGlobalHook(void)1602 void CDenseSegReserveLensHook::SetGlobalHook(void)
1603 {
1604 CRef<CDenseSegReserveLensHook> hook(new CDenseSegReserveLensHook);
1605 x_GetMember().SetGlobalReadHook(hook);
1606 }
1607
1608
ReadClassMember(CObjectIStream & in,const CObjectInfoMI & member)1609 void CDenseSegReserveStrandsHook::ReadClassMember(CObjectIStream& in,
1610 const CObjectInfoMI& member)
1611 {
1612 CDense_seg& ds = *CType<CDense_seg>::Get(member.GetClassObject());
1613 size_t size = ds.GetDim()*ds.GetNumseg();
1614 CDense_seg::TStrands& array = ds.SetStrands();
1615 try {
1616 array.reserve(size);
1617 }
1618 catch ( bad_alloc& /*ignored*/ ) {
1619 // ignore insufficient memory exception from advisory reserve()
1620 }
1621 DefaultRead(in, member);
1622 }
1623
1624
x_GetMember(void)1625 CObjectTypeInfoMI CDenseSegReserveStrandsHook::x_GetMember(void)
1626 {
1627 CObjectTypeInfo type = CType<CDense_seg>();
1628 return type.FindMember("strands");
1629 }
1630
1631
SetHook(CObjectIStream & in)1632 void CDenseSegReserveStrandsHook::SetHook(CObjectIStream& in)
1633 {
1634 CRef<CDenseSegReserveStrandsHook> hook(new CDenseSegReserveStrandsHook);
1635 x_GetMember().SetLocalReadHook(in, hook);
1636 }
1637
1638
SetGlobalHook(void)1639 void CDenseSegReserveStrandsHook::SetGlobalHook(void)
1640 {
1641 CRef<CDenseSegReserveStrandsHook> hook(new CDenseSegReserveStrandsHook);
1642 x_GetMember().SetGlobalReadHook(hook);
1643 }
1644
1645
1646 END_objects_SCOPE // namespace ncbi::objects::
1647
1648 END_NCBI_SCOPE
1649