1 /* $Id: alnsegments.cpp 488694 2016-01-04 20:27:43Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kamen Todorov, NCBI
27 *
28 * File Description:
29 * Alignment segments
30 *
31 * ===========================================================================
32 */
33
34 #include <ncbi_pch.hpp>
35 #include <objtools/alnmgr/alnsegments.hpp>
36 #include <objtools/alnmgr/alnseq.hpp>
37 #include <objtools/alnmgr/alnexception.hpp>
38 #include <objtools/alnmgr/alnmix.hpp>
39 #include <objtools/alnmgr/alnmap.hpp>
40
41 #include <stack>
42
43
44 BEGIN_NCBI_SCOPE
45 BEGIN_objects_SCOPE // namespace ncbi::objects::
46
47
CAlnMixSegments(CRef<CAlnMixSequences> & aln_mix_sequences,TCalcScoreMethod calc_score)48 CAlnMixSegments::CAlnMixSegments(CRef<CAlnMixSequences>& aln_mix_sequences,
49 TCalcScoreMethod calc_score)
50 : m_AlnMixSequences(aln_mix_sequences),
51 m_Rows(m_AlnMixSequences->m_Rows),
52 m_ExtraRows(m_AlnMixSequences->m_ExtraRows),
53 x_CalculateScore(calc_score)
54 {}
55
56
57 void
Build(bool gap_join,bool min_gap,bool remove_leading_and_trailing_gaps)58 CAlnMixSegments::Build(bool gap_join,
59 bool min_gap,
60 bool remove_leading_and_trailing_gaps)
61 {
62 m_AlnMixSequences->InitRowsStartIts();
63 m_AlnMixSequences->InitExtraRowsStartIts();
64 #if _DEBUG && _ALNMGR_DEBUG
65 m_AlnMixSequences->RowsStartItsContsistencyCheck(0);
66 #endif
67
68
69 TSegmentsContainer gapped_segs;
70
71 CAlnMixSequences::TSeqs::iterator refseq_it = m_Rows.begin();
72 bool orig_refseq = true;
73 while (true) {
74 CAlnMixSeq * refseq = 0;
75 while (refseq_it != m_Rows.end()) {
76 refseq = *(refseq_it++);
77 if (refseq->GetStarts().current != refseq->GetStarts().end()) {
78 break;
79 } else {
80 refseq = 0;
81 }
82 }
83 if ( !refseq ) {
84 // Done
85
86 // add the gapped segments if any
87 if (gapped_segs.size()) {
88 if (gap_join) {
89 // request to try to align
90 // gapped segments w/ equal len
91 x_ConsolidateGaps(gapped_segs);
92 } else if (min_gap) {
93 // request to try to align
94 // all gapped segments
95 x_MinimizeGaps(gapped_segs);
96 }
97 NON_CONST_ITERATE (TSegmentsContainer,
98 seg_i, gapped_segs) {
99 m_Segments.push_back(&**seg_i);
100 }
101 gapped_segs.clear();
102 }
103 break; // from the main loop
104 }
105 #if _DEBUG && _ALNMGR_TRACE
106 cerr << "refseq is on row " << refseq->m_RowIdx
107 << " seq " << refseq->m_SeqIdx << "\n";
108 #endif
109 // for each refseq segment
110 while (refseq->GetStarts().current != refseq->GetStarts().end()) {
111 stack< CRef<CAlnMixSegment> > seg_stack;
112 // To prevent infinite loop, remember all segments already on the stack.
113 set< CRef<CAlnMixSegment> > seg_set;
114 seg_stack.push(refseq->GetStarts().current->second);
115 seg_set.insert(refseq->GetStarts().current->second);
116 #if _DEBUG
117 const TSeqPos& refseq_start = refseq->GetStarts().current->first;
118 #if _DEBUG && _ALNMGR_TRACE
119 cerr << " [row " << refseq->m_RowIdx
120 << " seq " << refseq->m_SeqIdx
121 << " start " << refseq_start
122 << " was pushed into stack\n";
123 #endif
124 #endif
125
126 while ( !seg_stack.empty() ) {
127
128 bool pop_seg = true;
129 // check the gapped segments on the left
130 ITERATE (CAlnMixSegment::TStartIterators, start_its_i,
131 seg_stack.top()->m_StartIts) {
132
133 CAlnMixSeq * row = start_its_i->first;
134
135 if (row->GetStarts().current != start_its_i->second) {
136 #if _DEBUG
137 const TSeqPos& curr_row_start = row->GetStarts().current->first;
138 const TSeqPos& row_start = start_its_i->second->first;
139
140 if (row->m_PositiveStrand ?
141 curr_row_start >
142 row_start :
143 curr_row_start <
144 row_start) {
145 string errstr =
146 string("CAlnMixSegments::Build():")
147 + " Internal error: Integrity broken" +
148 " row=" + NStr::IntToString(row->m_RowIdx) +
149 " seq=" + NStr::IntToString(row->m_SeqIdx)
150 + " curr_row_start="
151 + NStr::IntToString(curr_row_start)
152 + " row_start=" +
153 NStr::IntToString(row_start)
154 + " refseq_start=" +
155 NStr::IntToString(refseq_start)
156 + " strand=" +
157 (row->m_PositiveStrand ? "plus" : "minus");
158 NCBI_THROW(CAlnException, eMergeFailure, errstr);
159 }
160 #endif
161 seg_stack.push(row->GetStarts().current->second);
162 // If the segment is already in the set, we have infinite loop
163 // and must terminate it.
164 if (!seg_set.insert(row->GetStarts().current->second).second) {
165 string errstr =
166 string("CAlnMixSegments::Build():")
167 + " Internal error: Infinite loop detected.";
168 NCBI_THROW(CAlnException, eMergeFailure, errstr);
169 }
170 #if _DEBUG && _ALNMGR_TRACE
171 cerr << " [row " << row->m_RowIdx
172 << " seq " << row->m_SeqIdx
173 << " start " << curr_row_start
174 << " (left of start " << row_start << ") "
175 << "was pushed into stack\n";
176 #endif
177 pop_seg = false;
178 break;
179 }
180 }
181
182 if (pop_seg) {
183
184 // inc/dec iterators for each row of the seg
185 ITERATE (CAlnMixSegment::TStartIterators, start_its_i,
186 seg_stack.top()->m_StartIts) {
187 CAlnMixSeq * row = start_its_i->first;
188
189 #if _DEBUG
190 const TSeqPos& curr_row_start = row->GetStarts().current->first;
191 const TSeqPos& row_start = start_its_i->second->first;
192
193 if (row->m_PositiveStrand &&
194 curr_row_start >
195 row_start ||
196 !row->m_PositiveStrand &&
197 curr_row_start <
198 row_start) {
199 string errstr =
200 string("CAlnMixSegments::Build():")
201 + " Internal error: Integrity broken" +
202 " row=" + NStr::IntToString(row->m_RowIdx) +
203 " seq=" + NStr::IntToString(row->m_SeqIdx)
204 + " curr_row_start="
205 + NStr::IntToString(curr_row_start)
206 + " row_start=" +
207 NStr::IntToString(row_start)
208 + " refseq_start=" +
209 NStr::IntToString(refseq_start)
210 + " strand=" +
211 (row->m_PositiveStrand ? "plus" : "minus");
212 NCBI_THROW(CAlnException, eMergeFailure, errstr);
213 }
214 #endif
215
216 if (row->m_PositiveStrand) {
217 row->SetStarts().current++;
218 } else {
219 if (row->SetStarts().current == row->GetStarts().begin()) {
220 row->SetStarts().current = row->GetStarts().end();
221 } else {
222 row->SetStarts().current--;
223 }
224 }
225 }
226
227 if (seg_stack.size() > 1) {
228 // add to the gapped segments
229 gapped_segs.push_back(seg_stack.top());
230 seg_stack.pop();
231 #if _DEBUG && _ALNMGR_TRACE
232 cerr << " seg popped].\n";
233 #endif
234 } else {
235 // add the gapped segments if any
236 if (gapped_segs.size()) {
237 if (gap_join) {
238 // request to try to align
239 // gapped segments w/ equal len
240 x_ConsolidateGaps(gapped_segs);
241 } else if (min_gap) {
242 // request to try to align
243 // all gapped segments
244 x_MinimizeGaps(gapped_segs);
245 }
246 if (orig_refseq) {
247 NON_CONST_ITERATE (TSegmentsContainer,
248 seg_i, gapped_segs) {
249 m_Segments.push_back(&**seg_i);
250 }
251 gapped_segs.clear();
252 }
253 }
254 // add the refseq segment
255 if (orig_refseq) {
256 m_Segments.push_back(seg_stack.top());
257 } else {
258 gapped_segs.push_back(seg_stack.top());
259 }
260 seg_stack.pop();
261 #if _DEBUG && _ALNMGR_TRACE
262 cerr << " refseq seg popped].\n";
263 #endif
264 } // if (seg_stack.size() > 1)
265 } // if (popseg)
266 } // while ( !seg_stack.empty() )
267 } // while (refseq->GetStarts().current != refseq->GetStarts().end())
268 orig_refseq = false;
269 } // while (true)
270
271
272 if (remove_leading_and_trailing_gaps) {
273 while (m_Segments.size() && m_Segments.front()->m_StartIts.size() < 2) {
274 m_Segments.pop_front();
275 }
276 while (m_Segments.size() && m_Segments.back()->m_StartIts.size() < 2) {
277 m_Segments.pop_back();
278 }
279 }
280
281 }
282
283
284
285 void
FillUnalignedRegions()286 CAlnMixSegments::FillUnalignedRegions()
287 {
288 vector<TSignedSeqPos> starts;
289 vector<TSeqPos> lens;
290 starts.resize(m_Rows.size(), -1);
291 lens.resize(m_Rows.size(), 0);
292
293 TSeqPos len = 0;
294 CAlnMap::TNumrow rowidx;
295
296 TSegments::iterator seg_i = m_Segments.begin();
297 while (seg_i != m_Segments.end()) {
298 len = (*seg_i)->m_Len;
299 ITERATE (CAlnMixSegment::TStartIterators, start_its_i,
300 (*seg_i)->m_StartIts) {
301 CAlnMixSeq * row = start_its_i->first;
302 rowidx = row->m_RowIdx;
303 TSignedSeqPos& prev_start = starts[rowidx];
304 TSeqPos& prev_len = lens[rowidx];
305 TSeqPos start = start_its_i->second->first;
306 const bool plus = row->m_PositiveStrand;
307 const int& width = row->m_Width;
308 TSeqPos prev_start_plus_len = prev_start + prev_len * width;
309 TSeqPos start_plus_len = start + len * width;
310 if (prev_start >= 0) {
311 if (plus && prev_start_plus_len < start ||
312 !plus && start_plus_len < (TSeqPos) prev_start) {
313 // create a new seg
314 CRef<CAlnMixSegment> seg (new CAlnMixSegment);
315 TSeqPos new_start;
316 if (row->m_PositiveStrand) {
317 new_start = prev_start + prev_len * width;
318 seg->m_Len = (start - new_start) / width;
319 } else {
320 new_start = start_plus_len;
321 seg->m_Len = (prev_start - new_start) / width;
322 }
323 row->SetStarts()[new_start] = seg;
324 CAlnMixStarts::iterator start_i =
325 start_its_i->second;
326 seg->SetStartIterator(row,
327 row->m_PositiveStrand ?
328 --start_i :
329 ++start_i);
330
331 seg_i = m_Segments.insert(seg_i, seg);
332 seg_i++;
333 }
334 }
335 prev_start = start;
336 prev_len = len;
337 }
338 seg_i++;
339 }
340 }
341
342
343 void
x_ConsolidateGaps(TSegmentsContainer & gapped_segs)344 CAlnMixSegments::x_ConsolidateGaps(TSegmentsContainer& gapped_segs)
345 {
346 TSegmentsContainer::iterator seg1_i, seg2_i;
347
348 seg2_i = seg1_i = gapped_segs.begin();
349 if (seg2_i != gapped_segs.end()) {
350 seg2_i++;
351 }
352
353 bool cache = false;
354 string s1;
355 int score1;
356 CAlnMixSeq * seq1;
357 CAlnMixSeq * seq2;
358
359 while (seg2_i != gapped_segs.end()) {
360
361 CAlnMixSegment * seg1 = *seg1_i;
362 CAlnMixSegment * seg2 = *seg2_i;
363
364 // check if this seg possibly aligns with the previous one
365 bool possible = true;
366
367 if (seg2->m_Len == seg1->m_Len &&
368 seg2->m_StartIts.size() == 1) {
369
370 seq2 = seg2->m_StartIts.begin()->first;
371
372 // check if this seq was already used
373 ITERATE (CAlnMixSegment::TStartIterators,
374 st_it,
375 (*seg1_i)->m_StartIts) {
376 if (st_it->first == seq2) {
377 possible = false;
378 break;
379 }
380 }
381
382 // check if score is sufficient
383 if (possible && x_CalculateScore) {
384 if (!cache) {
385
386 seq1 = seg1->m_StartIts.begin()->first;
387
388 seq2->GetSeqString(s1,
389 seg1->m_StartIts[seq1]->first,
390 seg1->m_Len * seq1->m_Width,
391 seq1->m_PositiveStrand);
392
393 score1 = x_CalculateScore(s1,
394 s1,
395 seq1->m_IsAA,
396 seq1->m_IsAA);
397 cache = true;
398 }
399
400 string s2;
401 seq2->GetSeqString(s2,
402 seg2->m_StartIts[seq2]->first,
403 seg2->m_Len * seq2->m_Width,
404 seq2->m_PositiveStrand);
405
406 int score2 =
407 x_CalculateScore(s1, s2, seq1->m_IsAA, seq2->m_IsAA);
408
409 if (score2 < 75 * score1 / 100) {
410 possible = false;
411 }
412 }
413
414 } else {
415 possible = false;
416 }
417
418 if (possible) {
419 // consolidate the ones so far
420
421 // add the new row
422 seg1->SetStartIterator(seq2, seg2->m_StartIts.begin()->second);
423
424 // point the row's start position to the beginning seg
425 seg2->m_StartIts.begin()->second->second = seg1;
426
427 seg2_i = gapped_segs.erase(seg2_i);
428 } else {
429 cache = false;
430 seg1_i++;
431 seg2_i++;
432 }
433 }
434 }
435
436
437 void
x_MinimizeGaps(TSegmentsContainer & gapped_segs)438 CAlnMixSegments::x_MinimizeGaps(TSegmentsContainer& gapped_segs)
439 {
440 TSegmentsContainer::iterator seg_i, seg_i_end, seg_i_begin;
441 CAlnMixSegment * seg1, * seg2;
442 CRef<CAlnMixSegment> seg;
443 CAlnMixSeq * seq;
444 TSegmentsContainer new_segs;
445
446 seg_i_begin = seg_i_end = seg_i = gapped_segs.begin();
447
448 typedef map<TSeqPos, CRef<CAlnMixSegment> > TLenMap;
449 TLenMap len_map;
450
451 while (seg_i_end != gapped_segs.end()) {
452
453 len_map[(*seg_i_end)->m_Len];
454
455 // check if this seg can possibly be minimized
456 bool possible = true;
457
458 seg_i = seg_i_begin;
459 while (seg_i != seg_i_end) {
460 seg1 = *seg_i;
461 seg2 = *seg_i_end;
462
463 ITERATE (CAlnMixSegment::TStartIterators,
464 st_it,
465 seg2->m_StartIts) {
466 seq = st_it->first;
467 // check if this seq was already used
468 if (seg1->m_StartIts.find(seq) != seg1->m_StartIts.end()) {
469 possible = false;
470 break;
471 }
472 }
473 if ( !possible ) {
474 break;
475 }
476 seg_i++;
477 }
478 seg_i_end++;
479
480 if ( !possible || seg_i_end == gapped_segs.end()) {
481 // use the accumulated len info to create the new segs
482
483 // create new segs with appropriate len
484 TSeqPos len_so_far = 0;
485 TLenMap::iterator len_i = len_map.begin();
486 while (len_i != len_map.end()) {
487 len_i->second = new CAlnMixSegment();
488 len_i->second->m_Len = len_i->first - len_so_far;
489 len_so_far += len_i->second->m_Len;
490 len_i++;
491 }
492
493 // loop through the accumulated orig segs.
494 // copy info from them into the new segs
495 TLenMap::iterator len_i_end;
496 seg_i = seg_i_begin;
497 while (seg_i != seg_i_end) {
498 TSeqPos orig_len = (*seg_i)->m_Len;
499
500 // determine the span of the current seg
501 len_i_end = len_map.find(orig_len);
502 len_i_end++;
503
504 // loop through its sequences
505 NON_CONST_ITERATE (CAlnMixSegment::TStartIterators,
506 st_it,
507 (*seg_i)->m_StartIts) {
508
509 seq = st_it->first;
510 TSeqPos orig_start = st_it->second->first;
511
512 len_i = len_map.begin();
513 len_so_far = 0;
514 // loop through the new segs
515 while (len_i != len_i_end) {
516 seg = len_i->second;
517
518 // calc the start
519 TSeqPos this_start = orig_start +
520 (seq->m_PositiveStrand ?
521 len_so_far :
522 orig_len - len_so_far - seg->m_Len) *
523 seq->m_Width;
524
525 // create the bindings:
526 seq->SetStarts()[this_start] = seg;
527 seg->SetStartIterator(seq, seq->SetStarts().find(this_start));
528 len_i++;
529 len_so_far += seg->m_Len;
530 }
531 }
532 seg_i++;
533 }
534 NON_CONST_ITERATE (TLenMap, len_it, len_map) {
535 new_segs.push_back(len_it->second);
536 }
537 len_map.clear();
538 seg_i_begin = seg_i_end;
539 }
540 }
541 gapped_segs.clear();
542 ITERATE (TSegmentsContainer, new_seg_i, new_segs) {
543 gapped_segs.push_back(*new_seg_i);
544 }
545 }
546
547
548 void
StartItsConsistencyCheck(const CAlnMixSeq & seq,const TSeqPos & start,size_t match_idx) const549 CAlnMixSegment::StartItsConsistencyCheck(const CAlnMixSeq& seq,
550 const TSeqPos& start,
551 size_t match_idx) const
552 {
553 ITERATE(TStartIterators, st_it_i, m_StartIts) {
554 // both should point to the same seg
555 if ((*st_it_i).second->second != this) {
556 string errstr =
557 string("CAlnMixSegment::StartItsConsistencyCheck")
558 + " [match_idx=" + NStr::NumericToString(match_idx) + "]"
559 + " The internal consistency check failed for"
560 + " the segment containing ["
561 + " row=" + NStr::NumericToString((*st_it_i).first->m_RowIdx)
562 + " seq=" + NStr::NumericToString((*st_it_i).first->m_SeqIdx)
563 + " strand=" +
564 ((*st_it_i).first->m_PositiveStrand ? "plus" : "minus")
565 + " start=" + NStr::NumericToString((*st_it_i).second->first)
566 + "] aligned to: ["
567 + " row=" + NStr::NumericToString(seq.m_RowIdx)
568 + " seq=" + NStr::NumericToString(seq.m_SeqIdx)
569 + " strand=" +
570 (seq.m_PositiveStrand ? "plus" : "minus")
571 + " start=" + NStr::NumericToString(start)
572 + "].";
573 NCBI_THROW(CAlnException, eMergeFailure, errstr);
574 }
575 }
576 }
577
578
579 END_objects_SCOPE // namespace ncbi::objects::
580 END_NCBI_SCOPE
581