1 /* $Id: alnmerger.cpp 601341 2020-02-05 20:27:33Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kamen Todorov, NCBI
27 *
28 * File Description:
29 * Alignment merger
30 *
31 * ===========================================================================
32 */
33
34
35 #include <ncbi_pch.hpp>
36 #include <objtools/alnmgr/alnmerger.hpp>
37 #include <objtools/alnmgr/alnseq.hpp>
38 #include <objtools/alnmgr/alnmatch.hpp>
39 #include <objtools/alnmgr/alnmap.hpp>
40
41 #include <algorithm>
42
43
44 BEGIN_NCBI_SCOPE
45 BEGIN_objects_SCOPE // namespace ncbi::objects::
46
47
CAlnMixMerger(CRef<CAlnMixMatches> & aln_mix_matches,TCalcScoreMethod calc_score)48 CAlnMixMerger::CAlnMixMerger(CRef<CAlnMixMatches>& aln_mix_matches,
49 TCalcScoreMethod calc_score)
50 : m_DsCnt(aln_mix_matches->m_DsCnt),
51 m_AlnMixMatches(aln_mix_matches),
52 m_Matches(aln_mix_matches->m_Matches),
53 m_AlnMixSequences(aln_mix_matches->m_AlnMixSequences),
54 m_Seqs(aln_mix_matches->m_Seqs),
55 m_Rows(m_AlnMixSequences->m_Rows),
56 m_ExtraRows(m_AlnMixSequences->m_ExtraRows),
57 m_AlnMixSegments(new CAlnMixSegments(m_AlnMixSequences)),
58 m_SingleRefseq(false),
59 x_CalculateScore(calc_score)
60 {
61 }
62
63
64 void
Reset()65 CAlnMixMerger::Reset()
66 {
67 m_SingleRefseq = false;
68 if (m_DS) {
69 m_DS.Reset();
70 }
71 if (m_Aln) {
72 m_Aln.Reset();
73 }
74 m_AlnMixSegments->m_Segments.clear();
75 m_Rows.clear();
76 m_ExtraRows.clear();
77 NON_CONST_ITERATE (TSeqs, seq_i, m_Seqs) {
78 (*seq_i)->SetStarts().clear();
79 (*seq_i)->m_ExtraRow = 0;
80 }
81 }
82
83
84 void
Merge(TMergeFlags flags)85 CAlnMixMerger::Merge(TMergeFlags flags)
86 {
87 if ( !m_DsCnt ) {
88 NCBI_THROW(CAlnException, eMergeFailure,
89 "CAlnMixMerger::Merge(): "
90 "No alignments were added for merging.");
91 }
92 if ( !m_DS || m_MergeFlags != flags) {
93 Reset();
94 m_MergeFlags = flags;
95 x_Merge();
96 }
97 }
98
99
100 void
x_Merge()101 CAlnMixMerger::x_Merge()
102 {
103 bool first_refseq = true; // mark the first loop
104
105 // Find the refseq (if such exists)
106 {{
107 m_SingleRefseq = false;
108 m_IndependentDSs = m_DsCnt > 1;
109
110 unsigned int ds_cnt;
111 NON_CONST_ITERATE (TSeqs, it, m_Seqs){
112 ds_cnt = (*it)->m_DsCnt;
113 if (ds_cnt > 1) {
114 m_IndependentDSs = false;
115 }
116 if (ds_cnt == m_DsCnt) {
117 m_SingleRefseq = true;
118 if ( !first_refseq ) {
119 CRef<CAlnMixSeq> refseq = *it;
120 m_Seqs.erase(it);
121 m_Seqs.insert(m_Seqs.begin(), refseq);
122 }
123 break;
124 }
125 first_refseq = false;
126 }
127 }}
128
129 // Index the sequences
130 {{
131 int seq_idx=0;
132 NON_CONST_ITERATE (TSeqs, seq_i, m_Seqs) {
133 (*seq_i)->m_SeqIdx = seq_idx++;
134 }
135 }}
136
137
138 // Set the widths if the mix contains both AA & NA
139 // or in case we force translation
140 if ((m_AlnMixMatches->m_ContainsNA && m_AlnMixMatches->m_ContainsAA) ||
141 (m_AlnMixMatches->m_AddFlags & CAlnMixMatches::fForceTranslation)) {
142 NON_CONST_ITERATE (TSeqs, seq_i, m_Seqs) {
143 (*seq_i)->m_Width = (*seq_i)->m_IsAA ? 1 : 3;
144 }
145 }
146
147
148
149 CAlnMixSeq * refseq = 0, * seq1 = 0, * seq2 = 0;
150 TSeqPos start, start1, start2, len, curr_len;
151 int width1 = 0, width2 = 0;
152 CAlnMixMatch * match;
153 CAlnMixSeq::TMatchList::iterator match_list_iter1, match_list_iter2;
154 CAlnMixSeq::TMatchList::iterator match_list_i;
155 TSecondRowFits second_row_fits;
156
157 refseq = *(m_Seqs.begin());
158 TMatches::iterator match_i = m_Matches.begin();
159 m_MatchIdx = 0;
160
161 CRef<CAlnMixSegment> seg;
162 CAlnMixStarts::iterator start_i, lo_start_i, hi_start_i, tmp_start_i;
163
164 first_refseq = true; // mark the first loop
165
166 x_SetTaskTotal(m_Matches.size());
167
168 while (true) {
169
170 // need to cancel?
171 if (x_InterruptTask()) {
172 Reset();
173 return;
174 }
175
176 // reached the end?
177 if (first_refseq ?
178 match_i == m_Matches.end() && m_Matches.size() :
179 refseq->m_MatchList.empty() ||
180 match_list_i == refseq->m_MatchList.end()) {
181
182 // move on to the next refseq
183 refseq->m_RefBy = 0;
184
185 // try to find the best scoring 'connected' candidate
186 NON_CONST_ITERATE (TSeqs, it, m_Seqs){
187 if ( !((*it)->m_MatchList.empty()) &&
188 (*it)->m_RefBy == refseq) {
189 if (refseq == *it) {
190 NCBI_THROW(CAlnException, eMergeFailure,
191 "CAlnMixMerger::x_Merge(): "
192 "Infinitue loop detected "
193 "while searching for a connected candidate.");
194 }
195 refseq = *it;
196 break;
197 }
198 }
199 if (refseq->m_RefBy == 0) {
200 // no candidate was found 'connected' to the refseq
201 // continue with the highest scoring candidate
202 NON_CONST_ITERATE (TSeqs, it, m_Seqs){
203 if ( !((*it)->m_MatchList.empty()) ) {
204 if (refseq == *it) {
205 NCBI_THROW(CAlnException, eMergeFailure,
206 "CAlnMixMerger::x_Merge(): "
207 "Infinitue loop detected "
208 "while searching for a new candidate.");
209 }
210 refseq = *it;
211 break;
212 }
213 }
214 }
215
216 if (refseq->m_MatchList.empty()) {
217 break; // done
218 } else {
219 first_refseq = false;
220 match_list_i = refseq->m_MatchList.begin();
221 }
222 continue;
223 } else {
224 // iterate
225 if (first_refseq) {
226 match = *match_i;
227 ++match_i;
228 } else {
229 match = *match_list_i;
230 ++match_list_i;
231 if (refseq == match->m_AlnSeq2 && refseq == match->m_AlnSeq1) {
232 // This is the rare case of a match b/n a seq and
233 // itself. We need one more incrementation since
234 // the match would be recorded twice
235 _ASSERT(refseq->m_MatchList.size() >= 2);
236 ++match_list_i;
237 }
238 }
239 }
240
241 curr_len = len = match->m_Len;
242
243 // is it a match with this refseq?
244 if (match->m_AlnSeq1 == refseq) {
245 seq1 = match->m_AlnSeq1;
246 start1 = match->m_Start1;
247 _ASSERT(seq1);
248 match_list_iter1 = match->m_MatchIter1;
249 seq2 = match->m_AlnSeq2;
250 start2 = match->m_Start2;
251 if ( seq2 )
252 match_list_iter2 = match->m_MatchIter2;
253 } else if (match->m_AlnSeq2 == refseq) {
254 seq1 = match->m_AlnSeq2;
255 start1 = match->m_Start2;
256 _ASSERT(seq1);
257 match_list_iter1 = match->m_MatchIter2;
258 seq2 = match->m_AlnSeq1;
259 start2 = match->m_Start1;
260 if ( seq2 )
261 match_list_iter2 = match->m_MatchIter1;
262 } else {
263 seq1 = match->m_AlnSeq1;
264 seq2 = match->m_AlnSeq2;
265
266 // mark the two refseqs, they are candidates for next refseq(s)
267 _ASSERT(seq1);
268 match->m_MatchIter1 =
269 seq1->m_MatchList.insert(seq1->m_MatchList.end(), match);
270 if (seq2) {
271 match->m_MatchIter2 =
272 seq2->m_MatchList.insert(seq2->m_MatchList.end(), match);
273 }
274 _ASSERT(match->IsGood());
275
276 // mark that there's no match with this refseq
277 seq1 = 0;
278 }
279
280 // save the match info into the segments map
281 if (seq1) {
282 x_SetTaskCompleted(m_MatchIdx++);
283
284 // this match is used, erase from seq1 list
285 if ( !first_refseq ) {
286 if ( !seq1->m_MatchList.empty() ) {
287 seq1->m_MatchList.erase(match_list_iter1);
288 match_list_iter1 = seq1->m_MatchList.end();
289 }
290 }
291
292 // order the match
293 match->m_AlnSeq1 = seq1;
294 match->m_Start1 = start1;
295 match->m_AlnSeq2 = seq2;
296 match->m_Start2 = start2;
297 match->m_MatchIter1 = match_list_iter1;
298 if ( seq2 )
299 match->m_MatchIter2 = match_list_iter2;
300
301 if (m_MergeFlags & fTruncateOverlaps && seq2) {
302 CDiagRangeCollection::TAlnRng rng(match->m_Start1,
303 match->m_Start2,
304 match->m_Len,
305 !match->m_StrandsDiffer);
306 CDiagRangeCollection::TAlnRngColl substrahend, diff;
307 substrahend.insert(rng);
308 CDiagRangeCollection* plane;
309 TPlanes::iterator plane_it;
310 if ((plane_it = m_Planes.find(make_pair(seq1, seq2))) !=
311 m_Planes.end()) {
312 plane = &(plane_it->second);
313 } else {
314 CDiagRangeCollection new_plane(match->m_AlnSeq1->m_Width,
315 match->m_AlnSeq2->m_Width);
316 plane = &(m_Planes[make_pair(seq1, seq2)] = new_plane);
317 }
318 plane->Diff(substrahend, diff);
319
320 if (diff.empty()) {
321 continue;
322 }
323 rng = *diff.begin();
324 plane->insert(rng);
325
326 // reset the ones below,
327 // since match may have been modified
328 start1 = match->m_Start1 = rng.GetFirstFrom();
329 start2 = match->m_Start2 = rng.GetSecondFrom();
330 curr_len = len = match->m_Len = rng.GetLength();
331 }
332
333 width1 = seq1->m_Width;
334 if (seq2) {
335 width2 = seq2->m_Width;
336 }
337
338 // in case of translated refseq
339 if (width1 == 3) {
340 x_SetSeqFrame(match, match->m_AlnSeq1);
341 {{
342 // reset the ones below,
343 // since match may have been modified
344 seq1 = match->m_AlnSeq1;
345 start1 = match->m_Start1;
346 match_list_iter1 = match->m_MatchIter1;
347 seq2 = match->m_AlnSeq2;
348 start2 = match->m_Start2;
349 if ( seq2 )
350 match_list_iter2 = match->m_MatchIter2;
351 curr_len = len = match->m_Len;
352 }}
353 }
354
355 // if a subject sequence place it in the proper row
356 if ( !first_refseq && m_MergeFlags & fQuerySeqMergeOnly) {
357 bool proper_row_found = false;
358 while (true) {
359 if (seq1->m_DsIdx == match->m_DsIdx) {
360 proper_row_found = true;
361 break;
362 } else {
363 if (seq1->m_ExtraRow) {
364 seq1 = match->m_AlnSeq1 = seq1->m_ExtraRow;
365 } else {
366 break;
367 }
368 }
369 }
370 if ( !proper_row_found &&
371 !(m_MergeFlags & fTruncateOverlaps)) {
372 NCBI_THROW(CAlnException, eMergeFailure,
373 "CAlnMixMerger::x_Merge(): "
374 "Proper row not found for the match. "
375 "Cannot use fQuerySeqMergeOnly?");
376 }
377 }
378
379 CAlnMixStarts& starts = seq1->SetStarts();
380 if (seq2) {
381 // mark it, it is linked to the refseq
382 seq2->m_RefBy = refseq;
383
384 // this match is used erase from seq2 list
385 if ( !first_refseq ) {
386 if ( !seq2->m_MatchList.empty() ) {
387 seq2->m_MatchList.erase(match_list_iter2);
388 match_list_iter2 = seq2->m_MatchList.end();
389 match->m_MatchIter2 = match_list_iter2;
390 }
391 }
392 }
393
394 start_i = starts.end();
395 lo_start_i = starts.end();
396 hi_start_i = starts.end();
397
398
399 // create a copy of the match,
400 // which we could work with temporarily
401 // without modifying the original
402 CAlnMixMatch tmp_match = *match;
403 match = &tmp_match; // point to the new tmp_match
404
405
406 if (seq2) {
407 if (width2 == 3) {
408 // Set the frame if necessary
409 x_SetSeqFrame(match, match->m_AlnSeq2);
410 }
411 // check if the second row fits
412 // this will truncate the match if
413 // there's an inconsistent overlap
414 // and truncation was requested
415 second_row_fits = x_SecondRowFits(match);
416 if (second_row_fits == eIgnoreMatch) {
417 continue;
418 }
419 {{
420 // reset the ones below,
421 // since match may have been modified
422 seq1 = match->m_AlnSeq1;
423 start1 = match->m_Start1;
424 match_list_iter1 = match->m_MatchIter1;
425 seq2 = match->m_AlnSeq2;
426 start2 = match->m_Start2;
427 if ( seq2 )
428 match_list_iter2 = match->m_MatchIter2;
429 curr_len = len = match->m_Len;
430 }}
431 while (second_row_fits == eTranslocation) {
432 if (!seq2->m_ExtraRow) {
433 // create an extra row
434 CRef<CAlnMixSeq> row (new CAlnMixSeq);
435 row->m_BioseqHandle = seq2->m_BioseqHandle;
436 row->m_SeqId = seq2->m_SeqId;
437 row->m_Width = seq2->m_Width;
438 row->m_Frame = start2 % 3;
439 row->m_SeqIdx = seq2->m_SeqIdx;
440 row->m_ChildIdx = seq2->m_ChildIdx + 1;
441 if (m_MergeFlags & fQuerySeqMergeOnly) {
442 row->m_DsIdx = match->m_DsIdx;
443 }
444 m_ExtraRows.push_back(row);
445 row->m_ExtraRowIdx = seq2->m_ExtraRowIdx + 1;
446 seq2 = match->m_AlnSeq2 = seq2->m_ExtraRow = row;
447 break;
448 }
449 seq2 = match->m_AlnSeq2 = seq2->m_ExtraRow;
450
451 second_row_fits = x_SecondRowFits(match);
452 {{
453 // reset the ones below,
454 // since match may have been modified
455 seq1 = match->m_AlnSeq1;
456 start1 = match->m_Start1;
457 match_list_iter1 = match->m_MatchIter1;
458 seq2 = match->m_AlnSeq2;
459 start2 = match->m_Start2;
460 if ( seq2 )
461 match_list_iter2 = match->m_MatchIter2;
462 curr_len = len = match->m_Len;
463 }}
464 }
465 }
466
467
468 if (!starts.size()) {
469 // no starts exist yet
470
471 if ( !m_IndependentDSs ) {
472 // Stop the algorithm after merging with respect to the first refseq.
473 // Clear all MatchLists and exit
474 if ( !(m_SingleRefseq || first_refseq) ) {
475 NON_CONST_ITERATE (TSeqs, it, m_Seqs){
476 if ( !((*it)->m_MatchList.empty()) ) {
477 (*it)->m_MatchList.clear();
478 }
479 }
480 break;
481 }
482 }
483
484 // this seq has not yet been used, set the strand
485 if (m_AlnMixMatches->m_AddFlags & CAlnMixMatches::fForceTranslation) {
486 seq1->m_PositiveStrand = (seq1->m_StrandScore >= 0);
487 } else {
488 seq1->m_PositiveStrand = ! (m_MergeFlags & fNegativeStrand);
489 }
490
491 //create the first one
492 seg = new CAlnMixSegment();
493 seg->m_Len = len;
494 seg->m_DsIdx = match->m_DsIdx;
495 starts[start1] = seg;
496 seg->SetStartIterator
497 (seq1, lo_start_i = hi_start_i = starts.begin());
498
499 if (m_MergeFlags & fQuerySeqMergeOnly) {
500 seq2->m_DsIdx = match->m_DsIdx;
501 }
502 // DONE!
503 } else {
504 // some starts exist already
505
506 // look ahead
507 if ((lo_start_i = start_i = starts.lower_bound(start1))
508 == starts.end() ||
509 start1 < start_i->first) {
510 // the start position does not exist
511 if (lo_start_i != starts.begin()) {
512 --lo_start_i;
513 }
514 }
515
516 // look back
517 if (hi_start_i == starts.end() && start_i != lo_start_i) {
518 CAlnMixSegment * prev_seg = lo_start_i->second;
519 if (lo_start_i->first + prev_seg->m_Len * width1 >
520 start1) {
521 // x----.. becomes x-x--..
522 // x--..
523
524 TSeqPos len1 = (start1 - lo_start_i->first) / width1;
525 TSeqPos len2 = prev_seg->m_Len - len1;
526
527 // create the second seg
528 seg = new CAlnMixSegment();
529 seg->m_Len = len2;
530 seg->m_DsIdx = match->m_DsIdx;
531 starts[start1] = seg;
532
533 // create rows info
534 ITERATE (CAlnMixSegment::TStartIterators, it,
535 prev_seg->m_StartIts) {
536 CAlnMixSeq * seq = it->first;
537 tmp_start_i = it->second;
538 if (seq->m_PositiveStrand ==
539 seq1->m_PositiveStrand) {
540 seq->SetStarts()
541 [tmp_start_i->first + len1 * seq->m_Width]
542 = seg;
543 if (tmp_start_i != seq->SetStarts().end()) {
544 seg->SetStartIterator(seq, ++tmp_start_i);
545 } else {
546 NCBI_THROW(CAlnException, eMergeFailure,
547 "CAlnMixMerger::x_Merge(): "
548 "Internal error: tmp_start_i == seq->GetStarts().end()");
549 }
550 } else {
551 seq->SetStarts()
552 [tmp_start_i->first + len2 * seq->m_Width]
553 = prev_seg;
554 seq->SetStarts()[tmp_start_i->first] = seg;
555 seg->SetStartIterator(seq, tmp_start_i);
556 if (tmp_start_i != seq->SetStarts().end()) {
557 prev_seg->SetStartIterator(seq, ++tmp_start_i);
558 } else {
559 NCBI_THROW(CAlnException, eMergeFailure,
560 "CAlnMixMerger::x_Merge(): "
561 "Internal error: tmp_start_i == seq->GetStarts().end()");
562 }
563 }
564 }
565
566 // truncate the first seg
567 prev_seg->m_Len = len1;
568
569 if (start_i != starts.begin()) {
570 start_i--; // point to the newly created start
571 }
572 }
573 if (lo_start_i != starts.end()) {
574 lo_start_i++;
575 }
576 }
577 }
578
579 // loop through overlapping segments
580 start = start1;
581 while (hi_start_i == starts.end()) {
582 if (start_i != starts.end() && start_i->first == start) {
583 CAlnMixSegment * prev_seg = start_i->second;
584 if (prev_seg->m_Len > curr_len) {
585 // x-------) becomes x----)x--)
586 // x----)
587
588 // create the second seg
589 seg = new CAlnMixSegment();
590 TSeqPos len1 =
591 seg->m_Len = prev_seg->m_Len - curr_len;
592 start += curr_len * width1;
593
594 // truncate the first seg
595 prev_seg->m_Len = curr_len;
596
597 // create rows info
598 ITERATE (CAlnMixSegment::TStartIterators, it,
599 prev_seg->m_StartIts) {
600 CAlnMixSeq * seq = it->first;
601 tmp_start_i = it->second;
602 if (seq->m_PositiveStrand ==
603 seq1->m_PositiveStrand) {
604 seq->SetStarts()[tmp_start_i->first +
605 curr_len * seq->m_Width]
606 = seg;
607 if (tmp_start_i != seq->SetStarts().end()) {
608 seg->SetStartIterator(seq, ++tmp_start_i);
609 } else {
610 NCBI_THROW(CAlnException, eMergeFailure,
611 "CAlnMixMerger::x_Merge(): "
612 "Internal error: tmp_start_i == seq->GetStarts().end()");
613 }
614 } else{
615 seq->SetStarts()[tmp_start_i->first +
616 len1 * seq->m_Width]
617 = prev_seg;
618 seq->SetStarts()[tmp_start_i->first] = seg;
619 seg->SetStartIterator(seq, tmp_start_i);
620 if (tmp_start_i != seq->SetStarts().end()) {
621 prev_seg->SetStartIterator(seq, ++tmp_start_i);
622 } else {
623 NCBI_THROW(CAlnException, eMergeFailure,
624 "CAlnMixMerger::x_Merge(): "
625 "Internal error: tmp_start_i == seq->GetStarts().end()");
626 }
627 }
628 }
629 #if _DEBUG && _ALNMGR_DEBUG
630 seg->StartItsConsistencyCheck(*seq1,
631 start,
632 m_MatchIdx);
633 prev_seg->StartItsConsistencyCheck(*seq1,
634 start,
635 m_MatchIdx);
636 #endif
637
638
639 hi_start_i = start_i; // DONE!
640 } else if (curr_len == prev_seg->m_Len) {
641 // x----)
642 // x----)
643 hi_start_i = start_i; // DONE!
644 } else {
645 // x----) becomes x----)x--)
646 // x-------)
647 start += prev_seg->m_Len * width1;
648 curr_len -= prev_seg->m_Len;
649 if (start_i != starts.end()) {
650 start_i++;
651 }
652 }
653 } else {
654 seg = new CAlnMixSegment();
655 starts[start] = seg;
656 tmp_start_i = start_i;
657 if (tmp_start_i != starts.begin()) {
658 tmp_start_i--;
659 }
660 seg->SetStartIterator(seq1, tmp_start_i);
661 if (start_i != starts.end() &&
662 start + curr_len * width1 > start_i->first) {
663 // x--..
664 // x--------..
665 seg->m_Len = (start_i->first - start) / width1;
666 seg->m_DsIdx = match->m_DsIdx;
667 } else {
668 // x-----)
669 // x---)
670 seg->m_Len = curr_len;
671 seg->m_DsIdx = match->m_DsIdx;
672 hi_start_i = start_i;
673 if (hi_start_i != starts.begin()) {
674 hi_start_i--; // DONE!
675 }
676 }
677 start += seg->m_Len * width1;
678 curr_len -= seg->m_Len;
679 if (lo_start_i == start_i) {
680 if (lo_start_i != starts.begin()) {
681 lo_start_i--;
682 }
683 }
684 }
685 }
686
687 // try to resolve the second row
688 if (seq2) {
689
690 while (second_row_fits != eSecondRowFitsOk &&
691 second_row_fits != eIgnoreMatch) {
692 if (!seq2->m_ExtraRow) {
693 // create an extra row
694 CRef<CAlnMixSeq> row (new CAlnMixSeq);
695 row->m_BioseqHandle = seq2->m_BioseqHandle;
696 row->m_SeqId = seq2->m_SeqId;
697 row->m_Width = seq2->m_Width;
698 row->m_Frame = start2 % 3;
699 row->m_SeqIdx = seq2->m_SeqIdx;
700 row->m_ChildIdx = seq2->m_ChildIdx + 1;
701 if (m_MergeFlags & fQuerySeqMergeOnly) {
702 row->m_DsIdx = match->m_DsIdx;
703 }
704 m_ExtraRows.push_back(row);
705 row->m_ExtraRowIdx = seq2->m_ExtraRowIdx + 1;
706 seq2 = match->m_AlnSeq2 = seq2->m_ExtraRow = row;
707 break;
708 }
709 seq2 = match->m_AlnSeq2 = seq2->m_ExtraRow;
710
711 second_row_fits = x_SecondRowFits(match);
712 {{
713 // reset the ones below,
714 // since match may have been modified
715 seq1 = match->m_AlnSeq1;
716 start1 = match->m_Start1;
717 match_list_iter1 = match->m_MatchIter1;
718 seq2 = match->m_AlnSeq2;
719 start2 = match->m_Start2;
720 if ( seq2 )
721 match_list_iter2 = match->m_MatchIter2;
722 curr_len = len = match->m_Len;
723 }}
724 }
725 if (second_row_fits == eIgnoreMatch) {
726 continue;
727 }
728
729 if (m_MergeFlags & fTruncateOverlaps) {
730 // we need to reset these shtorcuts
731 // in case the match was truncated
732 start1 = match->m_Start1;
733 start2 = match->m_Start2;
734 len = match->m_Len;
735 }
736
737 // set the strand if first time
738 if (seq2->GetStarts().empty()) {
739 seq2->m_PositiveStrand =
740 (seq1->m_PositiveStrand ?
741 !match->m_StrandsDiffer :
742 match->m_StrandsDiffer);
743 }
744
745 // create row info
746 CAlnMixStarts& starts2 = match->m_AlnSeq2->SetStarts();
747 start = start2;
748 CAlnMixStarts::iterator start2_i
749 = starts2.lower_bound(start2);
750 start_i = match->m_StrandsDiffer ? hi_start_i : lo_start_i;
751
752 while(start < start2 + len * width2) {
753 if (start2_i != starts2.end() &&
754 start2_i->first == start) {
755 // this position already exists
756
757 if (start2_i->second != start_i->second) {
758 // merge the two segments
759
760 // store the seg in a CRef to delay its deletion
761 // until after the iteration on it is finished
762 CRef<CAlnMixSegment> tmp_seg(start2_i->second);
763
764 ITERATE (CAlnMixSegment::TStartIterators,
765 it,
766 tmp_seg->m_StartIts) {
767 CAlnMixSeq * tmp_seq = it->first;
768 tmp_start_i = it->second;
769 tmp_start_i->second = start_i->second;
770 start_i->second->SetStartIterator(tmp_seq, tmp_start_i);
771 }
772 #if _DEBUG && _ALNMGR_DEBUG
773 start_i->second->StartItsConsistencyCheck(*seq2,
774 start,
775 m_MatchIdx);
776 #endif
777 }
778 } else {
779 // this position does not exist, create it
780 seq2->SetStarts()[start] = start_i->second;
781
782 // start2_i != starts.begin() because we just
783 // made an insertion, so decrement is ok
784 start2_i--;
785
786 // point this segment's row start iterator
787 if (start_i->second->m_StartIts.find(seq2) !=
788 start_i->second->m_StartIts.end()) {
789 // consistency check fails
790 NCBI_THROW(CAlnException, eMergeFailure,
791 "CAlnMixMerger::x_Merge(): "
792 "Internal error: "
793 "Start iterator already exists for seq2.");
794 }
795 start_i->second->SetStartIterator(seq2, start2_i);
796 #if _DEBUG && _ALNMGR_DEBUG
797 start_i->second->StartItsConsistencyCheck(*seq2,
798 start,
799 m_MatchIdx);
800 #endif
801 }
802
803 // increment values
804 start += start_i->second->m_Len * width2;
805 if (start2_i != starts2.end()) {
806 start2_i++;
807 }
808 if (match->m_StrandsDiffer) {
809 if (start_i != starts.begin()) {
810 start_i--;
811 }
812 } else {
813 if (start_i != starts.end()) {
814 start_i++;
815 }
816 }
817 }
818 }
819 }
820 }
821 x_SetTaskCompleted(m_MatchIdx);
822
823 m_AlnMixSequences->BuildRows();
824 m_AlnMixSegments->Build(m_MergeFlags & fGapJoin,
825 m_MergeFlags & fMinGap,
826 m_MergeFlags & fRemoveLeadTrailGaps);
827 if (m_MergeFlags & fFillUnalignedRegions) {
828 m_AlnMixSegments->FillUnalignedRegions();
829 }
830 x_CreateDenseg();
831 }
832
833
834 CAlnMixMerger::TSecondRowFits
x_SecondRowFits(CAlnMixMatch * match) const835 CAlnMixMerger::x_SecondRowFits(CAlnMixMatch * match) const
836 {
837 CAlnMixSeq*& seq1 = match->m_AlnSeq1;
838 CAlnMixSeq*& seq2 = match->m_AlnSeq2;
839 TSeqPos& start1 = match->m_Start1;
840 TSeqPos& start2 = match->m_Start2;
841 TSeqPos& len = match->m_Len;
842 const int& width1 = seq1->m_Width;
843 const int& width2 = seq2->m_Width;
844 CAlnMixStarts::const_iterator starts2_i;
845 TSignedSeqPos delta, delta1, delta2;
846 TSecondRowFits result = eSecondRowFitsOk;
847
848 // subject sequences go on separate rows if requested
849 if (m_MergeFlags & fQuerySeqMergeOnly) {
850 if (seq2->m_DsIdx) {
851 if ( !(m_MergeFlags & fTruncateOverlaps) ) {
852 if (seq2->m_DsIdx == match->m_DsIdx) {
853 return result;
854 } else {
855 return eForceSeparateRow;
856 }
857 }
858 } else {
859 seq2->m_DsIdx = match->m_DsIdx;
860 if ( !(m_MergeFlags & fTruncateOverlaps) ) {
861 return result;
862 }
863 }
864 }
865
866 if ( !seq2->GetStarts().empty() ) {
867
868 // check strand
869 while (true) {
870 if (seq2->m_PositiveStrand !=
871 (seq1->m_PositiveStrand ?
872 !match->m_StrandsDiffer :
873 match->m_StrandsDiffer)) {
874 if (seq2->m_ExtraRow) {
875 seq2 = seq2->m_ExtraRow;
876 } else {
877 return eInconsistentStrand;
878 }
879 } else {
880 break;
881 }
882 }
883
884 // check frame
885 if (seq2->m_Width == 3 && seq2->m_Frame != (int)(start2 % 3)) {
886 return eInconsistentFrame;
887 }
888
889 starts2_i = seq2->GetStarts().lower_bound(start2);
890
891 // check below
892 if (starts2_i != seq2->GetStarts().begin()) {
893 starts2_i--;
894
895 // check for inconsistency on the first row
896 if ( !m_IndependentDSs ) {
897 CAlnMixSegment::TStartIterators::const_iterator seq1_start_it_i =
898 starts2_i->second->m_StartIts.find(seq1);
899 if (seq1_start_it_i != starts2_i->second->m_StartIts.end()) {
900 const TSeqPos& existing_start1 = seq1_start_it_i->second->first;
901 if (match->m_StrandsDiffer) {
902 delta = start1 + len * width1 - existing_start1;
903 if (delta > 0) {
904 if (m_MergeFlags & fTruncateOverlaps) {
905 delta /= width1;
906 if (delta < (TSignedSeqPos)len) {
907 // target above
908 // x----- x-)-)
909 // (----x (---x
910 // target below
911 len -= delta;
912 start2 += delta * width2;
913 } else if (delta > (TSignedSeqPos)len &&
914 m_MergeFlags & fAllowTranslocation) {
915 // Translocation
916 // below target above
917 // x---- x-)---
918 // (----x (------x
919 // target below
920 delta = (existing_start1 +
921 starts2_i->second->m_Len - start1) /
922 width1;
923 if (delta > 0) {
924 if (delta < (TSignedSeqPos)len) {
925 start1 += delta * width1;
926 len -= delta;
927 } else {
928 return eIgnoreMatch;
929 }
930 }
931 return eTranslocation;
932 } else {
933 return eIgnoreMatch;
934 }
935 } else {
936 return eFirstRowOverlapAbove;
937 }
938 }
939 } else {
940 delta = existing_start1
941 + starts2_i->second->m_Len * width1
942 - start1;
943 if (delta > 0) {
944 if (m_MergeFlags & fTruncateOverlaps) {
945 delta /= width1;
946 if (len > (TSeqPos)delta) {
947 // below target
948 // x---- x-)--)
949 // x---) x----)
950 // below target
951 len -= delta;
952 start1 += delta * width1;
953 start2 += delta * width2;
954 } else if (delta > (TSignedSeqPos)len &&
955 m_MergeFlags & fAllowTranslocation) {
956 // Translocation
957 // target above
958 // x--x-) ----)
959 // x---) x----)
960 // below target
961 delta = (existing_start1 - start1) / width1;
962 if (delta < (TSignedSeqPos)len) {
963 len = delta;
964 if ( !len ) {
965 return eIgnoreMatch;
966 }
967 }
968 return eTranslocation;
969 } else {
970 return eIgnoreMatch;
971 }
972 } else {
973 return eFirstRowOverlapBelow;
974 }
975 }
976 }
977 }
978 }
979
980 // check for overlap with the segment below on second row
981 delta = starts2_i->first + starts2_i->second->m_Len * width2
982 - start2;
983 if (delta > 0) {
984 // target
985 // ----- ------
986 // x---- x-)--)
987 // below target
988 if (m_MergeFlags & fTruncateOverlaps) {
989 delta /= width2;
990 if (len > (TSeqPos)delta) {
991 len -= delta;
992 start2 += delta * width2;
993 if ( !match->m_StrandsDiffer ) {
994 start1 += delta * width1;
995 }
996 } else {
997 return eIgnoreMatch;
998 }
999 } else {
1000 return eSecondRowOverlap;
1001 }
1002 }
1003 if (starts2_i != seq2->GetStarts().end()) {
1004 starts2_i++;
1005 }
1006 }
1007
1008 // check the overlapping area for consistency
1009 while (starts2_i != seq2->GetStarts().end() &&
1010 starts2_i->first < start2 + len * width2) {
1011 if ( !m_IndependentDSs ) {
1012 CAlnMixSegment::TStartIterators::const_iterator seq1_start_it_i =
1013 starts2_i->second->m_StartIts.find(seq1);
1014 if (seq1_start_it_i != starts2_i->second->m_StartIts.end()) {
1015 const TSeqPos& existing_start1 = seq1_start_it_i->second->first;
1016 if (match->m_StrandsDiffer) {
1017 // x---..- x---..--)
1018 // (---..- (--x..--x
1019 delta1 = (start1 - existing_start1) / width1 +
1020 len - starts2_i->second->m_Len;
1021 } else {
1022 // x--..- x---...-)
1023 // x--..- x---...-)
1024 delta1 = (existing_start1 - start1) / width1;
1025 }
1026 delta2 = (starts2_i->first - start2) / width2;
1027 if (delta1 != delta2) {
1028 if (m_MergeFlags & fTruncateOverlaps) {
1029 delta = (delta1 < delta2 ? delta1 : delta2);
1030 if (delta > 0) {
1031 if (match->m_StrandsDiffer) {
1032 start1 += (len - delta) * width1;
1033 }
1034 len = delta;
1035 } else if (m_MergeFlags & fAllowTranslocation) {
1036 return eTranslocation;
1037 } else {
1038 return eInconsistentOverlap;
1039 }
1040 } else {
1041 return eInconsistentOverlap;
1042 }
1043 }
1044 }
1045 }
1046 starts2_i++;
1047 }
1048
1049 // check above for consistency
1050 if (starts2_i != seq2->GetStarts().end()) {
1051 if ( !m_IndependentDSs ) {
1052 CAlnMixSegment::TStartIterators::const_iterator seq1_start_it_i =
1053 starts2_i->second->m_StartIts.find(seq1);
1054 if (seq1_start_it_i != starts2_i->second->m_StartIts.end()) {
1055 const TSeqPos& existing_start1 = seq1_start_it_i->second->first;
1056 if (match->m_StrandsDiffer) {
1057 delta = existing_start1 +
1058 starts2_i->second->m_Len * width1 - start1;
1059 if (delta > 0) {
1060 // below target
1061 // x---- x-)--)
1062 // (---x (----x
1063 // above target
1064 if (m_MergeFlags & fTruncateOverlaps) {
1065 if (len > (TSeqPos)(delta / width1)) {
1066 len -= delta / width1;
1067 start1 += delta;
1068 } else {
1069 if (m_MergeFlags & fAllowTranslocation) {
1070 return eFirstRowOverlapBelow;
1071 } else {
1072 return eIgnoreMatch;
1073 }
1074 }
1075 } else {
1076 return eFirstRowOverlapBelow;
1077 }
1078 }
1079 } else {
1080 delta = start1 + len * width1 - existing_start1;
1081 if (delta > 0) {
1082 // target above
1083 // x--x-) ----)
1084 // x----) x---)
1085 // target above
1086 if (m_MergeFlags & fTruncateOverlaps) {
1087 if (len <= (TSeqPos)delta / width1) {
1088 if (m_MergeFlags & fAllowTranslocation) {
1089 return eFirstRowOverlapAbove;
1090 } else {
1091 return eIgnoreMatch;
1092 }
1093 } else {
1094 len -= delta / width1;
1095 }
1096 } else {
1097 return eFirstRowOverlapAbove;
1098 }
1099 }
1100 }
1101 }
1102 }
1103 }
1104
1105 // check for inconsistent matches
1106 CAlnMixStarts::const_iterator starts1_i = seq1->GetStarts().find(start1);
1107 if (starts1_i == seq1->GetStarts().end() ||
1108 starts1_i->first != start1) {
1109 // commenting out for now, since moved the function call ahead
1110 // NCBI_THROW(CAlnException, eMergeFailure,
1111 // "CAlnMixMerger::x_SecondRowFits(): "
1112 // "Internal error: seq1->m_Starts do not match");
1113 } else {
1114 CAlnMixSegment::TStartIterators::const_iterator seq2_start_it_i;
1115 TSeqPos tmp_start =
1116 match->m_StrandsDiffer ? start2 + len * width2 : start2;
1117 while (starts1_i != seq1->GetStarts().end() &&
1118 starts1_i->first < start1 + len * width1) {
1119
1120 const CAlnMixSegment::TStartIterators& seg_start_its =
1121 starts1_i->second->m_StartIts;
1122
1123 if (match->m_StrandsDiffer) {
1124 tmp_start -= starts1_i->second->m_Len * width2;
1125 }
1126
1127 if ((seq2_start_it_i = seg_start_its.find(seq2)) !=
1128 seg_start_its.end()) {
1129 if (seq2_start_it_i->second->first != tmp_start) {
1130 // found an inconsistent prev match
1131 return eSecondRowInconsistency;
1132 }
1133 }
1134
1135 if ( !match->m_StrandsDiffer ) {
1136 tmp_start += starts1_i->second->m_Len * width2;
1137 }
1138
1139 starts1_i++;
1140 }
1141 }
1142 }
1143 if (m_MergeFlags & fQuerySeqMergeOnly) {
1144 _ASSERT(m_MergeFlags & fTruncateOverlaps);
1145 if (seq2->m_DsIdx == match->m_DsIdx) {
1146 return result;
1147 } else {
1148 return eForceSeparateRow;
1149 }
1150 }
1151 return result;
1152 }
1153
1154
1155 void
x_CreateDenseg()1156 CAlnMixMerger::x_CreateDenseg()
1157 {
1158 int numrow = 0,
1159 numrows = m_Rows.size();
1160 int numseg = 0,
1161 numsegs = m_AlnMixSegments->m_Segments.size();
1162 int num = numrows * numsegs;
1163
1164 m_DS = new CDense_seg();
1165 m_DS->SetDim(numrows);
1166 m_DS->SetNumseg(numsegs);
1167
1168 m_Aln = new CSeq_align();
1169 m_Aln->SetType(CSeq_align::eType_not_set);
1170 m_Aln->SetSegs().SetDenseg(*m_DS);
1171 m_Aln->SetDim(numrows);
1172
1173 CDense_seg::TIds& ids = m_DS->SetIds();
1174 CDense_seg::TStarts& starts = m_DS->SetStarts();
1175 CDense_seg::TStrands& strands = m_DS->SetStrands();
1176 CDense_seg::TLens& lens = m_DS->SetLens();
1177
1178 x_SetTaskName("Building");
1179 x_SetTaskTotal(numsegs);
1180
1181 ids.resize(numrows);
1182 lens.resize(numsegs);
1183 starts.resize(num, -1);
1184 strands.resize(num, eNa_strand_minus);
1185
1186 vector<bool> row_empty(numrows, true);
1187
1188 // ids
1189 numrow = 0;
1190 ITERATE(TSeqs, row_i, m_Rows) {
1191 ids[numrow++] = (*row_i)->m_SeqId;
1192 }
1193
1194 int offset = 0;
1195 numseg = 0;
1196
1197 ITERATE(CAlnMixSegments::TSegments,
1198 seg_i,
1199 m_AlnMixSegments->m_Segments) {
1200
1201 // lens
1202 lens[numseg] = (*seg_i)->m_Len;
1203
1204 // starts
1205 ITERATE (CAlnMixSegment::TStartIterators, start_its_i,
1206 (*seg_i)->m_StartIts) {
1207 starts[offset + start_its_i->first->m_RowIdx] =
1208 start_its_i->second->first;
1209
1210 if (start_its_i->second->first != -1) {
1211 row_empty[start_its_i->first->m_RowIdx] = false;
1212 }
1213 }
1214
1215 // strands
1216 numrow = 0;
1217 ITERATE(TSeqs, row_i, m_Rows) {
1218 if ((*row_i)->m_PositiveStrand) {
1219 strands[offset + numrow] = eNa_strand_plus;
1220 }
1221 numrow++;
1222 }
1223
1224 // next segment
1225 numseg++; offset += numrows;
1226 x_SetTaskCompleted(numseg);
1227 }
1228
1229
1230 // widths
1231 CDense_seg::TWidths* widths = NULL;
1232 if ((m_AlnMixMatches->m_ContainsNA && m_AlnMixMatches->m_ContainsAA) ||
1233 (m_AlnMixMatches->m_AddFlags & CAlnMixMatches::fForceTranslation)) {
1234 widths = &m_DS->SetWidths();
1235 widths->resize(numrows);
1236 numrow = 0;
1237 ITERATE (TSeqs, row_i, m_Rows) {
1238 (*widths)[numrow++] = (*row_i)->m_Width;
1239 }
1240 }
1241
1242 //remove empty rows
1243 for(int row = numrows-1; row >=0; --row) {
1244 if (row_empty[row]) {
1245 ids.erase(ids.begin()+row);
1246 if (widths) {
1247 widths->erase(widths->begin()+row);
1248 }
1249 for (int i = (numsegs-1)*numrows + row; i > 0; i -= numrows) {
1250 starts.erase(starts.begin()+i);
1251 strands.erase(strands.begin()+i);
1252 }
1253 --numrows;
1254 }
1255 }
1256 m_DS->SetDim(numrows);
1257
1258 #if _DEBUG
1259 try {
1260 m_DS->Validate(true);
1261 } catch (...) {
1262 _ASSERT(false);
1263 }
1264 #endif
1265 }
1266
1267
1268 void
x_SetSeqFrame(CAlnMixMatch * match,CAlnMixSeq * & seq)1269 CAlnMixMerger::x_SetSeqFrame(CAlnMixMatch* match, CAlnMixSeq*& seq)
1270 {
1271 unsigned frame;
1272 if (seq == match->m_AlnSeq1) {
1273 frame = match->m_Start1 % 3;
1274 } else {
1275 frame = match->m_Start2 % 3;
1276 }
1277 if (seq->GetStarts().empty()) {
1278 seq->m_Frame = frame;
1279 } else {
1280 while ((unsigned)seq->m_Frame != frame) {
1281 if (!seq->m_ExtraRow) {
1282 // create an extra frame
1283 CRef<CAlnMixSeq> new_seq (new CAlnMixSeq);
1284 new_seq->m_BioseqHandle = seq->m_BioseqHandle;
1285 new_seq->m_SeqId = seq->m_SeqId;
1286 new_seq->m_PositiveStrand = seq->m_PositiveStrand;
1287 new_seq->m_Width = seq->m_Width;
1288 new_seq->m_Frame = frame;
1289 new_seq->m_SeqIdx = seq->m_SeqIdx;
1290 new_seq->m_ChildIdx = seq->m_ChildIdx + 1;
1291 if (m_MergeFlags & fQuerySeqMergeOnly) {
1292 new_seq->m_DsIdx = match->m_DsIdx;
1293 }
1294 m_ExtraRows.push_back(new_seq);
1295 new_seq->m_ExtraRowIdx = seq->m_ExtraRowIdx + 1;
1296 seq = seq->m_ExtraRow = new_seq;
1297 break;
1298 }
1299 seq = seq->m_ExtraRow;
1300 }
1301 }
1302 }
1303
1304
1305 END_objects_SCOPE // namespace ncbi::objects::
1306 END_NCBI_SCOPE
1307