1 /*  $Id: alnmerger.cpp 601341 2020-02-05 20:27:33Z vasilche $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Kamen Todorov, NCBI
27 *
28 * File Description:
29 *   Alignment merger
30 *
31 * ===========================================================================
32 */
33 
34 
35 #include <ncbi_pch.hpp>
36 #include <objtools/alnmgr/alnmerger.hpp>
37 #include <objtools/alnmgr/alnseq.hpp>
38 #include <objtools/alnmgr/alnmatch.hpp>
39 #include <objtools/alnmgr/alnmap.hpp>
40 
41 #include <algorithm>
42 
43 
44 BEGIN_NCBI_SCOPE
45 BEGIN_objects_SCOPE // namespace ncbi::objects::
46 
47 
CAlnMixMerger(CRef<CAlnMixMatches> & aln_mix_matches,TCalcScoreMethod calc_score)48 CAlnMixMerger::CAlnMixMerger(CRef<CAlnMixMatches>& aln_mix_matches,
49                              TCalcScoreMethod calc_score)
50     : m_DsCnt(aln_mix_matches->m_DsCnt),
51       m_AlnMixMatches(aln_mix_matches),
52       m_Matches(aln_mix_matches->m_Matches),
53       m_AlnMixSequences(aln_mix_matches->m_AlnMixSequences),
54       m_Seqs(aln_mix_matches->m_Seqs),
55       m_Rows(m_AlnMixSequences->m_Rows),
56       m_ExtraRows(m_AlnMixSequences->m_ExtraRows),
57       m_AlnMixSegments(new CAlnMixSegments(m_AlnMixSequences)),
58       m_SingleRefseq(false),
59       x_CalculateScore(calc_score)
60 {
61 }
62 
63 
64 void
Reset()65 CAlnMixMerger::Reset()
66 {
67     m_SingleRefseq = false;
68     if (m_DS) {
69         m_DS.Reset();
70     }
71     if (m_Aln) {
72         m_Aln.Reset();
73     }
74     m_AlnMixSegments->m_Segments.clear();
75     m_Rows.clear();
76     m_ExtraRows.clear();
77     NON_CONST_ITERATE (TSeqs, seq_i, m_Seqs) {
78         (*seq_i)->SetStarts().clear();
79         (*seq_i)->m_ExtraRow = 0;
80     }
81 }
82 
83 
84 void
Merge(TMergeFlags flags)85 CAlnMixMerger::Merge(TMergeFlags flags)
86 {
87     if ( !m_DsCnt ) {
88         NCBI_THROW(CAlnException, eMergeFailure,
89                    "CAlnMixMerger::Merge(): "
90                    "No alignments were added for merging.");
91     }
92     if ( !m_DS  ||  m_MergeFlags != flags) {
93         Reset();
94         m_MergeFlags = flags;
95         x_Merge();
96     }
97 }
98 
99 
100 void
x_Merge()101 CAlnMixMerger::x_Merge()
102 {
103     bool first_refseq = true; // mark the first loop
104 
105     // Find the refseq (if such exists)
106     {{
107         m_SingleRefseq = false;
108         m_IndependentDSs = m_DsCnt > 1;
109 
110         unsigned int ds_cnt;
111         NON_CONST_ITERATE (TSeqs, it, m_Seqs){
112             ds_cnt = (*it)->m_DsCnt;
113             if (ds_cnt > 1) {
114                 m_IndependentDSs = false;
115             }
116             if (ds_cnt == m_DsCnt) {
117                 m_SingleRefseq = true;
118                 if ( !first_refseq ) {
119                     CRef<CAlnMixSeq> refseq = *it;
120                     m_Seqs.erase(it);
121                     m_Seqs.insert(m_Seqs.begin(), refseq);
122                 }
123                 break;
124             }
125             first_refseq = false;
126         }
127     }}
128 
129     // Index the sequences
130     {{
131         int seq_idx=0;
132         NON_CONST_ITERATE (TSeqs, seq_i, m_Seqs) {
133             (*seq_i)->m_SeqIdx = seq_idx++;
134         }
135     }}
136 
137 
138     // Set the widths if the mix contains both AA & NA
139     // or in case we force translation
140     if ((m_AlnMixMatches->m_ContainsNA  &&  m_AlnMixMatches->m_ContainsAA)  ||
141         (m_AlnMixMatches->m_AddFlags & CAlnMixMatches::fForceTranslation)) {
142         NON_CONST_ITERATE (TSeqs, seq_i, m_Seqs) {
143             (*seq_i)->m_Width = (*seq_i)->m_IsAA ? 1 : 3;
144         }
145     }
146 
147 
148 
149     CAlnMixSeq * refseq = 0, * seq1 = 0, * seq2 = 0;
150     TSeqPos start, start1, start2, len, curr_len;
151     int width1 = 0, width2 = 0;
152     CAlnMixMatch * match;
153     CAlnMixSeq::TMatchList::iterator match_list_iter1, match_list_iter2;
154     CAlnMixSeq::TMatchList::iterator match_list_i;
155     TSecondRowFits second_row_fits;
156 
157     refseq = *(m_Seqs.begin());
158     TMatches::iterator match_i = m_Matches.begin();
159     m_MatchIdx = 0;
160 
161     CRef<CAlnMixSegment> seg;
162     CAlnMixStarts::iterator start_i, lo_start_i, hi_start_i, tmp_start_i;
163 
164     first_refseq = true; // mark the first loop
165 
166     x_SetTaskTotal(m_Matches.size());
167 
168     while (true) {
169 
170         // need to cancel?
171         if (x_InterruptTask()) {
172             Reset();
173             return;
174         }
175 
176         // reached the end?
177         if (first_refseq ?
178             match_i == m_Matches.end()  &&  m_Matches.size() :
179             refseq->m_MatchList.empty()  ||
180             match_list_i == refseq->m_MatchList.end()) {
181 
182             // move on to the next refseq
183             refseq->m_RefBy = 0;
184 
185             // try to find the best scoring 'connected' candidate
186             NON_CONST_ITERATE (TSeqs, it, m_Seqs){
187                 if ( !((*it)->m_MatchList.empty())  &&
188                      (*it)->m_RefBy == refseq) {
189                     if (refseq == *it) {
190                         NCBI_THROW(CAlnException, eMergeFailure,
191                                    "CAlnMixMerger::x_Merge(): "
192                                    "Infinitue loop detected "
193                                    "while searching for a connected candidate.");
194                     }
195                     refseq = *it;
196                     break;
197                 }
198             }
199             if (refseq->m_RefBy == 0) {
200                 // no candidate was found 'connected' to the refseq
201                 // continue with the highest scoring candidate
202                 NON_CONST_ITERATE (TSeqs, it, m_Seqs){
203                     if ( !((*it)->m_MatchList.empty()) ) {
204                         if (refseq == *it) {
205                             NCBI_THROW(CAlnException, eMergeFailure,
206                                        "CAlnMixMerger::x_Merge(): "
207                                        "Infinitue loop detected "
208                                        "while searching for a new candidate.");
209                         }
210                         refseq = *it;
211                         break;
212                     }
213                 }
214             }
215 
216             if (refseq->m_MatchList.empty()) {
217                 break; // done
218             } else {
219                 first_refseq = false;
220                 match_list_i = refseq->m_MatchList.begin();
221             }
222             continue;
223         } else {
224             // iterate
225             if (first_refseq) {
226                 match = *match_i;
227                 ++match_i;
228             } else {
229                 match = *match_list_i;
230                 ++match_list_i;
231                 if (refseq == match->m_AlnSeq2  &&  refseq == match->m_AlnSeq1) {
232                     // This is the rare case of a match b/n a seq and
233                     // itself.  We need one more incrementation since
234                     // the match would be recorded twice
235                     _ASSERT(refseq->m_MatchList.size() >= 2);
236                     ++match_list_i;
237                 }
238             }
239         }
240 
241         curr_len = len = match->m_Len;
242 
243         // is it a match with this refseq?
244         if (match->m_AlnSeq1 == refseq) {
245             seq1 = match->m_AlnSeq1;
246             start1 = match->m_Start1;
247             _ASSERT(seq1);
248             match_list_iter1 = match->m_MatchIter1;
249             seq2 = match->m_AlnSeq2;
250             start2 = match->m_Start2;
251             if ( seq2 )
252                 match_list_iter2 = match->m_MatchIter2;
253         } else if (match->m_AlnSeq2 == refseq) {
254             seq1 = match->m_AlnSeq2;
255             start1 = match->m_Start2;
256             _ASSERT(seq1);
257             match_list_iter1 = match->m_MatchIter2;
258             seq2 = match->m_AlnSeq1;
259             start2 = match->m_Start1;
260             if ( seq2 )
261                 match_list_iter2 = match->m_MatchIter1;
262         } else {
263             seq1 = match->m_AlnSeq1;
264             seq2 = match->m_AlnSeq2;
265 
266             // mark the two refseqs, they are candidates for next refseq(s)
267             _ASSERT(seq1);
268             match->m_MatchIter1 =
269                 seq1->m_MatchList.insert(seq1->m_MatchList.end(), match);
270             if (seq2) {
271                 match->m_MatchIter2 =
272                     seq2->m_MatchList.insert(seq2->m_MatchList.end(), match);
273             }
274             _ASSERT(match->IsGood());
275 
276             // mark that there's no match with this refseq
277             seq1 = 0;
278         }
279 
280         // save the match info into the segments map
281         if (seq1) {
282             x_SetTaskCompleted(m_MatchIdx++);
283 
284             // this match is used, erase from seq1 list
285             if ( !first_refseq ) {
286                 if ( !seq1->m_MatchList.empty() ) {
287                     seq1->m_MatchList.erase(match_list_iter1);
288                     match_list_iter1 = seq1->m_MatchList.end();
289                 }
290             }
291 
292             // order the match
293             match->m_AlnSeq1 = seq1;
294             match->m_Start1 = start1;
295             match->m_AlnSeq2 = seq2;
296             match->m_Start2 = start2;
297             match->m_MatchIter1 = match_list_iter1;
298             if ( seq2 )
299                 match->m_MatchIter2 = match_list_iter2;
300 
301             if (m_MergeFlags & fTruncateOverlaps  &&  seq2) {
302                 CDiagRangeCollection::TAlnRng rng(match->m_Start1,
303                                                   match->m_Start2,
304                                                   match->m_Len,
305                                                   !match->m_StrandsDiffer);
306                 CDiagRangeCollection::TAlnRngColl substrahend, diff;
307                 substrahend.insert(rng);
308                 CDiagRangeCollection* plane;
309                 TPlanes::iterator plane_it;
310                 if ((plane_it = m_Planes.find(make_pair(seq1, seq2))) !=
311                     m_Planes.end()) {
312                     plane = &(plane_it->second);
313                 } else {
314                     CDiagRangeCollection new_plane(match->m_AlnSeq1->m_Width,
315                                                    match->m_AlnSeq2->m_Width);
316                     plane = &(m_Planes[make_pair(seq1, seq2)] = new_plane);
317                 }
318                 plane->Diff(substrahend, diff);
319 
320                 if (diff.empty()) {
321                     continue;
322                 }
323                 rng = *diff.begin();
324                 plane->insert(rng);
325 
326                 // reset the ones below,
327                 // since match may have been modified
328                 start1 = match->m_Start1 = rng.GetFirstFrom();
329                 start2 = match->m_Start2 = rng.GetSecondFrom();
330                 curr_len = len = match->m_Len = rng.GetLength();
331             }
332 
333             width1 = seq1->m_Width;
334             if (seq2) {
335                 width2 = seq2->m_Width;
336             }
337 
338             // in case of translated refseq
339             if (width1 == 3) {
340                 x_SetSeqFrame(match, match->m_AlnSeq1);
341                 {{
342                     // reset the ones below,
343                     // since match may have been modified
344                     seq1 = match->m_AlnSeq1;
345                     start1 = match->m_Start1;
346                     match_list_iter1 = match->m_MatchIter1;
347                     seq2 = match->m_AlnSeq2;
348                     start2 = match->m_Start2;
349                     if ( seq2 )
350                         match_list_iter2 = match->m_MatchIter2;
351                     curr_len = len = match->m_Len;
352                 }}
353             }
354 
355             // if a subject sequence place it in the proper row
356             if ( !first_refseq  &&  m_MergeFlags & fQuerySeqMergeOnly) {
357                 bool proper_row_found = false;
358                 while (true) {
359                     if (seq1->m_DsIdx == match->m_DsIdx) {
360                         proper_row_found = true;
361                         break;
362                     } else {
363                         if (seq1->m_ExtraRow) {
364                             seq1 = match->m_AlnSeq1 = seq1->m_ExtraRow;
365                         } else {
366                             break;
367                         }
368                     }
369                 }
370                 if ( !proper_row_found   &&
371                      !(m_MergeFlags & fTruncateOverlaps)) {
372                     NCBI_THROW(CAlnException, eMergeFailure,
373                                "CAlnMixMerger::x_Merge(): "
374                                "Proper row not found for the match. "
375                                "Cannot use fQuerySeqMergeOnly?");
376                 }
377             }
378 
379             CAlnMixStarts& starts = seq1->SetStarts();
380             if (seq2) {
381                 // mark it, it is linked to the refseq
382                 seq2->m_RefBy = refseq;
383 
384                 // this match is used erase from seq2 list
385                 if ( !first_refseq ) {
386                     if ( !seq2->m_MatchList.empty() ) {
387                         seq2->m_MatchList.erase(match_list_iter2);
388                         match_list_iter2 = seq2->m_MatchList.end();
389                         match->m_MatchIter2 = match_list_iter2;
390                     }
391                 }
392             }
393 
394             start_i = starts.end();
395             lo_start_i = starts.end();
396             hi_start_i = starts.end();
397 
398 
399             // create a copy of the match,
400             // which we could work with temporarily
401             // without modifying the original
402             CAlnMixMatch tmp_match = *match;
403             match = &tmp_match; // point to the new tmp_match
404 
405 
406             if (seq2) {
407                 if (width2 == 3) {
408                     // Set the frame if necessary
409                     x_SetSeqFrame(match, match->m_AlnSeq2);
410                 }
411                 // check if the second row fits
412                 // this will truncate the match if
413                 // there's an inconsistent overlap
414                 // and truncation was requested
415                 second_row_fits = x_SecondRowFits(match);
416                 if (second_row_fits == eIgnoreMatch) {
417                     continue;
418                 }
419                 {{
420                     // reset the ones below,
421                     // since match may have been modified
422                     seq1 = match->m_AlnSeq1;
423                     start1 = match->m_Start1;
424                     match_list_iter1 = match->m_MatchIter1;
425                     seq2 = match->m_AlnSeq2;
426                     start2 = match->m_Start2;
427                     if ( seq2 )
428                         match_list_iter2 = match->m_MatchIter2;
429                     curr_len = len = match->m_Len;
430                 }}
431                 while (second_row_fits == eTranslocation) {
432                     if (!seq2->m_ExtraRow) {
433                         // create an extra row
434                         CRef<CAlnMixSeq> row (new CAlnMixSeq);
435                         row->m_BioseqHandle = seq2->m_BioseqHandle;
436                         row->m_SeqId = seq2->m_SeqId;
437                         row->m_Width = seq2->m_Width;
438                         row->m_Frame = start2 % 3;
439                         row->m_SeqIdx = seq2->m_SeqIdx;
440                         row->m_ChildIdx = seq2->m_ChildIdx + 1;
441                         if (m_MergeFlags & fQuerySeqMergeOnly) {
442                             row->m_DsIdx = match->m_DsIdx;
443                         }
444                         m_ExtraRows.push_back(row);
445                         row->m_ExtraRowIdx = seq2->m_ExtraRowIdx + 1;
446                         seq2 = match->m_AlnSeq2 = seq2->m_ExtraRow = row;
447                         break;
448                     }
449                     seq2 = match->m_AlnSeq2 = seq2->m_ExtraRow;
450 
451                     second_row_fits = x_SecondRowFits(match);
452                     {{
453                         // reset the ones below,
454                         // since match may have been modified
455                         seq1 = match->m_AlnSeq1;
456                         start1 = match->m_Start1;
457                         match_list_iter1 = match->m_MatchIter1;
458                         seq2 = match->m_AlnSeq2;
459                         start2 = match->m_Start2;
460                         if ( seq2 )
461                             match_list_iter2 = match->m_MatchIter2;
462                         curr_len = len = match->m_Len;
463                     }}
464                 }
465             }
466 
467 
468             if (!starts.size()) {
469                 // no starts exist yet
470 
471                 if ( !m_IndependentDSs ) {
472                     // Stop the algorithm after merging with respect to the first refseq.
473                     // Clear all MatchLists and exit
474                     if ( !(m_SingleRefseq  ||  first_refseq) ) {
475                         NON_CONST_ITERATE (TSeqs, it, m_Seqs){
476                             if ( !((*it)->m_MatchList.empty()) ) {
477                                 (*it)->m_MatchList.clear();
478                             }
479                         }
480                         break;
481                     }
482                 }
483 
484                 // this seq has not yet been used, set the strand
485                 if (m_AlnMixMatches->m_AddFlags & CAlnMixMatches::fForceTranslation) {
486                     seq1->m_PositiveStrand = (seq1->m_StrandScore >= 0);
487                 } else {
488                     seq1->m_PositiveStrand = ! (m_MergeFlags & fNegativeStrand);
489                 }
490 
491                 //create the first one
492                 seg = new CAlnMixSegment();
493                 seg->m_Len = len;
494                 seg->m_DsIdx = match->m_DsIdx;
495                 starts[start1] = seg;
496                 seg->SetStartIterator
497                     (seq1, lo_start_i = hi_start_i = starts.begin());
498 
499                 if (m_MergeFlags & fQuerySeqMergeOnly) {
500                     seq2->m_DsIdx = match->m_DsIdx;
501                 }
502                 // DONE!
503             } else {
504                 // some starts exist already
505 
506                 // look ahead
507                 if ((lo_start_i = start_i = starts.lower_bound(start1))
508                     == starts.end()  ||
509                     start1 < start_i->first) {
510                     // the start position does not exist
511                     if (lo_start_i != starts.begin()) {
512                         --lo_start_i;
513                     }
514                 }
515 
516                 // look back
517                 if (hi_start_i == starts.end()  &&  start_i != lo_start_i) {
518                     CAlnMixSegment * prev_seg = lo_start_i->second;
519                     if (lo_start_i->first + prev_seg->m_Len * width1 >
520                         start1) {
521                         // x----..   becomes  x-x--..
522                         //   x--..
523 
524                         TSeqPos len1 = (start1 - lo_start_i->first) / width1;
525                         TSeqPos len2 = prev_seg->m_Len - len1;
526 
527                         // create the second seg
528                         seg = new CAlnMixSegment();
529                         seg->m_Len = len2;
530                         seg->m_DsIdx = match->m_DsIdx;
531                         starts[start1] = seg;
532 
533                         // create rows info
534                         ITERATE (CAlnMixSegment::TStartIterators, it,
535                                  prev_seg->m_StartIts) {
536                             CAlnMixSeq * seq = it->first;
537                             tmp_start_i = it->second;
538                             if (seq->m_PositiveStrand ==
539                                 seq1->m_PositiveStrand) {
540                                 seq->SetStarts()
541                                     [tmp_start_i->first + len1 * seq->m_Width]
542                                     = seg;
543                                 if (tmp_start_i != seq->SetStarts().end()) {
544                                     seg->SetStartIterator(seq, ++tmp_start_i);
545                                 } else {
546                                     NCBI_THROW(CAlnException, eMergeFailure,
547                                                "CAlnMixMerger::x_Merge(): "
548                                                "Internal error: tmp_start_i == seq->GetStarts().end()");
549                                 }
550                             } else {
551                                 seq->SetStarts()
552                                     [tmp_start_i->first + len2 * seq->m_Width]
553                                     = prev_seg;
554                                 seq->SetStarts()[tmp_start_i->first] = seg;
555                                 seg->SetStartIterator(seq, tmp_start_i);
556                                 if (tmp_start_i != seq->SetStarts().end()) {
557                                     prev_seg->SetStartIterator(seq, ++tmp_start_i);
558                                 } else {
559                                     NCBI_THROW(CAlnException, eMergeFailure,
560                                                "CAlnMixMerger::x_Merge(): "
561                                                "Internal error: tmp_start_i == seq->GetStarts().end()");
562                                 }
563                             }
564                         }
565 
566                         // truncate the first seg
567                         prev_seg->m_Len = len1;
568 
569                         if (start_i != starts.begin()) {
570                             start_i--; // point to the newly created start
571                         }
572                     }
573                     if (lo_start_i != starts.end()) {
574                         lo_start_i++;
575                     }
576                 }
577             }
578 
579             // loop through overlapping segments
580             start = start1;
581             while (hi_start_i == starts.end()) {
582                 if (start_i != starts.end()  &&  start_i->first == start) {
583                     CAlnMixSegment * prev_seg = start_i->second;
584                     if (prev_seg->m_Len > curr_len) {
585                         // x-------)  becomes  x----)x--)
586                         // x----)
587 
588                         // create the second seg
589                         seg = new CAlnMixSegment();
590                         TSeqPos len1 =
591                             seg->m_Len = prev_seg->m_Len - curr_len;
592                         start += curr_len * width1;
593 
594                         // truncate the first seg
595                         prev_seg->m_Len = curr_len;
596 
597                         // create rows info
598                         ITERATE (CAlnMixSegment::TStartIterators, it,
599                                 prev_seg->m_StartIts) {
600                             CAlnMixSeq * seq = it->first;
601                             tmp_start_i = it->second;
602                             if (seq->m_PositiveStrand ==
603                                 seq1->m_PositiveStrand) {
604                                 seq->SetStarts()[tmp_start_i->first +
605                                              curr_len * seq->m_Width]
606                                     = seg;
607                                 if (tmp_start_i != seq->SetStarts().end()) {
608                                     seg->SetStartIterator(seq, ++tmp_start_i);
609                                 } else {
610                                     NCBI_THROW(CAlnException, eMergeFailure,
611                                                "CAlnMixMerger::x_Merge(): "
612                                                "Internal error: tmp_start_i == seq->GetStarts().end()");
613                                 }
614                             } else{
615                                 seq->SetStarts()[tmp_start_i->first +
616                                              len1 * seq->m_Width]
617                                     = prev_seg;
618                                 seq->SetStarts()[tmp_start_i->first] = seg;
619                                 seg->SetStartIterator(seq, tmp_start_i);
620                                 if (tmp_start_i != seq->SetStarts().end()) {
621                                     prev_seg->SetStartIterator(seq, ++tmp_start_i);
622                                 } else {
623                                     NCBI_THROW(CAlnException, eMergeFailure,
624                                                "CAlnMixMerger::x_Merge(): "
625                                                "Internal error: tmp_start_i == seq->GetStarts().end()");
626                                 }
627                             }
628                         }
629 #if _DEBUG && _ALNMGR_DEBUG
630                         seg->StartItsConsistencyCheck(*seq1,
631                                                       start,
632                                                       m_MatchIdx);
633                         prev_seg->StartItsConsistencyCheck(*seq1,
634                                                            start,
635                                                            m_MatchIdx);
636 #endif
637 
638 
639                         hi_start_i = start_i; // DONE!
640                     } else if (curr_len == prev_seg->m_Len) {
641                         // x----)
642                         // x----)
643                         hi_start_i = start_i; // DONE!
644                     } else {
645                         // x----)     becomes  x----)x--)
646                         // x-------)
647                         start += prev_seg->m_Len * width1;
648                         curr_len -= prev_seg->m_Len;
649                         if (start_i != starts.end()) {
650                             start_i++;
651                         }
652                     }
653                 } else {
654                     seg = new CAlnMixSegment();
655                     starts[start] = seg;
656                     tmp_start_i = start_i;
657                     if (tmp_start_i != starts.begin()) {
658                         tmp_start_i--;
659                     }
660                     seg->SetStartIterator(seq1, tmp_start_i);
661                     if (start_i != starts.end()  &&
662                         start + curr_len * width1 > start_i->first) {
663                         //       x--..
664                         // x--------..
665                         seg->m_Len = (start_i->first - start) / width1;
666                         seg->m_DsIdx = match->m_DsIdx;
667                     } else {
668                         //       x-----)
669                         // x---)
670                         seg->m_Len = curr_len;
671                         seg->m_DsIdx = match->m_DsIdx;
672                         hi_start_i = start_i;
673                         if (hi_start_i != starts.begin()) {
674                             hi_start_i--; // DONE!
675                         }
676                     }
677                     start += seg->m_Len * width1;
678                     curr_len -= seg->m_Len;
679                     if (lo_start_i == start_i) {
680                         if (lo_start_i != starts.begin()) {
681                             lo_start_i--;
682                         }
683                     }
684                 }
685             }
686 
687             // try to resolve the second row
688             if (seq2) {
689 
690                 while (second_row_fits != eSecondRowFitsOk  &&
691                        second_row_fits != eIgnoreMatch) {
692                     if (!seq2->m_ExtraRow) {
693                         // create an extra row
694                         CRef<CAlnMixSeq> row (new CAlnMixSeq);
695                         row->m_BioseqHandle = seq2->m_BioseqHandle;
696                         row->m_SeqId = seq2->m_SeqId;
697                         row->m_Width = seq2->m_Width;
698                         row->m_Frame = start2 % 3;
699                         row->m_SeqIdx = seq2->m_SeqIdx;
700                         row->m_ChildIdx = seq2->m_ChildIdx + 1;
701                         if (m_MergeFlags & fQuerySeqMergeOnly) {
702                             row->m_DsIdx = match->m_DsIdx;
703                         }
704                         m_ExtraRows.push_back(row);
705                         row->m_ExtraRowIdx = seq2->m_ExtraRowIdx + 1;
706                         seq2 = match->m_AlnSeq2 = seq2->m_ExtraRow = row;
707                         break;
708                     }
709                     seq2 = match->m_AlnSeq2 = seq2->m_ExtraRow;
710 
711                     second_row_fits = x_SecondRowFits(match);
712                     {{
713                         // reset the ones below,
714                         // since match may have been modified
715                         seq1 = match->m_AlnSeq1;
716                         start1 = match->m_Start1;
717                         match_list_iter1 = match->m_MatchIter1;
718                         seq2 = match->m_AlnSeq2;
719                         start2 = match->m_Start2;
720                         if ( seq2 )
721                             match_list_iter2 = match->m_MatchIter2;
722                         curr_len = len = match->m_Len;
723                     }}
724                 }
725                 if (second_row_fits == eIgnoreMatch) {
726                     continue;
727                 }
728 
729                 if (m_MergeFlags & fTruncateOverlaps) {
730                     // we need to reset these shtorcuts
731                     // in case the match was truncated
732                     start1 = match->m_Start1;
733                     start2 = match->m_Start2;
734                     len = match->m_Len;
735                 }
736 
737                 // set the strand if first time
738                 if (seq2->GetStarts().empty()) {
739                     seq2->m_PositiveStrand =
740                         (seq1->m_PositiveStrand ?
741                          !match->m_StrandsDiffer :
742                          match->m_StrandsDiffer);
743                 }
744 
745                 // create row info
746                 CAlnMixStarts& starts2 = match->m_AlnSeq2->SetStarts();
747                 start = start2;
748                 CAlnMixStarts::iterator start2_i
749                     = starts2.lower_bound(start2);
750                 start_i = match->m_StrandsDiffer ? hi_start_i : lo_start_i;
751 
752                 while(start < start2 + len * width2) {
753                     if (start2_i != starts2.end() &&
754                         start2_i->first == start) {
755                         // this position already exists
756 
757                         if (start2_i->second != start_i->second) {
758                             // merge the two segments
759 
760                             // store the seg in a CRef to delay its deletion
761                             // until after the iteration on it is finished
762                             CRef<CAlnMixSegment> tmp_seg(start2_i->second);
763 
764                             ITERATE (CAlnMixSegment::TStartIterators,
765                                      it,
766                                      tmp_seg->m_StartIts) {
767                                 CAlnMixSeq * tmp_seq = it->first;
768                                 tmp_start_i = it->second;
769                                 tmp_start_i->second = start_i->second;
770                                 start_i->second->SetStartIterator(tmp_seq, tmp_start_i);
771                             }
772 #if _DEBUG && _ALNMGR_DEBUG
773                             start_i->second->StartItsConsistencyCheck(*seq2,
774                                                                       start,
775                                                                       m_MatchIdx);
776 #endif
777                         }
778                     } else {
779                         // this position does not exist, create it
780                         seq2->SetStarts()[start] = start_i->second;
781 
782                         // start2_i != starts.begin() because we just
783                         // made an insertion, so decrement is ok
784                         start2_i--;
785 
786                         // point this segment's row start iterator
787                         if (start_i->second->m_StartIts.find(seq2) !=
788                             start_i->second->m_StartIts.end()) {
789                             // consistency check fails
790                             NCBI_THROW(CAlnException, eMergeFailure,
791                                        "CAlnMixMerger::x_Merge(): "
792                                        "Internal error: "
793                                        "Start iterator already exists for seq2.");
794                         }
795                         start_i->second->SetStartIterator(seq2, start2_i);
796 #if _DEBUG && _ALNMGR_DEBUG
797                         start_i->second->StartItsConsistencyCheck(*seq2,
798                                                                   start,
799                                                                   m_MatchIdx);
800 #endif
801                     }
802 
803                     // increment values
804                     start += start_i->second->m_Len * width2;
805                     if (start2_i != starts2.end()) {
806                         start2_i++;
807                     }
808                     if (match->m_StrandsDiffer) {
809                         if (start_i != starts.begin()) {
810                             start_i--;
811                         }
812                     } else {
813                         if (start_i != starts.end()) {
814                             start_i++;
815                         }
816                     }
817                 }
818             }
819         }
820     }
821     x_SetTaskCompleted(m_MatchIdx);
822 
823     m_AlnMixSequences->BuildRows();
824     m_AlnMixSegments->Build(m_MergeFlags & fGapJoin,
825                             m_MergeFlags & fMinGap,
826                             m_MergeFlags & fRemoveLeadTrailGaps);
827     if (m_MergeFlags & fFillUnalignedRegions) {
828         m_AlnMixSegments->FillUnalignedRegions();
829     }
830     x_CreateDenseg();
831 }
832 
833 
834 CAlnMixMerger::TSecondRowFits
x_SecondRowFits(CAlnMixMatch * match) const835 CAlnMixMerger::x_SecondRowFits(CAlnMixMatch * match) const
836 {
837     CAlnMixSeq*&            seq1    = match->m_AlnSeq1;
838     CAlnMixSeq*&            seq2    = match->m_AlnSeq2;
839     TSeqPos&                start1  = match->m_Start1;
840     TSeqPos&                start2  = match->m_Start2;
841     TSeqPos&                len     = match->m_Len;
842     const int&              width1  = seq1->m_Width;
843     const int&              width2  = seq2->m_Width;
844     CAlnMixStarts::const_iterator starts2_i;
845     TSignedSeqPos           delta, delta1, delta2;
846     TSecondRowFits          result  = eSecondRowFitsOk;
847 
848     // subject sequences go on separate rows if requested
849     if (m_MergeFlags & fQuerySeqMergeOnly) {
850         if (seq2->m_DsIdx) {
851             if ( !(m_MergeFlags & fTruncateOverlaps) ) {
852                 if (seq2->m_DsIdx == match->m_DsIdx) {
853                     return result;
854                 } else {
855                     return eForceSeparateRow;
856                 }
857             }
858         } else {
859             seq2->m_DsIdx = match->m_DsIdx;
860             if ( !(m_MergeFlags & fTruncateOverlaps) ) {
861                 return result;
862             }
863         }
864     }
865 
866     if ( !seq2->GetStarts().empty() ) {
867 
868         // check strand
869         while (true) {
870             if (seq2->m_PositiveStrand !=
871                 (seq1->m_PositiveStrand ?
872                  !match->m_StrandsDiffer :
873                  match->m_StrandsDiffer)) {
874                 if (seq2->m_ExtraRow) {
875                     seq2 = seq2->m_ExtraRow;
876                 } else {
877                     return eInconsistentStrand;
878                 }
879             } else {
880                 break;
881             }
882         }
883 
884         // check frame
885         if (seq2->m_Width == 3  &&  seq2->m_Frame != (int)(start2 % 3)) {
886             return eInconsistentFrame;
887         }
888 
889         starts2_i = seq2->GetStarts().lower_bound(start2);
890 
891         // check below
892         if (starts2_i != seq2->GetStarts().begin()) {
893             starts2_i--;
894 
895             // check for inconsistency on the first row
896             if ( !m_IndependentDSs ) {
897                 CAlnMixSegment::TStartIterators::const_iterator seq1_start_it_i =
898                     starts2_i->second->m_StartIts.find(seq1);
899                 if (seq1_start_it_i != starts2_i->second->m_StartIts.end()) {
900                     const TSeqPos& existing_start1 = seq1_start_it_i->second->first;
901                     if (match->m_StrandsDiffer) {
902                         delta = start1 + len * width1 - existing_start1;
903                         if (delta > 0) {
904                             if (m_MergeFlags & fTruncateOverlaps) {
905                                 delta /= width1;
906                                 if (delta < (TSignedSeqPos)len) {
907                                     // target above
908                                     // x----- x-)-)
909                                     // (----x (---x
910                                     // target below
911                                     len -= delta;
912                                     start2 += delta * width2;
913                                 } else if (delta > (TSignedSeqPos)len  &&
914                                            m_MergeFlags & fAllowTranslocation) {
915                                     // Translocation
916                                     // below target above
917                                     // x---- x-)---
918                                     //       (----x (------x
919                                     //       target below
920                                     delta = (existing_start1 +
921                                              starts2_i->second->m_Len - start1) /
922                                         width1;
923                                     if (delta > 0) {
924                                         if (delta < (TSignedSeqPos)len) {
925                                             start1 += delta * width1;
926                                             len -= delta;
927                                         } else {
928                                             return eIgnoreMatch;
929                                         }
930                                     }
931                                     return eTranslocation;
932                                 } else {
933                                     return eIgnoreMatch;
934                                 }
935                             } else {
936                                 return eFirstRowOverlapAbove;
937                             }
938                         }
939                     } else {
940                         delta = existing_start1
941                             + starts2_i->second->m_Len * width1
942                             - start1;
943                         if (delta > 0) {
944                             if (m_MergeFlags & fTruncateOverlaps) {
945                                 delta /= width1;
946                                 if (len > (TSeqPos)delta) {
947                                     // below target
948                                     // x---- x-)--)
949                                     // x---) x----)
950                                     // below target
951                                     len -= delta;
952                                     start1 += delta * width1;
953                                     start2 += delta * width2;
954                                 } else if (delta > (TSignedSeqPos)len  &&
955                                            m_MergeFlags & fAllowTranslocation) {
956                                     // Translocation
957                                     //       target above
958                                     //       x--x-) ----)
959                                     // x---) x----)
960                                     // below target
961                                     delta = (existing_start1 - start1) / width1;
962                                     if (delta < (TSignedSeqPos)len) {
963                                         len = delta;
964                                         if ( !len ) {
965                                             return eIgnoreMatch;
966                                         }
967                                     }
968                                     return eTranslocation;
969                                 } else {
970                                     return eIgnoreMatch;
971                                 }
972                             } else {
973                                 return eFirstRowOverlapBelow;
974                             }
975                         }
976                     }
977                 }
978             }
979 
980             // check for overlap with the segment below on second row
981             delta = starts2_i->first + starts2_i->second->m_Len * width2
982                 - start2;
983             if (delta > 0) {
984                 //       target
985                 // ----- ------
986                 // x---- x-)--)
987                 // below target
988                 if (m_MergeFlags & fTruncateOverlaps) {
989                     delta /= width2;
990                     if (len > (TSeqPos)delta) {
991                         len -= delta;
992                         start2 += delta * width2;
993                         if ( !match->m_StrandsDiffer ) {
994                             start1 += delta * width1;
995                         }
996                     } else {
997                         return eIgnoreMatch;
998                     }
999                 } else {
1000                     return eSecondRowOverlap;
1001                 }
1002             }
1003             if (starts2_i != seq2->GetStarts().end()) {
1004                 starts2_i++;
1005             }
1006         }
1007 
1008         // check the overlapping area for consistency
1009         while (starts2_i != seq2->GetStarts().end()  &&
1010                starts2_i->first < start2 + len * width2) {
1011             if ( !m_IndependentDSs ) {
1012                 CAlnMixSegment::TStartIterators::const_iterator seq1_start_it_i =
1013                     starts2_i->second->m_StartIts.find(seq1);
1014                 if (seq1_start_it_i != starts2_i->second->m_StartIts.end()) {
1015                     const TSeqPos& existing_start1 = seq1_start_it_i->second->first;
1016                     if (match->m_StrandsDiffer) {
1017                         // x---..- x---..--)
1018                         // (---..- (--x..--x
1019                         delta1 = (start1 - existing_start1) / width1 +
1020                             len - starts2_i->second->m_Len;
1021                     } else {
1022                         // x--..- x---...-)
1023                         // x--..- x---...-)
1024                         delta1 = (existing_start1 - start1) / width1;
1025                     }
1026                     delta2 = (starts2_i->first - start2) / width2;
1027                     if (delta1 != delta2) {
1028                         if (m_MergeFlags & fTruncateOverlaps) {
1029                             delta = (delta1 < delta2 ? delta1 : delta2);
1030                             if (delta > 0) {
1031                                 if (match->m_StrandsDiffer) {
1032                                     start1 += (len - delta) * width1;
1033                                 }
1034                                 len = delta;
1035                             } else if (m_MergeFlags & fAllowTranslocation) {
1036                                 return eTranslocation;
1037                             } else {
1038                                 return eInconsistentOverlap;
1039                             }
1040                         } else {
1041                             return eInconsistentOverlap;
1042                         }
1043                     }
1044                 }
1045             }
1046             starts2_i++;
1047         }
1048 
1049         // check above for consistency
1050         if (starts2_i != seq2->GetStarts().end()) {
1051             if ( !m_IndependentDSs ) {
1052                 CAlnMixSegment::TStartIterators::const_iterator seq1_start_it_i =
1053                     starts2_i->second->m_StartIts.find(seq1);
1054                 if (seq1_start_it_i != starts2_i->second->m_StartIts.end()) {
1055                     const TSeqPos& existing_start1 = seq1_start_it_i->second->first;
1056                     if (match->m_StrandsDiffer) {
1057                         delta = existing_start1 +
1058                             starts2_i->second->m_Len * width1 - start1;
1059                         if (delta > 0) {
1060                             // below target
1061                             // x---- x-)--)
1062                             // (---x (----x
1063                             // above target
1064                             if (m_MergeFlags & fTruncateOverlaps) {
1065                                 if (len > (TSeqPos)(delta / width1)) {
1066                                     len -= delta / width1;
1067                                     start1 += delta;
1068                                 } else {
1069                                     if (m_MergeFlags & fAllowTranslocation) {
1070                                         return eFirstRowOverlapBelow;
1071                                     } else {
1072                                         return eIgnoreMatch;
1073                                     }
1074                                 }
1075                             } else {
1076                                 return eFirstRowOverlapBelow;
1077                             }
1078                         }
1079                     } else {
1080                         delta = start1 + len * width1 - existing_start1;
1081                         if (delta > 0) {
1082                             // target above
1083                             // x--x-) ----)
1084                             // x----) x---)
1085                             // target above
1086                             if (m_MergeFlags & fTruncateOverlaps) {
1087                                 if (len <= (TSeqPos)delta / width1) {
1088                                     if (m_MergeFlags & fAllowTranslocation) {
1089                                         return eFirstRowOverlapAbove;
1090                                     } else {
1091                                         return eIgnoreMatch;
1092                                     }
1093                                 } else {
1094                                     len -= delta / width1;
1095                                 }
1096                             } else {
1097                                 return eFirstRowOverlapAbove;
1098                             }
1099                         }
1100                     }
1101                 }
1102             }
1103         }
1104 
1105         // check for inconsistent matches
1106         CAlnMixStarts::const_iterator starts1_i = seq1->GetStarts().find(start1);
1107         if (starts1_i == seq1->GetStarts().end() ||
1108             starts1_i->first != start1) {
1109             // commenting out for now, since moved the function call ahead
1110 //             NCBI_THROW(CAlnException, eMergeFailure,
1111 //                        "CAlnMixMerger::x_SecondRowFits(): "
1112 //                        "Internal error: seq1->m_Starts do not match");
1113         } else {
1114             CAlnMixSegment::TStartIterators::const_iterator seq2_start_it_i;
1115             TSeqPos tmp_start =
1116                 match->m_StrandsDiffer ? start2 + len * width2 : start2;
1117             while (starts1_i != seq1->GetStarts().end()  &&
1118                    starts1_i->first < start1 + len * width1) {
1119 
1120                 const CAlnMixSegment::TStartIterators& seg_start_its =
1121                     starts1_i->second->m_StartIts;
1122 
1123                 if (match->m_StrandsDiffer) {
1124                     tmp_start -= starts1_i->second->m_Len * width2;
1125                 }
1126 
1127                 if ((seq2_start_it_i = seg_start_its.find(seq2)) !=
1128                     seg_start_its.end()) {
1129                     if (seq2_start_it_i->second->first != tmp_start) {
1130                         // found an inconsistent prev match
1131                         return eSecondRowInconsistency;
1132                     }
1133                 }
1134 
1135                 if ( !match->m_StrandsDiffer ) {
1136                     tmp_start += starts1_i->second->m_Len * width2;
1137                 }
1138 
1139                 starts1_i++;
1140             }
1141         }
1142     }
1143     if (m_MergeFlags & fQuerySeqMergeOnly) {
1144         _ASSERT(m_MergeFlags & fTruncateOverlaps);
1145         if (seq2->m_DsIdx == match->m_DsIdx) {
1146             return result;
1147         } else {
1148             return eForceSeparateRow;
1149         }
1150     }
1151     return result;
1152 }
1153 
1154 
1155 void
x_CreateDenseg()1156 CAlnMixMerger::x_CreateDenseg()
1157 {
1158     int numrow  = 0,
1159         numrows = m_Rows.size();
1160     int numseg  = 0,
1161         numsegs = m_AlnMixSegments->m_Segments.size();
1162     int num     = numrows * numsegs;
1163 
1164     m_DS = new CDense_seg();
1165     m_DS->SetDim(numrows);
1166     m_DS->SetNumseg(numsegs);
1167 
1168     m_Aln = new CSeq_align();
1169     m_Aln->SetType(CSeq_align::eType_not_set);
1170     m_Aln->SetSegs().SetDenseg(*m_DS);
1171     m_Aln->SetDim(numrows);
1172 
1173     CDense_seg::TIds&     ids     = m_DS->SetIds();
1174     CDense_seg::TStarts&  starts  = m_DS->SetStarts();
1175     CDense_seg::TStrands& strands = m_DS->SetStrands();
1176     CDense_seg::TLens&    lens    = m_DS->SetLens();
1177 
1178     x_SetTaskName("Building");
1179     x_SetTaskTotal(numsegs);
1180 
1181     ids.resize(numrows);
1182     lens.resize(numsegs);
1183     starts.resize(num, -1);
1184     strands.resize(num, eNa_strand_minus);
1185 
1186     vector<bool> row_empty(numrows, true);
1187 
1188     // ids
1189     numrow = 0;
1190     ITERATE(TSeqs, row_i, m_Rows) {
1191         ids[numrow++] = (*row_i)->m_SeqId;
1192     }
1193 
1194     int offset = 0;
1195     numseg = 0;
1196 
1197     ITERATE(CAlnMixSegments::TSegments,
1198             seg_i,
1199             m_AlnMixSegments->m_Segments) {
1200 
1201         // lens
1202         lens[numseg] = (*seg_i)->m_Len;
1203 
1204         // starts
1205         ITERATE (CAlnMixSegment::TStartIterators, start_its_i,
1206                 (*seg_i)->m_StartIts) {
1207             starts[offset + start_its_i->first->m_RowIdx] =
1208                 start_its_i->second->first;
1209 
1210             if (start_its_i->second->first != -1) {
1211                 row_empty[start_its_i->first->m_RowIdx] = false;
1212             }
1213         }
1214 
1215         // strands
1216         numrow = 0;
1217         ITERATE(TSeqs, row_i, m_Rows) {
1218             if ((*row_i)->m_PositiveStrand) {
1219                 strands[offset + numrow] = eNa_strand_plus;
1220             }
1221             numrow++;
1222         }
1223 
1224         // next segment
1225         numseg++; offset += numrows;
1226         x_SetTaskCompleted(numseg);
1227     }
1228 
1229 
1230     // widths
1231     CDense_seg::TWidths*  widths = NULL;
1232     if ((m_AlnMixMatches->m_ContainsNA  &&  m_AlnMixMatches->m_ContainsAA)  ||
1233         (m_AlnMixMatches->m_AddFlags & CAlnMixMatches::fForceTranslation)) {
1234         widths = &m_DS->SetWidths();
1235         widths->resize(numrows);
1236         numrow = 0;
1237         ITERATE (TSeqs, row_i, m_Rows) {
1238             (*widths)[numrow++] = (*row_i)->m_Width;
1239         }
1240     }
1241 
1242     //remove empty rows
1243     for(int row = numrows-1; row >=0; --row) {
1244         if (row_empty[row]) {
1245             ids.erase(ids.begin()+row);
1246             if (widths) {
1247                 widths->erase(widths->begin()+row);
1248             }
1249             for (int i = (numsegs-1)*numrows + row; i > 0; i -= numrows) {
1250                 starts.erase(starts.begin()+i);
1251                 strands.erase(strands.begin()+i);
1252             }
1253             --numrows;
1254         }
1255     }
1256     m_DS->SetDim(numrows);
1257 
1258 #if _DEBUG
1259     try {
1260         m_DS->Validate(true);
1261     } catch (...) {
1262         _ASSERT(false);
1263     }
1264 #endif
1265 }
1266 
1267 
1268 void
x_SetSeqFrame(CAlnMixMatch * match,CAlnMixSeq * & seq)1269 CAlnMixMerger::x_SetSeqFrame(CAlnMixMatch* match, CAlnMixSeq*& seq)
1270 {
1271     unsigned frame;
1272     if (seq == match->m_AlnSeq1) {
1273         frame = match->m_Start1 % 3;
1274     } else {
1275         frame = match->m_Start2 % 3;
1276     }
1277     if (seq->GetStarts().empty()) {
1278         seq->m_Frame = frame;
1279     } else {
1280         while ((unsigned)seq->m_Frame != frame) {
1281             if (!seq->m_ExtraRow) {
1282                 // create an extra frame
1283                 CRef<CAlnMixSeq> new_seq (new CAlnMixSeq);
1284                 new_seq->m_BioseqHandle = seq->m_BioseqHandle;
1285                 new_seq->m_SeqId = seq->m_SeqId;
1286                 new_seq->m_PositiveStrand = seq->m_PositiveStrand;
1287                 new_seq->m_Width = seq->m_Width;
1288                 new_seq->m_Frame = frame;
1289                 new_seq->m_SeqIdx = seq->m_SeqIdx;
1290                 new_seq->m_ChildIdx = seq->m_ChildIdx + 1;
1291                 if (m_MergeFlags & fQuerySeqMergeOnly) {
1292                     new_seq->m_DsIdx = match->m_DsIdx;
1293                 }
1294                 m_ExtraRows.push_back(new_seq);
1295                 new_seq->m_ExtraRowIdx = seq->m_ExtraRowIdx + 1;
1296                 seq = seq->m_ExtraRow = new_seq;
1297                 break;
1298             }
1299             seq = seq->m_ExtraRow;
1300         }
1301     }
1302 }
1303 
1304 
1305 END_objects_SCOPE // namespace ncbi::objects::
1306 END_NCBI_SCOPE
1307