1 /*  $Id: alnsegments.cpp 488694 2016-01-04 20:27:43Z grichenk $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Kamen Todorov, NCBI
27 *
28 * File Description:
29 *   Alignment segments
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 #include <objtools/alnmgr/alnsegments.hpp>
36 #include <objtools/alnmgr/alnseq.hpp>
37 #include <objtools/alnmgr/alnexception.hpp>
38 #include <objtools/alnmgr/alnmix.hpp>
39 #include <objtools/alnmgr/alnmap.hpp>
40 
41 #include <stack>
42 
43 
44 BEGIN_NCBI_SCOPE
45 BEGIN_objects_SCOPE // namespace ncbi::objects::
46 
47 
CAlnMixSegments(CRef<CAlnMixSequences> & aln_mix_sequences,TCalcScoreMethod calc_score)48 CAlnMixSegments::CAlnMixSegments(CRef<CAlnMixSequences>&  aln_mix_sequences,
49                                  TCalcScoreMethod calc_score)
50     : m_AlnMixSequences(aln_mix_sequences),
51       m_Rows(m_AlnMixSequences->m_Rows),
52       m_ExtraRows(m_AlnMixSequences->m_ExtraRows),
53       x_CalculateScore(calc_score)
54 {}
55 
56 
57 void
Build(bool gap_join,bool min_gap,bool remove_leading_and_trailing_gaps)58 CAlnMixSegments::Build(bool gap_join,
59                        bool min_gap,
60                        bool remove_leading_and_trailing_gaps)
61 {
62     m_AlnMixSequences->InitRowsStartIts();
63     m_AlnMixSequences->InitExtraRowsStartIts();
64 #if _DEBUG && _ALNMGR_DEBUG
65     m_AlnMixSequences->RowsStartItsContsistencyCheck(0);
66 #endif
67 
68 
69     TSegmentsContainer gapped_segs;
70 
71     CAlnMixSequences::TSeqs::iterator refseq_it = m_Rows.begin();
72     bool orig_refseq = true;
73     while (true) {
74         CAlnMixSeq * refseq = 0;
75         while (refseq_it != m_Rows.end()) {
76             refseq = *(refseq_it++);
77             if (refseq->GetStarts().current != refseq->GetStarts().end()) {
78                 break;
79             } else {
80                 refseq = 0;
81             }
82         }
83         if ( !refseq ) {
84             // Done
85 
86             // add the gapped segments if any
87             if (gapped_segs.size()) {
88                 if (gap_join) {
89                     // request to try to align
90                     // gapped segments w/ equal len
91                     x_ConsolidateGaps(gapped_segs);
92                 } else if (min_gap) {
93                     // request to try to align
94                     // all gapped segments
95                     x_MinimizeGaps(gapped_segs);
96                 }
97                 NON_CONST_ITERATE (TSegmentsContainer,
98                                    seg_i, gapped_segs) {
99                     m_Segments.push_back(&**seg_i);
100                 }
101                 gapped_segs.clear();
102             }
103             break; // from the main loop
104         }
105 #if _DEBUG && _ALNMGR_TRACE
106         cerr << "refseq is on row " << refseq->m_RowIdx
107              << " seq " << refseq->m_SeqIdx << "\n";
108 #endif
109         // for each refseq segment
110         while (refseq->GetStarts().current != refseq->GetStarts().end()) {
111             stack< CRef<CAlnMixSegment> > seg_stack;
112             // To prevent infinite loop, remember all segments already on the stack.
113             set< CRef<CAlnMixSegment> > seg_set;
114             seg_stack.push(refseq->GetStarts().current->second);
115             seg_set.insert(refseq->GetStarts().current->second);
116 #if _DEBUG
117             const TSeqPos& refseq_start = refseq->GetStarts().current->first;
118 #if _DEBUG && _ALNMGR_TRACE
119             cerr << "  [row " << refseq->m_RowIdx
120                  << " seq " << refseq->m_SeqIdx
121                  << " start " << refseq_start
122                  << " was pushed into stack\n";
123 #endif
124 #endif
125 
126             while ( !seg_stack.empty() ) {
127 
128                 bool pop_seg = true;
129                 // check the gapped segments on the left
130                 ITERATE (CAlnMixSegment::TStartIterators, start_its_i,
131                          seg_stack.top()->m_StartIts) {
132 
133                     CAlnMixSeq * row = start_its_i->first;
134 
135                     if (row->GetStarts().current != start_its_i->second) {
136 #if _DEBUG
137                         const TSeqPos& curr_row_start = row->GetStarts().current->first;
138                         const TSeqPos& row_start      = start_its_i->second->first;
139 
140                         if (row->m_PositiveStrand ?
141                             curr_row_start >
142                             row_start :
143                             curr_row_start <
144                             row_start) {
145                             string errstr =
146                                 string("CAlnMixSegments::Build():")
147                                 + " Internal error: Integrity broken" +
148                                 " row=" + NStr::IntToString(row->m_RowIdx) +
149                                 " seq=" + NStr::IntToString(row->m_SeqIdx)
150                                 + " curr_row_start="
151                                 + NStr::IntToString(curr_row_start)
152                                 + " row_start=" +
153                                 NStr::IntToString(row_start)
154                                 + " refseq_start=" +
155                                 NStr::IntToString(refseq_start)
156                                 + " strand=" +
157                                 (row->m_PositiveStrand ? "plus" : "minus");
158                             NCBI_THROW(CAlnException, eMergeFailure, errstr);
159                         }
160 #endif
161                         seg_stack.push(row->GetStarts().current->second);
162                         // If the segment is already in the set, we have infinite loop
163                         // and must terminate it.
164                         if (!seg_set.insert(row->GetStarts().current->second).second) {
165                             string errstr =
166                                 string("CAlnMixSegments::Build():")
167                                 + " Internal error: Infinite loop detected.";
168                             NCBI_THROW(CAlnException, eMergeFailure, errstr);
169                         }
170 #if _DEBUG && _ALNMGR_TRACE
171                         cerr << "  [row " << row->m_RowIdx
172                              << " seq " << row->m_SeqIdx
173                              << " start " << curr_row_start
174                              << " (left of start " << row_start << ") "
175                              << "was pushed into stack\n";
176 #endif
177                         pop_seg = false;
178                         break;
179                     }
180                 }
181 
182                 if (pop_seg) {
183 
184                     // inc/dec iterators for each row of the seg
185                     ITERATE (CAlnMixSegment::TStartIterators, start_its_i,
186                              seg_stack.top()->m_StartIts) {
187                         CAlnMixSeq * row = start_its_i->first;
188 
189 #if _DEBUG
190                         const TSeqPos& curr_row_start = row->GetStarts().current->first;
191                         const TSeqPos& row_start      = start_its_i->second->first;
192 
193                         if (row->m_PositiveStrand  &&
194                             curr_row_start >
195                             row_start  ||
196                             !row->m_PositiveStrand  &&
197                             curr_row_start <
198                             row_start) {
199                             string errstr =
200                                 string("CAlnMixSegments::Build():")
201                                 + " Internal error: Integrity broken" +
202                                 " row=" + NStr::IntToString(row->m_RowIdx) +
203                                 " seq=" + NStr::IntToString(row->m_SeqIdx)
204                                 + " curr_row_start="
205                                 + NStr::IntToString(curr_row_start)
206                                 + " row_start=" +
207                                 NStr::IntToString(row_start)
208                                 + " refseq_start=" +
209                                 NStr::IntToString(refseq_start)
210                                 + " strand=" +
211                                 (row->m_PositiveStrand ? "plus" : "minus");
212                             NCBI_THROW(CAlnException, eMergeFailure, errstr);
213                         }
214 #endif
215 
216                         if (row->m_PositiveStrand) {
217                             row->SetStarts().current++;
218                         } else {
219                             if (row->SetStarts().current == row->GetStarts().begin()) {
220                                 row->SetStarts().current = row->GetStarts().end();
221                             } else {
222                                 row->SetStarts().current--;
223                             }
224                         }
225                     }
226 
227                     if (seg_stack.size() > 1) {
228                         // add to the gapped segments
229                         gapped_segs.push_back(seg_stack.top());
230                         seg_stack.pop();
231 #if _DEBUG && _ALNMGR_TRACE
232                         cerr << "  seg popped].\n";
233 #endif
234                     } else {
235                         // add the gapped segments if any
236                         if (gapped_segs.size()) {
237                             if (gap_join) {
238                                 // request to try to align
239                                 // gapped segments w/ equal len
240                                 x_ConsolidateGaps(gapped_segs);
241                             } else if (min_gap) {
242                                 // request to try to align
243                                 // all gapped segments
244                                 x_MinimizeGaps(gapped_segs);
245                             }
246                             if (orig_refseq) {
247                                 NON_CONST_ITERATE (TSegmentsContainer,
248                                                    seg_i, gapped_segs) {
249                                     m_Segments.push_back(&**seg_i);
250                                 }
251                                 gapped_segs.clear();
252                             }
253                         }
254                         // add the refseq segment
255                         if (orig_refseq) {
256                             m_Segments.push_back(seg_stack.top());
257                         } else {
258                             gapped_segs.push_back(seg_stack.top());
259                         }
260                         seg_stack.pop();
261 #if _DEBUG && _ALNMGR_TRACE
262                         cerr << "  refseq seg popped].\n";
263 #endif
264                     } // if (seg_stack.size() > 1)
265                 } // if (popseg)
266             } // while ( !seg_stack.empty() )
267         } // while (refseq->GetStarts().current != refseq->GetStarts().end())
268         orig_refseq = false;
269     } // while (true)
270 
271 
272     if (remove_leading_and_trailing_gaps) {
273         while (m_Segments.size()  &&  m_Segments.front()->m_StartIts.size() < 2) {
274             m_Segments.pop_front();
275         }
276         while (m_Segments.size()  &&  m_Segments.back()->m_StartIts.size() < 2) {
277             m_Segments.pop_back();
278         }
279     }
280 
281 }
282 
283 
284 
285 void
FillUnalignedRegions()286 CAlnMixSegments::FillUnalignedRegions()
287 {
288     vector<TSignedSeqPos> starts;
289     vector<TSeqPos> lens;
290     starts.resize(m_Rows.size(), -1);
291     lens.resize(m_Rows.size(), 0);
292 
293     TSeqPos len = 0;
294     CAlnMap::TNumrow rowidx;
295 
296     TSegments::iterator seg_i = m_Segments.begin();
297     while (seg_i != m_Segments.end()) {
298         len = (*seg_i)->m_Len;
299         ITERATE (CAlnMixSegment::TStartIterators, start_its_i,
300                  (*seg_i)->m_StartIts) {
301             CAlnMixSeq * row = start_its_i->first;
302             rowidx = row->m_RowIdx;
303             TSignedSeqPos& prev_start = starts[rowidx];
304             TSeqPos& prev_len = lens[rowidx];
305             TSeqPos start = start_its_i->second->first;
306             const bool plus = row->m_PositiveStrand;
307             const int& width = row->m_Width;
308             TSeqPos prev_start_plus_len = prev_start + prev_len * width;
309             TSeqPos start_plus_len = start + len * width;
310             if (prev_start >= 0) {
311                 if (plus  &&  prev_start_plus_len < start  ||
312                     !plus  &&  start_plus_len < (TSeqPos) prev_start) {
313                     // create a new seg
314                     CRef<CAlnMixSegment> seg (new CAlnMixSegment);
315                     TSeqPos new_start;
316                     if (row->m_PositiveStrand) {
317                         new_start = prev_start + prev_len * width;
318                         seg->m_Len = (start - new_start) / width;
319                     } else {
320                         new_start = start_plus_len;
321                         seg->m_Len = (prev_start - new_start) / width;
322                     }
323                     row->SetStarts()[new_start] = seg;
324                     CAlnMixStarts::iterator start_i =
325                         start_its_i->second;
326                     seg->SetStartIterator(row,
327                                           row->m_PositiveStrand ?
328                                           --start_i :
329                                           ++start_i);
330 
331                     seg_i = m_Segments.insert(seg_i, seg);
332                     seg_i++;
333                 }
334             }
335             prev_start = start;
336             prev_len = len;
337         }
338         seg_i++;
339     }
340 }
341 
342 
343 void
x_ConsolidateGaps(TSegmentsContainer & gapped_segs)344 CAlnMixSegments::x_ConsolidateGaps(TSegmentsContainer& gapped_segs)
345 {
346     TSegmentsContainer::iterator seg1_i, seg2_i;
347 
348     seg2_i = seg1_i = gapped_segs.begin();
349     if (seg2_i != gapped_segs.end()) {
350         seg2_i++;
351     }
352 
353     bool         cache = false;
354     string       s1;
355     int          score1;
356     CAlnMixSeq * seq1;
357     CAlnMixSeq * seq2;
358 
359     while (seg2_i != gapped_segs.end()) {
360 
361         CAlnMixSegment * seg1 = *seg1_i;
362         CAlnMixSegment * seg2 = *seg2_i;
363 
364         // check if this seg possibly aligns with the previous one
365         bool possible = true;
366 
367         if (seg2->m_Len == seg1->m_Len  &&
368             seg2->m_StartIts.size() == 1) {
369 
370             seq2 = seg2->m_StartIts.begin()->first;
371 
372             // check if this seq was already used
373             ITERATE (CAlnMixSegment::TStartIterators,
374                      st_it,
375                      (*seg1_i)->m_StartIts) {
376                 if (st_it->first == seq2) {
377                     possible = false;
378                     break;
379                 }
380             }
381 
382             // check if score is sufficient
383             if (possible  &&  x_CalculateScore) {
384                 if (!cache) {
385 
386                     seq1 = seg1->m_StartIts.begin()->first;
387 
388                     seq2->GetSeqString(s1,
389                                        seg1->m_StartIts[seq1]->first,
390                                        seg1->m_Len * seq1->m_Width,
391                                        seq1->m_PositiveStrand);
392 
393                     score1 = x_CalculateScore(s1,
394                                               s1,
395                                               seq1->m_IsAA,
396                                               seq1->m_IsAA);
397                     cache = true;
398                 }
399 
400                 string s2;
401                 seq2->GetSeqString(s2,
402                                    seg2->m_StartIts[seq2]->first,
403                                    seg2->m_Len * seq2->m_Width,
404                                    seq2->m_PositiveStrand);
405 
406                 int score2 =
407                     x_CalculateScore(s1, s2, seq1->m_IsAA, seq2->m_IsAA);
408 
409                 if (score2 < 75 * score1 / 100) {
410                     possible = false;
411                 }
412             }
413 
414         } else {
415             possible = false;
416         }
417 
418         if (possible) {
419             // consolidate the ones so far
420 
421             // add the new row
422             seg1->SetStartIterator(seq2, seg2->m_StartIts.begin()->second);
423 
424             // point the row's start position to the beginning seg
425             seg2->m_StartIts.begin()->second->second = seg1;
426 
427             seg2_i = gapped_segs.erase(seg2_i);
428         } else {
429             cache = false;
430             seg1_i++;
431             seg2_i++;
432         }
433     }
434 }
435 
436 
437 void
x_MinimizeGaps(TSegmentsContainer & gapped_segs)438 CAlnMixSegments::x_MinimizeGaps(TSegmentsContainer& gapped_segs)
439 {
440     TSegmentsContainer::iterator  seg_i, seg_i_end, seg_i_begin;
441     CAlnMixSegment       * seg1, * seg2;
442     CRef<CAlnMixSegment> seg;
443     CAlnMixSeq           * seq;
444     TSegmentsContainer            new_segs;
445 
446     seg_i_begin = seg_i_end = seg_i = gapped_segs.begin();
447 
448     typedef map<TSeqPos, CRef<CAlnMixSegment> > TLenMap;
449     TLenMap len_map;
450 
451     while (seg_i_end != gapped_segs.end()) {
452 
453         len_map[(*seg_i_end)->m_Len];
454 
455         // check if this seg can possibly be minimized
456         bool possible = true;
457 
458         seg_i = seg_i_begin;
459         while (seg_i != seg_i_end) {
460             seg1 = *seg_i;
461             seg2 = *seg_i_end;
462 
463             ITERATE (CAlnMixSegment::TStartIterators,
464                      st_it,
465                      seg2->m_StartIts) {
466                 seq = st_it->first;
467                 // check if this seq was already used
468                 if (seg1->m_StartIts.find(seq) != seg1->m_StartIts.end()) {
469                     possible = false;
470                     break;
471                 }
472             }
473             if ( !possible ) {
474                 break;
475             }
476             seg_i++;
477         }
478         seg_i_end++;
479 
480         if ( !possible  ||  seg_i_end == gapped_segs.end()) {
481             // use the accumulated len info to create the new segs
482 
483             // create new segs with appropriate len
484             TSeqPos len_so_far = 0;
485             TLenMap::iterator len_i = len_map.begin();
486             while (len_i != len_map.end()) {
487                 len_i->second = new CAlnMixSegment();
488                 len_i->second->m_Len = len_i->first - len_so_far;
489                 len_so_far += len_i->second->m_Len;
490                 len_i++;
491             }
492 
493             // loop through the accumulated orig segs.
494             // copy info from them into the new segs
495             TLenMap::iterator len_i_end;
496             seg_i = seg_i_begin;
497             while (seg_i != seg_i_end) {
498                 TSeqPos orig_len = (*seg_i)->m_Len;
499 
500                 // determine the span of the current seg
501                 len_i_end = len_map.find(orig_len);
502                 len_i_end++;
503 
504                 // loop through its sequences
505                 NON_CONST_ITERATE (CAlnMixSegment::TStartIterators,
506                                    st_it,
507                                    (*seg_i)->m_StartIts) {
508 
509                     seq = st_it->first;
510                     TSeqPos orig_start = st_it->second->first;
511 
512                     len_i = len_map.begin();
513                     len_so_far = 0;
514                     // loop through the new segs
515                     while (len_i != len_i_end) {
516                         seg = len_i->second;
517 
518                         // calc the start
519                         TSeqPos this_start = orig_start +
520                             (seq->m_PositiveStrand ?
521                              len_so_far :
522                              orig_len - len_so_far - seg->m_Len) *
523                             seq->m_Width;
524 
525                         // create the bindings:
526                         seq->SetStarts()[this_start] = seg;
527                         seg->SetStartIterator(seq, seq->SetStarts().find(this_start));
528                         len_i++;
529                         len_so_far += seg->m_Len;
530                     }
531                 }
532                 seg_i++;
533             }
534             NON_CONST_ITERATE (TLenMap, len_it, len_map) {
535                 new_segs.push_back(len_it->second);
536             }
537             len_map.clear();
538             seg_i_begin = seg_i_end;
539         }
540     }
541     gapped_segs.clear();
542     ITERATE (TSegmentsContainer, new_seg_i, new_segs) {
543         gapped_segs.push_back(*new_seg_i);
544     }
545 }
546 
547 
548 void
StartItsConsistencyCheck(const CAlnMixSeq & seq,const TSeqPos & start,size_t match_idx) const549 CAlnMixSegment::StartItsConsistencyCheck(const CAlnMixSeq& seq,
550                                          const TSeqPos&    start,
551                                          size_t            match_idx) const
552 {
553     ITERATE(TStartIterators, st_it_i, m_StartIts) {
554         // both should point to the same seg
555         if ((*st_it_i).second->second != this) {
556             string errstr =
557                 string("CAlnMixSegment::StartItsConsistencyCheck")
558                 + " [match_idx=" + NStr::NumericToString(match_idx) + "]"
559                 + " The internal consistency check failed for"
560                 + " the segment containing ["
561                 + " row=" + NStr::NumericToString((*st_it_i).first->m_RowIdx)
562                 + " seq=" + NStr::NumericToString((*st_it_i).first->m_SeqIdx)
563                 + " strand=" +
564                 ((*st_it_i).first->m_PositiveStrand ? "plus" : "minus")
565                 + " start=" + NStr::NumericToString((*st_it_i).second->first)
566                 + "] aligned to: ["
567                 + " row=" + NStr::NumericToString(seq.m_RowIdx)
568                 + " seq=" + NStr::NumericToString(seq.m_SeqIdx)
569                 + " strand=" +
570                 (seq.m_PositiveStrand ? "plus" : "minus")
571                 + " start=" + NStr::NumericToString(start)
572                 + "].";
573             NCBI_THROW(CAlnException, eMergeFailure, errstr);
574         }
575     }
576 }
577 
578 
579 END_objects_SCOPE // namespace ncbi::objects::
580 END_NCBI_SCOPE
581