1 /*  $Id: align_cleanup.cpp 618599 2020-10-22 17:38:51Z grichenk $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Mike DiCuccio
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
34 #include <algo/sequence/align_cleanup.hpp>
35 
36 #include <objtools/alnmgr/aln_container.hpp>
37 #include <objtools/alnmgr/aln_tests.hpp>
38 #include <objtools/alnmgr/aln_stats.hpp>
39 #include <objtools/alnmgr/pairwise_aln.hpp>
40 #include <objtools/alnmgr/aln_converters.hpp>
41 #include <objtools/alnmgr/aln_builders.hpp>
42 #include <objtools/alnmgr/aln_serial.hpp>
43 #include <objtools/alnmgr/aln_user_options.hpp>
44 #include <objtools/alnmgr/aln_generators.hpp>
45 #include <objtools/alnmgr/seqids_extractor.hpp>
46 
47 #include <objtools/alnmgr/alnmix.hpp>
48 
49 #include <serial/serial.hpp>
50 #include <serial/iterator.hpp>
51 
52 BEGIN_NCBI_SCOPE
53 USING_SCOPE(objects);
54 
55 /////////////////////////////////////////////////////////////////////////////
56 
CreatePairwiseFromMultiple(const CSeq_align & multiple,TAligns & pairwise)57 void CAlignCleanup::CreatePairwiseFromMultiple(const CSeq_align& multiple,
58                                                TAligns&          pairwise)
59 {
60 #if 0
61     CAlnContainer aln_container;
62     aln_container.insert(multiple);
63 
64     /// Types we use here:
65     typedef CSeq_align::TDim TDim;
66 
67     /// Create a vector of seq-ids per seq-align
68     TIdExtract id_extract;
69     TAlnIdMap aln_id_map(id_extract, aln_container.size());
70     size_t count_accepted = 0;
71     ITERATE(CAlnContainer, aln_it, aln_container) {
72         try {
73             aln_id_map.push_back(**aln_it);
74             ++count_accepted;
75         }
76         catch (CAlnException& e) {
77             ERR_POST(Error
78                 << "CAlgoPlugin_AlignCleanup::x_Run_AlignMgr(): "
79                 << "failed to extract IDs: " << e.GetMsg());
80         }
81     }
82 
83     if (count_accepted != aln_container.size()) {
84         if (count_accepted == 0) {
85             NCBI_THROW(CException, eUnknown,
86                        "No valid alignments found");
87             return;
88         }
89 
90         ERR_POST(Warning
91             << count_accepted << "/" << aln_container.size()
92             << " alignments had no IDs to extract.");
93     }
94 
95     ///
96     /// gather statistics about our alignment
97     ///
98     TAlnStats aln_stats(aln_id_map);
99 
100     CAlnUserOptions opts;
101     opts.m_MergeAlgo = CAlnUserOptions::eMergeAllSeqs;
102     opts.m_Direction = CAlnUserOptions::eBothDirections;
103     opts.SetMergeFlags(CAlnUserOptions::fTruncateOverlaps, true);
104 
105     ///
106     /// create a set of anchored alignments
107     ///
108     TAnchoredAlnVec anchored_aln_vec;
109     CreateAnchoredAlnVec(aln_stats, anchored_aln_vec, opts);
110 
111     ITERATE (TAnchoredAlnVec, iter, anchored_aln_vec) {
112         ITERATE(CAnchoredAln::TPairwiseAlnVector,
113                 pairwise_aln_i,
114                 (*iter)->GetPairwiseAlns()) {
115 
116             CRef<CDense_seg> ds =
117                 CreateDensegFromPairwiseAln(**pairwise_aln_i);
118             CRef<CSeq_align> aln(new CSeq_align);
119             aln->SetSegs().SetDenseg(*ds);
120             aln->SetType(CSeq_align::eType_partial);
121             pairwise.push_back(aln);
122         }
123     }
124 #endif
125 
126 #if 1
127     _ASSERT(multiple.GetSegs().IsDenseg());
128 
129     const CDense_seg& seg = multiple.GetSegs().GetDenseg();
130     CDense_seg::TDim max_rows = seg.GetDim();
131     for (CDense_seg::TDim row = 1;  row < max_rows;  ++row) {
132         CRef<CDense_seg> new_seg(new CDense_seg);
133 
134         /// we are creating pairwise alignments
135         new_seg->SetDim(2);
136 
137         /// get IDs right
138         CRef<CSeq_id> id;
139         id.Reset(new CSeq_id);
140         id->Assign(*seg.GetIds()[0]);
141         new_seg->SetIds().push_back(id);
142 
143         id.Reset(new CSeq_id);
144         id->Assign(*seg.GetIds()[row]);
145         new_seg->SetIds().push_back(id);
146 
147         /// copy the starts
148         CDense_seg::TNumseg segs = 0;
149         for (CDense_seg::TNumseg j = 0;  j < seg.GetNumseg();  ++j) {
150             TSignedSeqPos start_0 = seg.GetStarts()[j * max_rows + 0];
151             TSignedSeqPos start_1 = seg.GetStarts()[j * max_rows + row];
152 
153             if ((start_0 < 0  &&  start_1 < 0) // segment is entirely gapped
154                 || (segs==0 && (start_0 < 0  ||  start_1 < 0)) // gapped segment at the beginning
155                 ) {
156                 continue;
157             }
158             new_seg->SetLens().push_back(seg.GetLens()[j]);
159             new_seg->SetStarts().push_back(start_0);
160             new_seg->SetStarts().push_back(start_1);
161             new_seg->SetStrands().push_back(seg.GetStrands()[j * max_rows + 0]);
162             new_seg->SetStrands().push_back(seg.GetStrands()[j * max_rows + row]);
163 
164             ++segs;
165         }
166 
167         while (segs && (new_seg->SetStarts()[segs*2-2] < 0 || new_seg->SetStarts()[segs*2-1] < 0)) {
168             // gapped segment at the end
169             --segs;
170             new_seg->SetLens().resize(segs);
171             new_seg->SetStarts().resize(segs*2);
172             new_seg->SetStrands().resize(segs*2);
173         }
174 
175         /// set the number of segments
176         /// we will clean this up later
177         new_seg->SetNumseg(segs);
178 
179         if (segs) {
180             try {
181                 CRef<CSeq_align> align(new CSeq_align);
182 
183                 /// make sure we set type correctly
184                 align->SetType(multiple.GetType());
185 
186                 align->SetSegs().SetDenseg(*new_seg);
187 
188                 ///
189                 /// validation is optional!
190                 align->Validate(true);
191 
192                 pairwise.push_back(align);
193             }
194             catch (CException& e) {
195                 ERR_POST(Error
196                     << "CAlignCleanup::CreatePairwiseFromMultiple(): "
197                     << "failed to validate: " << e.GetMsg());
198             }
199         }
200     }
201 #endif
202 }
203 
204 
205 /////////////////////////////////////////////////////////////////////////////
206 
CAlignCleanup()207 CAlignCleanup::CAlignCleanup()
208     : m_SortByScore(true)
209     , m_PreserveRows(false)
210     , m_FillUnaligned(false)
211 {
212 }
213 
CAlignCleanup(CScope & scope)214 CAlignCleanup::CAlignCleanup(CScope& scope)
215     : m_Scope(&scope)
216     , m_SortByScore(true)
217     , m_PreserveRows(false)
218     , m_FillUnaligned(false)
219 {
220 }
221 
222 
Cleanup(const TAligns & aligns_in,TAligns & aligns_out,EMode mode)223 void CAlignCleanup::Cleanup(const TAligns& aligns_in,
224                             TAligns&       aligns_out,
225                             EMode          mode)
226 {
227     TConstAligns const_aligns_in;
228     copy(aligns_in.begin(), aligns_in.end(), back_inserter(const_aligns_in));
229     Cleanup(const_aligns_in, aligns_out, mode);
230 }
231 
Cleanup(const TConstAligns & aligns_in,TAligns & aligns_out,EMode mode)232 void CAlignCleanup::Cleanup(const TConstAligns& aligns_in,
233                             TAligns&            aligns_out,
234                             EMode               mode)
235 {
236     size_t size = aligns_in.size();
237     if (size == 0) {
238         return;
239     }
240     if (size == 1) {
241         // short cut: just copy the alignment
242         CRef<CSeq_align> align(new CSeq_align);
243         align->Assign(*aligns_in.front());
244         aligns_out.push_back(align);
245         return;
246     }
247 
248     switch (mode) {
249     case eAlignVec:
250         x_Cleanup_AlignVec(aligns_in, aligns_out);
251         break;
252 
253     case eAnchoredAlign:
254         x_Cleanup_AnchoredAln(aligns_in, aligns_out);
255         break;
256     }
257 }
258 
259 
x_Cleanup_AnchoredAln(const TConstAligns & aligns_in,TAligns & aligns_out)260 void CAlignCleanup::x_Cleanup_AnchoredAln(const TConstAligns& aligns_in,
261                                           TAligns&            aligns_out)
262 {
263     CAlnContainer aln_container;
264 
265     ///
266     /// step 1: add to alignment container
267     ///
268     size_t count = 0;
269     size_t count_invalid = 0;
270     ITERATE (TConstAligns, iter, aligns_in) {
271 
272         try {
273             ++count;
274             CConstRef<CSeq_align> aln = *iter;
275 
276             ///
277             /// validation is optional!
278             aln->Validate(true);
279 
280             aln_container.insert(*aln);
281         }
282         catch (CException& e) {
283             ERR_POST(Error
284                 << "CAlgoPlugin_AlignCleanup::x_Run_AlignMgr(): "
285                 << "failed to validate: " << e.GetMsg());
286             ++count_invalid;
287         }
288     }
289 
290     if (count_invalid) {
291         string msg;
292         msg += NStr::NumericToString(count_invalid);
293         msg += "/";
294         msg += NStr::NumericToString(count);
295         msg += " alignments failed validation.";
296         if (count_invalid == count) {
297             NCBI_THROW(CException, eUnknown, msg);
298         } else {
299             ERR_POST(Warning << msg);
300         }
301     }
302 
303     /// Create a vector of seq-ids per seq-align
304     TIdExtract id_extract;
305     TAlnIdMap aln_id_map(id_extract, aln_container.size());
306     size_t count_accepted = 0;
307     ITERATE(CAlnContainer, aln_it, aln_container) {
308         try {
309             aln_id_map.push_back(**aln_it);
310             ++count_accepted;
311         }
312         catch (CAlnException& e) {
313             ERR_POST(Error
314                 << "CAlgoPlugin_AlignCleanup::x_Run_AlignMgr(): "
315                 << "failed to extract IDs: " << e.GetMsg());
316         }
317     }
318 
319     if (count_accepted != aln_container.size()) {
320         if (count_accepted == 0) {
321             NCBI_THROW(CException, eUnknown,
322                        "No valid alignments found");
323             return;
324         }
325 
326         ERR_POST(Warning
327             << count_accepted << "/" << aln_container.size()
328             << " alignments had no IDs to extract.");
329     }
330 
331     ///
332     /// gather statistics about our alignment
333     ///
334     TAlnStats aln_stats(aln_id_map);
335 
336 
337     // auto-detect self-alignments
338     // if the input set of sequences correspond to one and only one sequence,
339     // force row preservation
340     bool preserve_rows = m_PreserveRows;
341     {{
342          set<CSeq_id_Handle> ids;
343          ITERATE (TAlnStats::TIdVec, i, aln_stats.GetIdVec()) {
344              CSeq_id_Handle idh = CSeq_id_Handle::GetHandle((*i)->GetSeqId());
345              ids.insert(idh);
346          }
347          if (ids.size() == 1) {
348              preserve_rows = true;
349          }
350      }}
351 
352     CAlnUserOptions opts;
353 
354 
355     /// always merge both directions
356     opts.m_Direction = CAlnUserOptions::eBothDirections;
357 
358     ///
359     /// create a set of anchored alignments
360     ///
361     TAnchoredAlnVec anchored_aln_vec;
362     CreateAnchoredAlnVec(aln_stats, anchored_aln_vec, opts);
363 
364     /// always merge all sequences
365     opts.m_MergeAlgo = CAlnUserOptions::eMergeAllSeqs;
366     if (preserve_rows) {
367         opts.m_MergeAlgo = CAlnUserOptions::ePreserveRows;
368     }
369 
370     /// we default to truncating overlaps
371     CAlnUserOptions::TMergeFlags flags =
372         CAlnUserOptions::fTruncateOverlaps |
373         CAlnUserOptions::fUseAnchorAsAlnSeq;
374 
375     /// we may disable soring by scores
376     if ( !m_SortByScore ) {
377         flags |= CAlnUserOptions::fSkipSortByScore;
378     }
379     opts.SetMergeFlags(flags, true);
380 
381     ///
382     /// now, build
383     ///
384     CAnchoredAln out_anchored_aln;
385     BuildAln(anchored_aln_vec, out_anchored_aln, opts);
386 
387     ///
388     /// create dense-segs and return
389     ///
390 #if 0
391     vector< CRef<CSeq_align> > ds_aligns;
392     ds_aligns.push_back(CreateSeqAlignFromAnchoredAln
393                         (out_anchored_aln, CSeq_align::TSegs::e_Denseg));
394 #endif
395 
396     vector< CRef<CSeq_align> > ds_aligns;
397     CreateSeqAlignFromEachPairwiseAln
398         (out_anchored_aln.GetPairwiseAlns(), out_anchored_aln.GetAnchorRow(),
399          ds_aligns, CSeq_align::TSegs::e_Denseg);
400 
401     NON_CONST_ITERATE (vector< CRef<CSeq_align> >, it, ds_aligns) {
402         (*it)->SetType(CSeq_align::eType_partial);
403         aligns_out.push_back(*it);
404     }
405 
406     /// fill unaligned regions
407     if (m_FillUnaligned) {
408         NON_CONST_ITERATE (TAligns, align_iter, aligns_out) {
409             CRef<CDense_seg> ds = (*align_iter)->SetSegs().SetDenseg().FillUnaligned();
410             (*align_iter)->SetSegs().SetDenseg(*ds);
411         }
412     }
413 }
414 
415 
x_Cleanup_AlignVec(const TConstAligns & aligns_in,TAligns & aligns_out)416 void CAlignCleanup::x_Cleanup_AlignVec(const TConstAligns& aligns_in,
417                                        TAligns&            aligns_out)
418 {
419     /// first, sort the alignments by the set of IDs they contain
420     typedef set<CSeq_id_Handle> TIdSet;
421     typedef map<TIdSet, list< CConstRef<CSeq_align> > > TAlignments;
422     TAlignments align_map;
423 
424     bool all_pairwise = true;
425     ITERATE (TConstAligns, iter, aligns_in) {
426         CConstRef<CSeq_align> al = *iter;
427         if (al->GetSegs().IsDenseg()  &&
428             al->GetSegs().GetDenseg().GetDim() != 2) {
429             all_pairwise = false;
430         }
431 
432         TIdSet id_set;
433         CTypeConstIterator<CSeq_id> id_iter(*al);
434         for ( ;  id_iter;  ++id_iter) {
435             CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(*id_iter);
436             id_set.insert(idh);
437         }
438 
439         align_map[id_set].push_back(al);
440     }
441 
442     CAlnMix::TMergeFlags merge_flags = CAlnMix::fTruncateOverlaps |
443                                        CAlnMix::fRemoveLeadTrailGaps |
444                                        CAlnMix::fGapJoin |
445                                        CAlnMix::fMinGap;
446     if (m_SortByScore) {
447         merge_flags |= CAlnMix::fSortInputByScore;
448     }
449 
450     /// next, merge each sublist independently, if needed
451     string msg;
452     ITERATE (TAlignments, iter, align_map) {
453         typedef list< CConstRef<CSeq_align> > TAlignList;
454         list<TAlignList> alignments;
455 
456         alignments.push_back(TAlignList());
457         TAlignList& pos_strand = alignments.back();
458 
459         alignments.push_back(TAlignList());
460         TAlignList& neg_strand = alignments.back();
461 
462         ITERATE (list< CConstRef<CSeq_align> >, it, iter->second) {
463             const CSeq_align& align = **it;
464             if (align.GetSegs().IsDenseg()  &&  align.GetSegs().GetDenseg().GetDim() == 2) {
465                 const CDense_seg& ds = align.GetSegs().GetDenseg();
466                 /// common case: dense-seg, particularly pairwise
467                 if (ds.IsSetStrands()  &&  ds.GetStrands()[0] != ds.GetStrands()[1]) {
468                     neg_strand.push_back(*it);
469                 } else {
470                     pos_strand.push_back(*it);
471                 }
472             } else {
473                 /// mixed, so we bain - this is not a common case
474                 pos_strand.insert(pos_strand.end(),
475                                   neg_strand.begin(), neg_strand.end());
476                 for ( ; it != iter->second.end();  ++it) {
477                     pos_strand.push_back(*it);
478                 }
479                 --it; // prevent enclosing loop to increment past end
480             }
481         }
482 
483         /// now, we mix two sets of alignments,
484         /// one negative strand, one positive strand
485         size_t count = 0;
486         ITERATE (list<TAlignList>, it, alignments) {
487             ++count;
488             if (it->empty()) {
489                 continue;
490             }
491             try {
492                 auto_ptr<CAlnMix> mix_ptr( m_Scope ? new CAlnMix(*m_Scope) : new CAlnMix() );
493                 CAlnMix& mix = *mix_ptr;
494                 CAlnMix::TAddFlags flags = 0;
495                 if (iter->first.size() == 1  ||  m_PreserveRows) {
496                     flags |= CAlnMix::fPreserveRows;
497                 }
498                 ITERATE (TAlignList, i, *it) {
499                     mix.Add(**i, flags);
500                 }
501 
502                 mix.Merge(merge_flags);
503 
504                 if (mix.GetDenseg().GetStarts().size() == 0) {
505                     NCBI_THROW(CException, eUnknown,
506                                "Mix produced empty alignment");
507                 }
508 
509                 if (mix.GetDenseg().GetLens().size() == 0) {
510                     NCBI_THROW(CException, eUnknown,
511                                "Mix produced empty alignment");
512                 }
513 
514                 list< CRef<CSeq_align> > aligns;
515                 CRef<CSeq_align> new_align(new CSeq_align);
516                 new_align->Assign(mix.GetSeqAlign());
517 
518                 if (all_pairwise  &&
519                     new_align->GetSegs().IsDenseg()  &&
520                     new_align->GetSegs().GetDenseg().GetDim() > 2) {
521                     CreatePairwiseFromMultiple(*new_align, aligns);
522                 } else {
523                     aligns.push_back(new_align);
524                 }
525 
526                 NON_CONST_ITERATE (list< CRef<CSeq_align> >, align, aligns) {
527                     if ((*align)->GetSegs().IsDenseg()) {
528                         (*align)->SetSegs().SetDenseg().Compact();
529                     }
530                     aligns_out.push_back(*align);
531                 }
532             }
533             catch (CException& e) {
534                 ERR_POST(Error << "CAlgoPlugin_AlignCleanup::Run(): "
535                     "error merging alignments: "
536                     << e.GetMsg());
537                 if ( !msg.empty() ) {
538                     msg += "\n";
539                 }
540                 msg += e.GetMsg();
541             }
542         }
543     }
544 }
545 
546 
547 END_NCBI_SCOPE
548 
549