1 /* $Id: align_cleanup.cpp 618599 2020-10-22 17:38:51Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Mike DiCuccio
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33
34 #include <algo/sequence/align_cleanup.hpp>
35
36 #include <objtools/alnmgr/aln_container.hpp>
37 #include <objtools/alnmgr/aln_tests.hpp>
38 #include <objtools/alnmgr/aln_stats.hpp>
39 #include <objtools/alnmgr/pairwise_aln.hpp>
40 #include <objtools/alnmgr/aln_converters.hpp>
41 #include <objtools/alnmgr/aln_builders.hpp>
42 #include <objtools/alnmgr/aln_serial.hpp>
43 #include <objtools/alnmgr/aln_user_options.hpp>
44 #include <objtools/alnmgr/aln_generators.hpp>
45 #include <objtools/alnmgr/seqids_extractor.hpp>
46
47 #include <objtools/alnmgr/alnmix.hpp>
48
49 #include <serial/serial.hpp>
50 #include <serial/iterator.hpp>
51
52 BEGIN_NCBI_SCOPE
53 USING_SCOPE(objects);
54
55 /////////////////////////////////////////////////////////////////////////////
56
CreatePairwiseFromMultiple(const CSeq_align & multiple,TAligns & pairwise)57 void CAlignCleanup::CreatePairwiseFromMultiple(const CSeq_align& multiple,
58 TAligns& pairwise)
59 {
60 #if 0
61 CAlnContainer aln_container;
62 aln_container.insert(multiple);
63
64 /// Types we use here:
65 typedef CSeq_align::TDim TDim;
66
67 /// Create a vector of seq-ids per seq-align
68 TIdExtract id_extract;
69 TAlnIdMap aln_id_map(id_extract, aln_container.size());
70 size_t count_accepted = 0;
71 ITERATE(CAlnContainer, aln_it, aln_container) {
72 try {
73 aln_id_map.push_back(**aln_it);
74 ++count_accepted;
75 }
76 catch (CAlnException& e) {
77 ERR_POST(Error
78 << "CAlgoPlugin_AlignCleanup::x_Run_AlignMgr(): "
79 << "failed to extract IDs: " << e.GetMsg());
80 }
81 }
82
83 if (count_accepted != aln_container.size()) {
84 if (count_accepted == 0) {
85 NCBI_THROW(CException, eUnknown,
86 "No valid alignments found");
87 return;
88 }
89
90 ERR_POST(Warning
91 << count_accepted << "/" << aln_container.size()
92 << " alignments had no IDs to extract.");
93 }
94
95 ///
96 /// gather statistics about our alignment
97 ///
98 TAlnStats aln_stats(aln_id_map);
99
100 CAlnUserOptions opts;
101 opts.m_MergeAlgo = CAlnUserOptions::eMergeAllSeqs;
102 opts.m_Direction = CAlnUserOptions::eBothDirections;
103 opts.SetMergeFlags(CAlnUserOptions::fTruncateOverlaps, true);
104
105 ///
106 /// create a set of anchored alignments
107 ///
108 TAnchoredAlnVec anchored_aln_vec;
109 CreateAnchoredAlnVec(aln_stats, anchored_aln_vec, opts);
110
111 ITERATE (TAnchoredAlnVec, iter, anchored_aln_vec) {
112 ITERATE(CAnchoredAln::TPairwiseAlnVector,
113 pairwise_aln_i,
114 (*iter)->GetPairwiseAlns()) {
115
116 CRef<CDense_seg> ds =
117 CreateDensegFromPairwiseAln(**pairwise_aln_i);
118 CRef<CSeq_align> aln(new CSeq_align);
119 aln->SetSegs().SetDenseg(*ds);
120 aln->SetType(CSeq_align::eType_partial);
121 pairwise.push_back(aln);
122 }
123 }
124 #endif
125
126 #if 1
127 _ASSERT(multiple.GetSegs().IsDenseg());
128
129 const CDense_seg& seg = multiple.GetSegs().GetDenseg();
130 CDense_seg::TDim max_rows = seg.GetDim();
131 for (CDense_seg::TDim row = 1; row < max_rows; ++row) {
132 CRef<CDense_seg> new_seg(new CDense_seg);
133
134 /// we are creating pairwise alignments
135 new_seg->SetDim(2);
136
137 /// get IDs right
138 CRef<CSeq_id> id;
139 id.Reset(new CSeq_id);
140 id->Assign(*seg.GetIds()[0]);
141 new_seg->SetIds().push_back(id);
142
143 id.Reset(new CSeq_id);
144 id->Assign(*seg.GetIds()[row]);
145 new_seg->SetIds().push_back(id);
146
147 /// copy the starts
148 CDense_seg::TNumseg segs = 0;
149 for (CDense_seg::TNumseg j = 0; j < seg.GetNumseg(); ++j) {
150 TSignedSeqPos start_0 = seg.GetStarts()[j * max_rows + 0];
151 TSignedSeqPos start_1 = seg.GetStarts()[j * max_rows + row];
152
153 if ((start_0 < 0 && start_1 < 0) // segment is entirely gapped
154 || (segs==0 && (start_0 < 0 || start_1 < 0)) // gapped segment at the beginning
155 ) {
156 continue;
157 }
158 new_seg->SetLens().push_back(seg.GetLens()[j]);
159 new_seg->SetStarts().push_back(start_0);
160 new_seg->SetStarts().push_back(start_1);
161 new_seg->SetStrands().push_back(seg.GetStrands()[j * max_rows + 0]);
162 new_seg->SetStrands().push_back(seg.GetStrands()[j * max_rows + row]);
163
164 ++segs;
165 }
166
167 while (segs && (new_seg->SetStarts()[segs*2-2] < 0 || new_seg->SetStarts()[segs*2-1] < 0)) {
168 // gapped segment at the end
169 --segs;
170 new_seg->SetLens().resize(segs);
171 new_seg->SetStarts().resize(segs*2);
172 new_seg->SetStrands().resize(segs*2);
173 }
174
175 /// set the number of segments
176 /// we will clean this up later
177 new_seg->SetNumseg(segs);
178
179 if (segs) {
180 try {
181 CRef<CSeq_align> align(new CSeq_align);
182
183 /// make sure we set type correctly
184 align->SetType(multiple.GetType());
185
186 align->SetSegs().SetDenseg(*new_seg);
187
188 ///
189 /// validation is optional!
190 align->Validate(true);
191
192 pairwise.push_back(align);
193 }
194 catch (CException& e) {
195 ERR_POST(Error
196 << "CAlignCleanup::CreatePairwiseFromMultiple(): "
197 << "failed to validate: " << e.GetMsg());
198 }
199 }
200 }
201 #endif
202 }
203
204
205 /////////////////////////////////////////////////////////////////////////////
206
CAlignCleanup()207 CAlignCleanup::CAlignCleanup()
208 : m_SortByScore(true)
209 , m_PreserveRows(false)
210 , m_FillUnaligned(false)
211 {
212 }
213
CAlignCleanup(CScope & scope)214 CAlignCleanup::CAlignCleanup(CScope& scope)
215 : m_Scope(&scope)
216 , m_SortByScore(true)
217 , m_PreserveRows(false)
218 , m_FillUnaligned(false)
219 {
220 }
221
222
Cleanup(const TAligns & aligns_in,TAligns & aligns_out,EMode mode)223 void CAlignCleanup::Cleanup(const TAligns& aligns_in,
224 TAligns& aligns_out,
225 EMode mode)
226 {
227 TConstAligns const_aligns_in;
228 copy(aligns_in.begin(), aligns_in.end(), back_inserter(const_aligns_in));
229 Cleanup(const_aligns_in, aligns_out, mode);
230 }
231
Cleanup(const TConstAligns & aligns_in,TAligns & aligns_out,EMode mode)232 void CAlignCleanup::Cleanup(const TConstAligns& aligns_in,
233 TAligns& aligns_out,
234 EMode mode)
235 {
236 size_t size = aligns_in.size();
237 if (size == 0) {
238 return;
239 }
240 if (size == 1) {
241 // short cut: just copy the alignment
242 CRef<CSeq_align> align(new CSeq_align);
243 align->Assign(*aligns_in.front());
244 aligns_out.push_back(align);
245 return;
246 }
247
248 switch (mode) {
249 case eAlignVec:
250 x_Cleanup_AlignVec(aligns_in, aligns_out);
251 break;
252
253 case eAnchoredAlign:
254 x_Cleanup_AnchoredAln(aligns_in, aligns_out);
255 break;
256 }
257 }
258
259
x_Cleanup_AnchoredAln(const TConstAligns & aligns_in,TAligns & aligns_out)260 void CAlignCleanup::x_Cleanup_AnchoredAln(const TConstAligns& aligns_in,
261 TAligns& aligns_out)
262 {
263 CAlnContainer aln_container;
264
265 ///
266 /// step 1: add to alignment container
267 ///
268 size_t count = 0;
269 size_t count_invalid = 0;
270 ITERATE (TConstAligns, iter, aligns_in) {
271
272 try {
273 ++count;
274 CConstRef<CSeq_align> aln = *iter;
275
276 ///
277 /// validation is optional!
278 aln->Validate(true);
279
280 aln_container.insert(*aln);
281 }
282 catch (CException& e) {
283 ERR_POST(Error
284 << "CAlgoPlugin_AlignCleanup::x_Run_AlignMgr(): "
285 << "failed to validate: " << e.GetMsg());
286 ++count_invalid;
287 }
288 }
289
290 if (count_invalid) {
291 string msg;
292 msg += NStr::NumericToString(count_invalid);
293 msg += "/";
294 msg += NStr::NumericToString(count);
295 msg += " alignments failed validation.";
296 if (count_invalid == count) {
297 NCBI_THROW(CException, eUnknown, msg);
298 } else {
299 ERR_POST(Warning << msg);
300 }
301 }
302
303 /// Create a vector of seq-ids per seq-align
304 TIdExtract id_extract;
305 TAlnIdMap aln_id_map(id_extract, aln_container.size());
306 size_t count_accepted = 0;
307 ITERATE(CAlnContainer, aln_it, aln_container) {
308 try {
309 aln_id_map.push_back(**aln_it);
310 ++count_accepted;
311 }
312 catch (CAlnException& e) {
313 ERR_POST(Error
314 << "CAlgoPlugin_AlignCleanup::x_Run_AlignMgr(): "
315 << "failed to extract IDs: " << e.GetMsg());
316 }
317 }
318
319 if (count_accepted != aln_container.size()) {
320 if (count_accepted == 0) {
321 NCBI_THROW(CException, eUnknown,
322 "No valid alignments found");
323 return;
324 }
325
326 ERR_POST(Warning
327 << count_accepted << "/" << aln_container.size()
328 << " alignments had no IDs to extract.");
329 }
330
331 ///
332 /// gather statistics about our alignment
333 ///
334 TAlnStats aln_stats(aln_id_map);
335
336
337 // auto-detect self-alignments
338 // if the input set of sequences correspond to one and only one sequence,
339 // force row preservation
340 bool preserve_rows = m_PreserveRows;
341 {{
342 set<CSeq_id_Handle> ids;
343 ITERATE (TAlnStats::TIdVec, i, aln_stats.GetIdVec()) {
344 CSeq_id_Handle idh = CSeq_id_Handle::GetHandle((*i)->GetSeqId());
345 ids.insert(idh);
346 }
347 if (ids.size() == 1) {
348 preserve_rows = true;
349 }
350 }}
351
352 CAlnUserOptions opts;
353
354
355 /// always merge both directions
356 opts.m_Direction = CAlnUserOptions::eBothDirections;
357
358 ///
359 /// create a set of anchored alignments
360 ///
361 TAnchoredAlnVec anchored_aln_vec;
362 CreateAnchoredAlnVec(aln_stats, anchored_aln_vec, opts);
363
364 /// always merge all sequences
365 opts.m_MergeAlgo = CAlnUserOptions::eMergeAllSeqs;
366 if (preserve_rows) {
367 opts.m_MergeAlgo = CAlnUserOptions::ePreserveRows;
368 }
369
370 /// we default to truncating overlaps
371 CAlnUserOptions::TMergeFlags flags =
372 CAlnUserOptions::fTruncateOverlaps |
373 CAlnUserOptions::fUseAnchorAsAlnSeq;
374
375 /// we may disable soring by scores
376 if ( !m_SortByScore ) {
377 flags |= CAlnUserOptions::fSkipSortByScore;
378 }
379 opts.SetMergeFlags(flags, true);
380
381 ///
382 /// now, build
383 ///
384 CAnchoredAln out_anchored_aln;
385 BuildAln(anchored_aln_vec, out_anchored_aln, opts);
386
387 ///
388 /// create dense-segs and return
389 ///
390 #if 0
391 vector< CRef<CSeq_align> > ds_aligns;
392 ds_aligns.push_back(CreateSeqAlignFromAnchoredAln
393 (out_anchored_aln, CSeq_align::TSegs::e_Denseg));
394 #endif
395
396 vector< CRef<CSeq_align> > ds_aligns;
397 CreateSeqAlignFromEachPairwiseAln
398 (out_anchored_aln.GetPairwiseAlns(), out_anchored_aln.GetAnchorRow(),
399 ds_aligns, CSeq_align::TSegs::e_Denseg);
400
401 NON_CONST_ITERATE (vector< CRef<CSeq_align> >, it, ds_aligns) {
402 (*it)->SetType(CSeq_align::eType_partial);
403 aligns_out.push_back(*it);
404 }
405
406 /// fill unaligned regions
407 if (m_FillUnaligned) {
408 NON_CONST_ITERATE (TAligns, align_iter, aligns_out) {
409 CRef<CDense_seg> ds = (*align_iter)->SetSegs().SetDenseg().FillUnaligned();
410 (*align_iter)->SetSegs().SetDenseg(*ds);
411 }
412 }
413 }
414
415
x_Cleanup_AlignVec(const TConstAligns & aligns_in,TAligns & aligns_out)416 void CAlignCleanup::x_Cleanup_AlignVec(const TConstAligns& aligns_in,
417 TAligns& aligns_out)
418 {
419 /// first, sort the alignments by the set of IDs they contain
420 typedef set<CSeq_id_Handle> TIdSet;
421 typedef map<TIdSet, list< CConstRef<CSeq_align> > > TAlignments;
422 TAlignments align_map;
423
424 bool all_pairwise = true;
425 ITERATE (TConstAligns, iter, aligns_in) {
426 CConstRef<CSeq_align> al = *iter;
427 if (al->GetSegs().IsDenseg() &&
428 al->GetSegs().GetDenseg().GetDim() != 2) {
429 all_pairwise = false;
430 }
431
432 TIdSet id_set;
433 CTypeConstIterator<CSeq_id> id_iter(*al);
434 for ( ; id_iter; ++id_iter) {
435 CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(*id_iter);
436 id_set.insert(idh);
437 }
438
439 align_map[id_set].push_back(al);
440 }
441
442 CAlnMix::TMergeFlags merge_flags = CAlnMix::fTruncateOverlaps |
443 CAlnMix::fRemoveLeadTrailGaps |
444 CAlnMix::fGapJoin |
445 CAlnMix::fMinGap;
446 if (m_SortByScore) {
447 merge_flags |= CAlnMix::fSortInputByScore;
448 }
449
450 /// next, merge each sublist independently, if needed
451 string msg;
452 ITERATE (TAlignments, iter, align_map) {
453 typedef list< CConstRef<CSeq_align> > TAlignList;
454 list<TAlignList> alignments;
455
456 alignments.push_back(TAlignList());
457 TAlignList& pos_strand = alignments.back();
458
459 alignments.push_back(TAlignList());
460 TAlignList& neg_strand = alignments.back();
461
462 ITERATE (list< CConstRef<CSeq_align> >, it, iter->second) {
463 const CSeq_align& align = **it;
464 if (align.GetSegs().IsDenseg() && align.GetSegs().GetDenseg().GetDim() == 2) {
465 const CDense_seg& ds = align.GetSegs().GetDenseg();
466 /// common case: dense-seg, particularly pairwise
467 if (ds.IsSetStrands() && ds.GetStrands()[0] != ds.GetStrands()[1]) {
468 neg_strand.push_back(*it);
469 } else {
470 pos_strand.push_back(*it);
471 }
472 } else {
473 /// mixed, so we bain - this is not a common case
474 pos_strand.insert(pos_strand.end(),
475 neg_strand.begin(), neg_strand.end());
476 for ( ; it != iter->second.end(); ++it) {
477 pos_strand.push_back(*it);
478 }
479 --it; // prevent enclosing loop to increment past end
480 }
481 }
482
483 /// now, we mix two sets of alignments,
484 /// one negative strand, one positive strand
485 size_t count = 0;
486 ITERATE (list<TAlignList>, it, alignments) {
487 ++count;
488 if (it->empty()) {
489 continue;
490 }
491 try {
492 auto_ptr<CAlnMix> mix_ptr( m_Scope ? new CAlnMix(*m_Scope) : new CAlnMix() );
493 CAlnMix& mix = *mix_ptr;
494 CAlnMix::TAddFlags flags = 0;
495 if (iter->first.size() == 1 || m_PreserveRows) {
496 flags |= CAlnMix::fPreserveRows;
497 }
498 ITERATE (TAlignList, i, *it) {
499 mix.Add(**i, flags);
500 }
501
502 mix.Merge(merge_flags);
503
504 if (mix.GetDenseg().GetStarts().size() == 0) {
505 NCBI_THROW(CException, eUnknown,
506 "Mix produced empty alignment");
507 }
508
509 if (mix.GetDenseg().GetLens().size() == 0) {
510 NCBI_THROW(CException, eUnknown,
511 "Mix produced empty alignment");
512 }
513
514 list< CRef<CSeq_align> > aligns;
515 CRef<CSeq_align> new_align(new CSeq_align);
516 new_align->Assign(mix.GetSeqAlign());
517
518 if (all_pairwise &&
519 new_align->GetSegs().IsDenseg() &&
520 new_align->GetSegs().GetDenseg().GetDim() > 2) {
521 CreatePairwiseFromMultiple(*new_align, aligns);
522 } else {
523 aligns.push_back(new_align);
524 }
525
526 NON_CONST_ITERATE (list< CRef<CSeq_align> >, align, aligns) {
527 if ((*align)->GetSegs().IsDenseg()) {
528 (*align)->SetSegs().SetDenseg().Compact();
529 }
530 aligns_out.push_back(*align);
531 }
532 }
533 catch (CException& e) {
534 ERR_POST(Error << "CAlgoPlugin_AlignCleanup::Run(): "
535 "error merging alignments: "
536 << e.GetMsg());
537 if ( !msg.empty() ) {
538 msg += "\n";
539 }
540 msg += e.GetMsg();
541 }
542 }
543 }
544 }
545
546
547 END_NCBI_SCOPE
548
549