1 /*  $Id: seq_align_mapper.cpp 439337 2014-06-27 16:08:53Z vasilche $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aleksey Grichenko
27 *
28 * File Description:
29 *   Alignment mapper
30 *
31 */
32 
33 #include <ncbi_pch.hpp>
34 #include <objmgr/impl/seq_align_mapper.hpp>
35 #include <objmgr/seq_loc_mapper.hpp>
36 #include <objmgr/objmgr_exception.hpp>
37 #include <objects/seqalign/seqalign__.hpp>
38 #include <algorithm>
39 
40 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)41 BEGIN_SCOPE(objects)
42 
43 
44 CSeq_align_Mapper::CSeq_align_Mapper(const CSeq_align&     align,
45                                      CSeq_loc_Mapper_Base& loc_mapper)
46     : CSeq_align_Mapper_Base(loc_mapper)
47 {
48     x_Init(align);
49 }
50 
51 
CSeq_align_Mapper(CSeq_loc_Mapper_Base & loc_mapper)52 CSeq_align_Mapper::CSeq_align_Mapper(CSeq_loc_Mapper_Base& loc_mapper)
53     : CSeq_align_Mapper_Base(loc_mapper)
54 {
55 }
56 
57 
~CSeq_align_Mapper(void)58 CSeq_align_Mapper::~CSeq_align_Mapper(void)
59 {
60 }
61 
62 
63 // Mapping through CSeq_loc_Conversion_Set
64 
65 struct CConversionRef_Less
66 {
67     bool operator()(const CRef<CSeq_loc_Conversion>& x,
68                     const CRef<CSeq_loc_Conversion>& y) const;
69 };
70 
71 
operator ()(const CRef<CSeq_loc_Conversion> & x,const CRef<CSeq_loc_Conversion> & y) const72 bool CConversionRef_Less::operator()(const CRef<CSeq_loc_Conversion>& x,
73                                      const CRef<CSeq_loc_Conversion>& y) const
74 {
75     if (x->m_Src_id_Handle != y->m_Src_id_Handle) {
76         return x->m_Src_id_Handle < y->m_Src_id_Handle;
77     }
78     // Leftmost first
79     if (x->m_Src_from != y->m_Src_from) {
80         return x->m_Src_from < y->m_Src_from;
81     }
82     // Longest first
83     return x->m_Src_to > y->m_Src_to;
84 }
85 
86 
Convert(CSeq_loc_Conversion_Set & cvts)87 void CSeq_align_Mapper::Convert(CSeq_loc_Conversion_Set& cvts)
88 {
89     m_DstAlign.Reset();
90 
91     if (m_SubAligns.size() > 0) {
92         NON_CONST_ITERATE(TSubAligns, it, m_SubAligns) {
93             dynamic_cast<CSeq_align_Mapper*>(it->GetPointer())->
94                 Convert(cvts);
95         }
96         return;
97     }
98     x_ConvertAlignCvt(cvts);
99 }
100 
101 
x_ConvertAlignCvt(CSeq_loc_Conversion_Set & cvts)102 void CSeq_align_Mapper::x_ConvertAlignCvt(CSeq_loc_Conversion_Set& cvts)
103 {
104     if (cvts.m_CvtByIndex.size() == 0) {
105         // Single mapping
106         _ASSERT(cvts.m_SingleConv);
107         if ( cvts.m_SingleIndex == cvts.kAllIndexes ) {
108             for ( size_t row = 0; row < GetMaxDim(); ++row ) {
109                 x_ConvertRowCvt(*cvts.m_SingleConv, row);
110             }
111         }
112         else {
113             x_ConvertRowCvt(*cvts.m_SingleConv, cvts.m_SingleIndex);
114         }
115         return;
116     }
117     NON_CONST_ITERATE(CSeq_loc_Conversion_Set::TConvByIndex, idx_it,
118                       cvts.m_CvtByIndex) {
119         if ( idx_it->first == cvts.kAllIndexes ) {
120             for ( size_t row = 0; row < GetMaxDim(); ++row ) {
121                 x_ConvertRowCvt(idx_it->second, row);
122             }
123         }
124         else {
125             x_ConvertRowCvt(idx_it->second, idx_it->first);
126         }
127     }
128 }
129 
130 
x_ConvertRowCvt(CSeq_loc_Conversion & cvt,size_t row)131 void CSeq_align_Mapper::x_ConvertRowCvt(CSeq_loc_Conversion& cvt,
132                                         size_t row)
133 {
134     CSeq_id_Handle dst_id;
135     TSegments::iterator seg_it = m_Segs.begin();
136     for ( ; seg_it != m_Segs.end(); ) {
137         if (seg_it->m_Rows.size() <= row) {
138             // No such row in the current segment
139             ++seg_it;
140             m_AlignFlags = eAlign_MultiDim;
141             continue;
142         }
143         CSeq_id_Handle seg_id = x_ConvertSegmentCvt(seg_it, cvt, row);
144         if (dst_id) {
145             if (dst_id != seg_id  &&  m_AlignFlags == eAlign_Normal) {
146                 m_AlignFlags = eAlign_MultiId;
147             }
148             dst_id = seg_id;
149         }
150     }
151 }
152 
153 
x_ConvertRowCvt(TIdMap & cvts,size_t row)154 void CSeq_align_Mapper::x_ConvertRowCvt(TIdMap& cvts,
155                                         size_t row)
156 {
157     CSeq_id_Handle dst_id;
158     TSegments::iterator seg_it = m_Segs.begin();
159     for ( ; seg_it != m_Segs.end(); ) {
160         if (seg_it->m_Rows.size() <= row) {
161             // No such row in the current segment
162             ++seg_it;
163             m_AlignFlags = eAlign_MultiDim;
164             continue;
165         }
166         CSeq_id_Handle seg_id = x_ConvertSegmentCvt(seg_it, cvts, row);
167         if (dst_id) {
168             if (dst_id != seg_id  &&  m_AlignFlags == eAlign_Normal) {
169                 m_AlignFlags = eAlign_MultiId;
170             }
171             dst_id = seg_id;
172         }
173     }
174 }
175 
176 
177 CSeq_id_Handle
x_ConvertSegmentCvt(TSegments::iterator & seg_it,CSeq_loc_Conversion & cvt,size_t row)178 CSeq_align_Mapper::x_ConvertSegmentCvt(TSegments::iterator& seg_it,
179                                        CSeq_loc_Conversion& cvt,
180                                        size_t row)
181 {
182     TSegments::iterator old_it = seg_it;
183     ++seg_it;
184     SAlignment_Segment::SAlignment_Row& aln_row = old_it->m_Rows[row];
185     if (aln_row.m_Id != cvt.m_Src_id_Handle) {
186         return aln_row.m_Id;
187     }
188     if (aln_row.m_Start == kInvalidSeqPos) {
189         // ??? Skipped row - change ID
190         aln_row.m_Id = cvt.m_Dst_id_Handle;
191         aln_row.SetMapped();
192         return aln_row.m_Id;
193     }
194     TRange rg(aln_row.m_Start, aln_row.m_Start + old_it->m_Len - 1);
195     if (!cvt.ConvertInterval(rg.GetFrom(), rg.GetTo(), aln_row.m_Strand) ) {
196         // Do not erase the segment, just change the row ID and reset start
197         aln_row.m_Start = kInvalidSeqPos;
198         aln_row.m_Id = cvt.m_Dst_id_Handle;
199         aln_row.SetMapped();
200         return aln_row.m_Id;
201     }
202 
203     // Prepare insert point depending on the source strand
204     TSegments::iterator ins_point = seg_it;
205     bool src_reverse = aln_row.m_IsSetStrand ? IsReverse(aln_row.m_Strand) : false;
206 
207     // At least part of the interval was converted.
208     TSeqPos dl = cvt.m_Src_from <= rg.GetFrom() ?
209         0 : cvt.m_Src_from - rg.GetFrom();
210     TSeqPos dr = cvt.m_Src_to >= rg.GetTo() ?
211         0 : rg.GetTo() - cvt.m_Src_to;
212     if (dl > 0) {
213         // Add segment for the skipped range
214         SAlignment_Segment& lseg = x_InsertSeg(ins_point, dl,
215             old_it->m_Rows.size(), src_reverse);
216         lseg.m_PartType = old_it->m_PartType;
217         for (size_t r = 0; r < old_it->m_Rows.size(); ++r) {
218             SAlignment_Segment::SAlignment_Row& lrow =
219                 lseg.CopyRow(r, old_it->m_Rows[r]);
220             if (r == row) {
221                 lrow.m_Start = kInvalidSeqPos;
222                 lrow.m_Id = cvt.m_Dst_id_Handle;
223             }
224             else if (lrow.m_Start != kInvalidSeqPos &&
225                 !lrow.SameStrand(aln_row)) {
226                 // Adjust start for minus strand
227                 lrow.m_Start += old_it->m_Len - lseg.m_Len;
228             }
229         }
230     }
231     rg.SetFrom(rg.GetFrom() + dl);
232     SAlignment_Segment& mseg = x_InsertSeg(ins_point,
233         rg.GetLength() - dr,
234         old_it->m_Rows.size(),
235         src_reverse);
236     mseg.m_PartType = old_it->m_PartType;
237     if (!dl  &&  !dr) {
238         // copy scores if there's no truncation
239         mseg.m_Scores = old_it->m_Scores;
240         mseg.m_ScoresGroupIdx = old_it->m_ScoresGroupIdx;
241     }
242     else {
243         // Invalidate all scores related to the segment (this
244         // includes alignment-level scores).
245         x_InvalidateScores(&(*old_it));
246     }
247     for (size_t r = 0; r < old_it->m_Rows.size(); ++r) {
248         SAlignment_Segment::SAlignment_Row& mrow =
249             mseg.CopyRow(r, old_it->m_Rows[r]);
250         if (r == row) {
251             // translate id and coords
252             mrow.m_Id = cvt.m_Dst_id_Handle;
253             mrow.m_Start = cvt.m_LastRange.GetFrom();
254             mrow.m_IsSetStrand |= (cvt.m_LastStrand != eNa_strand_unknown);
255             mrow.m_Strand = cvt.m_LastStrand;
256             mrow.SetMapped();
257             mseg.m_HaveStrands |= mrow.m_IsSetStrand;
258         }
259         else {
260             if (mrow.m_Start != kInvalidSeqPos) {
261                 if (mrow.SameStrand(aln_row)) {
262                     mrow.m_Start += dl;
263                 }
264                 else {
265                     mrow.m_Start += old_it->m_Len - dl - mseg.m_Len;
266                 }
267             }
268         }
269     }
270     cvt.m_LastType = cvt.eMappedObjType_not_set;
271     dl += rg.GetLength() - dr;
272     rg.SetFrom(rg.GetTo() - dr);
273     if (dr > 0) {
274         // Add the remaining unmapped range
275         SAlignment_Segment& rseg = x_InsertSeg(ins_point,
276             dr,
277             old_it->m_Rows.size(),
278             src_reverse);
279         rseg.m_PartType = old_it->m_PartType;
280         for (size_t r = 0; r < old_it->m_Rows.size(); ++r) {
281             SAlignment_Segment::SAlignment_Row& rrow =
282                 rseg.CopyRow(r, old_it->m_Rows[r]);
283             if (r == row) {
284                 rrow.m_Start = kInvalidSeqPos;
285                 rrow.m_Id = cvt.m_Dst_id_Handle;
286             }
287             else {
288                 if (rrow.SameStrand(aln_row)) {
289                     rrow.m_Start += dl;
290                 }
291             }
292         }
293     }
294     m_Segs.erase(old_it);
295     return cvt.m_Dst_id_Handle;
296 }
297 
298 
299 CSeq_id_Handle
x_ConvertSegmentCvt(TSegments::iterator & seg_it,TIdMap & id_map,size_t row)300 CSeq_align_Mapper::x_ConvertSegmentCvt(TSegments::iterator& seg_it,
301                                        TIdMap& id_map,
302                                        size_t row)
303 {
304     TSegments::iterator old_it = seg_it;
305     SAlignment_Segment& seg = *old_it;
306     ++seg_it;
307     SAlignment_Segment::SAlignment_Row& aln_row = seg.m_Rows[row];
308     if (aln_row.m_Start == kInvalidSeqPos) {
309         // skipped row
310         return aln_row.m_Id;
311     }
312     TRange rg(aln_row.m_Start, aln_row.m_Start + seg.m_Len - 1);
313     TIdMap::iterator id_it = id_map.find(aln_row.m_Id);
314     if (id_it == id_map.end()) {
315         // ID not found in the segment, leave the row unchanged
316         return aln_row.m_Id;
317     }
318     TRangeMap& rmap = id_it->second;
319     if ( rmap.empty() ) {
320         // No mappings for this segment
321         return aln_row.m_Id;
322     }
323     // Sorted mappings
324     typedef vector< CRef<CSeq_loc_Conversion> > TSortedConversions;
325     TSortedConversions cvts;
326     for (TRangeMap::iterator rg_it = rmap.begin(rg); rg_it; ++rg_it) {
327         cvts.push_back(rg_it->second);
328     }
329     sort(cvts.begin(), cvts.end(), CConversionRef_Less());
330 
331     // Prepare insert point depending on the source strand
332     TSegments::iterator ins_point = seg_it;
333     bool src_reverse = aln_row.m_IsSetStrand ? IsReverse(aln_row.m_Strand) : false;
334 
335     bool mapped = false;
336     CSeq_id_Handle dst_id;
337     EAlignFlags align_flags = eAlign_Normal;
338     TSeqPos left_shift = 0;
339     for (size_t cvt_idx = 0; cvt_idx < cvts.size(); ++cvt_idx) {
340         CSeq_loc_Conversion& cvt = *cvts[cvt_idx];
341         if (!cvt.ConvertInterval(rg.GetFrom(), rg.GetTo(), aln_row.m_Strand) ) {
342             continue;
343         }
344         // Check destination id
345         if ( dst_id ) {
346             if (cvt.m_Dst_id_Handle != dst_id) {
347                 align_flags = eAlign_MultiId;
348             }
349         }
350         dst_id = cvt.m_Dst_id_Handle;
351 
352         // At least part of the interval was converted.
353         TSeqPos dl = cvt.m_Src_from <= rg.GetFrom() ?
354             0 : cvt.m_Src_from - rg.GetFrom();
355         TSeqPos dr = cvt.m_Src_to >= rg.GetTo() ?
356             0 : rg.GetTo() - cvt.m_Src_to;
357         if (dl > 0) {
358             // Add segment for the skipped range
359             SAlignment_Segment& lseg = x_InsertSeg(ins_point,
360                 dl, seg.m_Rows.size(), src_reverse);
361             lseg.m_PartType = old_it->m_PartType;
362             for (size_t r = 0; r < seg.m_Rows.size(); ++r) {
363                 SAlignment_Segment::SAlignment_Row& lrow =
364                     lseg.CopyRow(r, seg.m_Rows[r]);
365                 if (r == row) {
366                     lrow.m_Start = kInvalidSeqPos;
367                     lrow.m_Id = dst_id;
368                 }
369                 else if (lrow.m_Start != kInvalidSeqPos) {
370                     if (lrow.SameStrand(aln_row)) {
371                         lrow.m_Start += left_shift;
372                     }
373                     else {
374                         lrow.m_Start += seg.m_Len - lseg.m_Len - left_shift;
375                     }
376                 }
377             }
378         }
379         left_shift += dl;
380         SAlignment_Segment& mseg = x_InsertSeg(ins_point,
381             rg.GetLength() - dl - dr, seg.m_Rows.size(), src_reverse);
382         mseg.m_PartType = old_it->m_PartType;
383         if (!dl  &&  !dr) {
384             // copy scores if there's no truncation
385             mseg.m_Scores = seg.m_Scores;
386             mseg.m_ScoresGroupIdx = seg.m_ScoresGroupIdx;
387         }
388         else {
389             // Invalidate all scores related to the segment (this
390             // includes alignment-level scores).
391             x_InvalidateScores(&seg);
392         }
393         for (size_t r = 0; r < seg.m_Rows.size(); ++r) {
394             SAlignment_Segment::SAlignment_Row& mrow =
395                 mseg.CopyRow(r, seg.m_Rows[r]);
396             if (r == row) {
397                 // translate id and coords
398                 mrow.m_Id = cvt.m_Dst_id_Handle;
399                 mrow.m_Start = cvt.m_LastRange.GetFrom();
400                 mrow.m_IsSetStrand |= (cvt.m_LastStrand != eNa_strand_unknown);
401                 mrow.m_Strand = cvt.m_LastStrand;
402                 mrow.SetMapped();
403                 mseg.m_HaveStrands |= mrow.m_IsSetStrand;
404             }
405             else {
406                 if (mrow.m_Start != kInvalidSeqPos) {
407                     if (mrow.SameStrand(aln_row)) {
408                         mrow.m_Start += left_shift;
409                     }
410                     else {
411                         mrow.m_Start += seg.m_Len - left_shift - mseg.m_Len;
412                     }
413                 }
414             }
415         }
416         cvt.m_LastType = cvt.eMappedObjType_not_set;
417         mapped = true;
418         left_shift += mseg.m_Len;
419         rg.SetFrom(aln_row.m_Start + left_shift);
420     }
421     if (align_flags == eAlign_MultiId  &&  m_AlignFlags == eAlign_Normal) {
422         m_AlignFlags = align_flags;
423     }
424     if ( !mapped ) {
425         // Do not erase the segment, just change the row ID and reset start
426         seg.m_Rows[row].m_Start = kInvalidSeqPos;
427         seg.m_Rows[row].m_Id = rmap.begin()->second->m_Dst_id_Handle;
428         seg.m_Rows[row].SetMapped();
429         return seg.m_Rows[row].m_Id;
430     }
431     if (rg.GetFrom() <= rg.GetTo()) {
432         // Add the remaining unmapped range
433         SAlignment_Segment& rseg = x_InsertSeg(ins_point,
434             rg.GetLength(), seg.m_Rows.size(), src_reverse);
435         rseg.m_PartType = old_it->m_PartType;
436         for (size_t r = 0; r < seg.m_Rows.size(); ++r) {
437             SAlignment_Segment::SAlignment_Row& rrow =
438                 rseg.CopyRow(r, seg.m_Rows[r]);
439             if (r == row) {
440                 rrow.m_Start = kInvalidSeqPos;
441                 rrow.m_Id = dst_id;
442             }
443             else if (rrow.m_Start != kInvalidSeqPos) {
444                 if (rrow.SameStrand(aln_row)) {
445                     rrow.m_Start += left_shift;
446                 }
447             }
448         }
449     }
450     m_Segs.erase(old_it);
451     return align_flags == eAlign_MultiId ? CSeq_id_Handle() : dst_id;
452 }
453 
454 
455 CSeq_align_Mapper_Base*
CreateSubAlign(const CSeq_align & align)456 CSeq_align_Mapper::CreateSubAlign(const CSeq_align& align)
457 {
458     return new CSeq_align_Mapper(align, GetLocMapper());
459 }
460 
461 
462 CSeq_align_Mapper_Base*
CreateSubAlign(const CSpliced_seg & spliced,const CSpliced_exon & exon)463 CSeq_align_Mapper::CreateSubAlign(const CSpliced_seg& spliced,
464                                   const CSpliced_exon& exon)
465 {
466     auto_ptr<CSeq_align_Mapper> sub(new CSeq_align_Mapper(GetLocMapper()));
467     sub->InitExon(spliced, exon);
468     return sub.release();
469 }
470 
471 
472 END_SCOPE(objects)
473 END_NCBI_SCOPE
474