1 /* $Id: seq_align_mapper.cpp 439337 2014-06-27 16:08:53Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aleksey Grichenko
27 *
28 * File Description:
29 * Alignment mapper
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <objmgr/impl/seq_align_mapper.hpp>
35 #include <objmgr/seq_loc_mapper.hpp>
36 #include <objmgr/objmgr_exception.hpp>
37 #include <objects/seqalign/seqalign__.hpp>
38 #include <algorithm>
39
40 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)41 BEGIN_SCOPE(objects)
42
43
44 CSeq_align_Mapper::CSeq_align_Mapper(const CSeq_align& align,
45 CSeq_loc_Mapper_Base& loc_mapper)
46 : CSeq_align_Mapper_Base(loc_mapper)
47 {
48 x_Init(align);
49 }
50
51
CSeq_align_Mapper(CSeq_loc_Mapper_Base & loc_mapper)52 CSeq_align_Mapper::CSeq_align_Mapper(CSeq_loc_Mapper_Base& loc_mapper)
53 : CSeq_align_Mapper_Base(loc_mapper)
54 {
55 }
56
57
~CSeq_align_Mapper(void)58 CSeq_align_Mapper::~CSeq_align_Mapper(void)
59 {
60 }
61
62
63 // Mapping through CSeq_loc_Conversion_Set
64
65 struct CConversionRef_Less
66 {
67 bool operator()(const CRef<CSeq_loc_Conversion>& x,
68 const CRef<CSeq_loc_Conversion>& y) const;
69 };
70
71
operator ()(const CRef<CSeq_loc_Conversion> & x,const CRef<CSeq_loc_Conversion> & y) const72 bool CConversionRef_Less::operator()(const CRef<CSeq_loc_Conversion>& x,
73 const CRef<CSeq_loc_Conversion>& y) const
74 {
75 if (x->m_Src_id_Handle != y->m_Src_id_Handle) {
76 return x->m_Src_id_Handle < y->m_Src_id_Handle;
77 }
78 // Leftmost first
79 if (x->m_Src_from != y->m_Src_from) {
80 return x->m_Src_from < y->m_Src_from;
81 }
82 // Longest first
83 return x->m_Src_to > y->m_Src_to;
84 }
85
86
Convert(CSeq_loc_Conversion_Set & cvts)87 void CSeq_align_Mapper::Convert(CSeq_loc_Conversion_Set& cvts)
88 {
89 m_DstAlign.Reset();
90
91 if (m_SubAligns.size() > 0) {
92 NON_CONST_ITERATE(TSubAligns, it, m_SubAligns) {
93 dynamic_cast<CSeq_align_Mapper*>(it->GetPointer())->
94 Convert(cvts);
95 }
96 return;
97 }
98 x_ConvertAlignCvt(cvts);
99 }
100
101
x_ConvertAlignCvt(CSeq_loc_Conversion_Set & cvts)102 void CSeq_align_Mapper::x_ConvertAlignCvt(CSeq_loc_Conversion_Set& cvts)
103 {
104 if (cvts.m_CvtByIndex.size() == 0) {
105 // Single mapping
106 _ASSERT(cvts.m_SingleConv);
107 if ( cvts.m_SingleIndex == cvts.kAllIndexes ) {
108 for ( size_t row = 0; row < GetMaxDim(); ++row ) {
109 x_ConvertRowCvt(*cvts.m_SingleConv, row);
110 }
111 }
112 else {
113 x_ConvertRowCvt(*cvts.m_SingleConv, cvts.m_SingleIndex);
114 }
115 return;
116 }
117 NON_CONST_ITERATE(CSeq_loc_Conversion_Set::TConvByIndex, idx_it,
118 cvts.m_CvtByIndex) {
119 if ( idx_it->first == cvts.kAllIndexes ) {
120 for ( size_t row = 0; row < GetMaxDim(); ++row ) {
121 x_ConvertRowCvt(idx_it->second, row);
122 }
123 }
124 else {
125 x_ConvertRowCvt(idx_it->second, idx_it->first);
126 }
127 }
128 }
129
130
x_ConvertRowCvt(CSeq_loc_Conversion & cvt,size_t row)131 void CSeq_align_Mapper::x_ConvertRowCvt(CSeq_loc_Conversion& cvt,
132 size_t row)
133 {
134 CSeq_id_Handle dst_id;
135 TSegments::iterator seg_it = m_Segs.begin();
136 for ( ; seg_it != m_Segs.end(); ) {
137 if (seg_it->m_Rows.size() <= row) {
138 // No such row in the current segment
139 ++seg_it;
140 m_AlignFlags = eAlign_MultiDim;
141 continue;
142 }
143 CSeq_id_Handle seg_id = x_ConvertSegmentCvt(seg_it, cvt, row);
144 if (dst_id) {
145 if (dst_id != seg_id && m_AlignFlags == eAlign_Normal) {
146 m_AlignFlags = eAlign_MultiId;
147 }
148 dst_id = seg_id;
149 }
150 }
151 }
152
153
x_ConvertRowCvt(TIdMap & cvts,size_t row)154 void CSeq_align_Mapper::x_ConvertRowCvt(TIdMap& cvts,
155 size_t row)
156 {
157 CSeq_id_Handle dst_id;
158 TSegments::iterator seg_it = m_Segs.begin();
159 for ( ; seg_it != m_Segs.end(); ) {
160 if (seg_it->m_Rows.size() <= row) {
161 // No such row in the current segment
162 ++seg_it;
163 m_AlignFlags = eAlign_MultiDim;
164 continue;
165 }
166 CSeq_id_Handle seg_id = x_ConvertSegmentCvt(seg_it, cvts, row);
167 if (dst_id) {
168 if (dst_id != seg_id && m_AlignFlags == eAlign_Normal) {
169 m_AlignFlags = eAlign_MultiId;
170 }
171 dst_id = seg_id;
172 }
173 }
174 }
175
176
177 CSeq_id_Handle
x_ConvertSegmentCvt(TSegments::iterator & seg_it,CSeq_loc_Conversion & cvt,size_t row)178 CSeq_align_Mapper::x_ConvertSegmentCvt(TSegments::iterator& seg_it,
179 CSeq_loc_Conversion& cvt,
180 size_t row)
181 {
182 TSegments::iterator old_it = seg_it;
183 ++seg_it;
184 SAlignment_Segment::SAlignment_Row& aln_row = old_it->m_Rows[row];
185 if (aln_row.m_Id != cvt.m_Src_id_Handle) {
186 return aln_row.m_Id;
187 }
188 if (aln_row.m_Start == kInvalidSeqPos) {
189 // ??? Skipped row - change ID
190 aln_row.m_Id = cvt.m_Dst_id_Handle;
191 aln_row.SetMapped();
192 return aln_row.m_Id;
193 }
194 TRange rg(aln_row.m_Start, aln_row.m_Start + old_it->m_Len - 1);
195 if (!cvt.ConvertInterval(rg.GetFrom(), rg.GetTo(), aln_row.m_Strand) ) {
196 // Do not erase the segment, just change the row ID and reset start
197 aln_row.m_Start = kInvalidSeqPos;
198 aln_row.m_Id = cvt.m_Dst_id_Handle;
199 aln_row.SetMapped();
200 return aln_row.m_Id;
201 }
202
203 // Prepare insert point depending on the source strand
204 TSegments::iterator ins_point = seg_it;
205 bool src_reverse = aln_row.m_IsSetStrand ? IsReverse(aln_row.m_Strand) : false;
206
207 // At least part of the interval was converted.
208 TSeqPos dl = cvt.m_Src_from <= rg.GetFrom() ?
209 0 : cvt.m_Src_from - rg.GetFrom();
210 TSeqPos dr = cvt.m_Src_to >= rg.GetTo() ?
211 0 : rg.GetTo() - cvt.m_Src_to;
212 if (dl > 0) {
213 // Add segment for the skipped range
214 SAlignment_Segment& lseg = x_InsertSeg(ins_point, dl,
215 old_it->m_Rows.size(), src_reverse);
216 lseg.m_PartType = old_it->m_PartType;
217 for (size_t r = 0; r < old_it->m_Rows.size(); ++r) {
218 SAlignment_Segment::SAlignment_Row& lrow =
219 lseg.CopyRow(r, old_it->m_Rows[r]);
220 if (r == row) {
221 lrow.m_Start = kInvalidSeqPos;
222 lrow.m_Id = cvt.m_Dst_id_Handle;
223 }
224 else if (lrow.m_Start != kInvalidSeqPos &&
225 !lrow.SameStrand(aln_row)) {
226 // Adjust start for minus strand
227 lrow.m_Start += old_it->m_Len - lseg.m_Len;
228 }
229 }
230 }
231 rg.SetFrom(rg.GetFrom() + dl);
232 SAlignment_Segment& mseg = x_InsertSeg(ins_point,
233 rg.GetLength() - dr,
234 old_it->m_Rows.size(),
235 src_reverse);
236 mseg.m_PartType = old_it->m_PartType;
237 if (!dl && !dr) {
238 // copy scores if there's no truncation
239 mseg.m_Scores = old_it->m_Scores;
240 mseg.m_ScoresGroupIdx = old_it->m_ScoresGroupIdx;
241 }
242 else {
243 // Invalidate all scores related to the segment (this
244 // includes alignment-level scores).
245 x_InvalidateScores(&(*old_it));
246 }
247 for (size_t r = 0; r < old_it->m_Rows.size(); ++r) {
248 SAlignment_Segment::SAlignment_Row& mrow =
249 mseg.CopyRow(r, old_it->m_Rows[r]);
250 if (r == row) {
251 // translate id and coords
252 mrow.m_Id = cvt.m_Dst_id_Handle;
253 mrow.m_Start = cvt.m_LastRange.GetFrom();
254 mrow.m_IsSetStrand |= (cvt.m_LastStrand != eNa_strand_unknown);
255 mrow.m_Strand = cvt.m_LastStrand;
256 mrow.SetMapped();
257 mseg.m_HaveStrands |= mrow.m_IsSetStrand;
258 }
259 else {
260 if (mrow.m_Start != kInvalidSeqPos) {
261 if (mrow.SameStrand(aln_row)) {
262 mrow.m_Start += dl;
263 }
264 else {
265 mrow.m_Start += old_it->m_Len - dl - mseg.m_Len;
266 }
267 }
268 }
269 }
270 cvt.m_LastType = cvt.eMappedObjType_not_set;
271 dl += rg.GetLength() - dr;
272 rg.SetFrom(rg.GetTo() - dr);
273 if (dr > 0) {
274 // Add the remaining unmapped range
275 SAlignment_Segment& rseg = x_InsertSeg(ins_point,
276 dr,
277 old_it->m_Rows.size(),
278 src_reverse);
279 rseg.m_PartType = old_it->m_PartType;
280 for (size_t r = 0; r < old_it->m_Rows.size(); ++r) {
281 SAlignment_Segment::SAlignment_Row& rrow =
282 rseg.CopyRow(r, old_it->m_Rows[r]);
283 if (r == row) {
284 rrow.m_Start = kInvalidSeqPos;
285 rrow.m_Id = cvt.m_Dst_id_Handle;
286 }
287 else {
288 if (rrow.SameStrand(aln_row)) {
289 rrow.m_Start += dl;
290 }
291 }
292 }
293 }
294 m_Segs.erase(old_it);
295 return cvt.m_Dst_id_Handle;
296 }
297
298
299 CSeq_id_Handle
x_ConvertSegmentCvt(TSegments::iterator & seg_it,TIdMap & id_map,size_t row)300 CSeq_align_Mapper::x_ConvertSegmentCvt(TSegments::iterator& seg_it,
301 TIdMap& id_map,
302 size_t row)
303 {
304 TSegments::iterator old_it = seg_it;
305 SAlignment_Segment& seg = *old_it;
306 ++seg_it;
307 SAlignment_Segment::SAlignment_Row& aln_row = seg.m_Rows[row];
308 if (aln_row.m_Start == kInvalidSeqPos) {
309 // skipped row
310 return aln_row.m_Id;
311 }
312 TRange rg(aln_row.m_Start, aln_row.m_Start + seg.m_Len - 1);
313 TIdMap::iterator id_it = id_map.find(aln_row.m_Id);
314 if (id_it == id_map.end()) {
315 // ID not found in the segment, leave the row unchanged
316 return aln_row.m_Id;
317 }
318 TRangeMap& rmap = id_it->second;
319 if ( rmap.empty() ) {
320 // No mappings for this segment
321 return aln_row.m_Id;
322 }
323 // Sorted mappings
324 typedef vector< CRef<CSeq_loc_Conversion> > TSortedConversions;
325 TSortedConversions cvts;
326 for (TRangeMap::iterator rg_it = rmap.begin(rg); rg_it; ++rg_it) {
327 cvts.push_back(rg_it->second);
328 }
329 sort(cvts.begin(), cvts.end(), CConversionRef_Less());
330
331 // Prepare insert point depending on the source strand
332 TSegments::iterator ins_point = seg_it;
333 bool src_reverse = aln_row.m_IsSetStrand ? IsReverse(aln_row.m_Strand) : false;
334
335 bool mapped = false;
336 CSeq_id_Handle dst_id;
337 EAlignFlags align_flags = eAlign_Normal;
338 TSeqPos left_shift = 0;
339 for (size_t cvt_idx = 0; cvt_idx < cvts.size(); ++cvt_idx) {
340 CSeq_loc_Conversion& cvt = *cvts[cvt_idx];
341 if (!cvt.ConvertInterval(rg.GetFrom(), rg.GetTo(), aln_row.m_Strand) ) {
342 continue;
343 }
344 // Check destination id
345 if ( dst_id ) {
346 if (cvt.m_Dst_id_Handle != dst_id) {
347 align_flags = eAlign_MultiId;
348 }
349 }
350 dst_id = cvt.m_Dst_id_Handle;
351
352 // At least part of the interval was converted.
353 TSeqPos dl = cvt.m_Src_from <= rg.GetFrom() ?
354 0 : cvt.m_Src_from - rg.GetFrom();
355 TSeqPos dr = cvt.m_Src_to >= rg.GetTo() ?
356 0 : rg.GetTo() - cvt.m_Src_to;
357 if (dl > 0) {
358 // Add segment for the skipped range
359 SAlignment_Segment& lseg = x_InsertSeg(ins_point,
360 dl, seg.m_Rows.size(), src_reverse);
361 lseg.m_PartType = old_it->m_PartType;
362 for (size_t r = 0; r < seg.m_Rows.size(); ++r) {
363 SAlignment_Segment::SAlignment_Row& lrow =
364 lseg.CopyRow(r, seg.m_Rows[r]);
365 if (r == row) {
366 lrow.m_Start = kInvalidSeqPos;
367 lrow.m_Id = dst_id;
368 }
369 else if (lrow.m_Start != kInvalidSeqPos) {
370 if (lrow.SameStrand(aln_row)) {
371 lrow.m_Start += left_shift;
372 }
373 else {
374 lrow.m_Start += seg.m_Len - lseg.m_Len - left_shift;
375 }
376 }
377 }
378 }
379 left_shift += dl;
380 SAlignment_Segment& mseg = x_InsertSeg(ins_point,
381 rg.GetLength() - dl - dr, seg.m_Rows.size(), src_reverse);
382 mseg.m_PartType = old_it->m_PartType;
383 if (!dl && !dr) {
384 // copy scores if there's no truncation
385 mseg.m_Scores = seg.m_Scores;
386 mseg.m_ScoresGroupIdx = seg.m_ScoresGroupIdx;
387 }
388 else {
389 // Invalidate all scores related to the segment (this
390 // includes alignment-level scores).
391 x_InvalidateScores(&seg);
392 }
393 for (size_t r = 0; r < seg.m_Rows.size(); ++r) {
394 SAlignment_Segment::SAlignment_Row& mrow =
395 mseg.CopyRow(r, seg.m_Rows[r]);
396 if (r == row) {
397 // translate id and coords
398 mrow.m_Id = cvt.m_Dst_id_Handle;
399 mrow.m_Start = cvt.m_LastRange.GetFrom();
400 mrow.m_IsSetStrand |= (cvt.m_LastStrand != eNa_strand_unknown);
401 mrow.m_Strand = cvt.m_LastStrand;
402 mrow.SetMapped();
403 mseg.m_HaveStrands |= mrow.m_IsSetStrand;
404 }
405 else {
406 if (mrow.m_Start != kInvalidSeqPos) {
407 if (mrow.SameStrand(aln_row)) {
408 mrow.m_Start += left_shift;
409 }
410 else {
411 mrow.m_Start += seg.m_Len - left_shift - mseg.m_Len;
412 }
413 }
414 }
415 }
416 cvt.m_LastType = cvt.eMappedObjType_not_set;
417 mapped = true;
418 left_shift += mseg.m_Len;
419 rg.SetFrom(aln_row.m_Start + left_shift);
420 }
421 if (align_flags == eAlign_MultiId && m_AlignFlags == eAlign_Normal) {
422 m_AlignFlags = align_flags;
423 }
424 if ( !mapped ) {
425 // Do not erase the segment, just change the row ID and reset start
426 seg.m_Rows[row].m_Start = kInvalidSeqPos;
427 seg.m_Rows[row].m_Id = rmap.begin()->second->m_Dst_id_Handle;
428 seg.m_Rows[row].SetMapped();
429 return seg.m_Rows[row].m_Id;
430 }
431 if (rg.GetFrom() <= rg.GetTo()) {
432 // Add the remaining unmapped range
433 SAlignment_Segment& rseg = x_InsertSeg(ins_point,
434 rg.GetLength(), seg.m_Rows.size(), src_reverse);
435 rseg.m_PartType = old_it->m_PartType;
436 for (size_t r = 0; r < seg.m_Rows.size(); ++r) {
437 SAlignment_Segment::SAlignment_Row& rrow =
438 rseg.CopyRow(r, seg.m_Rows[r]);
439 if (r == row) {
440 rrow.m_Start = kInvalidSeqPos;
441 rrow.m_Id = dst_id;
442 }
443 else if (rrow.m_Start != kInvalidSeqPos) {
444 if (rrow.SameStrand(aln_row)) {
445 rrow.m_Start += left_shift;
446 }
447 }
448 }
449 }
450 m_Segs.erase(old_it);
451 return align_flags == eAlign_MultiId ? CSeq_id_Handle() : dst_id;
452 }
453
454
455 CSeq_align_Mapper_Base*
CreateSubAlign(const CSeq_align & align)456 CSeq_align_Mapper::CreateSubAlign(const CSeq_align& align)
457 {
458 return new CSeq_align_Mapper(align, GetLocMapper());
459 }
460
461
462 CSeq_align_Mapper_Base*
CreateSubAlign(const CSpliced_seg & spliced,const CSpliced_exon & exon)463 CSeq_align_Mapper::CreateSubAlign(const CSpliced_seg& spliced,
464 const CSpliced_exon& exon)
465 {
466 auto_ptr<CSeq_align_Mapper> sub(new CSeq_align_Mapper(GetLocMapper()));
467 sub->InitExon(spliced, exon);
468 return sub.release();
469 }
470
471
472 END_SCOPE(objects)
473 END_NCBI_SCOPE
474