1 /* $Id: seq_loc_mapper.cpp 591839 2019-08-21 15:03:00Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aleksey Grichenko
27 *
28 * File Description:
29 * Seq-loc mapper
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <objmgr/seq_loc_mapper.hpp>
35 #include <objmgr/scope.hpp>
36 #include <objmgr/object_manager.hpp>
37 #include <objmgr/objmgr_exception.hpp>
38 #include <objmgr/seq_map.hpp>
39 #include <objmgr/seq_map_ci.hpp>
40 #include <objmgr/impl/synonyms.hpp>
41 #include <objmgr/impl/seq_align_mapper.hpp>
42 #include <objmgr/impl/seq_loc_cvt.hpp>
43 #include <objmgr/gc_assembly_parser.hpp>
44 #include <objects/seqloc/Seq_loc.hpp>
45 #include <objects/seqfeat/Seq_feat.hpp>
46 #include <objects/seqfeat/Cdregion.hpp>
47 #include <objects/seqloc/Seq_loc_equiv.hpp>
48 #include <objects/seqloc/Seq_bond.hpp>
49 #include <objects/seqalign/seqalign__.hpp>
50 #include <objects/genomecoll/genome_collection__.hpp>
51 #include <objects/seq/Delta_ext.hpp>
52 #include <objects/seq/Delta_seq.hpp>
53 #include <objects/seq/Seq_literal.hpp>
54 #include <objects/seq/Seq_ext.hpp>
55 #include <objects/seq/Seq_gap.hpp>
56 #include <algorithm>
57
58
59 BEGIN_NCBI_SCOPE
60 BEGIN_SCOPE(objects)
61
62
63 /////////////////////////////////////////////////////////////////////
64 //
65 // CScope_Mapper_Sequence_Info
66 //
67 // Sequence type/length/synonyms provider using CScope to fetch
68 // the information.
69
70
71 class CScope_Mapper_Sequence_Info : public IMapper_Sequence_Info
72 {
73 public:
74 CScope_Mapper_Sequence_Info(CScope* scope);
75
76 virtual TSeqType GetSequenceType(const CSeq_id_Handle& idh);
77 virtual TSeqPos GetSequenceLength(const CSeq_id_Handle& idh);
78 virtual void CollectSynonyms(const CSeq_id_Handle& id,
79 TSynonyms& synonyms);
80 private:
81 CHeapScope m_Scope;
82 };
83
84
CScope_Mapper_Sequence_Info(CScope * scope)85 CScope_Mapper_Sequence_Info::CScope_Mapper_Sequence_Info(CScope* scope)
86 : m_Scope(scope)
87 {
88 }
89
90
91 void CScope_Mapper_Sequence_Info::
CollectSynonyms(const CSeq_id_Handle & id,TSynonyms & synonyms)92 CollectSynonyms(const CSeq_id_Handle& id,
93 TSynonyms& synonyms)
94 {
95 if ( m_Scope.IsNull() ) {
96 synonyms.insert(id);
97 }
98 else {
99 CConstRef<CSynonymsSet> syns =
100 m_Scope.GetScope().GetSynonyms(id);
101 ITERATE(CSynonymsSet, syn_it, *syns) {
102 synonyms.insert(CSynonymsSet::GetSeq_id_Handle(syn_it));
103 }
104 }
105 }
106
107
108 CScope_Mapper_Sequence_Info::TSeqType
GetSequenceType(const CSeq_id_Handle & idh)109 CScope_Mapper_Sequence_Info::GetSequenceType(const CSeq_id_Handle& idh)
110 {
111 if ( m_Scope.IsNull() ) {
112 return CSeq_loc_Mapper_Base::eSeq_unknown;
113 }
114 switch ( m_Scope.GetScope().GetSequenceType(idh) ) {
115 case CSeq_inst::eMol_dna:
116 case CSeq_inst::eMol_rna:
117 case CSeq_inst::eMol_na:
118 return CSeq_loc_Mapper_Base::eSeq_nuc;
119 case CSeq_inst::eMol_aa:
120 return CSeq_loc_Mapper_Base::eSeq_prot;
121 default:
122 return CSeq_loc_Mapper_Base::eSeq_unknown;
123 }
124 }
125
126
GetSequenceLength(const CSeq_id_Handle & idh)127 TSeqPos CScope_Mapper_Sequence_Info::GetSequenceLength(const CSeq_id_Handle& idh)
128 {
129 CBioseq_Handle h;
130 if ( m_Scope.IsNull() ) {
131 return kInvalidSeqPos;
132 }
133 h = m_Scope.GetScope().GetBioseqHandle(idh);
134 return h ? h.GetBioseqLength() : kInvalidSeqPos;
135 }
136
137
138 /////////////////////////////////////////////////////////////////////
139 //
140 // CSeq_loc_Mapper
141 //
142
143
144 /////////////////////////////////////////////////////////////////////
145 //
146 // Initialization of the mapper
147 //
148
149
150 inline
s_IndexToStrand(size_t idx)151 ENa_strand s_IndexToStrand(size_t idx)
152 {
153 _ASSERT(idx != 0);
154 return ENa_strand(idx - 1);
155 }
156
157 #define STRAND_TO_INDEX(is_set, strand) \
158 ((is_set) ? size_t((strand) + 1) : 0)
159
160 #define INDEX_TO_STRAND(idx) \
161 s_IndexToStrand(idx)
162
163
SetOptionsScope(CSeq_loc_Mapper_Options & options,CScope * scope)164 CSeq_loc_Mapper_Options& SetOptionsScope(CSeq_loc_Mapper_Options& options, CScope* scope)
165 {
166 if (!options.GetMapperSequenceInfo()) {
167 options.SetMapperSequenceInfo(new CScope_Mapper_Sequence_Info(scope));
168 }
169 return options;
170 }
171
172
CSeq_loc_Mapper(CMappingRanges * mapping_ranges,CScope * scope,CSeq_loc_Mapper_Options options)173 CSeq_loc_Mapper::CSeq_loc_Mapper(CMappingRanges* mapping_ranges,
174 CScope* scope,
175 CSeq_loc_Mapper_Options options)
176 : CSeq_loc_Mapper_Base(mapping_ranges, SetOptionsScope(options, scope)),
177 m_Scope(scope)
178 {
179 }
180
181
CSeq_loc_Mapper(const CSeq_feat & map_feat,EFeatMapDirection dir,CScope * scope,CSeq_loc_Mapper_Options options)182 CSeq_loc_Mapper::CSeq_loc_Mapper(const CSeq_feat& map_feat,
183 EFeatMapDirection dir,
184 CScope* scope,
185 CSeq_loc_Mapper_Options options)
186 : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
187 m_Scope(scope)
188 {
189 x_InitializeFeat(map_feat, dir);
190 }
191
192
CSeq_loc_Mapper(const CSeq_loc & source,const CSeq_loc & target,CScope * scope,CSeq_loc_Mapper_Options options)193 CSeq_loc_Mapper::CSeq_loc_Mapper(const CSeq_loc& source,
194 const CSeq_loc& target,
195 CScope* scope,
196 CSeq_loc_Mapper_Options options)
197 : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
198 m_Scope(scope)
199 {
200 x_InitializeLocs(source, target);
201 }
202
203
CSeq_loc_Mapper(const CSeq_align & map_align,const CSeq_id & to_id,CScope * scope,CSeq_loc_Mapper_Options options)204 CSeq_loc_Mapper::CSeq_loc_Mapper(const CSeq_align& map_align,
205 const CSeq_id& to_id,
206 CScope* scope,
207 CSeq_loc_Mapper_Options options)
208 : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
209 m_Scope(scope)
210 {
211 x_InitializeAlign(map_align, to_id);
212 }
213
214
CSeq_loc_Mapper(const CSeq_align & map_align,size_t to_row,CScope * scope,CSeq_loc_Mapper_Options options)215 CSeq_loc_Mapper::CSeq_loc_Mapper(const CSeq_align& map_align,
216 size_t to_row,
217 CScope* scope,
218 CSeq_loc_Mapper_Options options)
219 : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
220 m_Scope(scope)
221 {
222 x_InitializeAlign(map_align, to_row);
223 }
224
225
CSeq_loc_Mapper(const CSeq_id & from_id,const CSeq_id & to_id,const CSeq_align & map_align,CScope * scope,CSeq_loc_Mapper_Options options)226 CSeq_loc_Mapper::CSeq_loc_Mapper(const CSeq_id& from_id,
227 const CSeq_id& to_id,
228 const CSeq_align& map_align,
229 CScope* scope,
230 CSeq_loc_Mapper_Options options)
231 : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
232 m_Scope(scope)
233 {
234 x_InitializeAlign(map_align, to_id, &from_id);
235 }
236
237
CSeq_loc_Mapper(size_t from_row,size_t to_row,const CSeq_align & map_align,CScope * scope,CSeq_loc_Mapper_Options options)238 CSeq_loc_Mapper::CSeq_loc_Mapper(size_t from_row,
239 size_t to_row,
240 const CSeq_align& map_align,
241 CScope* scope,
242 CSeq_loc_Mapper_Options options)
243 : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
244 m_Scope(scope)
245 {
246 x_InitializeAlign(map_align, to_row, from_row);
247 }
248
249
CSeq_loc_Mapper(CBioseq_Handle target_seq,ESeqMapDirection direction,CSeq_loc_Mapper_Options options)250 CSeq_loc_Mapper::CSeq_loc_Mapper(CBioseq_Handle target_seq,
251 ESeqMapDirection direction,
252 CSeq_loc_Mapper_Options options)
253 : CSeq_loc_Mapper_Base(SetOptionsScope(options, &target_seq.GetScope())),
254 m_Scope(&target_seq.GetScope())
255 {
256 CConstRef<CSeq_id> top_level_id = target_seq.GetSeqId();
257 if ( !top_level_id ) {
258 // Bioseq handle has no id, try to get one.
259 CConstRef<CSynonymsSet> syns = target_seq.GetSynonyms();
260 if ( !syns->empty() ) {
261 top_level_id = syns->GetSeq_id_Handle(syns->begin()).GetSeqId();
262 }
263 }
264 x_InitializeSeqMap(target_seq.GetSeqMap(),
265 top_level_id.GetPointerOrNull(),
266 direction);
267 if (direction == eSeqMap_Up) {
268 // Ignore seq-map destination ranges, map whole sequence to itself,
269 // use unknown strand only.
270 m_DstRanges.resize(1);
271 m_DstRanges[0].clear();
272 m_DstRanges[0][CSeq_id_Handle::GetHandle(*top_level_id)]
273 .push_back(TRange::GetWhole());
274 }
275 x_PreserveDestinationLocs();
276 }
277
278
CSeq_loc_Mapper(const CSeqMap & seq_map,ESeqMapDirection direction,const CSeq_id * top_level_id,CScope * scope,CSeq_loc_Mapper_Options options)279 CSeq_loc_Mapper::CSeq_loc_Mapper(const CSeqMap& seq_map,
280 ESeqMapDirection direction,
281 const CSeq_id* top_level_id,
282 CScope* scope,
283 CSeq_loc_Mapper_Options options)
284 : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
285 m_Scope(scope)
286 {
287 x_InitializeSeqMap(seq_map, top_level_id, direction);
288 x_PreserveDestinationLocs();
289 }
290
291
CSeq_loc_Mapper(CBioseq_Handle target_seq,ESeqMapDirection direction,SSeqMapSelector selector,CSeq_loc_Mapper_Options options)292 CSeq_loc_Mapper::CSeq_loc_Mapper(CBioseq_Handle target_seq,
293 ESeqMapDirection direction,
294 SSeqMapSelector selector,
295 CSeq_loc_Mapper_Options options)
296 : CSeq_loc_Mapper_Base(SetOptionsScope(options, &target_seq.GetScope())),
297 m_Scope(&target_seq.GetScope())
298 {
299 CConstRef<CSeq_id> top_id = target_seq.GetSeqId();
300 if ( !top_id ) {
301 // Bioseq handle has no id, try to get one.
302 CConstRef<CSynonymsSet> syns = target_seq.GetSynonyms();
303 if ( !syns->empty() ) {
304 top_id = syns->GetSeq_id_Handle(syns->begin()).GetSeqId();
305 }
306 }
307 selector.SetLinkUsedTSE(target_seq.GetTSE_Handle());
308 x_InitializeSeqMap(target_seq.GetSeqMap(), selector, top_id, direction);
309 if (direction == eSeqMap_Up) {
310 // Ignore seq-map destination ranges, map whole sequence to itself,
311 // use unknown strand only.
312 m_DstRanges.resize(1);
313 m_DstRanges[0].clear();
314 m_DstRanges[0][CSeq_id_Handle::GetHandle(*top_id)]
315 .push_back(TRange::GetWhole());
316 }
317 x_PreserveDestinationLocs();
318 }
319
320
CSeq_loc_Mapper(const CSeqMap & seq_map,ESeqMapDirection direction,SSeqMapSelector selector,const CSeq_id * top_level_id,CScope * scope,CSeq_loc_Mapper_Options options)321 CSeq_loc_Mapper::CSeq_loc_Mapper(const CSeqMap& seq_map,
322 ESeqMapDirection direction,
323 SSeqMapSelector selector,
324 const CSeq_id* top_level_id,
325 CScope* scope,
326 CSeq_loc_Mapper_Options options)
327 : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
328 m_Scope(scope)
329 {
330 x_InitializeSeqMap(seq_map,
331 selector,
332 top_level_id,
333 direction);
334 x_PreserveDestinationLocs();
335 }
336
337
CSeq_loc_Mapper(size_t depth,const CBioseq_Handle & top_level_seq,ESeqMapDirection direction,CSeq_loc_Mapper_Options options)338 CSeq_loc_Mapper::CSeq_loc_Mapper(size_t depth,
339 const CBioseq_Handle& top_level_seq,
340 ESeqMapDirection direction,
341 CSeq_loc_Mapper_Options options)
342 : CSeq_loc_Mapper_Base(SetOptionsScope(options, &top_level_seq.GetScope())),
343 m_Scope(&top_level_seq.GetScope())
344 {
345 if (depth > 0) {
346 depth--;
347 x_InitializeSeqMap(top_level_seq.GetSeqMap(),
348 depth,
349 top_level_seq.GetSeqId().GetPointer(),
350 direction);
351 }
352 else if (direction == eSeqMap_Up) {
353 // Synonyms conversion
354 CConstRef<CSeq_id> top_level_id = top_level_seq.GetSeqId();
355 m_DstRanges.resize(1);
356 m_DstRanges[0][CSeq_id_Handle::GetHandle(*top_level_id)]
357 .push_back(TRange::GetWhole());
358 }
359 x_PreserveDestinationLocs();
360 }
361
362
CSeq_loc_Mapper(size_t depth,const CSeqMap & top_level_seq,ESeqMapDirection direction,const CSeq_id * top_level_id,CScope * scope,CSeq_loc_Mapper_Options options)363 CSeq_loc_Mapper::CSeq_loc_Mapper(size_t depth,
364 const CSeqMap& top_level_seq,
365 ESeqMapDirection direction,
366 const CSeq_id* top_level_id,
367 CScope* scope,
368 CSeq_loc_Mapper_Options options)
369 : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
370 m_Scope(scope)
371 {
372 if (depth > 0) {
373 depth--;
374 x_InitializeSeqMap(top_level_seq, depth, top_level_id, direction);
375 }
376 else if (direction == eSeqMap_Up) {
377 // Synonyms conversion
378 m_DstRanges.resize(1);
379 m_DstRanges[0][CSeq_id_Handle::GetHandle(*top_level_id)]
380 .push_back(TRange::GetWhole());
381 }
382 x_PreserveDestinationLocs();
383 }
384
385
CSeq_loc_Mapper(const CGC_Assembly & gc_assembly,EGCAssemblyAlias to_alias,CScope * scope,EScopeFlag scope_flag)386 CSeq_loc_Mapper::CSeq_loc_Mapper(const CGC_Assembly& gc_assembly,
387 EGCAssemblyAlias to_alias,
388 CScope* scope,
389 EScopeFlag scope_flag)
390 : CSeq_loc_Mapper_Base(CSeq_loc_Mapper_Options(new CScope_Mapper_Sequence_Info(scope))),
391 m_Scope(scope)
392 {
393 // While parsing GC-Assembly the mapper will need to add virtual
394 // bioseqs to the scope. To keep the original scope clean of them,
395 // create a new scope and add the original one as a child.
396 if (scope_flag == eCopyScope) {
397 m_Scope = CHeapScope(new CScope(*CObjectManager::GetInstance()));
398 if ( scope ) {
399 m_Scope.GetScope().AddScope(*scope);
400 }
401 m_MapOptions.SetMapperSequenceInfo(new CScope_Mapper_Sequence_Info(m_Scope));
402 }
403 x_InitGCAssembly(gc_assembly, to_alias);
404 }
405
406
CSeq_loc_Mapper(const CGC_Assembly & gc_assembly,ESeqMapDirection direction,SSeqMapSelector selector,CScope * scope,EScopeFlag scope_flag,CSeq_loc_Mapper_Options options)407 CSeq_loc_Mapper::CSeq_loc_Mapper(const CGC_Assembly& gc_assembly,
408 ESeqMapDirection direction,
409 SSeqMapSelector selector,
410 CScope* scope,
411 EScopeFlag scope_flag,
412 CSeq_loc_Mapper_Options options)
413 : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
414 m_Scope(scope)
415 {
416 // While parsing GC-Assembly the mapper will need to add virtual
417 // bioseqs to the scope. To keep the original scope clean of them,
418 // create a new scope and add the original one as a child.
419 if (scope_flag == eCopyScope) {
420 m_Scope = CHeapScope(new CScope(*CObjectManager::GetInstance()));
421 if ( scope ) {
422 m_Scope.GetScope().AddScope(*scope);
423 }
424 m_MapOptions.SetMapperSequenceInfo(new CScope_Mapper_Sequence_Info(m_Scope));
425 }
426 CGC_Assembly_Parser parser(gc_assembly);
427 CRef<CSeq_entry> entry = parser.GetTSE();
428 m_Scope.GetScope().AddTopLevelSeqEntry(*entry);
429 const CGC_Assembly_Parser::TSeqIds& ids = parser.GetTopLevelSequences();
430 ITERATE(CGC_Assembly_Parser::TSeqIds, id, ids) {
431 CBioseq_Handle h = m_Scope.GetScope().GetBioseqHandle(*id);
432 if ( !h ) continue;
433 x_InitializeSeqMap(h.GetSeqMap(), selector, id->GetSeqId(), direction);
434 if (direction == eSeqMap_Up) {
435 // Ignore seq-map destination ranges, map whole sequence to itself,
436 // use unknown strand only.
437 m_DstRanges.resize(1);
438 m_DstRanges[0].clear();
439 m_DstRanges[0][*id].push_back(TRange::GetWhole());
440 }
441 x_PreserveDestinationLocs();
442 }
443 }
444
445
~CSeq_loc_Mapper(void)446 CSeq_loc_Mapper::~CSeq_loc_Mapper(void)
447 {
448 return;
449 }
450
451
x_InitializeSeqMap(const CSeqMap & seq_map,const CSeq_id * top_id,ESeqMapDirection direction)452 void CSeq_loc_Mapper::x_InitializeSeqMap(const CSeqMap& seq_map,
453 const CSeq_id* top_id,
454 ESeqMapDirection direction)
455 {
456 x_InitializeSeqMap(seq_map, size_t(-1), top_id, direction);
457 }
458
459
x_InitializeSeqMap(const CSeqMap & seq_map,size_t depth,const CSeq_id * top_id,ESeqMapDirection direction)460 void CSeq_loc_Mapper::x_InitializeSeqMap(const CSeqMap& seq_map,
461 size_t depth,
462 const CSeq_id* top_id,
463 ESeqMapDirection direction)
464 {
465 x_InitializeSeqMap(seq_map,
466 SSeqMapSelector(0, depth),
467 top_id,
468 direction);
469 }
470
471
x_InitializeSeqMap(const CSeqMap & seq_map,SSeqMapSelector selector,const CSeq_id * top_id,ESeqMapDirection direction)472 void CSeq_loc_Mapper::x_InitializeSeqMap(const CSeqMap& seq_map,
473 SSeqMapSelector selector,
474 const CSeq_id* top_id,
475 ESeqMapDirection direction)
476 {
477 selector.SetFlags(CSeqMap::fFindRef | CSeqMap::fIgnoreUnresolved)
478 .SetLinkUsedTSE();
479 x_InitializeSeqMap(CSeqMap_CI(ConstRef(&seq_map),
480 m_Scope.GetScopeOrNull(), selector),
481 top_id,
482 direction);
483 }
484
485
x_InitializeSeqMap(CSeqMap_CI seg_it,const CSeq_id * top_id,ESeqMapDirection direction)486 void CSeq_loc_Mapper::x_InitializeSeqMap(CSeqMap_CI seg_it,
487 const CSeq_id* top_id,
488 ESeqMapDirection direction)
489 {
490 if (m_MapOptions.GetMapSingleLevel()) {
491 x_InitializeSeqMapSingleLevel(seg_it, top_id, direction);
492 }
493 else if (direction == eSeqMap_Up) {
494 x_InitializeSeqMapUp(seg_it, top_id);
495 }
496 else {
497 x_InitializeSeqMapDown(seg_it, top_id);
498 }
499 }
500
501
x_InitializeSeqMapUp(CSeqMap_CI seg_it,const CSeq_id * top_id)502 void CSeq_loc_Mapper::x_InitializeSeqMapUp(CSeqMap_CI seg_it,
503 const CSeq_id* top_id)
504 {
505 TSeqPos src_from, src_len, dst_from, dst_len;
506 ENa_strand dst_strand = eNa_strand_unknown;
507 // Mapping up - for each iterator create mapping to the top level.
508 // If top_id is set, top level positions are the iterator's positions.
509 // Otherwise top-level ids and positions must be taked from the
510 // iterators with depth == 1.
511 TSeqPos top_ref_start = 0;
512 TSeqPos top_start = 0;
513 CConstRef<CSeq_id> dst_id(top_id);
514 _ASSERT(seg_it.GetDepth() == 1);
515 while ( seg_it ) {
516 ENa_strand src_strand = seg_it.GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus;
517 src_from = seg_it.GetRefPosition();
518 src_len = seg_it.GetLength();
519 dst_len = src_len;
520 if (top_id) {
521 dst_from = seg_it.GetPosition();
522 x_NextMappingRange(
523 *seg_it.GetRefSeqid().GetSeqId(),
524 src_from, src_len, src_strand,
525 *top_id,
526 dst_from, dst_len, dst_strand);
527 }
528 else /* !top_id */ {
529 if (seg_it.GetDepth() == 1) {
530 // Depth==1 - top level sequences (destination).
531 dst_id.Reset(seg_it.GetRefSeqid().GetSeqId());
532 top_ref_start = seg_it.GetRefPosition();
533 top_start = seg_it.GetPosition();
534 dst_strand = seg_it.GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus;
535 }
536 else {
537 _ASSERT(seg_it.GetPosition() >= top_start);
538 TSeqPos shift = seg_it.GetPosition() - top_start;
539 dst_from = top_ref_start + shift;
540 x_NextMappingRange(
541 *seg_it.GetRefSeqid().GetSeqId(),
542 src_from, src_len, src_strand,
543 *dst_id,
544 dst_from, dst_len, dst_strand);
545 }
546 }
547 ++seg_it;
548 }
549 }
550
551
x_InitializeSeqMapDown(CSeqMap_CI seg_it,const CSeq_id * top_id)552 void CSeq_loc_Mapper::x_InitializeSeqMapDown(CSeqMap_CI seg_it,
553 const CSeq_id* top_id)
554 {
555 TSeqPos src_from, src_len, dst_from, dst_len;
556 ENa_strand dst_strand = eNa_strand_unknown;
557 // eSeqMap_Down
558 // Collect all non-leaf references, create mapping from each non-leaf
559 // to the bottom level.
560 list<CSeqMap_CI> refs;
561 refs.push_back(seg_it);
562 while ( seg_it ) {
563 ++seg_it;
564 // While depth increases push all iterators to the stack.
565 if ( seg_it ) {
566 if (refs.empty() || refs.back().GetDepth() < seg_it.GetDepth()) {
567 refs.push_back(seg_it);
568 continue;
569 }
570 }
571 // End of seq-map or the last iterator was a leaf - create mappings.
572 if ( !refs.empty() ) {
573 CSeqMap_CI leaf = refs.back();
574 // Exclude self-mapping of the leaf reference.
575 refs.pop_back();
576 dst_strand = leaf.GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus;
577 if ( top_id ) {
578 // Add mapping from the top-level sequence if any.
579 src_from = leaf.GetPosition();
580 src_len = leaf.GetLength();
581 dst_from = leaf.GetRefPosition();
582 dst_len = src_len;
583 x_NextMappingRange(
584 *top_id,
585 src_from, src_len, eNa_strand_unknown,
586 *leaf.GetRefSeqid().GetSeqId(),
587 dst_from, dst_len, dst_strand);
588 }
589 // Create mapping from each non-leaf level.
590 ITERATE(list<CSeqMap_CI>, it, refs) {
591 TSeqPos shift = leaf.GetPosition() - it->GetPosition();
592 ENa_strand src_strand = it->GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus;
593 src_from = it->GetRefPosition() + shift;
594 src_len = leaf.GetLength();
595 dst_from = leaf.GetRefPosition();
596 dst_len = src_len;
597 x_NextMappingRange(
598 *it->GetRefSeqid().GetSeqId(),
599 src_from, src_len, src_strand,
600 *leaf.GetRefSeqid().GetSeqId(),
601 dst_from, dst_len, dst_strand);
602 }
603 while ( !refs.empty() && refs.back().GetDepth() >= seg_it.GetDepth()) {
604 refs.pop_back();
605 }
606 }
607 if ( seg_it ) {
608 refs.push_back(seg_it);
609 }
610 }
611 }
612
613
x_InitializeSeqMapSingleLevel(CSeqMap_CI seg_it,const CSeq_id * top_id,ESeqMapDirection direction)614 void CSeq_loc_Mapper::x_InitializeSeqMapSingleLevel(CSeqMap_CI seg_it,
615 const CSeq_id* top_id,
616 ESeqMapDirection direction)
617 {
618 TSeqPos seg_from, seg_len, ref_from, ref_len;
619 ENa_strand seg_strand = eNa_strand_unknown;
620 ENa_strand ref_strand = eNa_strand_unknown;
621 // Stack of segments for each level.
622 list<CSeqMap_CI> refs;
623 refs.push_back(seg_it);
624 while ( seg_it ) {
625 ++seg_it;
626 // While depth increases push all iterators to the stack.
627 if ( seg_it ) {
628 if (refs.empty() || refs.back().GetDepth() < seg_it.GetDepth()) {
629 refs.push_back(seg_it);
630 continue;
631 }
632 }
633 // End of seq-map or the last iterator was a leaf - create mappings.
634 if ( !refs.empty() ) {
635 CSeqMap_CI ref = refs.back();
636 refs.pop_back();
637 if (direction == eSeqMap_Down) {
638 // Create self-mapping of the leaf reference - we can not use
639 // m_DstRanges here since they will contain ranges for each
640 // level while we need only the bottom.
641 seg_from = ref.GetRefPosition();
642 seg_len = ref.GetLength();
643 ref_from = seg_from;
644 ref_len = seg_len;
645 CConstRef<CSeq_id> id = ref.GetRefSeqid().GetSeqId();
646 x_NextMappingRange(*id, seg_from, seg_len, eNa_strand_unknown,
647 *id, ref_from, ref_len, eNa_strand_unknown);
648 }
649 // Create mapping for each non-leaf level.
650 while ( !refs.empty() ) {
651 const CSeqMap_CI& seg = refs.back();
652 TSeqPos shift = ref.GetPosition() - seg.GetPosition();
653 seg_strand = seg.GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus;
654 seg_from = seg.GetRefPosition() + shift;
655 seg_len = ref.GetLength();
656 ref_from = ref.GetRefPosition();
657 ref_len = seg_len;
658 ref_strand = ref.GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus;
659 switch (direction) {
660 case eSeqMap_Down:
661 x_NextMappingRange(
662 *seg.GetRefSeqid().GetSeqId(),
663 seg_from, seg_len, seg_strand,
664 *ref.GetRefSeqid().GetSeqId(),
665 ref_from, ref_len, ref_strand);
666 break;
667 case eSeqMap_Up:
668 x_NextMappingRange(
669 *ref.GetRefSeqid().GetSeqId(),
670 ref_from, ref_len, ref_strand,
671 *seg.GetRefSeqid().GetSeqId(),
672 seg_from, seg_len, seg_strand);
673 break;
674 }
675 ref = seg;
676 if (ref.GetDepth() >= seg_it.GetDepth()) {
677 refs.pop_back();
678 }
679 else {
680 break;
681 }
682 }
683 // Are there still segments above?
684 if ( refs.empty() ) {
685 // Top level segment.
686 _ASSERT(ref);
687 if (top_id) {
688 // If the top level is a single bioseq, add mapping.
689 seg_from = ref.GetPosition();
690 seg_len = ref.GetLength();
691 ref_from = ref.GetRefPosition();
692 ref_len = seg_len;
693 ref_strand = ref.GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus;
694 switch (direction) {
695 case eSeqMap_Down:
696 x_NextMappingRange(
697 *top_id,
698 seg_from, seg_len, eNa_strand_unknown,
699 *ref.GetRefSeqid().GetSeqId(),
700 ref_from, ref_len, ref_strand);
701 break;
702 case eSeqMap_Up:
703 x_NextMappingRange(
704 *ref.GetRefSeqid().GetSeqId(),
705 ref_from, ref_len, ref_strand,
706 *top_id,
707 seg_from, seg_len, seg_strand);
708 break;
709 }
710 }
711 else if (direction == eSeqMap_Up) {
712 // Create self-mapping of the top-level reference if the top
713 // level is not a single bioseq but rather a seq-map level.
714 ref_from = ref.GetRefPosition();
715 ref_len = ref.GetLength();
716 seg_from = ref_from;
717 seg_len = ref_len;
718 CConstRef<CSeq_id> id = ref.GetRefSeqid().GetSeqId();
719 x_NextMappingRange(*id, ref_from, ref_len, eNa_strand_unknown,
720 *id, seg_from, seg_len, eNa_strand_unknown);
721 }
722 }
723 }
724 if ( seg_it ) {
725 refs.push_back(seg_it);
726 }
727 }
728 // Remove all collected destination ranges - they are not real destinations.
729 m_DstRanges.clear();
730 // If top level is a single sequence, create self-mapping for it.
731 if (top_id && direction == eSeqMap_Up) {
732 m_DstRanges.resize(1);
733 m_DstRanges[0].clear();
734 m_DstRanges[0][CSeq_id_Handle::GetHandle(*top_id)].push_back(TRange::GetWhole());
735 }
736 }
737
738
739 CBioseq_Handle
x_AddVirtualBioseq(const TSynonyms & synonyms,const CGC_Sequence & gc_seq)740 CSeq_loc_Mapper::x_AddVirtualBioseq(const TSynonyms& synonyms,
741 const CGC_Sequence& gc_seq)
742 {
743 CRef<CBioseq> bioseq(new CBioseq);
744 ITERATE(IMapper_Sequence_Info::TSynonyms, syn, synonyms) {
745 CBioseq_Handle h = m_Scope.GetScope().GetBioseqHandle(*syn);
746 if ( h ) {
747 return h;
748 }
749 CRef<CSeq_id> syn_id(new CSeq_id);
750 syn_id->Assign(*syn->GetSeqId());
751 bioseq->SetId().push_back(syn_id);
752 }
753
754 bioseq->SetInst().SetMol(CSeq_inst::eMol_na);
755 if ( gc_seq.CanGetLength() ) {
756 bioseq->SetInst().SetLength(gc_seq.GetLength());
757 }
758
759 // Create virtual bioseq without length/data.
760 bioseq->SetInst().SetRepr(CSeq_inst::eRepr_virtual);
761 return m_Scope.GetScope().AddBioseq(*bioseq);
762 }
763
764
x_InitGCAssembly(const CGC_Assembly & gc_assembly,EGCAssemblyAlias to_alias)765 void CSeq_loc_Mapper::x_InitGCAssembly(const CGC_Assembly& gc_assembly,
766 EGCAssemblyAlias to_alias)
767 {
768 if ( gc_assembly.IsUnit() ) {
769 const CGC_AssemblyUnit& unit = gc_assembly.GetUnit();
770 if ( unit.IsSetMols() ) {
771 ITERATE(CGC_AssemblyUnit::TMols, it, unit.GetMols()) {
772 const CGC_Replicon::TSequence& seq = (*it)->GetSequence();
773 if ( seq.IsSingle() ) {
774 x_InitGCSequence(seq.GetSingle(), to_alias);
775 }
776 else {
777 ITERATE(CGC_Replicon::TSequence::TSet, tseq, seq.GetSet()) {
778 x_InitGCSequence(**tseq, to_alias);
779 }
780 }
781 }
782 }
783 if ( unit.IsSetOther_sequences() ) {
784 ITERATE(CGC_Sequence::TSequences, seq, unit.GetOther_sequences()) {
785 ITERATE(CGC_TaggedSequences::TSeqs, tseq, (*seq)->GetSeqs()) {
786 x_InitGCSequence(**tseq, to_alias);
787 }
788 }
789 }
790 }
791 else if ( gc_assembly.IsAssembly_set() ) {
792 const CGC_AssemblySet& aset = gc_assembly.GetAssembly_set();
793 x_InitGCAssembly(aset.GetPrimary_assembly(), to_alias);
794 if ( aset.IsSetMore_assemblies() ) {
795 ITERATE(CGC_AssemblySet::TMore_assemblies, assm, aset.GetMore_assemblies()) {
796 x_InitGCAssembly(**assm, to_alias);
797 }
798 }
799 }
800 }
801
802
s_IsLocalRandomChrId(const CSeq_id & id)803 inline bool s_IsLocalRandomChrId(const CSeq_id& id)
804 {
805 return id.IsLocal() && id.GetLocal().IsStr() &&
806 id.GetLocal().GetStr().find("_random") != string::npos;
807 }
808
809
x_IsUCSCRandomChr(const CGC_Sequence & gc_seq,CConstRef<CSeq_id> & chr_id,TSynonyms & synonyms) const810 bool CSeq_loc_Mapper::x_IsUCSCRandomChr(const CGC_Sequence& gc_seq,
811 CConstRef<CSeq_id>& chr_id,
812 TSynonyms& synonyms) const
813 {
814 chr_id.Reset();
815 if (!gc_seq.IsSetStructure()) return false;
816
817 CConstRef<CSeq_id> id(&gc_seq.GetSeq_id());
818 synonyms.insert(CSeq_id_Handle::GetHandle(*id));
819 if ( s_IsLocalRandomChrId(*id) ) {
820 chr_id = id;
821 }
822 // Collect all synonyms.
823 ITERATE(CGC_Sequence::TSeq_id_synonyms, it, gc_seq.GetSeq_id_synonyms()) {
824 const CGC_TypedSeqId& gc_id = **it;
825 switch ( gc_id.Which() ) {
826 case CGC_TypedSeqId::e_Genbank:
827 if ( gc_id.GetGenbank().IsSetGi() ) {
828 id.Reset(&gc_id.GetGenbank().GetGi());
829 }
830 break;
831 case CGC_TypedSeqId::e_Refseq:
832 if ( gc_id.GetRefseq().IsSetGi() ) {
833 id.Reset(&gc_id.GetRefseq().GetGi());
834 }
835 break;
836 case CGC_TypedSeqId::e_External:
837 id.Reset(&gc_id.GetExternal().GetId());
838 break;
839 case CGC_TypedSeqId::e_Private:
840 id.Reset(&gc_id.GetPrivate());
841 break;
842 default:
843 continue;
844 }
845 synonyms.insert(CSeq_id_Handle::GetHandle(*id));
846 if ( !chr_id && s_IsLocalRandomChrId(*id) ) {
847 chr_id = id;
848 }
849 }
850
851 if ( !chr_id ) {
852 synonyms.clear();
853 return false;
854 }
855
856 // Use only random chromosome ids, ignore other synonyms (?)
857 string lcl_str = chr_id->GetLocal().GetStr();
858 if ( !NStr::StartsWith(lcl_str, "chr") ) {
859 CSeq_id lcl;
860 lcl.SetLocal().SetStr("chr" + lcl_str);
861 synonyms.insert(CSeq_id_Handle::GetHandle(lcl));
862 }
863
864 return true;
865 }
866
867
s_GetSeqIdAlias(const CGC_TypedSeqId & id,CSeq_loc_Mapper::EGCAssemblyAlias alias)868 const CSeq_id* s_GetSeqIdAlias(const CGC_TypedSeqId& id,
869 CSeq_loc_Mapper::EGCAssemblyAlias alias)
870 {
871 switch ( id.Which() ) {
872 case CGC_TypedSeqId::e_Genbank:
873 if (alias == CSeq_loc_Mapper::eGCA_Genbank) {
874 return id.GetGenbank().IsSetGi() ?
875 &id.GetGenbank().GetGi() : &id.GetGenbank().GetPublic();
876 }
877 if (alias == CSeq_loc_Mapper::eGCA_GenbankAcc) {
878 return &id.GetGenbank().GetPublic();
879 }
880 break;
881 case CGC_TypedSeqId::e_Refseq:
882 if (alias == CSeq_loc_Mapper::eGCA_Refseq) {
883 return id.GetRefseq().IsSetGi() ?
884 &id.GetRefseq().GetGi() : &id.GetRefseq().GetPublic();
885 }
886 if (alias == CSeq_loc_Mapper::eGCA_RefseqAcc) {
887 return &id.GetRefseq().GetPublic();
888 }
889 break;
890 case CGC_TypedSeqId::e_External:
891 if (alias == CSeq_loc_Mapper::eGCA_UCSC &&
892 id.GetExternal().GetExternal() == "UCSC") {
893 return &id.GetExternal().GetId();
894 }
895 break;
896 case CGC_TypedSeqId::e_Private:
897 if (alias == CSeq_loc_Mapper::eGCA_Other) {
898 return &id.GetPrivate();
899 }
900 break;
901 default:
902 break;
903 }
904 return 0;
905 }
906
907
x_InitGCSequence(const CGC_Sequence & gc_seq,EGCAssemblyAlias to_alias)908 void CSeq_loc_Mapper::x_InitGCSequence(const CGC_Sequence& gc_seq,
909 EGCAssemblyAlias to_alias)
910 {
911 if ( gc_seq.IsSetSeq_id_synonyms() ) {
912 CConstRef<CSeq_id> dst_id;
913 ITERATE(CGC_Sequence::TSeq_id_synonyms, it, gc_seq.GetSeq_id_synonyms()) {
914 const CGC_TypedSeqId& id = **it;
915 dst_id.Reset(s_GetSeqIdAlias(id, to_alias));
916 if ( dst_id ) break; // Use the first matching alias
917 }
918 if ( dst_id ) {
919 TSynonyms synonyms;
920 synonyms.insert(CSeq_id_Handle::GetHandle(gc_seq.GetSeq_id()));
921 ITERATE(CGC_Sequence::TSeq_id_synonyms, it, gc_seq.GetSeq_id_synonyms()) {
922 // Add conversion for each synonym which can be used
923 // as a source id.
924 const CGC_TypedSeqId& id = **it;
925 switch ( id.Which() ) {
926 case CGC_TypedSeqId::e_Genbank:
927 if (id.GetGenbank().IsSetGi() && dst_id != &id.GetGenbank().GetGi()) {
928 synonyms.insert(CSeq_id_Handle::GetHandle(id.GetGenbank().GetGi()));
929 }
930 if (dst_id != &id.GetGenbank().GetPublic()) {
931 synonyms.insert(CSeq_id_Handle::GetHandle(id.GetGenbank().GetPublic()));
932 }
933 if ( id.GetGenbank().IsSetGpipe() ) {
934 synonyms.insert(CSeq_id_Handle::GetHandle(id.GetGenbank().GetGpipe()));
935 }
936 break;
937 case CGC_TypedSeqId::e_Refseq:
938 if (id.GetRefseq().IsSetGi() && dst_id != &id.GetRefseq().GetGi()) {
939 synonyms.insert(CSeq_id_Handle::GetHandle(id.GetRefseq().GetGi()));
940 }
941 if (dst_id != &id.GetRefseq().GetPublic()) {
942 synonyms.insert(CSeq_id_Handle::GetHandle(id.GetRefseq().GetPublic()));
943 }
944 if ( id.GetRefseq().IsSetGpipe() ) {
945 synonyms.insert(CSeq_id_Handle::GetHandle(id.GetRefseq().GetGpipe()));
946 }
947 break;
948 case CGC_TypedSeqId::e_Private:
949 // Ignore private local ids - they are not unique.
950 if (id.GetPrivate().IsLocal()) continue;
951 if (dst_id != &id.GetPrivate()) {
952 synonyms.insert(CSeq_id_Handle::GetHandle(id.GetPrivate()));
953 }
954 break;
955 case CGC_TypedSeqId::e_External:
956 if (dst_id != &id.GetExternal().GetId()) {
957 synonyms.insert(CSeq_id_Handle::GetHandle(id.GetExternal().GetId()));
958 }
959 break;
960 default:
961 NCBI_THROW(CAnnotMapperException, eOtherError,
962 "Unsupported alias type in GC-Sequence synonyms");
963 break;
964 }
965 }
966 CBioseq_Handle h = x_AddVirtualBioseq(synonyms, gc_seq);
967 TSeqPos hlen = kInvalidSeqPos;
968 if (h && h.CanGetInst_Length()) {
969 hlen = h.GetInst_Length();
970 }
971 x_AddConversion(gc_seq.GetSeq_id(), 0, eNa_strand_unknown,
972 *dst_id, 0, eNa_strand_unknown,
973 hlen != kInvalidSeqPos ? hlen : TRange::GetWholeLength(),
974 false, 0, hlen, hlen);
975 }
976 else if (to_alias == eGCA_UCSC || to_alias == eGCA_Refseq) {
977 TSynonyms synonyms;
978 CConstRef<CSeq_id> chr_id;
979 // The requested alias type not found,
980 // check for UCSC random chromosomes.
981 if ( x_IsUCSCRandomChr(gc_seq, chr_id, synonyms) ) {
982 _ASSERT(chr_id);
983
984 // Use structure (delta-seq) to initialize the mapper.
985 // Here we use just one level of the delta and parse it
986 // directly rather than use CSeqMap.
987 TSeqPos chr_pos = 0;
988 TSeqPos chr_len = kInvalidSeqPos;
989 ITERATE(CDelta_ext::Tdata, it, gc_seq.GetStructure().Get()) {
990 // Do not create mappings for literals/gaps.
991 if ( (*it)->IsLiteral() ) {
992 chr_pos += (*it)->GetLiteral().GetLength();
993 }
994 if ( !(*it)->IsLoc() ) {
995 continue;
996 }
997 CSeq_loc_CI loc_it((*it)->GetLoc());
998 for (; loc_it; ++loc_it) {
999 if ( loc_it.IsEmpty() ) continue;
1000 TSeqPos seg_pos = loc_it.GetRange().GetFrom();
1001 TSeqPos seg_len = loc_it.GetRange().GetLength();
1002 ENa_strand seg_str = loc_it.IsSetStrand() ?
1003 loc_it.GetStrand() : eNa_strand_unknown;
1004 switch ( to_alias ) {
1005 case eGCA_UCSC:
1006 // Map up to the chr
1007 x_NextMappingRange(loc_it.GetSeq_id(),
1008 seg_pos, seg_len, seg_str,
1009 *chr_id, chr_pos, chr_len,
1010 eNa_strand_unknown);
1011 break;
1012 case eGCA_Refseq:
1013 // Map down to delta parts
1014 x_NextMappingRange(*chr_id, chr_pos, chr_len,
1015 eNa_strand_unknown,
1016 loc_it.GetSeq_id(), seg_pos, seg_len,
1017 seg_str);
1018 break;
1019 default:
1020 break;
1021 }
1022 }
1023 }
1024 x_AddVirtualBioseq(synonyms, gc_seq);
1025 }
1026 }
1027 }
1028 if ( gc_seq.IsSetSequences() ) {
1029 ITERATE(CGC_Sequence::TSequences, seq, gc_seq.GetSequences()) {
1030 ITERATE(CGC_TaggedSequences::TSeqs, tseq, (*seq)->GetSeqs()) {
1031 x_InitGCSequence(**tseq, to_alias);
1032 }
1033 }
1034 }
1035 }
1036
1037
1038 /////////////////////////////////////////////////////////////////////
1039 //
1040 // Initialization helpers
1041 //
1042
1043
1044 CSeq_align_Mapper_Base*
InitAlignMapper(const CSeq_align & src_align)1045 CSeq_loc_Mapper::InitAlignMapper(const CSeq_align& src_align)
1046 {
1047 return new CSeq_align_Mapper(src_align, *this);
1048 }
1049
1050
1051 END_SCOPE(objects)
1052 END_NCBI_SCOPE
1053