1 /*  $Id: seq_loc_mapper.cpp 591839 2019-08-21 15:03:00Z grichenk $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aleksey Grichenko
27 *
28 * File Description:
29 *   Seq-loc mapper
30 *
31 */
32 
33 #include <ncbi_pch.hpp>
34 #include <objmgr/seq_loc_mapper.hpp>
35 #include <objmgr/scope.hpp>
36 #include <objmgr/object_manager.hpp>
37 #include <objmgr/objmgr_exception.hpp>
38 #include <objmgr/seq_map.hpp>
39 #include <objmgr/seq_map_ci.hpp>
40 #include <objmgr/impl/synonyms.hpp>
41 #include <objmgr/impl/seq_align_mapper.hpp>
42 #include <objmgr/impl/seq_loc_cvt.hpp>
43 #include <objmgr/gc_assembly_parser.hpp>
44 #include <objects/seqloc/Seq_loc.hpp>
45 #include <objects/seqfeat/Seq_feat.hpp>
46 #include <objects/seqfeat/Cdregion.hpp>
47 #include <objects/seqloc/Seq_loc_equiv.hpp>
48 #include <objects/seqloc/Seq_bond.hpp>
49 #include <objects/seqalign/seqalign__.hpp>
50 #include <objects/genomecoll/genome_collection__.hpp>
51 #include <objects/seq/Delta_ext.hpp>
52 #include <objects/seq/Delta_seq.hpp>
53 #include <objects/seq/Seq_literal.hpp>
54 #include <objects/seq/Seq_ext.hpp>
55 #include <objects/seq/Seq_gap.hpp>
56 #include <algorithm>
57 
58 
59 BEGIN_NCBI_SCOPE
60 BEGIN_SCOPE(objects)
61 
62 
63 /////////////////////////////////////////////////////////////////////
64 //
65 // CScope_Mapper_Sequence_Info
66 //
67 //   Sequence type/length/synonyms provider using CScope to fetch
68 //   the information.
69 
70 
71 class CScope_Mapper_Sequence_Info : public IMapper_Sequence_Info
72 {
73 public:
74     CScope_Mapper_Sequence_Info(CScope* scope);
75 
76     virtual TSeqType GetSequenceType(const CSeq_id_Handle& idh);
77     virtual TSeqPos GetSequenceLength(const CSeq_id_Handle& idh);
78     virtual void CollectSynonyms(const CSeq_id_Handle& id,
79                                  TSynonyms&            synonyms);
80 private:
81     CHeapScope m_Scope;
82 };
83 
84 
CScope_Mapper_Sequence_Info(CScope * scope)85 CScope_Mapper_Sequence_Info::CScope_Mapper_Sequence_Info(CScope* scope)
86     : m_Scope(scope)
87 {
88 }
89 
90 
91 void CScope_Mapper_Sequence_Info::
CollectSynonyms(const CSeq_id_Handle & id,TSynonyms & synonyms)92 CollectSynonyms(const CSeq_id_Handle& id,
93                 TSynonyms&            synonyms)
94 {
95     if ( m_Scope.IsNull() ) {
96         synonyms.insert(id);
97     }
98     else {
99         CConstRef<CSynonymsSet> syns =
100             m_Scope.GetScope().GetSynonyms(id);
101         ITERATE(CSynonymsSet, syn_it, *syns) {
102             synonyms.insert(CSynonymsSet::GetSeq_id_Handle(syn_it));
103         }
104     }
105 }
106 
107 
108 CScope_Mapper_Sequence_Info::TSeqType
GetSequenceType(const CSeq_id_Handle & idh)109 CScope_Mapper_Sequence_Info::GetSequenceType(const CSeq_id_Handle& idh)
110 {
111     if ( m_Scope.IsNull() ) {
112         return CSeq_loc_Mapper_Base::eSeq_unknown;
113     }
114     switch ( m_Scope.GetScope().GetSequenceType(idh) ) {
115     case CSeq_inst::eMol_dna:
116     case CSeq_inst::eMol_rna:
117     case CSeq_inst::eMol_na:
118         return CSeq_loc_Mapper_Base::eSeq_nuc;
119     case CSeq_inst::eMol_aa:
120         return CSeq_loc_Mapper_Base::eSeq_prot;
121     default:
122         return CSeq_loc_Mapper_Base::eSeq_unknown;
123     }
124 }
125 
126 
GetSequenceLength(const CSeq_id_Handle & idh)127 TSeqPos CScope_Mapper_Sequence_Info::GetSequenceLength(const CSeq_id_Handle& idh)
128 {
129     CBioseq_Handle h;
130     if ( m_Scope.IsNull() ) {
131         return kInvalidSeqPos;
132     }
133     h = m_Scope.GetScope().GetBioseqHandle(idh);
134     return h ? h.GetBioseqLength() : kInvalidSeqPos;
135 }
136 
137 
138 /////////////////////////////////////////////////////////////////////
139 //
140 // CSeq_loc_Mapper
141 //
142 
143 
144 /////////////////////////////////////////////////////////////////////
145 //
146 //   Initialization of the mapper
147 //
148 
149 
150 inline
s_IndexToStrand(size_t idx)151 ENa_strand s_IndexToStrand(size_t idx)
152 {
153     _ASSERT(idx != 0);
154     return ENa_strand(idx - 1);
155 }
156 
157 #define STRAND_TO_INDEX(is_set, strand) \
158     ((is_set) ? size_t((strand) + 1) : 0)
159 
160 #define INDEX_TO_STRAND(idx) \
161     s_IndexToStrand(idx)
162 
163 
SetOptionsScope(CSeq_loc_Mapper_Options & options,CScope * scope)164 CSeq_loc_Mapper_Options& SetOptionsScope(CSeq_loc_Mapper_Options& options, CScope* scope)
165 {
166     if (!options.GetMapperSequenceInfo()) {
167         options.SetMapperSequenceInfo(new CScope_Mapper_Sequence_Info(scope));
168     }
169     return options;
170 }
171 
172 
CSeq_loc_Mapper(CMappingRanges * mapping_ranges,CScope * scope,CSeq_loc_Mapper_Options options)173 CSeq_loc_Mapper::CSeq_loc_Mapper(CMappingRanges* mapping_ranges,
174                                  CScope*         scope,
175                                  CSeq_loc_Mapper_Options options)
176     : CSeq_loc_Mapper_Base(mapping_ranges, SetOptionsScope(options, scope)),
177       m_Scope(scope)
178 {
179 }
180 
181 
CSeq_loc_Mapper(const CSeq_feat & map_feat,EFeatMapDirection dir,CScope * scope,CSeq_loc_Mapper_Options options)182 CSeq_loc_Mapper::CSeq_loc_Mapper(const CSeq_feat&  map_feat,
183                                  EFeatMapDirection dir,
184                                  CScope*           scope,
185                                  CSeq_loc_Mapper_Options options)
186     : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
187       m_Scope(scope)
188 {
189     x_InitializeFeat(map_feat, dir);
190 }
191 
192 
CSeq_loc_Mapper(const CSeq_loc & source,const CSeq_loc & target,CScope * scope,CSeq_loc_Mapper_Options options)193 CSeq_loc_Mapper::CSeq_loc_Mapper(const CSeq_loc& source,
194                                  const CSeq_loc& target,
195                                  CScope*         scope,
196                                  CSeq_loc_Mapper_Options options)
197     : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
198       m_Scope(scope)
199 {
200     x_InitializeLocs(source, target);
201 }
202 
203 
CSeq_loc_Mapper(const CSeq_align & map_align,const CSeq_id & to_id,CScope * scope,CSeq_loc_Mapper_Options options)204 CSeq_loc_Mapper::CSeq_loc_Mapper(const CSeq_align&       map_align,
205                                  const CSeq_id&          to_id,
206                                  CScope*                 scope,
207                                  CSeq_loc_Mapper_Options options)
208     : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
209       m_Scope(scope)
210 {
211     x_InitializeAlign(map_align, to_id);
212 }
213 
214 
CSeq_loc_Mapper(const CSeq_align & map_align,size_t to_row,CScope * scope,CSeq_loc_Mapper_Options options)215 CSeq_loc_Mapper::CSeq_loc_Mapper(const CSeq_align&       map_align,
216                                  size_t                  to_row,
217                                  CScope*                 scope,
218                                  CSeq_loc_Mapper_Options options)
219     : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
220       m_Scope(scope)
221 {
222     x_InitializeAlign(map_align, to_row);
223 }
224 
225 
CSeq_loc_Mapper(const CSeq_id & from_id,const CSeq_id & to_id,const CSeq_align & map_align,CScope * scope,CSeq_loc_Mapper_Options options)226 CSeq_loc_Mapper::CSeq_loc_Mapper(const CSeq_id&          from_id,
227                                  const CSeq_id&          to_id,
228                                  const CSeq_align&       map_align,
229                                  CScope*                 scope,
230                                  CSeq_loc_Mapper_Options options)
231     : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
232       m_Scope(scope)
233 {
234     x_InitializeAlign(map_align, to_id, &from_id);
235 }
236 
237 
CSeq_loc_Mapper(size_t from_row,size_t to_row,const CSeq_align & map_align,CScope * scope,CSeq_loc_Mapper_Options options)238 CSeq_loc_Mapper::CSeq_loc_Mapper(size_t                  from_row,
239                                  size_t                  to_row,
240                                  const CSeq_align&       map_align,
241                                  CScope*                 scope,
242                                  CSeq_loc_Mapper_Options options)
243     : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
244       m_Scope(scope)
245 {
246     x_InitializeAlign(map_align, to_row, from_row);
247 }
248 
249 
CSeq_loc_Mapper(CBioseq_Handle target_seq,ESeqMapDirection direction,CSeq_loc_Mapper_Options options)250 CSeq_loc_Mapper::CSeq_loc_Mapper(CBioseq_Handle   target_seq,
251                                  ESeqMapDirection direction,
252                                  CSeq_loc_Mapper_Options options)
253     : CSeq_loc_Mapper_Base(SetOptionsScope(options, &target_seq.GetScope())),
254       m_Scope(&target_seq.GetScope())
255 {
256     CConstRef<CSeq_id> top_level_id = target_seq.GetSeqId();
257     if ( !top_level_id ) {
258         // Bioseq handle has no id, try to get one.
259         CConstRef<CSynonymsSet> syns = target_seq.GetSynonyms();
260         if ( !syns->empty() ) {
261             top_level_id = syns->GetSeq_id_Handle(syns->begin()).GetSeqId();
262         }
263     }
264     x_InitializeSeqMap(target_seq.GetSeqMap(),
265                        top_level_id.GetPointerOrNull(),
266                        direction);
267     if (direction == eSeqMap_Up) {
268         // Ignore seq-map destination ranges, map whole sequence to itself,
269         // use unknown strand only.
270         m_DstRanges.resize(1);
271         m_DstRanges[0].clear();
272         m_DstRanges[0][CSeq_id_Handle::GetHandle(*top_level_id)]
273             .push_back(TRange::GetWhole());
274     }
275     x_PreserveDestinationLocs();
276 }
277 
278 
CSeq_loc_Mapper(const CSeqMap & seq_map,ESeqMapDirection direction,const CSeq_id * top_level_id,CScope * scope,CSeq_loc_Mapper_Options options)279 CSeq_loc_Mapper::CSeq_loc_Mapper(const CSeqMap&   seq_map,
280                                  ESeqMapDirection direction,
281                                  const CSeq_id*   top_level_id,
282                                  CScope*          scope,
283                                  CSeq_loc_Mapper_Options options)
284     : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
285       m_Scope(scope)
286 {
287     x_InitializeSeqMap(seq_map, top_level_id, direction);
288     x_PreserveDestinationLocs();
289 }
290 
291 
CSeq_loc_Mapper(CBioseq_Handle target_seq,ESeqMapDirection direction,SSeqMapSelector selector,CSeq_loc_Mapper_Options options)292 CSeq_loc_Mapper::CSeq_loc_Mapper(CBioseq_Handle   target_seq,
293                                  ESeqMapDirection direction,
294                                  SSeqMapSelector  selector,
295                                  CSeq_loc_Mapper_Options options)
296     : CSeq_loc_Mapper_Base(SetOptionsScope(options, &target_seq.GetScope())),
297       m_Scope(&target_seq.GetScope())
298 {
299     CConstRef<CSeq_id> top_id = target_seq.GetSeqId();
300     if ( !top_id ) {
301         // Bioseq handle has no id, try to get one.
302         CConstRef<CSynonymsSet> syns = target_seq.GetSynonyms();
303         if ( !syns->empty() ) {
304             top_id = syns->GetSeq_id_Handle(syns->begin()).GetSeqId();
305         }
306     }
307     selector.SetLinkUsedTSE(target_seq.GetTSE_Handle());
308     x_InitializeSeqMap(target_seq.GetSeqMap(), selector, top_id, direction);
309     if (direction == eSeqMap_Up) {
310         // Ignore seq-map destination ranges, map whole sequence to itself,
311         // use unknown strand only.
312         m_DstRanges.resize(1);
313         m_DstRanges[0].clear();
314         m_DstRanges[0][CSeq_id_Handle::GetHandle(*top_id)]
315             .push_back(TRange::GetWhole());
316     }
317     x_PreserveDestinationLocs();
318 }
319 
320 
CSeq_loc_Mapper(const CSeqMap & seq_map,ESeqMapDirection direction,SSeqMapSelector selector,const CSeq_id * top_level_id,CScope * scope,CSeq_loc_Mapper_Options options)321 CSeq_loc_Mapper::CSeq_loc_Mapper(const CSeqMap&   seq_map,
322                                  ESeqMapDirection direction,
323                                  SSeqMapSelector  selector,
324                                  const CSeq_id*   top_level_id,
325                                  CScope*          scope,
326                                  CSeq_loc_Mapper_Options options)
327     : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
328       m_Scope(scope)
329 {
330     x_InitializeSeqMap(seq_map,
331                        selector,
332                        top_level_id,
333                        direction);
334     x_PreserveDestinationLocs();
335 }
336 
337 
CSeq_loc_Mapper(size_t depth,const CBioseq_Handle & top_level_seq,ESeqMapDirection direction,CSeq_loc_Mapper_Options options)338 CSeq_loc_Mapper::CSeq_loc_Mapper(size_t                 depth,
339                                  const CBioseq_Handle&  top_level_seq,
340                                  ESeqMapDirection       direction,
341                                  CSeq_loc_Mapper_Options options)
342     : CSeq_loc_Mapper_Base(SetOptionsScope(options, &top_level_seq.GetScope())),
343       m_Scope(&top_level_seq.GetScope())
344 {
345     if (depth > 0) {
346         depth--;
347         x_InitializeSeqMap(top_level_seq.GetSeqMap(),
348                            depth,
349                            top_level_seq.GetSeqId().GetPointer(),
350                            direction);
351     }
352     else if (direction == eSeqMap_Up) {
353         // Synonyms conversion
354         CConstRef<CSeq_id> top_level_id = top_level_seq.GetSeqId();
355         m_DstRanges.resize(1);
356         m_DstRanges[0][CSeq_id_Handle::GetHandle(*top_level_id)]
357             .push_back(TRange::GetWhole());
358     }
359     x_PreserveDestinationLocs();
360 }
361 
362 
CSeq_loc_Mapper(size_t depth,const CSeqMap & top_level_seq,ESeqMapDirection direction,const CSeq_id * top_level_id,CScope * scope,CSeq_loc_Mapper_Options options)363 CSeq_loc_Mapper::CSeq_loc_Mapper(size_t           depth,
364                                  const CSeqMap&   top_level_seq,
365                                  ESeqMapDirection direction,
366                                  const CSeq_id*   top_level_id,
367                                  CScope*          scope,
368                                  CSeq_loc_Mapper_Options options)
369     : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
370       m_Scope(scope)
371 {
372     if (depth > 0) {
373         depth--;
374         x_InitializeSeqMap(top_level_seq, depth, top_level_id, direction);
375     }
376     else if (direction == eSeqMap_Up) {
377         // Synonyms conversion
378         m_DstRanges.resize(1);
379         m_DstRanges[0][CSeq_id_Handle::GetHandle(*top_level_id)]
380             .push_back(TRange::GetWhole());
381     }
382     x_PreserveDestinationLocs();
383 }
384 
385 
CSeq_loc_Mapper(const CGC_Assembly & gc_assembly,EGCAssemblyAlias to_alias,CScope * scope,EScopeFlag scope_flag)386 CSeq_loc_Mapper::CSeq_loc_Mapper(const CGC_Assembly& gc_assembly,
387                                  EGCAssemblyAlias    to_alias,
388                                  CScope*             scope,
389                                  EScopeFlag          scope_flag)
390     : CSeq_loc_Mapper_Base(CSeq_loc_Mapper_Options(new CScope_Mapper_Sequence_Info(scope))),
391       m_Scope(scope)
392 {
393     // While parsing GC-Assembly the mapper will need to add virtual
394     // bioseqs to the scope. To keep the original scope clean of them,
395     // create a new scope and add the original one as a child.
396     if (scope_flag == eCopyScope) {
397         m_Scope = CHeapScope(new CScope(*CObjectManager::GetInstance()));
398         if ( scope ) {
399             m_Scope.GetScope().AddScope(*scope);
400         }
401         m_MapOptions.SetMapperSequenceInfo(new CScope_Mapper_Sequence_Info(m_Scope));
402     }
403     x_InitGCAssembly(gc_assembly, to_alias);
404 }
405 
406 
CSeq_loc_Mapper(const CGC_Assembly & gc_assembly,ESeqMapDirection direction,SSeqMapSelector selector,CScope * scope,EScopeFlag scope_flag,CSeq_loc_Mapper_Options options)407 CSeq_loc_Mapper::CSeq_loc_Mapper(const CGC_Assembly& gc_assembly,
408                                  ESeqMapDirection    direction,
409                                  SSeqMapSelector     selector,
410                                  CScope*             scope,
411                                  EScopeFlag          scope_flag,
412                                  CSeq_loc_Mapper_Options options)
413     : CSeq_loc_Mapper_Base(SetOptionsScope(options, scope)),
414       m_Scope(scope)
415 {
416     // While parsing GC-Assembly the mapper will need to add virtual
417     // bioseqs to the scope. To keep the original scope clean of them,
418     // create a new scope and add the original one as a child.
419     if (scope_flag == eCopyScope) {
420         m_Scope = CHeapScope(new CScope(*CObjectManager::GetInstance()));
421         if ( scope ) {
422             m_Scope.GetScope().AddScope(*scope);
423         }
424         m_MapOptions.SetMapperSequenceInfo(new CScope_Mapper_Sequence_Info(m_Scope));
425     }
426     CGC_Assembly_Parser parser(gc_assembly);
427     CRef<CSeq_entry> entry = parser.GetTSE();
428     m_Scope.GetScope().AddTopLevelSeqEntry(*entry);
429     const CGC_Assembly_Parser::TSeqIds& ids = parser.GetTopLevelSequences();
430     ITERATE(CGC_Assembly_Parser::TSeqIds, id, ids) {
431         CBioseq_Handle h = m_Scope.GetScope().GetBioseqHandle(*id);
432         if ( !h ) continue;
433         x_InitializeSeqMap(h.GetSeqMap(), selector, id->GetSeqId(), direction);
434         if (direction == eSeqMap_Up) {
435             // Ignore seq-map destination ranges, map whole sequence to itself,
436             // use unknown strand only.
437             m_DstRanges.resize(1);
438             m_DstRanges[0].clear();
439             m_DstRanges[0][*id].push_back(TRange::GetWhole());
440         }
441         x_PreserveDestinationLocs();
442     }
443 }
444 
445 
~CSeq_loc_Mapper(void)446 CSeq_loc_Mapper::~CSeq_loc_Mapper(void)
447 {
448     return;
449 }
450 
451 
x_InitializeSeqMap(const CSeqMap & seq_map,const CSeq_id * top_id,ESeqMapDirection direction)452 void CSeq_loc_Mapper::x_InitializeSeqMap(const CSeqMap&   seq_map,
453                                          const CSeq_id*   top_id,
454                                          ESeqMapDirection direction)
455 {
456     x_InitializeSeqMap(seq_map, size_t(-1), top_id, direction);
457 }
458 
459 
x_InitializeSeqMap(const CSeqMap & seq_map,size_t depth,const CSeq_id * top_id,ESeqMapDirection direction)460 void CSeq_loc_Mapper::x_InitializeSeqMap(const CSeqMap&   seq_map,
461                                          size_t           depth,
462                                          const CSeq_id*   top_id,
463                                          ESeqMapDirection direction)
464 {
465     x_InitializeSeqMap(seq_map,
466                        SSeqMapSelector(0, depth),
467                        top_id,
468                        direction);
469 }
470 
471 
x_InitializeSeqMap(const CSeqMap & seq_map,SSeqMapSelector selector,const CSeq_id * top_id,ESeqMapDirection direction)472 void CSeq_loc_Mapper::x_InitializeSeqMap(const CSeqMap&   seq_map,
473                                          SSeqMapSelector  selector,
474                                          const CSeq_id*   top_id,
475                                          ESeqMapDirection direction)
476 {
477     selector.SetFlags(CSeqMap::fFindRef | CSeqMap::fIgnoreUnresolved)
478         .SetLinkUsedTSE();
479     x_InitializeSeqMap(CSeqMap_CI(ConstRef(&seq_map),
480                        m_Scope.GetScopeOrNull(), selector),
481                        top_id,
482                        direction);
483 }
484 
485 
x_InitializeSeqMap(CSeqMap_CI seg_it,const CSeq_id * top_id,ESeqMapDirection direction)486 void CSeq_loc_Mapper::x_InitializeSeqMap(CSeqMap_CI       seg_it,
487                                          const CSeq_id*   top_id,
488                                          ESeqMapDirection direction)
489 {
490     if (m_MapOptions.GetMapSingleLevel()) {
491         x_InitializeSeqMapSingleLevel(seg_it, top_id, direction);
492     }
493     else if (direction == eSeqMap_Up) {
494         x_InitializeSeqMapUp(seg_it, top_id);
495     }
496     else {
497         x_InitializeSeqMapDown(seg_it, top_id);
498     }
499 }
500 
501 
x_InitializeSeqMapUp(CSeqMap_CI seg_it,const CSeq_id * top_id)502 void CSeq_loc_Mapper::x_InitializeSeqMapUp(CSeqMap_CI       seg_it,
503                                            const CSeq_id*   top_id)
504 {
505     TSeqPos src_from, src_len, dst_from, dst_len;
506     ENa_strand dst_strand = eNa_strand_unknown;
507     // Mapping up - for each iterator create mapping to the top level.
508     // If top_id is set, top level positions are the iterator's positions.
509     // Otherwise top-level ids and positions must be taked from the
510     // iterators with depth == 1.
511     TSeqPos top_ref_start = 0;
512     TSeqPos top_start = 0;
513     CConstRef<CSeq_id> dst_id(top_id);
514     _ASSERT(seg_it.GetDepth() == 1);
515     while ( seg_it ) {
516         ENa_strand src_strand = seg_it.GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus;
517         src_from = seg_it.GetRefPosition();
518         src_len = seg_it.GetLength();
519         dst_len = src_len;
520         if (top_id) {
521             dst_from = seg_it.GetPosition();
522             x_NextMappingRange(
523                 *seg_it.GetRefSeqid().GetSeqId(),
524                 src_from, src_len, src_strand,
525                 *top_id,
526                 dst_from, dst_len, dst_strand);
527         }
528         else /* !top_id */ {
529             if (seg_it.GetDepth() == 1) {
530                 // Depth==1 - top level sequences (destination).
531                 dst_id.Reset(seg_it.GetRefSeqid().GetSeqId());
532                 top_ref_start = seg_it.GetRefPosition();
533                 top_start = seg_it.GetPosition();
534                 dst_strand = seg_it.GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus;
535             }
536             else {
537                 _ASSERT(seg_it.GetPosition() >= top_start);
538                 TSeqPos shift = seg_it.GetPosition() - top_start;
539                 dst_from = top_ref_start + shift;
540                 x_NextMappingRange(
541                     *seg_it.GetRefSeqid().GetSeqId(),
542                     src_from, src_len, src_strand,
543                     *dst_id,
544                     dst_from, dst_len, dst_strand);
545             }
546         }
547         ++seg_it;
548     }
549 }
550 
551 
x_InitializeSeqMapDown(CSeqMap_CI seg_it,const CSeq_id * top_id)552 void CSeq_loc_Mapper::x_InitializeSeqMapDown(CSeqMap_CI       seg_it,
553                                              const CSeq_id*   top_id)
554 {
555     TSeqPos src_from, src_len, dst_from, dst_len;
556     ENa_strand dst_strand = eNa_strand_unknown;
557     // eSeqMap_Down
558     // Collect all non-leaf references, create mapping from each non-leaf
559     // to the bottom level.
560     list<CSeqMap_CI> refs;
561     refs.push_back(seg_it);
562     while ( seg_it ) {
563         ++seg_it;
564         // While depth increases push all iterators to the stack.
565         if ( seg_it ) {
566             if (refs.empty()  ||  refs.back().GetDepth() < seg_it.GetDepth()) {
567                 refs.push_back(seg_it);
568                 continue;
569             }
570         }
571         // End of seq-map or the last iterator was a leaf - create mappings.
572         if ( !refs.empty() ) {
573             CSeqMap_CI leaf = refs.back();
574             // Exclude self-mapping of the leaf reference.
575             refs.pop_back();
576             dst_strand = leaf.GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus;
577             if ( top_id ) {
578                 // Add mapping from the top-level sequence if any.
579                 src_from = leaf.GetPosition();
580                 src_len = leaf.GetLength();
581                 dst_from = leaf.GetRefPosition();
582                 dst_len = src_len;
583                 x_NextMappingRange(
584                     *top_id,
585                     src_from, src_len, eNa_strand_unknown,
586                     *leaf.GetRefSeqid().GetSeqId(),
587                     dst_from, dst_len, dst_strand);
588             }
589             // Create mapping from each non-leaf level.
590             ITERATE(list<CSeqMap_CI>, it, refs) {
591                 TSeqPos shift = leaf.GetPosition() - it->GetPosition();
592                 ENa_strand src_strand = it->GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus;
593                 src_from = it->GetRefPosition() + shift;
594                 src_len = leaf.GetLength();
595                 dst_from = leaf.GetRefPosition();
596                 dst_len = src_len;
597                 x_NextMappingRange(
598                     *it->GetRefSeqid().GetSeqId(),
599                     src_from, src_len, src_strand,
600                     *leaf.GetRefSeqid().GetSeqId(),
601                     dst_from, dst_len, dst_strand);
602             }
603             while ( !refs.empty()  &&  refs.back().GetDepth() >= seg_it.GetDepth()) {
604                 refs.pop_back();
605             }
606         }
607         if ( seg_it ) {
608             refs.push_back(seg_it);
609         }
610     }
611 }
612 
613 
x_InitializeSeqMapSingleLevel(CSeqMap_CI seg_it,const CSeq_id * top_id,ESeqMapDirection direction)614 void CSeq_loc_Mapper::x_InitializeSeqMapSingleLevel(CSeqMap_CI       seg_it,
615                                                     const CSeq_id*   top_id,
616                                                     ESeqMapDirection direction)
617 {
618     TSeqPos seg_from, seg_len, ref_from, ref_len;
619     ENa_strand seg_strand = eNa_strand_unknown;
620     ENa_strand ref_strand = eNa_strand_unknown;
621     // Stack of segments for each level.
622     list<CSeqMap_CI> refs;
623     refs.push_back(seg_it);
624     while ( seg_it ) {
625         ++seg_it;
626         // While depth increases push all iterators to the stack.
627         if ( seg_it ) {
628             if (refs.empty()  ||  refs.back().GetDepth() < seg_it.GetDepth()) {
629                 refs.push_back(seg_it);
630                 continue;
631             }
632         }
633         // End of seq-map or the last iterator was a leaf - create mappings.
634         if ( !refs.empty() ) {
635             CSeqMap_CI ref = refs.back();
636             refs.pop_back();
637             if (direction == eSeqMap_Down) {
638                 // Create self-mapping of the leaf reference - we can not use
639                 // m_DstRanges here since they will contain ranges for each
640                 // level while we need only the bottom.
641                 seg_from = ref.GetRefPosition();
642                 seg_len = ref.GetLength();
643                 ref_from = seg_from;
644                 ref_len = seg_len;
645                 CConstRef<CSeq_id> id = ref.GetRefSeqid().GetSeqId();
646                 x_NextMappingRange(*id, seg_from, seg_len, eNa_strand_unknown,
647                     *id, ref_from, ref_len, eNa_strand_unknown);
648             }
649             // Create mapping for each non-leaf level.
650             while ( !refs.empty() ) {
651                 const CSeqMap_CI& seg = refs.back();
652                 TSeqPos shift = ref.GetPosition() - seg.GetPosition();
653                 seg_strand = seg.GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus;
654                 seg_from = seg.GetRefPosition() + shift;
655                 seg_len = ref.GetLength();
656                 ref_from = ref.GetRefPosition();
657                 ref_len = seg_len;
658                 ref_strand = ref.GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus;
659                 switch (direction) {
660                 case eSeqMap_Down:
661                     x_NextMappingRange(
662                         *seg.GetRefSeqid().GetSeqId(),
663                         seg_from, seg_len, seg_strand,
664                         *ref.GetRefSeqid().GetSeqId(),
665                         ref_from, ref_len, ref_strand);
666                     break;
667                 case eSeqMap_Up:
668                     x_NextMappingRange(
669                         *ref.GetRefSeqid().GetSeqId(),
670                         ref_from, ref_len, ref_strand,
671                         *seg.GetRefSeqid().GetSeqId(),
672                         seg_from, seg_len, seg_strand);
673                     break;
674                 }
675                 ref = seg;
676                 if (ref.GetDepth() >= seg_it.GetDepth()) {
677                     refs.pop_back();
678                 }
679                 else {
680                     break;
681                 }
682             }
683             // Are there still segments above?
684             if ( refs.empty() ) {
685                 // Top level segment.
686                 _ASSERT(ref);
687                 if (top_id) {
688                     // If the top level is a single bioseq, add mapping.
689                     seg_from = ref.GetPosition();
690                     seg_len = ref.GetLength();
691                     ref_from = ref.GetRefPosition();
692                     ref_len = seg_len;
693                     ref_strand = ref.GetRefMinusStrand() ? eNa_strand_minus : eNa_strand_plus;
694                     switch (direction) {
695                     case eSeqMap_Down:
696                         x_NextMappingRange(
697                             *top_id,
698                             seg_from, seg_len, eNa_strand_unknown,
699                             *ref.GetRefSeqid().GetSeqId(),
700                             ref_from, ref_len, ref_strand);
701                         break;
702                     case eSeqMap_Up:
703                         x_NextMappingRange(
704                             *ref.GetRefSeqid().GetSeqId(),
705                             ref_from, ref_len, ref_strand,
706                             *top_id,
707                             seg_from, seg_len, seg_strand);
708                         break;
709                     }
710                 }
711                 else if (direction == eSeqMap_Up) {
712                     // Create self-mapping of the top-level reference if the top
713                     // level is not a single bioseq but rather a seq-map level.
714                     ref_from = ref.GetRefPosition();
715                     ref_len = ref.GetLength();
716                     seg_from = ref_from;
717                     seg_len = ref_len;
718                     CConstRef<CSeq_id> id = ref.GetRefSeqid().GetSeqId();
719                     x_NextMappingRange(*id, ref_from, ref_len, eNa_strand_unknown,
720                         *id, seg_from, seg_len, eNa_strand_unknown);
721                 }
722             }
723         }
724         if ( seg_it ) {
725             refs.push_back(seg_it);
726         }
727     }
728     // Remove all collected destination ranges - they are not real destinations.
729     m_DstRanges.clear();
730     // If top level is a single sequence, create self-mapping for it.
731     if (top_id  &&  direction == eSeqMap_Up) {
732         m_DstRanges.resize(1);
733         m_DstRanges[0].clear();
734         m_DstRanges[0][CSeq_id_Handle::GetHandle(*top_id)].push_back(TRange::GetWhole());
735     }
736 }
737 
738 
739 CBioseq_Handle
x_AddVirtualBioseq(const TSynonyms & synonyms,const CGC_Sequence & gc_seq)740 CSeq_loc_Mapper::x_AddVirtualBioseq(const TSynonyms&    synonyms,
741                                     const CGC_Sequence& gc_seq)
742 {
743     CRef<CBioseq> bioseq(new CBioseq);
744     ITERATE(IMapper_Sequence_Info::TSynonyms, syn, synonyms) {
745         CBioseq_Handle h = m_Scope.GetScope().GetBioseqHandle(*syn);
746         if ( h ) {
747             return h;
748         }
749         CRef<CSeq_id> syn_id(new CSeq_id);
750         syn_id->Assign(*syn->GetSeqId());
751         bioseq->SetId().push_back(syn_id);
752     }
753 
754     bioseq->SetInst().SetMol(CSeq_inst::eMol_na);
755     if ( gc_seq.CanGetLength() ) {
756         bioseq->SetInst().SetLength(gc_seq.GetLength());
757     }
758 
759     // Create virtual bioseq without length/data.
760     bioseq->SetInst().SetRepr(CSeq_inst::eRepr_virtual);
761     return m_Scope.GetScope().AddBioseq(*bioseq);
762 }
763 
764 
x_InitGCAssembly(const CGC_Assembly & gc_assembly,EGCAssemblyAlias to_alias)765 void CSeq_loc_Mapper::x_InitGCAssembly(const CGC_Assembly& gc_assembly,
766                                        EGCAssemblyAlias    to_alias)
767 {
768     if ( gc_assembly.IsUnit() ) {
769         const CGC_AssemblyUnit& unit = gc_assembly.GetUnit();
770         if ( unit.IsSetMols() ) {
771             ITERATE(CGC_AssemblyUnit::TMols, it, unit.GetMols()) {
772                 const CGC_Replicon::TSequence& seq = (*it)->GetSequence();
773                 if ( seq.IsSingle() ) {
774                     x_InitGCSequence(seq.GetSingle(), to_alias);
775                 }
776                 else {
777                     ITERATE(CGC_Replicon::TSequence::TSet, tseq, seq.GetSet()) {
778                         x_InitGCSequence(**tseq, to_alias);
779                     }
780                 }
781             }
782         }
783         if ( unit.IsSetOther_sequences() ) {
784             ITERATE(CGC_Sequence::TSequences, seq, unit.GetOther_sequences()) {
785                 ITERATE(CGC_TaggedSequences::TSeqs, tseq, (*seq)->GetSeqs()) {
786                     x_InitGCSequence(**tseq, to_alias);
787                 }
788             }
789         }
790     }
791     else if ( gc_assembly.IsAssembly_set() ) {
792         const CGC_AssemblySet& aset = gc_assembly.GetAssembly_set();
793         x_InitGCAssembly(aset.GetPrimary_assembly(), to_alias);
794         if ( aset.IsSetMore_assemblies() ) {
795             ITERATE(CGC_AssemblySet::TMore_assemblies, assm, aset.GetMore_assemblies()) {
796                 x_InitGCAssembly(**assm, to_alias);
797             }
798         }
799     }
800 }
801 
802 
s_IsLocalRandomChrId(const CSeq_id & id)803 inline bool s_IsLocalRandomChrId(const CSeq_id& id)
804 {
805     return id.IsLocal()  &&  id.GetLocal().IsStr()  &&
806         id.GetLocal().GetStr().find("_random") != string::npos;
807 }
808 
809 
x_IsUCSCRandomChr(const CGC_Sequence & gc_seq,CConstRef<CSeq_id> & chr_id,TSynonyms & synonyms) const810 bool CSeq_loc_Mapper::x_IsUCSCRandomChr(const CGC_Sequence& gc_seq,
811                                         CConstRef<CSeq_id>& chr_id,
812                                         TSynonyms& synonyms) const
813 {
814     chr_id.Reset();
815     if (!gc_seq.IsSetStructure()) return false;
816 
817     CConstRef<CSeq_id> id(&gc_seq.GetSeq_id());
818     synonyms.insert(CSeq_id_Handle::GetHandle(*id));
819     if ( s_IsLocalRandomChrId(*id) ) {
820         chr_id = id;
821     }
822     // Collect all synonyms.
823     ITERATE(CGC_Sequence::TSeq_id_synonyms, it, gc_seq.GetSeq_id_synonyms()) {
824         const CGC_TypedSeqId& gc_id = **it;
825         switch ( gc_id.Which() ) {
826         case CGC_TypedSeqId::e_Genbank:
827             if ( gc_id.GetGenbank().IsSetGi() ) {
828                 id.Reset(&gc_id.GetGenbank().GetGi());
829             }
830             break;
831         case CGC_TypedSeqId::e_Refseq:
832             if ( gc_id.GetRefseq().IsSetGi() ) {
833                 id.Reset(&gc_id.GetRefseq().GetGi());
834             }
835             break;
836         case CGC_TypedSeqId::e_External:
837             id.Reset(&gc_id.GetExternal().GetId());
838             break;
839         case CGC_TypedSeqId::e_Private:
840             id.Reset(&gc_id.GetPrivate());
841             break;
842         default:
843             continue;
844         }
845         synonyms.insert(CSeq_id_Handle::GetHandle(*id));
846         if ( !chr_id  &&  s_IsLocalRandomChrId(*id) ) {
847             chr_id = id;
848         }
849     }
850 
851     if ( !chr_id ) {
852         synonyms.clear();
853         return false;
854     }
855 
856     // Use only random chromosome ids, ignore other synonyms (?)
857     string lcl_str = chr_id->GetLocal().GetStr();
858     if ( !NStr::StartsWith(lcl_str, "chr") ) {
859         CSeq_id lcl;
860         lcl.SetLocal().SetStr("chr" + lcl_str);
861         synonyms.insert(CSeq_id_Handle::GetHandle(lcl));
862     }
863 
864     return true;
865 }
866 
867 
s_GetSeqIdAlias(const CGC_TypedSeqId & id,CSeq_loc_Mapper::EGCAssemblyAlias alias)868 const CSeq_id* s_GetSeqIdAlias(const CGC_TypedSeqId& id,
869                                CSeq_loc_Mapper::EGCAssemblyAlias alias)
870 {
871     switch ( id.Which() ) {
872     case CGC_TypedSeqId::e_Genbank:
873         if (alias == CSeq_loc_Mapper::eGCA_Genbank) {
874             return id.GetGenbank().IsSetGi() ?
875                 &id.GetGenbank().GetGi() : &id.GetGenbank().GetPublic();
876         }
877         if (alias == CSeq_loc_Mapper::eGCA_GenbankAcc) {
878             return &id.GetGenbank().GetPublic();
879         }
880         break;
881     case CGC_TypedSeqId::e_Refseq:
882         if (alias == CSeq_loc_Mapper::eGCA_Refseq) {
883             return id.GetRefseq().IsSetGi() ?
884                 &id.GetRefseq().GetGi() : &id.GetRefseq().GetPublic();
885         }
886         if (alias == CSeq_loc_Mapper::eGCA_RefseqAcc) {
887             return &id.GetRefseq().GetPublic();
888         }
889         break;
890     case CGC_TypedSeqId::e_External:
891         if (alias == CSeq_loc_Mapper::eGCA_UCSC  &&
892             id.GetExternal().GetExternal() == "UCSC") {
893             return &id.GetExternal().GetId();
894         }
895         break;
896     case CGC_TypedSeqId::e_Private:
897         if (alias == CSeq_loc_Mapper::eGCA_Other) {
898             return &id.GetPrivate();
899         }
900         break;
901     default:
902         break;
903     }
904     return 0;
905 }
906 
907 
x_InitGCSequence(const CGC_Sequence & gc_seq,EGCAssemblyAlias to_alias)908 void CSeq_loc_Mapper::x_InitGCSequence(const CGC_Sequence& gc_seq,
909                                        EGCAssemblyAlias    to_alias)
910 {
911     if ( gc_seq.IsSetSeq_id_synonyms() ) {
912         CConstRef<CSeq_id> dst_id;
913         ITERATE(CGC_Sequence::TSeq_id_synonyms, it, gc_seq.GetSeq_id_synonyms()) {
914             const CGC_TypedSeqId& id = **it;
915             dst_id.Reset(s_GetSeqIdAlias(id, to_alias));
916             if ( dst_id ) break; // Use the first matching alias
917         }
918         if ( dst_id ) {
919             TSynonyms synonyms;
920             synonyms.insert(CSeq_id_Handle::GetHandle(gc_seq.GetSeq_id()));
921             ITERATE(CGC_Sequence::TSeq_id_synonyms, it, gc_seq.GetSeq_id_synonyms()) {
922                 // Add conversion for each synonym which can be used
923                 // as a source id.
924                 const CGC_TypedSeqId& id = **it;
925                 switch ( id.Which() ) {
926                 case CGC_TypedSeqId::e_Genbank:
927                     if (id.GetGenbank().IsSetGi()  &&  dst_id != &id.GetGenbank().GetGi()) {
928                         synonyms.insert(CSeq_id_Handle::GetHandle(id.GetGenbank().GetGi()));
929                     }
930                     if (dst_id != &id.GetGenbank().GetPublic()) {
931                         synonyms.insert(CSeq_id_Handle::GetHandle(id.GetGenbank().GetPublic()));
932                     }
933                     if ( id.GetGenbank().IsSetGpipe() ) {
934                         synonyms.insert(CSeq_id_Handle::GetHandle(id.GetGenbank().GetGpipe()));
935                     }
936                     break;
937                 case CGC_TypedSeqId::e_Refseq:
938                     if (id.GetRefseq().IsSetGi()  &&  dst_id != &id.GetRefseq().GetGi()) {
939                         synonyms.insert(CSeq_id_Handle::GetHandle(id.GetRefseq().GetGi()));
940                     }
941                     if (dst_id != &id.GetRefseq().GetPublic()) {
942                         synonyms.insert(CSeq_id_Handle::GetHandle(id.GetRefseq().GetPublic()));
943                     }
944                     if ( id.GetRefseq().IsSetGpipe() ) {
945                         synonyms.insert(CSeq_id_Handle::GetHandle(id.GetRefseq().GetGpipe()));
946                     }
947                     break;
948                 case CGC_TypedSeqId::e_Private:
949                     // Ignore private local ids - they are not unique.
950                     if (id.GetPrivate().IsLocal()) continue;
951                     if (dst_id != &id.GetPrivate()) {
952                         synonyms.insert(CSeq_id_Handle::GetHandle(id.GetPrivate()));
953                     }
954                     break;
955                 case CGC_TypedSeqId::e_External:
956                     if (dst_id != &id.GetExternal().GetId()) {
957                         synonyms.insert(CSeq_id_Handle::GetHandle(id.GetExternal().GetId()));
958                     }
959                     break;
960                 default:
961                     NCBI_THROW(CAnnotMapperException, eOtherError,
962                                "Unsupported alias type in GC-Sequence synonyms");
963                     break;
964                 }
965             }
966             CBioseq_Handle h = x_AddVirtualBioseq(synonyms, gc_seq);
967             TSeqPos hlen = kInvalidSeqPos;
968             if (h && h.CanGetInst_Length()) {
969                 hlen = h.GetInst_Length();
970             }
971             x_AddConversion(gc_seq.GetSeq_id(), 0, eNa_strand_unknown,
972                 *dst_id, 0, eNa_strand_unknown,
973                 hlen != kInvalidSeqPos ? hlen : TRange::GetWholeLength(),
974                 false, 0, hlen, hlen);
975         }
976         else if (to_alias == eGCA_UCSC  ||  to_alias == eGCA_Refseq) {
977             TSynonyms synonyms;
978             CConstRef<CSeq_id> chr_id;
979             // The requested alias type not found,
980             // check for UCSC random chromosomes.
981             if ( x_IsUCSCRandomChr(gc_seq, chr_id, synonyms) ) {
982                 _ASSERT(chr_id);
983 
984                 // Use structure (delta-seq) to initialize the mapper.
985                 // Here we use just one level of the delta and parse it
986                 // directly rather than use CSeqMap.
987                 TSeqPos chr_pos = 0;
988                 TSeqPos chr_len = kInvalidSeqPos;
989                 ITERATE(CDelta_ext::Tdata, it, gc_seq.GetStructure().Get()) {
990                     // Do not create mappings for literals/gaps.
991                     if ( (*it)->IsLiteral() ) {
992                         chr_pos += (*it)->GetLiteral().GetLength();
993                     }
994                     if ( !(*it)->IsLoc() ) {
995                         continue;
996                     }
997                     CSeq_loc_CI loc_it((*it)->GetLoc());
998                     for (; loc_it; ++loc_it) {
999                         if ( loc_it.IsEmpty() ) continue;
1000                         TSeqPos seg_pos = loc_it.GetRange().GetFrom();
1001                         TSeqPos seg_len = loc_it.GetRange().GetLength();
1002                         ENa_strand seg_str = loc_it.IsSetStrand() ?
1003                             loc_it.GetStrand() : eNa_strand_unknown;
1004                         switch ( to_alias ) {
1005                         case eGCA_UCSC:
1006                             // Map up to the chr
1007                             x_NextMappingRange(loc_it.GetSeq_id(),
1008                                 seg_pos, seg_len, seg_str,
1009                                 *chr_id, chr_pos, chr_len,
1010                                 eNa_strand_unknown);
1011                             break;
1012                         case eGCA_Refseq:
1013                             // Map down to delta parts
1014                             x_NextMappingRange(*chr_id, chr_pos, chr_len,
1015                                 eNa_strand_unknown,
1016                                 loc_it.GetSeq_id(), seg_pos, seg_len,
1017                                 seg_str);
1018                             break;
1019                         default:
1020                             break;
1021                         }
1022                     }
1023                 }
1024                 x_AddVirtualBioseq(synonyms, gc_seq);
1025             }
1026         }
1027     }
1028     if ( gc_seq.IsSetSequences() ) {
1029         ITERATE(CGC_Sequence::TSequences, seq, gc_seq.GetSequences()) {
1030             ITERATE(CGC_TaggedSequences::TSeqs, tseq, (*seq)->GetSeqs()) {
1031                 x_InitGCSequence(**tseq, to_alias);
1032             }
1033         }
1034     }
1035 }
1036 
1037 
1038 /////////////////////////////////////////////////////////////////////
1039 //
1040 //   Initialization helpers
1041 //
1042 
1043 
1044 CSeq_align_Mapper_Base*
InitAlignMapper(const CSeq_align & src_align)1045 CSeq_loc_Mapper::InitAlignMapper(const CSeq_align& src_align)
1046 {
1047     return new CSeq_align_Mapper(src_align, *this);
1048 }
1049 
1050 
1051 END_SCOPE(objects)
1052 END_NCBI_SCOPE
1053