1 /*  $Id: gc_assembly_parser.cpp 513580 2016-09-13 11:58:16Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors:
27 *           Aleksey Grichenko
28 *
29 * File Description:
30 *           GC-Assembly parser used by CScope and CSeq_loc_Mapper to
31 *           convert assemblies to seq-entries.
32 *
33 */
34 
35 #include <ncbi_pch.hpp>
36 
37 #include <objmgr/gc_assembly_parser.hpp>
38 #include <objmgr/error_codes.hpp>
39 #include <objects/genomecoll/genome_collection__.hpp>
40 #include <objects/seq/seq__.hpp>
41 #include <objects/seqloc/seqloc__.hpp>
42 #include <objects/seqset/seqset__.hpp>
43 
44 
45 #define NCBI_USE_ERRCODE_X   ObjMgr_GC_Assembly_Parser
46 
47 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects) const48 BEGIN_SCOPE(objects)
49 
50 
51 const char* CAssemblyParserException::GetErrCodeString(void) const
52 {
53     switch ( GetErrCode() ) {
54     case eUnsupported:      return "eUnsupported";
55     case eOtherError:       return "eOtherError";
56     default:                return CException::GetErrCodeString();
57     }
58 }
59 
60 
61 /////////////////////////////////////////////////////////////////////////////
62 //
63 // CGC_Assembly_Parser
64 //
65 /////////////////////////////////////////////////////////////////////////////
66 
67 
CGC_Assembly_Parser(const CGC_Assembly & assembly,TParserFlags flags)68 CGC_Assembly_Parser::CGC_Assembly_Parser(const CGC_Assembly& assembly,
69                                          TParserFlags        flags)
70     : m_Flags(flags)
71 {
72     m_TSE.Reset(new CSeq_entry);
73     x_InitSeq_entry(m_TSE, null);
74     x_ParseGCAssembly(assembly, m_TSE);
75 }
76 
77 
~CGC_Assembly_Parser(void)78 CGC_Assembly_Parser::~CGC_Assembly_Parser(void)
79 {
80 }
81 
82 
x_InitSeq_entry(CRef<CSeq_entry> entry,CRef<CSeq_entry> parent)83 void CGC_Assembly_Parser::x_InitSeq_entry(CRef<CSeq_entry> entry,
84                                           CRef<CSeq_entry> parent)
85 {
86     entry->SetSet().SetLevel(parent ? parent->GetSet().GetLevel() + 1 : 1);
87     entry->SetSet().SetClass(CBioseq_set::eClass_segset);
88     entry->SetSet().SetSeq_set(); // mandatory member, must be initialized
89     if (parent) {
90         parent->SetSet().SetSeq_set().push_back(entry);
91     }
92 }
93 
94 
x_CopyData(const CGC_AssemblyDesc & assm_desc,CSeq_entry & entry)95 void CGC_Assembly_Parser::x_CopyData(const CGC_AssemblyDesc& assm_desc,
96                                      CSeq_entry&             entry)
97 {
98     if (assm_desc.IsSetDescr()  &&  (m_Flags & fIgnoreDescr) == 0) {
99         const CSeq_descr& descr = assm_desc.GetDescr();
100         ITERATE(CSeq_descr::Tdata, desc, descr.Get()) {
101             CRef<CSeqdesc> desc_copy(new CSeqdesc);
102             desc_copy->Assign(**desc);
103             entry.SetDescr().Set().push_back(desc_copy);
104         }
105     }
106     if (assm_desc.IsSetAnnot()  &&  (m_Flags & fIgnoreAnnots) == 0) {
107         ITERATE(CGC_AssemblyDesc::TAnnot, annot, assm_desc.GetAnnot()) {
108             CRef<CSeq_annot> annot_copy(new CSeq_annot);
109             annot_copy->Assign(**annot);
110             entry.SetAnnot().push_back(annot_copy);
111         }
112     }
113 }
114 
115 
x_ParseGCAssembly(const CGC_Assembly & gc_assembly,CRef<CSeq_entry> parent_entry)116 void CGC_Assembly_Parser::x_ParseGCAssembly(const CGC_Assembly& gc_assembly,
117                                             CRef<CSeq_entry>    parent_entry)
118 {
119     if ( gc_assembly.IsUnit() ) {
120         const CGC_AssemblyUnit& unit = gc_assembly.GetUnit();
121         if (unit.IsSetDesc()) {
122             // Add annotations and descriptions.
123             x_CopyData(unit.GetDesc(), *parent_entry);
124         }
125         if ( unit.IsSetMols() ) {
126             ITERATE(CGC_AssemblyUnit::TMols, it, unit.GetMols()) {
127                 CRef<CSeq_entry> entry(new CSeq_entry);
128                 x_InitSeq_entry(entry, parent_entry);
129                 const CGC_Replicon::TSequence& seq = (*it)->GetSequence();
130                 if ( seq.IsSingle() ) {
131                     x_ParseGCSequence(seq.GetSingle(), NULL, entry, null);
132                 }
133                 else {
134                     ITERATE(CGC_Replicon::TSequence::TSet, its, seq.GetSet()) {
135                         x_ParseGCSequence(**its, NULL, entry, null);
136                     }
137                 }
138             }
139         }
140         if ( unit.IsSetOther_sequences() ) {
141             CRef<CSeq_entry> entry(new CSeq_entry);
142             x_InitSeq_entry(entry, parent_entry);
143             ITERATE(CGC_Sequence::TSequences, seq, unit.GetOther_sequences()) {
144                 ITERATE(CGC_TaggedSequences::TSeqs, tseq, (*seq)->GetSeqs()) {
145                     x_ParseGCSequence(**tseq, NULL, entry, null);
146                 }
147             }
148         }
149     }
150     else if ( gc_assembly.IsAssembly_set() ) {
151         const CGC_AssemblySet& aset = gc_assembly.GetAssembly_set();
152         if (aset.IsSetDesc()) {
153             // Add annotations and descriptions.
154             x_CopyData(aset.GetDesc(), *parent_entry);
155         }
156         CRef<CSeq_entry> entry(new CSeq_entry);
157         x_InitSeq_entry(entry, parent_entry);
158         x_ParseGCAssembly(aset.GetPrimary_assembly(), entry);
159         if ( aset.IsSetMore_assemblies() ) {
160             ITERATE(CGC_AssemblySet::TMore_assemblies, assm,
161                 aset.GetMore_assemblies()) {
162                 x_ParseGCAssembly(**assm, entry);
163             }
164         }
165     }
166 }
167 
168 
x_ParseGCSequence(const CGC_Sequence & gc_seq,const CGC_Sequence * parent_seq,CRef<CSeq_entry> parent_entry,CRef<CSeq_id> override_id)169 void CGC_Assembly_Parser::x_ParseGCSequence(const CGC_Sequence& gc_seq,
170                                             const CGC_Sequence* parent_seq,
171                                             CRef<CSeq_entry>    parent_entry,
172                                             CRef<CSeq_id>       override_id)
173 {
174     CRef<CSeq_id> id(override_id);
175     if ( !id ) {
176         id.Reset(new CSeq_id);
177         id->Assign(gc_seq.GetSeq_id());
178     }
179 
180     // Special case - structure contains just one (whole) sequence and
181     // the same sequence is mentioned in the synonyms. Must skip this
182     // sequence and use the part instead.
183     CSeq_id_Handle struct_syn;
184     if ( gc_seq.IsSetStructure() ) {
185         if (gc_seq.GetStructure().Get().size() == 1) {
186             const CDelta_seq& delta = *gc_seq.GetStructure().Get().front();
187             if ( delta.IsLoc() ) {
188                 const CSeq_loc& delta_loc = delta.GetLoc();
189                 switch (delta_loc.Which()) {
190                 case CSeq_loc::e_Whole:
191                     struct_syn = CSeq_id_Handle::GetHandle(delta_loc.GetWhole());
192                     break;
193                 case CSeq_loc::e_Int:
194                     if (delta_loc.GetInt().GetFrom() == 0) {
195                         struct_syn = CSeq_id_Handle::GetHandle(delta_loc.GetInt().GetId());
196                     }
197                     break;
198                 default:
199                     break;
200                 }
201             }
202         }
203     }
204     // Same as above, but structure is missing and sequences contain just one item.
205     else if (gc_seq.IsSetSequences()  &&  gc_seq.GetSequences().size() == 1) {
206         const CGC_TaggedSequences& tagged_seq = *gc_seq.GetSequences().front();
207         if (tagged_seq.GetSeqs().size() == 1) {
208             struct_syn = CSeq_id_Handle::GetHandle(tagged_seq.GetSeqs().front()->GetSeq_id());
209         }
210     }
211 
212     // Add synonyms if any.
213     TSeqIds synonyms;
214     synonyms.insert(CSeq_id_Handle::GetHandle(*id));
215     if ( gc_seq.IsSetSeq_id_synonyms() ) {
216         ITERATE(CGC_Sequence::TSeq_id_synonyms, it, gc_seq.GetSeq_id_synonyms()) {
217             // Add conversion for each synonym which can be used
218             // as a source id.
219             const CGC_TypedSeqId& it_id = **it;
220             switch ( it_id.Which() ) {
221             case CGC_TypedSeqId::e_Genbank:
222                 synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetGenbank().GetPublic()));
223                 if ( it_id.GetGenbank().IsSetGi() ) {
224                     synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetGenbank().GetGi()));
225                 }
226                 if ( it_id.GetGenbank().IsSetGpipe() ) {
227                     synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetGenbank().GetGpipe()));
228                 }
229                 break;
230             case CGC_TypedSeqId::e_Refseq:
231             {
232                 // If some of the ids is used in the structure (see above),
233                 // ignore all refseq ids.
234                 synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetRefseq().GetPublic()));
235                 if ( it_id.GetRefseq().IsSetGi() ) {
236                     synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetRefseq().GetGi()));
237                 }
238                 if (it_id.GetRefseq().IsSetGpipe()) {
239                     synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetRefseq().GetGpipe()));
240                 }
241                 break;
242             }
243             case CGC_TypedSeqId::e_Private:
244                 // Ignore private local ids.
245                 if ((m_Flags & fIgnoreLocalIds) == 0  ||
246                     !it_id.GetPrivate().IsLocal()) {
247                     synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetPrivate()));
248                 }
249                 break;
250             case CGC_TypedSeqId::e_External:
251                 if ((m_Flags & fIgnoreExternalIds) == 0 &&
252                     ((m_Flags & fIgnoreLocalIds) == 0 ||
253                     !it_id.GetExternal().GetId().IsLocal())) {
254                     synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetExternal().GetId()));
255                 }
256                 break;
257             default:
258                 NCBI_THROW(CAssemblyParserException, eUnsupported,
259                            "Unsupported alias type in GC-Sequence synonyms");
260                 break;
261             }
262         }
263         // The sequence is referencing itself?
264         if (synonyms.find(struct_syn) != synonyms.end()) {
265             x_ParseGCSequence(
266                 *gc_seq.GetSequences().front()->GetSeqs().front(),
267                 parent_seq,
268                 parent_entry,
269                 id);
270             return;
271         }
272     }
273 
274     CRef<CSeq_entry> entry;
275     if ( gc_seq.IsSetSequences() ) {
276         entry.Reset(new CSeq_entry);
277         x_InitSeq_entry(entry, parent_entry);
278     }
279     else {
280         entry = parent_entry;
281     }
282 
283     if (gc_seq.IsSetDescr()  &&  (m_Flags & fIgnoreDescr) == 0) {
284         const CSeq_descr& descr = gc_seq.GetDescr();
285         ITERATE(CSeq_descr::Tdata, desc, descr.Get()) {
286             CRef<CSeqdesc> desc_copy(new CSeqdesc);
287             desc_copy->Assign(**desc);
288             entry->SetDescr().Set().push_back(desc_copy);
289         }
290     }
291     if (gc_seq.IsSetAnnot()  &&  (m_Flags & fIgnoreAnnots) == 0) {
292         ITERATE(CGC_Sequence::TAnnot, annot, gc_seq.GetAnnot()) {
293             CRef<CSeq_annot> annot_copy(new CSeq_annot);
294             annot_copy->Assign(**annot);
295             entry->SetAnnot().push_back(annot_copy);
296         }
297     }
298 
299     // Create virtual bioseq and use it to initialize the mapper
300     x_AddBioseq(entry, synonyms, gc_seq);
301     if ( !parent_seq ) {
302         // Save top-level sequences.
303         m_TopSeqs.insert(CSeq_id_Handle::GetHandle(*id));
304     }
305 
306     if ( gc_seq.IsSetSequences() ) {
307         CRef<CSeq_entry> sub_entry(new CSeq_entry);
308         x_InitSeq_entry(sub_entry, entry);
309         ITERATE(CGC_Sequence::TSequences, seq, gc_seq.GetSequences()) {
310             ITERATE(CGC_TaggedSequences::TSeqs, tseq, (*seq)->GetSeqs()) {
311                 // To create a sub-level of the existing seq-map we need
312                 // both structure at the current level and 'placed' state
313                 // on the child sequences. If this is not true, iterate
314                 // sub-sequences but treat them as top-level sequences rather
315                 // than segments.
316                 const CGC_Sequence* parent = 0;
317                 if (gc_seq.IsSetStructure()  &&
318                     (*seq)->GetState() == CGC_TaggedSequences::eState_placed) {
319                     parent = &gc_seq;
320                 }
321                 x_ParseGCSequence(**tseq, parent, sub_entry, null);
322             }
323         }
324     }
325 }
326 
327 
x_AddBioseq(CRef<CSeq_entry> parent_entry,const TSeqIds & synonyms,const CGC_Sequence & gc_seq)328 void CGC_Assembly_Parser::x_AddBioseq(CRef<CSeq_entry>    parent_entry,
329                                       const TSeqIds&      synonyms,
330                                       const CGC_Sequence& gc_seq)
331 {
332     CRef<CBioseq> bioseq(new CBioseq);
333     ITERATE(TSeqIds, syn, synonyms) {
334         // Do not add bioseqs with duplicate ids.
335         if ((m_Flags & fSkipDuplicates) != 0  &&
336             !m_AllSeqs.insert(*syn).second ) {
337             return;
338         }
339 
340         CRef<CSeq_id> syn_id(new CSeq_id);
341         syn_id->Assign(*syn->GetSeqId());
342         bioseq->SetId().push_back(syn_id);
343     }
344 
345     bioseq->SetInst().SetMol(CSeq_inst::eMol_na);
346     if ( gc_seq.CanGetLength() ) {
347         bioseq->SetInst().SetLength(gc_seq.GetLength());
348     }
349     if ( gc_seq.IsSetStructure() ) {
350         // Create delta sequence
351         bioseq->SetInst().SetRepr(CSeq_inst::eRepr_delta);
352         // const_cast should be safe here - we are not going to modify data
353         bioseq->SetInst().SetExt().SetDelta(
354             const_cast<CDelta_ext&>(gc_seq.GetStructure()));
355     }
356     else {
357         // Create virtual bioseq without length/data.
358         bioseq->SetInst().SetRepr(CSeq_inst::eRepr_virtual);
359     }
360     CRef<CSeq_entry> entry(new CSeq_entry);
361     entry->SetSeq(*bioseq);
362     parent_entry->SetSet().SetSeq_set().push_back(entry);
363 }
364 
365 
366 END_SCOPE(objects)
367 END_NCBI_SCOPE
368