1 /* $Id: gc_assembly_parser.cpp 513580 2016-09-13 11:58:16Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors:
27 * Aleksey Grichenko
28 *
29 * File Description:
30 * GC-Assembly parser used by CScope and CSeq_loc_Mapper to
31 * convert assemblies to seq-entries.
32 *
33 */
34
35 #include <ncbi_pch.hpp>
36
37 #include <objmgr/gc_assembly_parser.hpp>
38 #include <objmgr/error_codes.hpp>
39 #include <objects/genomecoll/genome_collection__.hpp>
40 #include <objects/seq/seq__.hpp>
41 #include <objects/seqloc/seqloc__.hpp>
42 #include <objects/seqset/seqset__.hpp>
43
44
45 #define NCBI_USE_ERRCODE_X ObjMgr_GC_Assembly_Parser
46
47 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects) const48 BEGIN_SCOPE(objects)
49
50
51 const char* CAssemblyParserException::GetErrCodeString(void) const
52 {
53 switch ( GetErrCode() ) {
54 case eUnsupported: return "eUnsupported";
55 case eOtherError: return "eOtherError";
56 default: return CException::GetErrCodeString();
57 }
58 }
59
60
61 /////////////////////////////////////////////////////////////////////////////
62 //
63 // CGC_Assembly_Parser
64 //
65 /////////////////////////////////////////////////////////////////////////////
66
67
CGC_Assembly_Parser(const CGC_Assembly & assembly,TParserFlags flags)68 CGC_Assembly_Parser::CGC_Assembly_Parser(const CGC_Assembly& assembly,
69 TParserFlags flags)
70 : m_Flags(flags)
71 {
72 m_TSE.Reset(new CSeq_entry);
73 x_InitSeq_entry(m_TSE, null);
74 x_ParseGCAssembly(assembly, m_TSE);
75 }
76
77
~CGC_Assembly_Parser(void)78 CGC_Assembly_Parser::~CGC_Assembly_Parser(void)
79 {
80 }
81
82
x_InitSeq_entry(CRef<CSeq_entry> entry,CRef<CSeq_entry> parent)83 void CGC_Assembly_Parser::x_InitSeq_entry(CRef<CSeq_entry> entry,
84 CRef<CSeq_entry> parent)
85 {
86 entry->SetSet().SetLevel(parent ? parent->GetSet().GetLevel() + 1 : 1);
87 entry->SetSet().SetClass(CBioseq_set::eClass_segset);
88 entry->SetSet().SetSeq_set(); // mandatory member, must be initialized
89 if (parent) {
90 parent->SetSet().SetSeq_set().push_back(entry);
91 }
92 }
93
94
x_CopyData(const CGC_AssemblyDesc & assm_desc,CSeq_entry & entry)95 void CGC_Assembly_Parser::x_CopyData(const CGC_AssemblyDesc& assm_desc,
96 CSeq_entry& entry)
97 {
98 if (assm_desc.IsSetDescr() && (m_Flags & fIgnoreDescr) == 0) {
99 const CSeq_descr& descr = assm_desc.GetDescr();
100 ITERATE(CSeq_descr::Tdata, desc, descr.Get()) {
101 CRef<CSeqdesc> desc_copy(new CSeqdesc);
102 desc_copy->Assign(**desc);
103 entry.SetDescr().Set().push_back(desc_copy);
104 }
105 }
106 if (assm_desc.IsSetAnnot() && (m_Flags & fIgnoreAnnots) == 0) {
107 ITERATE(CGC_AssemblyDesc::TAnnot, annot, assm_desc.GetAnnot()) {
108 CRef<CSeq_annot> annot_copy(new CSeq_annot);
109 annot_copy->Assign(**annot);
110 entry.SetAnnot().push_back(annot_copy);
111 }
112 }
113 }
114
115
x_ParseGCAssembly(const CGC_Assembly & gc_assembly,CRef<CSeq_entry> parent_entry)116 void CGC_Assembly_Parser::x_ParseGCAssembly(const CGC_Assembly& gc_assembly,
117 CRef<CSeq_entry> parent_entry)
118 {
119 if ( gc_assembly.IsUnit() ) {
120 const CGC_AssemblyUnit& unit = gc_assembly.GetUnit();
121 if (unit.IsSetDesc()) {
122 // Add annotations and descriptions.
123 x_CopyData(unit.GetDesc(), *parent_entry);
124 }
125 if ( unit.IsSetMols() ) {
126 ITERATE(CGC_AssemblyUnit::TMols, it, unit.GetMols()) {
127 CRef<CSeq_entry> entry(new CSeq_entry);
128 x_InitSeq_entry(entry, parent_entry);
129 const CGC_Replicon::TSequence& seq = (*it)->GetSequence();
130 if ( seq.IsSingle() ) {
131 x_ParseGCSequence(seq.GetSingle(), NULL, entry, null);
132 }
133 else {
134 ITERATE(CGC_Replicon::TSequence::TSet, its, seq.GetSet()) {
135 x_ParseGCSequence(**its, NULL, entry, null);
136 }
137 }
138 }
139 }
140 if ( unit.IsSetOther_sequences() ) {
141 CRef<CSeq_entry> entry(new CSeq_entry);
142 x_InitSeq_entry(entry, parent_entry);
143 ITERATE(CGC_Sequence::TSequences, seq, unit.GetOther_sequences()) {
144 ITERATE(CGC_TaggedSequences::TSeqs, tseq, (*seq)->GetSeqs()) {
145 x_ParseGCSequence(**tseq, NULL, entry, null);
146 }
147 }
148 }
149 }
150 else if ( gc_assembly.IsAssembly_set() ) {
151 const CGC_AssemblySet& aset = gc_assembly.GetAssembly_set();
152 if (aset.IsSetDesc()) {
153 // Add annotations and descriptions.
154 x_CopyData(aset.GetDesc(), *parent_entry);
155 }
156 CRef<CSeq_entry> entry(new CSeq_entry);
157 x_InitSeq_entry(entry, parent_entry);
158 x_ParseGCAssembly(aset.GetPrimary_assembly(), entry);
159 if ( aset.IsSetMore_assemblies() ) {
160 ITERATE(CGC_AssemblySet::TMore_assemblies, assm,
161 aset.GetMore_assemblies()) {
162 x_ParseGCAssembly(**assm, entry);
163 }
164 }
165 }
166 }
167
168
x_ParseGCSequence(const CGC_Sequence & gc_seq,const CGC_Sequence * parent_seq,CRef<CSeq_entry> parent_entry,CRef<CSeq_id> override_id)169 void CGC_Assembly_Parser::x_ParseGCSequence(const CGC_Sequence& gc_seq,
170 const CGC_Sequence* parent_seq,
171 CRef<CSeq_entry> parent_entry,
172 CRef<CSeq_id> override_id)
173 {
174 CRef<CSeq_id> id(override_id);
175 if ( !id ) {
176 id.Reset(new CSeq_id);
177 id->Assign(gc_seq.GetSeq_id());
178 }
179
180 // Special case - structure contains just one (whole) sequence and
181 // the same sequence is mentioned in the synonyms. Must skip this
182 // sequence and use the part instead.
183 CSeq_id_Handle struct_syn;
184 if ( gc_seq.IsSetStructure() ) {
185 if (gc_seq.GetStructure().Get().size() == 1) {
186 const CDelta_seq& delta = *gc_seq.GetStructure().Get().front();
187 if ( delta.IsLoc() ) {
188 const CSeq_loc& delta_loc = delta.GetLoc();
189 switch (delta_loc.Which()) {
190 case CSeq_loc::e_Whole:
191 struct_syn = CSeq_id_Handle::GetHandle(delta_loc.GetWhole());
192 break;
193 case CSeq_loc::e_Int:
194 if (delta_loc.GetInt().GetFrom() == 0) {
195 struct_syn = CSeq_id_Handle::GetHandle(delta_loc.GetInt().GetId());
196 }
197 break;
198 default:
199 break;
200 }
201 }
202 }
203 }
204 // Same as above, but structure is missing and sequences contain just one item.
205 else if (gc_seq.IsSetSequences() && gc_seq.GetSequences().size() == 1) {
206 const CGC_TaggedSequences& tagged_seq = *gc_seq.GetSequences().front();
207 if (tagged_seq.GetSeqs().size() == 1) {
208 struct_syn = CSeq_id_Handle::GetHandle(tagged_seq.GetSeqs().front()->GetSeq_id());
209 }
210 }
211
212 // Add synonyms if any.
213 TSeqIds synonyms;
214 synonyms.insert(CSeq_id_Handle::GetHandle(*id));
215 if ( gc_seq.IsSetSeq_id_synonyms() ) {
216 ITERATE(CGC_Sequence::TSeq_id_synonyms, it, gc_seq.GetSeq_id_synonyms()) {
217 // Add conversion for each synonym which can be used
218 // as a source id.
219 const CGC_TypedSeqId& it_id = **it;
220 switch ( it_id.Which() ) {
221 case CGC_TypedSeqId::e_Genbank:
222 synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetGenbank().GetPublic()));
223 if ( it_id.GetGenbank().IsSetGi() ) {
224 synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetGenbank().GetGi()));
225 }
226 if ( it_id.GetGenbank().IsSetGpipe() ) {
227 synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetGenbank().GetGpipe()));
228 }
229 break;
230 case CGC_TypedSeqId::e_Refseq:
231 {
232 // If some of the ids is used in the structure (see above),
233 // ignore all refseq ids.
234 synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetRefseq().GetPublic()));
235 if ( it_id.GetRefseq().IsSetGi() ) {
236 synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetRefseq().GetGi()));
237 }
238 if (it_id.GetRefseq().IsSetGpipe()) {
239 synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetRefseq().GetGpipe()));
240 }
241 break;
242 }
243 case CGC_TypedSeqId::e_Private:
244 // Ignore private local ids.
245 if ((m_Flags & fIgnoreLocalIds) == 0 ||
246 !it_id.GetPrivate().IsLocal()) {
247 synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetPrivate()));
248 }
249 break;
250 case CGC_TypedSeqId::e_External:
251 if ((m_Flags & fIgnoreExternalIds) == 0 &&
252 ((m_Flags & fIgnoreLocalIds) == 0 ||
253 !it_id.GetExternal().GetId().IsLocal())) {
254 synonyms.insert(CSeq_id_Handle::GetHandle(it_id.GetExternal().GetId()));
255 }
256 break;
257 default:
258 NCBI_THROW(CAssemblyParserException, eUnsupported,
259 "Unsupported alias type in GC-Sequence synonyms");
260 break;
261 }
262 }
263 // The sequence is referencing itself?
264 if (synonyms.find(struct_syn) != synonyms.end()) {
265 x_ParseGCSequence(
266 *gc_seq.GetSequences().front()->GetSeqs().front(),
267 parent_seq,
268 parent_entry,
269 id);
270 return;
271 }
272 }
273
274 CRef<CSeq_entry> entry;
275 if ( gc_seq.IsSetSequences() ) {
276 entry.Reset(new CSeq_entry);
277 x_InitSeq_entry(entry, parent_entry);
278 }
279 else {
280 entry = parent_entry;
281 }
282
283 if (gc_seq.IsSetDescr() && (m_Flags & fIgnoreDescr) == 0) {
284 const CSeq_descr& descr = gc_seq.GetDescr();
285 ITERATE(CSeq_descr::Tdata, desc, descr.Get()) {
286 CRef<CSeqdesc> desc_copy(new CSeqdesc);
287 desc_copy->Assign(**desc);
288 entry->SetDescr().Set().push_back(desc_copy);
289 }
290 }
291 if (gc_seq.IsSetAnnot() && (m_Flags & fIgnoreAnnots) == 0) {
292 ITERATE(CGC_Sequence::TAnnot, annot, gc_seq.GetAnnot()) {
293 CRef<CSeq_annot> annot_copy(new CSeq_annot);
294 annot_copy->Assign(**annot);
295 entry->SetAnnot().push_back(annot_copy);
296 }
297 }
298
299 // Create virtual bioseq and use it to initialize the mapper
300 x_AddBioseq(entry, synonyms, gc_seq);
301 if ( !parent_seq ) {
302 // Save top-level sequences.
303 m_TopSeqs.insert(CSeq_id_Handle::GetHandle(*id));
304 }
305
306 if ( gc_seq.IsSetSequences() ) {
307 CRef<CSeq_entry> sub_entry(new CSeq_entry);
308 x_InitSeq_entry(sub_entry, entry);
309 ITERATE(CGC_Sequence::TSequences, seq, gc_seq.GetSequences()) {
310 ITERATE(CGC_TaggedSequences::TSeqs, tseq, (*seq)->GetSeqs()) {
311 // To create a sub-level of the existing seq-map we need
312 // both structure at the current level and 'placed' state
313 // on the child sequences. If this is not true, iterate
314 // sub-sequences but treat them as top-level sequences rather
315 // than segments.
316 const CGC_Sequence* parent = 0;
317 if (gc_seq.IsSetStructure() &&
318 (*seq)->GetState() == CGC_TaggedSequences::eState_placed) {
319 parent = &gc_seq;
320 }
321 x_ParseGCSequence(**tseq, parent, sub_entry, null);
322 }
323 }
324 }
325 }
326
327
x_AddBioseq(CRef<CSeq_entry> parent_entry,const TSeqIds & synonyms,const CGC_Sequence & gc_seq)328 void CGC_Assembly_Parser::x_AddBioseq(CRef<CSeq_entry> parent_entry,
329 const TSeqIds& synonyms,
330 const CGC_Sequence& gc_seq)
331 {
332 CRef<CBioseq> bioseq(new CBioseq);
333 ITERATE(TSeqIds, syn, synonyms) {
334 // Do not add bioseqs with duplicate ids.
335 if ((m_Flags & fSkipDuplicates) != 0 &&
336 !m_AllSeqs.insert(*syn).second ) {
337 return;
338 }
339
340 CRef<CSeq_id> syn_id(new CSeq_id);
341 syn_id->Assign(*syn->GetSeqId());
342 bioseq->SetId().push_back(syn_id);
343 }
344
345 bioseq->SetInst().SetMol(CSeq_inst::eMol_na);
346 if ( gc_seq.CanGetLength() ) {
347 bioseq->SetInst().SetLength(gc_seq.GetLength());
348 }
349 if ( gc_seq.IsSetStructure() ) {
350 // Create delta sequence
351 bioseq->SetInst().SetRepr(CSeq_inst::eRepr_delta);
352 // const_cast should be safe here - we are not going to modify data
353 bioseq->SetInst().SetExt().SetDelta(
354 const_cast<CDelta_ext&>(gc_seq.GetStructure()));
355 }
356 else {
357 // Create virtual bioseq without length/data.
358 bioseq->SetInst().SetRepr(CSeq_inst::eRepr_virtual);
359 }
360 CRef<CSeq_entry> entry(new CSeq_entry);
361 entry->SetSeq(*bioseq);
362 parent_entry->SetSet().SetSeq_set().push_back(entry);
363 }
364
365
366 END_SCOPE(objects)
367 END_NCBI_SCOPE
368