1 /* $Id: asn1.cpp 556806 2018-02-05 17:38:50Z souvorov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Vyacheslav Chetvernin
27 *
28 * File Description:
29 * conversion to/from ASN1
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <algo/gnomon/asn1.hpp>
35 #include <algo/sequence/gene_model.hpp>
36 #include <objects/general/general__.hpp>
37 #include <objects/seqset/Bioseq_set.hpp>
38 #include <objects/seqfeat/seqfeat__.hpp>
39 #include <objects/seqloc/seqloc__.hpp>
40 #include <objects/seq/seq__.hpp>
41 #include <objects/seqalign/seqalign__.hpp>
42 #include "gnomon_seq.hpp"
43 #include <algo/gnomon/id_handler.hpp>
44
45 #include <objects/seq/seqport_util.hpp>
46 #include <objmgr/object_manager.hpp>
47 #include <objmgr/util/sequence.hpp>
48 #include <objmgr/seq_vector.hpp>
49 #include <objmgr/seq_annot_ci.hpp>
50 #include <objmgr/feat_ci.hpp>
51 #include <objmgr/align_ci.hpp>
52
53 BEGIN_NCBI_SCOPE
54 BEGIN_SCOPE(gnomon)
55 USING_SCOPE(ncbi::objects);
56
57
58
59 // defined in gnomon_model.cpp
60 typedef map<string,string> TAttributes;
61 void CollectAttributes(const CAlignModel& a, TAttributes& attributes);
62 void ParseAttributes(TAttributes& attributes, CAlignModel& a);
63
64 // defined in gnomon_objmgr.cpp
65 Int8 GetModelId(const CSeq_align& seq_align);
66
67
68 const string kGnomonConstructed = "Is [re]constructed alignment";
69
70
71 struct SModelData {
72 SModelData(const CAlignModel& model, const CEResidueVec& contig_seq, int shift);
73
74 CAlignModel model;
75 CEResidueVec mrna_seq;
76 CRef<CSeq_id> mrna_sid;
77 CRef<CSeq_id> prot_sid;
78
79 bool is_ncrna;
80 };
81
SModelData(const CAlignModel & m,const CEResidueVec & contig_seq,int shift)82 SModelData::SModelData(const CAlignModel& m, const CEResidueVec& contig_seq, int shift) : model(m)
83 {
84 CAlignMap amap = model.GetAlignMap();
85 CCDSInfo cds = model.GetCdsInfo();
86 if(cds.IsMappedToGenome()) {
87 cds = cds.MapFromOrigToEdited(amap); // all coords on mRNA
88 model.SetCdsInfo(cds);
89 }
90
91 amap.MoveOrigin(shift);
92 amap.EditedSequence(contig_seq, mrna_seq, true);
93
94 prot_sid.Reset(new CSeq_id);
95 prot_sid->Assign(*CIdHandler::GnomonProtein(model.ID()));
96 mrna_sid.Reset(new CSeq_id);
97 mrna_sid->Assign(*CIdHandler::GnomonMRNA(model.ID()));
98
99 is_ncrna = m.ReadingFrame().Empty();
100 }
101
102
103 class CAnnotationASN1::CImplementationData {
104 public:
105 CImplementationData(const string& contig_name, const CResidueVec& seq, IEvidence& evdnc, int genetic_code, int sh);
106
107 void ResetASN1();
108 void AddModel(const CAlignModel& model);
109 CRef<CSeq_entry> main_seq_entry;
110
111 private:
112 void CreateModelProducts(SModelData& model);
113 CRef<CSeq_feat> create_internal_feature(const SModelData& md);
114 CRef<CSeq_feat> create_cdregion_feature(SModelData& md);
115 CRef<CSeq_align> model2spliced_seq_align(SModelData& md);
116 // CRef<CSpliced_exon> spliced_exon (const CModelExon& e, EStrand strand) const;
117 CRef<CSeq_loc> create_packed_int_seqloc(const CGeneModel& model, TSignedSeqRange limits_on_mrna = TSignedSeqRange::GetWhole());
118
119 CRef< CUser_object > create_ModelEvidence_user_object(const CGeneModel& model);
120 void AddInternalFeature(const SModelData& md);
121 void DumpEvidence(const SModelData& md);
122
123 string contig_name;
124 CRef<CSeq_id> contig_sid;
125 CEResidueVec contig_seq;
126
127 int gencode;
128 int shift;
129
130 CBioseq_set::TSeq_set* nucprots;
131 CSeq_annot* gnomon_models_annot;
132 CSeq_annot::C_Data::TFtable* feature_table;
133 CSeq_annot::C_Data::TFtable* internal_feature_table;
134 set<Int8> models_in_internal_feature_table;
135
136 typedef map<Int8,CRef<CSeq_feat> > TGeneMap;
137 TGeneMap genes;
138 IEvidence& evidence;
139
140 auto_ptr<CFeatureGenerator> feature_generator;
141 CRef<CScope> scope;
142
143 friend class CAnnotationASN1;
144 };
145
NameAnnot(CSeq_annot & annot,const string & name)146 void NameAnnot(CSeq_annot& annot, const string& name)
147 {
148 annot.SetNameDesc(name);
149 annot.SetTitleDesc(name);
150 }
151
ResetASN1()152 void CAnnotationASN1::CImplementationData::ResetASN1() {
153 main_seq_entry = new CSeq_entry;
154
155 CBioseq_set& bioseq_set = main_seq_entry->SetSet();
156
157 nucprots = &bioseq_set.SetSeq_set();
158
159 gnomon_models_annot = new CSeq_annot;
160 NameAnnot(*gnomon_models_annot, "Gnomon models");
161 CRef<CAnnotdesc> desc(new CAnnotdesc());
162 CRef<CSeq_loc> genomic_seqloc(new CSeq_loc());
163 genomic_seqloc->SetWhole(*contig_sid);
164 desc->SetRegion(*genomic_seqloc);
165 gnomon_models_annot->SetDesc().Set().push_back(desc);
166
167 CRef<CSeq_annot> gnomon_models_annot_ref(gnomon_models_annot);
168 bioseq_set.SetAnnot().push_back(gnomon_models_annot_ref);
169 feature_table = &gnomon_models_annot->SetData().SetFtable();
170
171 CRef<CSeq_annot> seq_annot(new CSeq_annot);
172 NameAnnot(*seq_annot, "Gnomon internal attributes");
173 bioseq_set.SetAnnot().push_back(seq_annot);
174 internal_feature_table = &seq_annot->SetData().SetFtable();
175 }
176
CImplementationData(const string & a_contig_name,const CResidueVec & seq,IEvidence & evdnc,int genetic_code,int sh)177 CAnnotationASN1::CImplementationData::CImplementationData(const string& a_contig_name, const CResidueVec& seq, IEvidence& evdnc, int genetic_code, int sh) :
178 contig_name(a_contig_name),
179 contig_sid(CIdHandler::ToSeq_id(a_contig_name)),
180 gencode(genetic_code),
181 shift(sh),
182 evidence(evdnc)
183 {
184 Convert(seq, contig_seq);
185 ResetASN1();
186
187 CRef<CObjectManager> obj_mgr = CObjectManager::GetInstance();
188 scope.Reset(new CScope(*obj_mgr));
189 scope->AddDefaults();
190 feature_generator.reset(new CFeatureGenerator(*scope));
191 feature_generator->SetFlags(CFeatureGenerator::fCreateGene | CFeatureGenerator::fCreateMrna | CFeatureGenerator::fCreateCdregion | CFeatureGenerator::fForceTranslateCds | CFeatureGenerator::fForceTranscribeMrna | CFeatureGenerator::fDeNovoProducts);
192 feature_generator->SetMinIntron(numeric_limits<TSeqPos>::max());
193 feature_generator->SetAllowedUnaligned(0);
194 }
195
ResetASN1()196 void CAnnotationASN1::ResetASN1() {
197 m_data->ResetASN1();
198 }
199
CAnnotationASN1(const string & contig_name,const CResidueVec & seq,IEvidence & evdnc,int genetic_code,int shift)200 CAnnotationASN1::CAnnotationASN1(const string& contig_name, const CResidueVec& seq, IEvidence& evdnc,
201 int genetic_code, int shift) :
202 m_data(new CImplementationData(contig_name, seq, evdnc, genetic_code, shift))
203 {
204 }
205
~CAnnotationASN1()206 CAnnotationASN1::~CAnnotationASN1()
207 {
208 }
209
GetASN1() const210 CRef<CSeq_entry> CAnnotationASN1::GetASN1() const
211 {
212 return m_data->main_seq_entry;
213 }
214
215
AddModel(const CAlignModel & model)216 void CAnnotationASN1::AddModel(const CAlignModel& model)
217 {
218 m_data->AddModel(model);
219 }
220
CreateModelProducts(SModelData & md)221 void CAnnotationASN1::CImplementationData::CreateModelProducts(SModelData& md)
222 {
223 CRef< CSeq_align > align = model2spliced_seq_align(md);
224 CRef<CSeq_feat> cds_feat;
225
226 CSeq_feat* cds_feat_ptr = NULL;
227 if (!md.is_ncrna) {
228 cds_feat = create_cdregion_feature(md);
229 cds_feat_ptr = cds_feat.GetPointer();
230 }
231
232 CRef<CSeq_entry> model_products(new CSeq_entry);
233 nucprots->push_back(model_products);
234 CRef<CSeq_annot> annot(new CSeq_annot);
235 feature_generator->ConvertAlignToAnnot(*align, *annot, model_products->SetSet(), 0, cds_feat_ptr);
236 }
237
AddInternalFeature(const SModelData & md)238 void CAnnotationASN1::CImplementationData::AddInternalFeature(const SModelData& md)
239 {
240 Int8 id = md.model.ID();
241 if (models_in_internal_feature_table.find(id) == models_in_internal_feature_table.end()) {
242 CRef<CSeq_feat> internal_feat = create_internal_feature(md);
243 internal_feature_table->push_back(internal_feat);
244 models_in_internal_feature_table.insert(id);
245 }
246 }
247
AddModel(const CAlignModel & model)248 void CAnnotationASN1::CImplementationData::AddModel(const CAlignModel& model)
249 {
250 SModelData md(model, contig_seq, shift);
251
252 CRef< CSeq_align > align = model2spliced_seq_align(md);
253 CRef<CSeq_feat> cds_feat;
254 CSeq_feat* cds_feat_ptr = NULL;
255 if (!md.is_ncrna) {
256 cds_feat = create_cdregion_feature(md);
257 cds_feat_ptr = cds_feat.GetPointer();
258 }
259
260 try {
261 CRef<CSeq_entry> model_products(new CSeq_entry);
262 nucprots->push_back(model_products);
263
264 CRef<CSeq_feat> mrna_feat = feature_generator->ConvertAlignToAnnot(*align, *gnomon_models_annot, model_products->SetSet(), model.GeneID(), cds_feat_ptr);
265
266 DumpEvidence(md);
267
268 CRef< CUser_object > user_obj = create_ModelEvidence_user_object(model);
269 mrna_feat->SetExts().push_back(user_obj);
270
271 AddInternalFeature(md);
272 }
273 catch (CException&) {
274 cerr << MSerial_AsnText << *align;
275 throw;
276 }
277 }
278
create_cdregion_feature(SModelData & md)279 CRef<CSeq_feat> CAnnotationASN1::CImplementationData::create_cdregion_feature(SModelData& md)
280 {
281 const CGeneModel& model = md.model;
282 const CCDSInfo& cds = model.GetCdsInfo();
283
284 CRef<CSeq_feat> cdregion_feature(new CSeq_feat);
285
286 CRef<CGenetic_code::C_E> cdrcode(new CGenetic_code::C_E);
287 cdrcode->SetId(gencode);
288 cdregion_feature->SetData().SetCdregion().SetCode().Set().push_back(cdrcode);
289
290 if(model.PStop()) {
291 CCdregion::TCode_break& code_breaks = cdregion_feature->SetData().SetCdregion().SetCode_break();
292 ITERATE(CCDSInfo::TPStops,s,cds.PStops()) {
293 if(!Include(cds.Cds(),*s))
294 continue;
295
296 CRef< CCode_break > code_break(new CCode_break);
297 CRef<CSeq_loc> pstop;
298
299 TSeqPos from = s->GetFrom();
300 TSeqPos to = s->GetTo();
301 _ASSERT(from+2==to);
302
303 pstop.Reset(new CSeq_loc(*md.mrna_sid, from, to, eNa_strand_plus));
304
305 code_break->SetLoc(*pstop);
306 CRef<CCode_break::C_Aa> aa(new CCode_break::C_Aa);
307 if(s->m_status == CCDSInfo::eSelenocysteine)
308 aa->SetNcbieaa('U');
309 else
310 aa->SetNcbieaa('X');
311 code_break->SetAa(*aa);
312 code_breaks.push_back(code_break);
313 }
314 }
315
316 cdregion_feature->SetProduct().SetWhole(*md.prot_sid);
317
318 TSignedSeqRange tlim = model.TranscriptLimits();
319 int altstart = (cds.MaxCdsLimits()&tlim).GetFrom();
320 int start = cds.Cds().GetFrom();
321 int stop = (cds.MaxCdsLimits()&tlim).GetTo();
322
323 int frame = 0;
324 if(!model.HasStart()) {
325 _ASSERT(altstart == 0);
326 frame = start%3;
327 start = 0;
328 }
329 CCdregion::EFrame ncbi_frame = CCdregion::eFrame_one;
330 if(frame == 1) ncbi_frame = CCdregion::eFrame_two;
331 else if(frame == 2) ncbi_frame = CCdregion::eFrame_three;
332 cdregion_feature->SetData().SetCdregion().SetFrame(ncbi_frame);
333
334 CRef<CSeq_loc> cdregion_location;
335
336 cdregion_location.Reset(new CSeq_loc(*md.mrna_sid, start, stop, eNa_strand_plus));
337
338 if (0 < altstart && altstart != start)
339 cdregion_location->SetInt().SetFuzz_from().SetAlt().push_back(altstart);
340
341 if (!model.HasStart())
342 cdregion_location->SetPartialStart(true,eExtreme_Biological);
343 if (!model.HasStop())
344 cdregion_location->SetPartialStop(true,eExtreme_Biological);
345 cdregion_feature->SetLocation(*cdregion_location);
346
347 if (!model.HasStart() || !model.HasStop() || !model.Continuous())
348 cdregion_feature->SetPartial(true);
349
350 return cdregion_feature;
351 }
352
create_packed_int_seqloc(const CGeneModel & model,TSignedSeqRange limits_on_mrna)353 CRef<CSeq_loc> CAnnotationASN1::CImplementationData::create_packed_int_seqloc(const CGeneModel& model, TSignedSeqRange limits_on_mrna)
354 {
355 CRef<CSeq_loc> seq_loc(new CSeq_loc);
356 CPacked_seqint& packed_int = seq_loc->SetPacked_int();
357 ENa_strand strand = model.Strand()==ePlus?eNa_strand_plus:eNa_strand_minus;
358
359 CAlignMap amap = model.GetAlignMap();
360
361 for (size_t i=0; i < model.Exons().size(); ++i) {
362 const CModelExon* e = &model.Exons()[i];
363
364 if(e->Limits().NotEmpty()) { // not a ggap
365 TSignedSeqRange interval_range_on_mrna = amap.MapRangeOrigToEdited(e->Limits()) & limits_on_mrna;
366 if (interval_range_on_mrna.Empty())
367 continue;
368
369 bool extends_to_left = interval_range_on_mrna.GetFrom() > limits_on_mrna.GetFrom();
370 bool extends_to_right = interval_range_on_mrna.GetTo() < limits_on_mrna.GetTo();
371 if(model.Strand() == eMinus)
372 swap(extends_to_left,extends_to_right);
373
374 TSignedSeqRange interval_range = amap.MapRangeEditedToOrig(interval_range_on_mrna);
375 CRef<CSeq_interval> interval(new CSeq_interval(*contig_sid, interval_range.GetFrom(),interval_range.GetTo(), strand));
376
377 if (i > 0 && (!e->m_fsplice || (model.Exons()[i-1].Limits().Empty() && extends_to_left))) {
378 _ASSERT(interval_range.GetFrom() == e->Limits().GetFrom());
379 interval->SetFuzz_from().SetLim(CInt_fuzz::eLim_lt);
380 }
381 if (i < model.Exons().size()-1 && (!e->m_ssplice || (model.Exons()[i+1].Limits().Empty() && extends_to_right))) {
382 _ASSERT(interval_range.GetTo() == e->Limits().GetTo());
383 interval->SetFuzz_to().SetLim(CInt_fuzz::eLim_gt);
384 }
385
386 packed_int.AddInterval(*interval);
387 }
388 }
389 return seq_loc->Merge(CSeq_loc::fSort, NULL);
390 }
391
ExpandSupport(const CSupportInfoSet & src,CSupportInfoSet & dst,IEvidence & evidence)392 void ExpandSupport(const CSupportInfoSet& src, CSupportInfoSet& dst, IEvidence& evidence)
393 {
394 ITERATE(CSupportInfoSet, s, src) {
395 dst.insert(*s);
396
397 const CAlignModel* m = evidence.GetModel(s->GetId());
398 if (m && (m->Type()&(CAlignModel::eChain|CAlignModel::eGnomon))!=0) {
399 _ASSERT( !s->IsCore() );
400 ExpandSupport(m->Support(), dst, evidence);
401 }
402 }
403 }
404
DumpEvidence(const SModelData & md)405 void CAnnotationASN1::CImplementationData::DumpEvidence(const SModelData& md)
406 {
407 const CGeneModel& model = md.model;
408 evidence.GetModel(model.ID()); // complete chains might not get marked as used otherwise
409
410 if (model.Support().empty())
411 return;
412 CRef<CSeq_annot> seq_annot(new CSeq_annot);
413 CSeq_annot::C_Data::TAlign* aligns = &seq_annot->SetData().SetAlign();
414
415 {{
416 string id_str = CIdHandler::ToString(*md.mrna_sid);
417
418 NameAnnot(*seq_annot, "Evidence for " + id_str);
419
420 CRef<CAnnot_id> annot_id(new CAnnot_id);
421 annot_id->SetGeneral().SetDb("GNOMON");
422 CIdHandler::SetId(annot_id->SetGeneral().SetTag(), md.model.ID());
423 seq_annot->SetId().push_back(annot_id);
424 }}
425
426
427 ITERATE(CSupportInfoSet, s, model.Support()) {
428 Int8 id = s->GetId();
429
430 CRef<CSeq_align> a(const_cast<CSeq_align*>(evidence.GetSeq_align(id).GetPointerOrNull()));
431 if (a.NotEmpty()) { //output asn alignments (chainer)
432 aligns->push_back(a);
433 } else {
434 const CAlignModel* m = evidence.GetModel(id);
435 if (m != NULL && (m->Type()&CGeneModel::eChain)) { //output supporting chains (gnomon)
436 auto_ptr<SModelData> smd;
437 smd.reset( new SModelData(*m, contig_seq, shift) );
438 AddInternalFeature(*smd);
439 CreateModelProducts(*smd);
440 aligns->push_back(model2spliced_seq_align(*smd));
441 }
442 }
443 }
444
445 if(!aligns->empty())
446 main_seq_entry->SetSet().SetAnnot().push_back(seq_annot);
447 }
448
CollectUserField(const CUser_field & field,const string & name,vector<string> & values)449 int CollectUserField(const CUser_field& field, const string& name, vector<string>& values)
450 {
451 int count = 0;
452 if (field.HasField(name)) {
453 const CUser_field& fn = field.GetField(name);
454 const CUser_field::C_Data::TStrs& strs = fn.GetData().GetStrs();
455 copy(strs.begin(), strs.end(), back_inserter(values));
456 if(fn.CanGetNum())
457 count = fn.GetNum();
458 else
459 count = strs.size();
460 }
461
462 return count;
463 }
464
465
ModelMethod(const CGeneModel & model)466 string ModelMethod(const CGeneModel& model) {
467 string method;
468 bool ggap = false;
469 for(int i = 1; i < (int)model.Exons().size() && !ggap; ++i) {
470 ggap = model.Exons()[i-1].m_ssplice_sig == "XX" || model.Exons()[i].m_fsplice_sig == "XX";
471 }
472 if(model.Type()&CGeneModel::eChain) {
473 method = ggap ? "Chainer_GapFilled" : "Chainer";
474 } else if(model.Type()&CGeneModel::eGnomon) {
475 if(model.Support().empty())
476 method = "FullAbInitio";
477 else
478 method = ggap ? "PartAbInitio_GapFilled" : "PartAbInitio";
479 } else {
480 method = CGeneModel::TypeToString(model.Type());
481 }
482
483 return method;
484 }
485
486
create_ModelEvidence_user_object(const CGeneModel & model)487 CRef< CUser_object > CAnnotationASN1::CImplementationData::create_ModelEvidence_user_object(const CGeneModel& model)
488 {
489 if(model.Type()&CGeneModel::eChain) {
490 CRef<CUser_object> uo = evidence.GetModelEvidenceUserObject(model.ID());
491 if(uo.NotNull() && uo->HasField("Support"))
492 return uo;
493 }
494
495 CRef< CUser_object > user_obj(new CUser_object);
496 CRef< CObject_id > type(new CObject_id);
497 type->SetStr("ModelEvidence");
498 user_obj->SetType(*type);
499
500 user_obj->AddField("Method",ModelMethod(model));
501
502 if (!model.Support().empty()) {
503 CRef<CUser_field> support_field(new CUser_field());
504 support_field->SetLabel().SetStr("Support");
505 vector<string> chains;
506 vector<string> cores;
507 vector<string> proteins;
508 vector<string> mrnas;
509 vector<string> ests;
510 vector<string> short_reads;
511 vector<string> long_sras;
512 vector<string> others;
513 vector<string> unknown;
514
515 int ests_count = 0;
516 int long_sras_count = 0;
517 int others_count = 0;
518
519 ITERATE(CSupportInfoSet, s, model.Support()) {
520 Int8 id = s->GetId();
521 const CAlignModel* m = evidence.GetModel(id);
522 if(m == 0)
523 continue;
524
525 int type = m->Type();
526 string accession;
527 if (m->Type()&CGeneModel::eChain) {
528 accession = CIdHandler::ToString(*CIdHandler::GnomonMRNA(id));
529 } else {
530 accession = CIdHandler::ToString(*m->GetTargetId());
531 }
532
533 if (s->IsCore())
534 cores.push_back(accession);
535
536 if (type&CGeneModel::eChain) {
537 chains.push_back(accession);
538 CRef<CUser_object> uo = evidence.GetModelEvidenceUserObject(id);
539 if(uo.NotNull() && uo->HasField("Support")) {
540 const CUser_field& support_field = uo->GetField("Support");
541 CollectUserField(support_field, "Core", cores);
542 CollectUserField(support_field, "Proteins", proteins);
543 CollectUserField(support_field, "mRNAs", mrnas);
544 ests_count += CollectUserField(support_field, "ESTs", ests);
545 CollectUserField(support_field, "RNASeq", short_reads);
546 long_sras_count += CollectUserField(support_field, "longSRA", long_sras);
547 others_count += CollectUserField(support_field, "other", others);
548 }
549 } else if (type&CGeneModel::eProt) {
550 proteins.push_back(accession);
551 } else if (type&CGeneModel::emRNA) {
552 mrnas.push_back(accession);
553 } else if (type&CGeneModel::eEST) {
554 if(NStr::StartsWith(accession, "gi|")) {
555 ests.push_back(accession);
556 ests_count += m->Weight();
557 } else if(NStr::StartsWith(accession, "gnl|SRA")) {
558 long_sras.push_back(accession);
559 long_sras_count += m->Weight();
560 } else {
561 others.push_back(accession);
562 others_count += m->Weight();
563 }
564 } else if (type&CGeneModel::eSR) {
565 short_reads.push_back(accession);
566 } else {
567 unknown.push_back(accession);
568 }
569 }
570
571 bool have_something = false;
572
573 if (!chains.empty()) {
574 support_field->AddField("Chains",chains);
575 support_field->SetData().SetFields().back()->SetNum(chains.size());
576 have_something = true;
577 }
578 if (!cores.empty()) {
579 support_field->AddField("Core",cores);
580 support_field->SetData().SetFields().back()->SetNum(cores.size());
581 have_something = true;
582 }
583 if (!proteins.empty()) {
584 sort(proteins.begin(),proteins.end());
585 support_field->AddField("Proteins",proteins);
586 support_field->SetData().SetFields().back()->SetNum(proteins.size());
587 have_something = true;
588 }
589 if (!mrnas.empty()) {
590 sort(mrnas.begin(),mrnas.end());
591 support_field->AddField("mRNAs",mrnas);
592 support_field->SetData().SetFields().back()->SetNum(mrnas.size());
593 have_something = true;
594 }
595 if (!ests.empty()) {
596 sort(ests.begin(),ests.end());
597 support_field->AddField("ESTs",ests);
598 support_field->SetData().SetFields().back()->SetNum(ests_count);
599 have_something = true;
600 }
601 if (!short_reads.empty()) {
602 sort(short_reads.begin(),short_reads.end());
603 support_field->AddField("RNASeq",short_reads);
604 support_field->SetData().SetFields().back()->SetNum(short_reads.size());
605 have_something = true;
606 }
607 if (!long_sras.empty()) {
608 sort(long_sras.begin(),long_sras.end());
609 support_field->AddField("longSRA",long_sras);
610 support_field->SetData().SetFields().back()->SetNum(long_sras_count);
611 have_something = true;
612 }
613 if (!others.empty()) {
614 sort(others.begin(),others.end());
615 support_field->AddField("other",others);
616 support_field->SetData().SetFields().back()->SetNum(others_count);
617 have_something = true;
618 }
619 if (!unknown.empty()) {
620 support_field->AddField("unknown",unknown);
621 support_field->SetData().SetFields().back()->SetNum(unknown.size());
622 have_something = true;
623 }
624
625 if (have_something)
626 user_obj->SetData().push_back(support_field);
627 }
628
629 if(!model.ProteinHit().empty())
630 user_obj->AddField("BestTargetProteinHit",model.ProteinHit());
631 if(model.Status()&CGeneModel::eFullSupCDS)
632 user_obj->AddField("CDS support",string("full"));
633
634 return user_obj;
635 }
636
637
create_internal_feature(const SModelData & md)638 CRef<CSeq_feat> CAnnotationASN1::CImplementationData::create_internal_feature(const SModelData& md)
639 {
640 const CAlignModel& model = md.model;
641
642 CRef<CSeq_feat> feature(new CSeq_feat);
643
644 CRef<CObject_id> obj_id( new CObject_id() );
645 CIdHandler::SetId(*obj_id, model.ID());
646 CRef<CFeat_id> feat_id( new CFeat_id() );
647 feat_id->SetLocal(*obj_id);
648 feature->SetIds().push_back(feat_id);
649
650 CRef<CUser_object> user( new CUser_object);
651 user->SetClass("Gnomon");
652 user->SetType().SetStr("Model Internal Attributes");
653 feature->SetExts().push_back(user);
654
655 if (model.Type() & (CGeneModel::eGnomon | CGeneModel::eChain))
656 user->AddField("Method", ModelMethod(model));
657
658 TAttributes attributes;
659 CollectAttributes(model, attributes);
660 if (model.GetTargetId().NotEmpty())
661 attributes["Target"] = CIdHandler::ToString(*model.GetTargetId());
662 ITERATE (TAttributes, a, attributes) {
663 if (!a->second.empty())
664 user->AddField(a->first, a->second);
665 }
666
667 if (model.Score() != BadScore())
668 user->AddField("cds_score", model.Score());
669
670 const CCDSInfo& cds_info = model.GetCdsInfo();
671 TSignedSeqRange rcds_on_mrna = cds_info.MaxCdsLimits() & model.TranscriptLimits();
672 int frame = 0;
673 if(cds_info.HasStart())
674 rcds_on_mrna.SetFrom(cds_info.Start().GetFrom());
675 else
676 frame = (cds_info.Cds().GetFrom()-rcds_on_mrna.GetFrom())%3;
677
678 if (cds_info.ReadingFrame().NotEmpty() && model.GetAlignMap().MapRangeOrigToEdited(model.Limits(),false).IntersectingWith(rcds_on_mrna)) { // at least some CDS could be projected on genome
679 // create cdregion feature as there is no other place to show CDS on evidence alignment
680
681 _ASSERT((model.Status()&CGeneModel::eReversed) == 0); // needs to be changed if not true
682
683 feature->SetLocation(*create_packed_int_seqloc(model,rcds_on_mrna));
684 CRef<CGenetic_code::C_E> cdrcode(new CGenetic_code::C_E);
685 cdrcode->SetId(1);
686 feature->SetData().SetCdregion().SetCode().Set().push_back(cdrcode);
687
688 CCdregion::EFrame ncbi_frame = CCdregion::eFrame_one;
689 if(frame == 1) ncbi_frame = CCdregion::eFrame_two;
690 else if(frame == 2) ncbi_frame = CCdregion::eFrame_three;
691 feature->SetData().SetCdregion().SetFrame(ncbi_frame);
692 } else {
693 feature->SetLocation(*create_packed_int_seqloc(model,model.TranscriptLimits()));
694 feature->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
695 }
696
697 return feature;
698 }
699
700 //CRef<CSpliced_exon> CAnnotationASN1::CImplementationData::spliced_exon (const CModelExon& e, EStrand strand) const
spliced_exon(const CModelExon & e,EStrand strand)701 CRef<CSpliced_exon> spliced_exon (const CModelExon& e, EStrand strand)
702 {
703 CRef<CSpliced_exon> se(new CSpliced_exon());
704
705 if(e.Limits().NotEmpty()) { // normal exon
706 se->SetGenomic_start(e.GetFrom());
707 se->SetGenomic_end(e.GetTo());
708 } else { // gap filler
709 CRef<CSeq_id> fillerid = CIdHandler::ToSeq_id(e.m_source.m_acc);
710 se->SetGenomic_id(*fillerid);
711 se->SetGenomic_strand(e.m_source.m_strand==ePlus?eNa_strand_plus:eNa_strand_minus);
712 se->SetGenomic_start(e.m_source.m_range.GetFrom());
713 se->SetGenomic_end(e.m_source.m_range.GetTo());
714 }
715
716 if(e.m_ident > 0) {
717 CRef<CScore> scr(new CScore);
718 scr->SetValue().SetReal(e.m_ident);
719 CRef<CObject_id> id(new CObject_id());
720 id->SetStr("idty");
721 scr->SetId(*id);
722 se->SetScores().Set().push_back(scr);
723 }
724
725 if (e.m_fsplice) {
726 // _ASSERT(e.m_fsplice_sig.length() == 2);
727 if (strand==ePlus) {
728 se->SetAcceptor_before_exon().SetBases(e.m_fsplice_sig);
729 } else {
730 se->SetDonor_after_exon().SetBases(e.m_fsplice_sig);
731 }
732 }
733 if (e.m_ssplice) {
734 // _ASSERT(e.m_ssplice_sig.length() == 2);
735 if (strand==ePlus) {
736 se->SetDonor_after_exon().SetBases(e.m_ssplice_sig);
737 } else {
738 se->SetAcceptor_before_exon().SetBases(e.m_ssplice_sig);
739 }
740 }
741 return se;
742 }
743
744
NucPosToProtPos(int nultripos)745 CRef<CProduct_pos> NucPosToProtPos(int nultripos) {
746 CRef<CProduct_pos> pos(new CProduct_pos);
747 pos->SetProtpos().SetFrame(nultripos%3 + 1);
748 pos->SetProtpos().SetAmin(nultripos/3);
749 return pos;
750 }
751
AlignModelToSeqalign(const CAlignModel & model,CSeq_id & mrnaid,CSeq_id & contigid,bool is_align,bool is_protalign,bool stop_found)752 CRef<CSeq_align> AlignModelToSeqalign(const CAlignModel& model, CSeq_id& mrnaid, CSeq_id& contigid, bool is_align, bool is_protalign, bool stop_found) {
753
754 CRef< CSeq_align > seq_align( new CSeq_align );
755
756 if(model.ID() > 0) {
757 CRef<CObject_id> id(new CObject_id());
758 CIdHandler::SetId(*id, model.ID());
759 seq_align->SetId().push_back(id);
760 }
761
762 seq_align->SetType(CSeq_align::eType_partial);
763 CSpliced_seg& spliced_seg = seq_align->SetSegs().SetSpliced();
764
765 if(is_protalign) {
766 spliced_seg.SetProduct_type(CSpliced_seg::eProduct_type_protein);
767 int product_length = model.TargetLen()/3;
768 if(stop_found)
769 --product_length;
770 spliced_seg.SetProduct_length(product_length);
771 } else {
772 spliced_seg.SetProduct_type(CSpliced_seg::eProduct_type_transcript);
773 spliced_seg.SetProduct_length(model.TargetLen());
774 }
775 if (model.Status() & CAlignModel::ePolyA) {
776 spliced_seg.SetPoly_a((model.Status() & CAlignModel::eReversed)? model.PolyALen() - 1 : model.TargetLen() - model.PolyALen());
777 }
778
779 spliced_seg.SetProduct_id(mrnaid);
780 spliced_seg.SetGenomic_id(contigid);
781 spliced_seg.SetProduct_strand((model.Status() & CGeneModel::eReversed)==0 ? eNa_strand_plus : eNa_strand_minus);
782 spliced_seg.SetGenomic_strand(model.Strand()==ePlus?eNa_strand_plus:eNa_strand_minus);
783
784 CSpliced_seg::TExons& exons = spliced_seg.SetExons();
785
786 TInDels indels = (is_protalign ? model.GetInDels(false) : model.FrameShifts());
787 TInDels::const_iterator indel_i = indels.begin();
788 for (size_t i=0; i < model.Exons().size(); ++i) {
789 const CModelExon *e = &model.Exons()[i];
790
791 CRef<CSpliced_exon> se = spliced_exon(*e,model.Strand());
792
793 TSignedSeqRange transcript_exon = model.TranscriptExon(i);
794 if(is_protalign) {
795 se->SetProduct_start(*NucPosToProtPos(transcript_exon.GetFrom()));
796 se->SetProduct_end(*NucPosToProtPos(transcript_exon.GetTo()));
797 } else {
798 se->SetProduct_start().SetNucpos(transcript_exon.GetFrom());
799 se->SetProduct_end().SetNucpos(transcript_exon.GetTo());
800 }
801
802 if(e->Limits().NotEmpty()) { // normal exon
803 int last_chunk = e->GetFrom();
804 for( ;indel_i != indels.end() && indel_i->Loc() <= e->GetTo()+1; ++indel_i) {
805 const CInDelInfo& indel = *indel_i;
806 _ASSERT( e->GetFrom() <= indel.Loc() );
807
808 if (indel.Loc()-last_chunk > 0) {
809 CRef< CSpliced_exon_chunk > chunk(new CSpliced_exon_chunk);
810 if(is_align) {
811 if(is_protalign)
812 chunk->SetDiag(indel.Loc()-last_chunk);
813 else
814 chunk->SetMatch(indel.Loc()-last_chunk);
815 } else
816 chunk->SetMatch(indel.Loc()-last_chunk);
817 se->SetParts().push_back(chunk);
818 last_chunk = indel.Loc();
819 }
820
821 if(indel.IsMismatch()) {
822 _ASSERT(!is_protalign);
823 CRef< CSpliced_exon_chunk > chunk(new CSpliced_exon_chunk);
824 chunk->SetMismatch(indel.Len());
825 se->SetParts().push_back(chunk);
826
827 last_chunk += indel.Len();
828 } else if (indel.IsInsertion()) {
829 CRef< CSpliced_exon_chunk > chunk(new CSpliced_exon_chunk);
830 chunk->SetGenomic_ins(indel.Len());
831 se->SetParts().push_back(chunk);
832
833 last_chunk += indel.Len();
834 } else {
835 CRef< CSpliced_exon_chunk > chunk(new CSpliced_exon_chunk);
836 chunk->SetProduct_ins(indel.Len());
837 se->SetParts().push_back(chunk);
838 }
839 }
840 if (e->GetFrom() <= last_chunk && last_chunk <= e->GetTo()) {
841 CRef< CSpliced_exon_chunk > chunk(new CSpliced_exon_chunk);
842 if(is_align) {
843 if(is_protalign)
844 chunk->SetDiag(e->GetTo()-last_chunk+1);
845 else
846 chunk->SetMatch(e->GetTo()-last_chunk+1);
847 } else
848 chunk->SetMatch(e->GetTo()-last_chunk+1);
849 se->SetParts().push_back(chunk);
850 }
851 } else { // gap filler
852 CRef< CSpliced_exon_chunk > chunk(new CSpliced_exon_chunk);
853 if(is_align) {
854 if(is_protalign)
855 chunk->SetDiag(e->m_source.m_range.GetLength());
856 else
857 chunk->SetMatch(e->m_source.m_range.GetLength());
858 } else
859 chunk->SetMatch(e->m_source.m_range.GetLength());
860 se->SetParts().push_back(chunk);
861 }
862
863 exons.push_back(se);
864 }
865 _ASSERT( indel_i == indels.end() );
866
867 if (model.Strand() == eMinus) {
868 // reverse if minus strand (don't forget to reverse chunks)
869 exons.reverse();
870 NON_CONST_ITERATE(CSpliced_seg::TExons, exon_i, exons) {
871 CSpliced_exon& se = **exon_i;
872 if (se.IsSetParts())
873 se.SetParts().reverse();
874 }
875 }
876
877 if(!model.FrameShifts().empty() && !is_protalign) {
878 TSignedSeqRange tlim = model.TranscriptLimits();
879 int left_not_aligned = tlim.GetFrom();
880 int right_not_aligned = model.TargetLen()-tlim.GetTo()-1;
881 if(model.Strand() == eMinus)
882 swap(left_not_aligned, right_not_aligned);
883
884 string mismatches(left_not_aligned, 'N'); // will include mismatches and deletions
885 string mismstatus; // will include status for mismatches and indels
886 TInDels::const_iterator indl = model.FrameShifts().begin();
887 for(int e = 0; e < (int)model.Exons().size(); ++e) {
888 if(model.Exons()[e].Limits().Empty())
889 continue;
890 if(e > 0 && !model.Exons()[e].m_fsplice) {
891 int len;
892 if(model.Orientation() == ePlus)
893 len = model.TranscriptExon(e).GetFrom()-model.TranscriptExon(e-1).GetTo()-1;
894 else
895 len = model.TranscriptExon(e-1).GetFrom()-model.TranscriptExon(e).GetTo()-1;
896 mismatches += string(len, 'N');
897 }
898 for( ; indl != model.FrameShifts().end() && indl->IntersectingWith(model.Exons()[e].GetFrom(), model.Exons()[e].GetTo()); ++indl) {
899 switch(indl->GetStatus()) {
900 case CInDelInfo::eGenomeNotCorrect:
901 mismstatus.push_back('n'); break;
902 case CInDelInfo::eGenomeCorrect:
903 mismstatus.push_back('c'); break;
904 default:
905 mismstatus.push_back('u'); break;
906 }
907 if(!indl->IsInsertion())
908 mismatches += indl->GetInDelV();
909 }
910 }
911 mismatches += string(right_not_aligned, 'N');
912
913 if(mismatches.find_first_not_of('N') != string::npos) {
914 CRef<CUser_object> userm( new CUser_object);
915 CRef<CObject_id> typem(new CObject_id);
916 typem->SetStr("MismatchedBases");
917 userm->SetType(*typem);
918 userm->AddField("Bases", mismatches);
919 seq_align->SetExt().push_back(userm);
920 }
921
922 if(mismstatus.find_first_not_of('u') != string::npos) {
923 CRef<CUser_object> users( new CUser_object);
924 CRef<CObject_id> types(new CObject_id);
925 types->SetStr("MismatchedBasesStatus");
926 users->SetType(*types);
927 users->AddField("Status", mismstatus);
928 seq_align->SetExt().push_back(users);
929 }
930 }
931
932 return seq_align;
933 }
934
MakeSeqAlign(const string & contig) const935 CRef<objects::CSeq_align> CAlignModel::MakeSeqAlign(const string& contig) const {
936
937 bool is_align = !(Type()&(eChain|eGnomon));
938
939 CAlignModel model = *this;
940 if(is_align && (Type()&eProt) && HasStop()) {
941 _ASSERT(GetCdsInfo().IsMappedToGenome());
942 TSignedSeqRange lim = GetCdsInfo().Start()+GetCdsInfo().ReadingFrame();
943 model.Clip(lim,eRemoveExons); // protein alignments doesn't include stop
944 }
945
946 CRef<CSeq_id> contig_sid(CIdHandler::ToSeq_id(contig));
947 CRef<CSeq_id> mrna_sid(new CSeq_id);
948 mrna_sid->Assign(*GetTargetId());
949 CRef<CSeq_align> seq_align = AlignModelToSeqalign(model, *mrna_sid, *contig_sid, is_align, is_align && (Type()&eProt), is_align && (Type()&eProt) && HasStop());
950
951 if(is_align) {
952 CSpliced_seg& spliced_seg = seq_align->SetSegs().SetSpliced();
953
954 if(Ident() > 0) {
955 int matches = Ident()*seq_align->GetAlignLength()+0.5;
956 seq_align->SetNamedScore("matches", matches);
957 }
958
959 if(Status()&eBestPlacement)
960 seq_align->SetNamedScore("rank", 1);
961
962 if(Status()&eUnknownOrientation)
963 seq_align->SetNamedScore("ambiguous_orientation", 1);
964
965 if(Weight() > 1)
966 seq_align->SetNamedScore("count", int(Weight()+0.5));
967
968 if(Type()&eProt) {
969 if(HasStart()) {
970 CRef<CSpliced_seg_modifier> modi(new CSpliced_seg_modifier);
971 modi->SetStart_codon_found(true);
972 spliced_seg.SetModifiers().push_back(modi);
973 }
974 if(HasStop()) {
975 CRef<CSpliced_seg_modifier> modi(new CSpliced_seg_modifier);
976 modi->SetStop_codon_found(true);
977 spliced_seg.SetModifiers().push_back(modi);
978 }
979 }
980 }
981
982 return seq_align;
983 }
984
model2spliced_seq_align(SModelData & md)985 CRef<CSeq_align> CAnnotationASN1::CImplementationData::model2spliced_seq_align(SModelData& md)
986 {
987 const CAlignModel& model = md.model;
988
989 CRef<CSeq_align> seq_align = AlignModelToSeqalign(model, *md.mrna_sid, *contig_sid, false, false, false);
990
991 CRef<CUser_object> user( new CUser_object);
992 user->SetClass("Gnomon");
993 CRef< CObject_id > type(new CObject_id);
994 type->SetStr("AlignmentAttributes");
995 user->SetType(*type);
996 seq_align->SetExt().push_back(user);
997 user->AddField(kGnomonConstructed, true);
998
999 #ifdef _DEBUG
1000 try {
1001 seq_align->Validate(true);
1002 } catch (...) {
1003 _ASSERT(false);
1004 }
1005 // try {
1006 // CNcbiOstrstream ostream;
1007 // ostream << MSerial_AsnBinary << *seq_align;
1008 // } catch (...) {
1009 // _ASSERT(false);
1010 // }
1011 #endif
1012
1013 return seq_align;
1014 }
1015
GetModelEvidenceUserObject(const CSeq_feat_Handle & feat)1016 CRef<CUser_object> GetModelEvidenceUserObject(const CSeq_feat_Handle& feat)
1017 {
1018 ITERATE(CSeq_feat::TExts, uo, feat.GetSeq_feat()->GetExts()) {
1019 if ((*uo)->GetType().GetStr() == "ModelEvidence" ) {
1020 return *uo;
1021 }
1022 }
1023 return CRef<CUser_object>();
1024 }
1025
RestoreModelMethod(const CSeq_feat_Handle & feat,CAlignModel & model)1026 void RestoreModelMethod(const CSeq_feat_Handle& feat, CAlignModel& model)
1027 {
1028 const CUser_object& user = *feat.GetSeq_feat()->GetExts().front();
1029 _ASSERT( user.GetType().GetStr() == "Model Internal Attributes" || user.GetType().GetStr() == "ModelEvidence" );
1030 if (user.HasField("Method")) {
1031 string method = user.GetField("Method").GetData().GetStr();
1032 if (method.find("AbInitio") != string::npos || method.find("Gnomon") != string::npos)
1033 model.SetType(CGeneModel::eGnomon);
1034 else if (method.find("Chainer") != string::npos)
1035 model.SetType(CGeneModel::eChain);
1036 }
1037 }
1038
RestoreModelAttributes(const CSeq_feat_Handle & feat_handle,CAlignModel & model)1039 void RestoreModelAttributes(const CSeq_feat_Handle& feat_handle, CAlignModel& model)
1040 {
1041 TAttributes attributes;
1042
1043 _ASSERT(feat_handle);
1044
1045 const CUser_object& user = *feat_handle.GetOriginalSeq_feat()->GetExts().front();
1046 _ASSERT( user.GetClass() == "Gnomon" );
1047 _ASSERT( user.GetType().GetStr() == "Model Internal Attributes" );
1048
1049
1050 model.SetType(0);
1051 RestoreModelMethod(feat_handle, model);
1052
1053 ITERATE(CUser_object::TData, f, user.GetData()) {
1054 const CUser_field& fld = **f;
1055 if (fld.GetData().IsStr())
1056 attributes[fld.GetLabel().GetStr()] = fld.GetData().GetStr();
1057 else if (fld.GetLabel().GetStr() == "cds_score")
1058 attributes["cds_score"] = NStr::DoubleToString(fld.GetData().GetReal());
1059 }
1060
1061 ParseAttributes(attributes, model);
1062
1063 if (attributes.find("cds_score") != attributes.end()) {
1064 double score = NStr::StringToDouble(attributes["cds_score"]);
1065 CCDSInfo cds_info = model.GetCdsInfo();
1066 cds_info.SetScore(score, cds_info.OpenCds());
1067 model.SetCdsInfo(cds_info);
1068 }
1069
1070 if(model.GetCdsInfo().ReadingFrame().NotEmpty()) {
1071 CCDSInfo cds_info_t = model.GetCdsInfo();
1072 CCDSInfo cds_info_g = cds_info_t.MapFromEditedToOrig(model.GetAlignMap());
1073 if(cds_info_g.ReadingFrame().NotEmpty()) // successful projection
1074 model.SetCdsInfo(cds_info_g);
1075 }
1076 }
1077
RestoreModelReadingFrame(const CSeq_feat_Handle & feat,CAlignModel & model)1078 void RestoreModelReadingFrame(const CSeq_feat_Handle& feat, CAlignModel& model)
1079 {
1080 if (feat && feat.GetFeatType() == CSeqFeatData::e_Cdregion) {
1081 TSeqRange cds_range = feat.GetLocation().GetTotalRange();
1082 TSignedSeqRange rf = TSignedSeqRange(cds_range.GetFrom(), cds_range.GetTo());
1083 if (!feat.GetLocation().GetId()->Match(*CIdHandler::GnomonMRNA(model.ID()))) {
1084 rf = model.GetAlignMap().MapRangeOrigToEdited(rf, false);
1085 }
1086
1087 if (feat.GetData().GetCdregion().CanGetFrame()) {
1088
1089 CCdregion::EFrame ncbi_frame = feat.GetData().GetCdregion().GetFrame();
1090 int phase = 0;
1091 switch (ncbi_frame) {
1092 case CCdregion::eFrame_not_set:
1093 case CCdregion::eFrame_one:
1094 phase = 0;
1095 break;
1096 case CCdregion::eFrame_two:
1097 phase = 1;
1098 break;
1099 case CCdregion::eFrame_three:
1100 phase = 2;
1101 break;
1102 default:
1103 _ASSERT( false);
1104 }
1105
1106 bool notreversed = (model.Status()&CGeneModel::eReversed) == 0;
1107 if(notreversed) {
1108 rf.SetFrom(rf.GetFrom()+phase);
1109 rf.SetTo(rf.GetTo()-rf.GetLength()%3);
1110 } else {
1111 rf.SetTo(rf.GetTo()-phase);
1112 rf.SetFrom(rf.GetFrom()+rf.GetLength()%3);
1113 }
1114 }
1115
1116 CCDSInfo cds_info(false); // mrna coordinates; will be projected to genome at the end if possible
1117 cds_info.SetReadingFrame(rf, false);
1118 model.SetCdsInfo(cds_info);
1119 }
1120 }
1121
RestoreModel(const CSeq_feat_Handle & internal_feat,const CSeq_feat_Handle & cds_feat,const CSeq_align & align)1122 CAlignModel* RestoreModel(const CSeq_feat_Handle& internal_feat, const CSeq_feat_Handle& cds_feat, const CSeq_align& align)
1123 {
1124 CAlignModel* model = new CAlignModel(align);
1125
1126 RestoreModelReadingFrame(cds_feat, *model);
1127
1128 RestoreModelAttributes(internal_feat, *model);
1129
1130 return model;
1131 }
1132
RestoreModelFromPublicMrnaFeature(const CSeq_feat_Handle & feat)1133 CAlignModel* RestoreModelFromPublicMrnaFeature(const CSeq_feat_Handle& feat)
1134 {
1135 CScope& scope = feat.GetScope();
1136 CBioseq_Handle mrna_handle = scope.GetBioseqHandle(*feat.GetProduct().GetId());
1137 CConstRef<CBioseq> mrna = mrna_handle.GetCompleteBioseq();
1138 _ASSERT(mrna->IsNa());
1139
1140 const CSeq_align& align = *mrna->GetInst().GetHist().GetAssembly().front();
1141 const CObject_id& obj_id = *align.GetId().back();
1142
1143 CFeat_CI cds_feat(mrna_handle);
1144 while (cds_feat && !cds_feat->GetOriginalFeature().GetData().IsCdregion())
1145 ++cds_feat;
1146
1147 const CTSE_Handle& tse_handle = feat.GetAnnot().GetTSE_Handle();
1148 CSeq_feat_Handle feat_handle = tse_handle.GetFeatureWithId(CSeqFeatData::e_Rna, obj_id);
1149 if (!feat_handle) {
1150 feat_handle = tse_handle.GetFeatureWithId(CSeqFeatData::e_Cdregion, obj_id);
1151 }
1152
1153 return RestoreModel(feat_handle, *cds_feat, align);
1154 }
1155
RestoreModelFromInternalGnomonFeature(const CSeq_feat_Handle & feat)1156 CAlignModel* RestoreModelFromInternalGnomonFeature(const CSeq_feat_Handle& feat)
1157 {
1158 Int8 id = CIdHandler::GetId(feat.GetOriginalSeq_feat()->GetIds().front()->GetLocal());
1159
1160 CScope& scope = feat.GetScope();
1161 CConstRef<CSeq_id> mrna_seq_id = CIdHandler::GnomonMRNA(id);
1162 CBioseq_Handle mrna_handle = scope.GetBioseqHandle(*mrna_seq_id);
1163 if (!mrna_handle)
1164 return NULL;
1165 CConstRef<CBioseq> mrna = mrna_handle.GetCompleteBioseq();
1166 _ASSERT(mrna->IsNa());
1167
1168 const CSeq_align& align = *mrna->GetInst().GetHist().GetAssembly().front();
1169
1170 return RestoreModel(feat, feat, align);
1171 }
1172
IsGnomonConstructed(const CSeq_align & seq_align)1173 bool IsGnomonConstructed(const CSeq_align& seq_align)
1174 {
1175 if (seq_align.CanGetExt()) {
1176 CSeq_align::TExt ext = seq_align.GetExt();
1177 ITERATE(CSeq_align::TExt, u, ext) {
1178 if ((*u)->CanGetClass() && (*u)->GetClass() == "Gnomon" &&
1179 (*u)->HasField(kGnomonConstructed) &&
1180 (*u)->GetField(kGnomonConstructed).GetData().GetBool())
1181 return true;
1182 }
1183 }
1184 return false;
1185 }
1186
ExtractSupportModels(Int8 model_id,TAlignModelList & evidence_models,list<CRef<CSeq_align>> & evidence_alignments,CSeq_entry_Handle seq_entry_handle,map<string,CRef<CSeq_annot>> & seq_annot_map,set<Int8> & processed_ids)1187 void ExtractSupportModels(Int8 model_id,
1188 TAlignModelList& evidence_models, list<CRef<CSeq_align> >& evidence_alignments,
1189 CSeq_entry_Handle seq_entry_handle, map<string, CRef<CSeq_annot> >& seq_annot_map,
1190 set<Int8>& processed_ids)
1191 {
1192 map<string, CRef<CSeq_annot> >::iterator annot = seq_annot_map.find("Evidence for "+CIdHandler::ToString(*CIdHandler::GnomonMRNA(model_id)));
1193 if (annot == seq_annot_map.end())
1194 return;
1195
1196 CSeq_annot::TData::TAlign aligns = annot->second->SetData().SetAlign();
1197
1198 NON_CONST_ITERATE (CSeq_annot::TData::TAlign, align_ci, aligns) {
1199 CSeq_align& seq_align = **align_ci;
1200 const CObject_id& obj_id = *seq_align.GetId().back();
1201 Int8 id = CIdHandler::GetId(obj_id);
1202 if (!processed_ids.insert(id).second) // already there
1203 continue;
1204
1205 const CTSE_Handle& tse_handle = seq_entry_handle.GetTSE_Handle();
1206 CSeq_feat_Handle feat_handle = tse_handle.GetFeatureWithId(CSeqFeatData::e_Rna, obj_id);
1207 if (!feat_handle)
1208 feat_handle = tse_handle.GetFeatureWithId(CSeqFeatData::e_Cdregion, obj_id);
1209
1210 CSeq_feat_Handle cds_handle = tse_handle.GetFeatureWithId(CSeqFeatData::e_Cdregion, "cds."+NStr::NumericToString(id));
1211 if(!cds_handle)
1212 cds_handle = feat_handle;
1213
1214 auto_ptr<CAlignModel> model( RestoreModel(feat_handle, cds_handle, seq_align) );
1215 evidence_models.push_back(*model);
1216
1217 ExtractSupportModels(id, evidence_models, evidence_alignments, seq_entry_handle, seq_annot_map, processed_ids);
1218
1219 if (IsGnomonConstructed(seq_align))
1220 continue;
1221
1222 CRef<CSeq_align> align_ref(&seq_align);
1223 evidence_alignments.push_back(align_ref);
1224 }
1225 }
1226
ExtractModels(objects::CSeq_entry & seq_entry,TAlignModelList & model_list,TAlignModelList & evidence_models,list<CRef<CSeq_align>> & evidence_alignments,map<Int8,CRef<CUser_object>> & model_evidence_uo)1227 string CAnnotationASN1::ExtractModels(objects::CSeq_entry& seq_entry,
1228 TAlignModelList& model_list,
1229 TAlignModelList& evidence_models,
1230 list<CRef<CSeq_align> >& evidence_alignments,
1231 map<Int8, CRef<CUser_object> >& model_evidence_uo)
1232 {
1233 CScope scope(*CObjectManager::GetInstance());
1234 CSeq_entry_Handle seq_entry_handle = scope.AddTopLevelSeqEntry(seq_entry);
1235
1236 map<string, CRef<CSeq_annot> > seq_annot_map;
1237
1238 string contig;
1239
1240 ITERATE(CBioseq_set::TAnnot, annot, seq_entry.SetSet().SetAnnot()) {
1241 CAnnot_descr::Tdata::const_iterator iter = (*annot)->GetDesc().Get().begin();
1242 string name;
1243 string region;
1244 for ( ; iter != (*annot)->GetDesc().Get().end(); ++iter) {
1245 if ((*iter)->IsName() ) {
1246 name = (*iter)->GetName();
1247 }
1248 if ((*iter)->IsRegion() ) {
1249 region = CIdHandler::ToString(*(*iter)->GetRegion().GetId());
1250 }
1251 }
1252 if (!name.empty()) {
1253 seq_annot_map[name] = *annot;
1254 if (name=="Gnomon models") {
1255 contig = region;
1256 }
1257 }
1258 }
1259
1260 CRef<CSeq_annot> feature_table = seq_annot_map["Gnomon models"];
1261 _ASSERT( feature_table.NotEmpty() );
1262 _ASSERT( feature_table->IsFtable() );
1263
1264 CRef<CSeq_annot> internal_feature_table = seq_annot_map["Gnomon internal attributes"];
1265 _ASSERT( internal_feature_table.NotEmpty() );
1266 _ASSERT( internal_feature_table->IsFtable() );
1267
1268 SAnnotSelector sel;
1269 sel.SetFeatType(CSeqFeatData::e_Rna);
1270 CFeat_CI feat_ci(scope.GetSeq_annotHandle(*feature_table), sel);
1271
1272 if (contig.empty() && feat_ci) {
1273 contig = CIdHandler::ToString(*feat_ci->GetLocation().GetId());
1274 }
1275
1276 set<Int8> processed_ids;
1277 for (; feat_ci; ++feat_ci) {
1278 auto_ptr<CAlignModel> model( RestoreModelFromPublicMrnaFeature(feat_ci->GetSeq_feat_Handle()) );
1279 model_list.push_back(*model);
1280 processed_ids.insert(model->ID());
1281 ExtractSupportModels(model->ID(), evidence_models, evidence_alignments, seq_entry_handle, seq_annot_map, processed_ids);
1282 model_evidence_uo[model->ID()] = GetModelEvidenceUserObject(feat_ci->GetSeq_feat_Handle());
1283 }
1284
1285 CFeat_CI internal_feat_ci(scope.GetSeq_annotHandle(*internal_feature_table));
1286 for (; internal_feat_ci; ++internal_feat_ci) {
1287 Int8 id = CIdHandler::GetId(internal_feat_ci->GetOriginalFeature().GetIds().front()->GetLocal());
1288 if (processed_ids.find(id) != processed_ids.end()) // already there
1289 continue;
1290 auto_ptr<CAlignModel> model( RestoreModelFromInternalGnomonFeature(internal_feat_ci->GetSeq_feat_Handle()) );
1291 if (model.get() == NULL)
1292 continue;
1293 evidence_models.push_back(*model);
1294 processed_ids.insert(id);
1295 ExtractSupportModels(model->ID(), evidence_models, evidence_alignments, seq_entry_handle, seq_annot_map, processed_ids);
1296 }
1297
1298
1299 return contig;
1300 }
1301
1302 END_SCOPE(gnomon)
1303 END_NCBI_SCOPE
1304