1 /*  $Id: asn1.cpp 556806 2018-02-05 17:38:50Z souvorov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Vyacheslav Chetvernin
27  *
28  * File Description:
29  * conversion to/from ASN1
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <algo/gnomon/asn1.hpp>
35 #include <algo/sequence/gene_model.hpp>
36 #include <objects/general/general__.hpp>
37 #include <objects/seqset/Bioseq_set.hpp>
38 #include <objects/seqfeat/seqfeat__.hpp>
39 #include <objects/seqloc/seqloc__.hpp>
40 #include <objects/seq/seq__.hpp>
41 #include <objects/seqalign/seqalign__.hpp>
42 #include "gnomon_seq.hpp"
43 #include <algo/gnomon/id_handler.hpp>
44 
45 #include <objects/seq/seqport_util.hpp>
46 #include <objmgr/object_manager.hpp>
47 #include <objmgr/util/sequence.hpp>
48 #include <objmgr/seq_vector.hpp>
49 #include <objmgr/seq_annot_ci.hpp>
50 #include <objmgr/feat_ci.hpp>
51 #include <objmgr/align_ci.hpp>
52 
53 BEGIN_NCBI_SCOPE
54 BEGIN_SCOPE(gnomon)
55 USING_SCOPE(ncbi::objects);
56 
57 
58 
59 // defined in gnomon_model.cpp
60 typedef map<string,string> TAttributes;
61 void CollectAttributes(const CAlignModel& a, TAttributes& attributes);
62 void ParseAttributes(TAttributes& attributes, CAlignModel& a);
63 
64 // defined in gnomon_objmgr.cpp
65 Int8 GetModelId(const CSeq_align& seq_align);
66 
67 
68 const string kGnomonConstructed = "Is [re]constructed alignment";
69 
70 
71 struct SModelData {
72     SModelData(const CAlignModel& model, const CEResidueVec& contig_seq, int shift);
73 
74     CAlignModel model;
75     CEResidueVec mrna_seq;
76     CRef<CSeq_id> mrna_sid;
77     CRef<CSeq_id> prot_sid;
78 
79     bool is_ncrna;
80 };
81 
SModelData(const CAlignModel & m,const CEResidueVec & contig_seq,int shift)82 SModelData::SModelData(const CAlignModel& m, const CEResidueVec& contig_seq, int shift) : model(m)
83 {
84     CAlignMap amap = model.GetAlignMap();
85     CCDSInfo cds = model.GetCdsInfo();
86     if(cds.IsMappedToGenome()) {
87         cds = cds.MapFromOrigToEdited(amap);  // all coords on mRNA
88         model.SetCdsInfo(cds);
89     }
90 
91     amap.MoveOrigin(shift);
92     amap.EditedSequence(contig_seq, mrna_seq, true);
93 
94     prot_sid.Reset(new CSeq_id);
95     prot_sid->Assign(*CIdHandler::GnomonProtein(model.ID()));
96     mrna_sid.Reset(new CSeq_id);
97     mrna_sid->Assign(*CIdHandler::GnomonMRNA(model.ID()));
98 
99     is_ncrna = m.ReadingFrame().Empty();
100 }
101 
102 
103 class CAnnotationASN1::CImplementationData {
104 public:
105     CImplementationData(const string& contig_name, const CResidueVec& seq, IEvidence& evdnc, int genetic_code, int sh);
106 
107     void ResetASN1();
108     void AddModel(const CAlignModel& model);
109     CRef<CSeq_entry> main_seq_entry;
110 
111 private:
112     void CreateModelProducts(SModelData& model);
113     CRef<CSeq_feat> create_internal_feature(const SModelData& md);
114     CRef<CSeq_feat> create_cdregion_feature(SModelData& md);
115     CRef<CSeq_align> model2spliced_seq_align(SModelData& md);
116     //    CRef<CSpliced_exon> spliced_exon (const CModelExon& e, EStrand strand) const;
117     CRef<CSeq_loc> create_packed_int_seqloc(const CGeneModel& model, TSignedSeqRange limits_on_mrna = TSignedSeqRange::GetWhole());
118 
119     CRef< CUser_object > create_ModelEvidence_user_object(const CGeneModel& model);
120     void AddInternalFeature(const SModelData& md);
121     void DumpEvidence(const SModelData& md);
122 
123     string contig_name;
124     CRef<CSeq_id> contig_sid;
125     CEResidueVec contig_seq;
126 
127     int gencode;
128     int shift;
129 
130     CBioseq_set::TSeq_set* nucprots;
131     CSeq_annot* gnomon_models_annot;
132     CSeq_annot::C_Data::TFtable* feature_table;
133     CSeq_annot::C_Data::TFtable* internal_feature_table;
134     set<Int8> models_in_internal_feature_table;
135 
136     typedef map<Int8,CRef<CSeq_feat> > TGeneMap;
137     TGeneMap genes;
138     IEvidence& evidence;
139 
140     auto_ptr<CFeatureGenerator> feature_generator;
141     CRef<CScope> scope;
142 
143     friend class CAnnotationASN1;
144 };
145 
NameAnnot(CSeq_annot & annot,const string & name)146 void NameAnnot(CSeq_annot& annot, const string& name)
147 {
148     annot.SetNameDesc(name);
149     annot.SetTitleDesc(name);
150 }
151 
ResetASN1()152 void CAnnotationASN1::CImplementationData::ResetASN1() {
153     main_seq_entry = new CSeq_entry;
154 
155     CBioseq_set& bioseq_set = main_seq_entry->SetSet();
156 
157     nucprots = &bioseq_set.SetSeq_set();
158 
159     gnomon_models_annot = new CSeq_annot;
160     NameAnnot(*gnomon_models_annot, "Gnomon models");
161     CRef<CAnnotdesc> desc(new CAnnotdesc());
162     CRef<CSeq_loc> genomic_seqloc(new CSeq_loc());
163     genomic_seqloc->SetWhole(*contig_sid);
164     desc->SetRegion(*genomic_seqloc);
165     gnomon_models_annot->SetDesc().Set().push_back(desc);
166 
167     CRef<CSeq_annot> gnomon_models_annot_ref(gnomon_models_annot);
168     bioseq_set.SetAnnot().push_back(gnomon_models_annot_ref);
169     feature_table = &gnomon_models_annot->SetData().SetFtable();
170 
171     CRef<CSeq_annot> seq_annot(new CSeq_annot);
172     NameAnnot(*seq_annot, "Gnomon internal attributes");
173     bioseq_set.SetAnnot().push_back(seq_annot);
174     internal_feature_table = &seq_annot->SetData().SetFtable();
175 }
176 
CImplementationData(const string & a_contig_name,const CResidueVec & seq,IEvidence & evdnc,int genetic_code,int sh)177 CAnnotationASN1::CImplementationData::CImplementationData(const string& a_contig_name, const CResidueVec& seq, IEvidence& evdnc, int genetic_code, int sh) :
178     contig_name(a_contig_name),
179     contig_sid(CIdHandler::ToSeq_id(a_contig_name)),
180     gencode(genetic_code),
181     shift(sh),
182     evidence(evdnc)
183 {
184     Convert(seq, contig_seq);
185     ResetASN1();
186 
187     CRef<CObjectManager> obj_mgr = CObjectManager::GetInstance();
188     scope.Reset(new CScope(*obj_mgr));
189     scope->AddDefaults();
190     feature_generator.reset(new CFeatureGenerator(*scope));
191     feature_generator->SetFlags(CFeatureGenerator::fCreateGene | CFeatureGenerator::fCreateMrna | CFeatureGenerator::fCreateCdregion | CFeatureGenerator::fForceTranslateCds | CFeatureGenerator::fForceTranscribeMrna | CFeatureGenerator::fDeNovoProducts);
192     feature_generator->SetMinIntron(numeric_limits<TSeqPos>::max());
193     feature_generator->SetAllowedUnaligned(0);
194 }
195 
ResetASN1()196 void CAnnotationASN1::ResetASN1() {
197     m_data->ResetASN1();
198 }
199 
CAnnotationASN1(const string & contig_name,const CResidueVec & seq,IEvidence & evdnc,int genetic_code,int shift)200 CAnnotationASN1::CAnnotationASN1(const string& contig_name, const CResidueVec& seq, IEvidence& evdnc,
201                                  int genetic_code, int shift) :
202     m_data(new CImplementationData(contig_name, seq, evdnc, genetic_code, shift))
203 {
204 }
205 
~CAnnotationASN1()206 CAnnotationASN1::~CAnnotationASN1()
207 {
208 }
209 
GetASN1() const210 CRef<CSeq_entry> CAnnotationASN1::GetASN1() const
211 {
212     return m_data->main_seq_entry;
213 }
214 
215 
AddModel(const CAlignModel & model)216 void CAnnotationASN1::AddModel(const CAlignModel& model)
217 {
218     m_data->AddModel(model);
219 }
220 
CreateModelProducts(SModelData & md)221 void CAnnotationASN1::CImplementationData::CreateModelProducts(SModelData& md)
222 {
223     CRef< CSeq_align > align = model2spliced_seq_align(md);
224     CRef<CSeq_feat> cds_feat;
225 
226     CSeq_feat* cds_feat_ptr = NULL;
227     if (!md.is_ncrna) {
228          cds_feat = create_cdregion_feature(md);
229          cds_feat_ptr = cds_feat.GetPointer();
230     }
231 
232     CRef<CSeq_entry> model_products(new CSeq_entry);
233     nucprots->push_back(model_products);
234     CRef<CSeq_annot> annot(new CSeq_annot);
235     feature_generator->ConvertAlignToAnnot(*align, *annot, model_products->SetSet(), 0, cds_feat_ptr);
236 }
237 
AddInternalFeature(const SModelData & md)238 void CAnnotationASN1::CImplementationData::AddInternalFeature(const SModelData& md)
239 {
240     Int8 id = md.model.ID();
241     if (models_in_internal_feature_table.find(id) == models_in_internal_feature_table.end()) {
242         CRef<CSeq_feat> internal_feat = create_internal_feature(md);
243         internal_feature_table->push_back(internal_feat);
244         models_in_internal_feature_table.insert(id);
245     }
246 }
247 
AddModel(const CAlignModel & model)248 void CAnnotationASN1::CImplementationData::AddModel(const CAlignModel& model)
249 {
250     SModelData md(model, contig_seq, shift);
251 
252     CRef< CSeq_align > align = model2spliced_seq_align(md);
253     CRef<CSeq_feat> cds_feat;
254     CSeq_feat* cds_feat_ptr = NULL;
255     if (!md.is_ncrna) {
256          cds_feat = create_cdregion_feature(md);
257          cds_feat_ptr = cds_feat.GetPointer();
258     }
259 
260     try {
261         CRef<CSeq_entry> model_products(new CSeq_entry);
262         nucprots->push_back(model_products);
263 
264         CRef<CSeq_feat> mrna_feat = feature_generator->ConvertAlignToAnnot(*align, *gnomon_models_annot, model_products->SetSet(), model.GeneID(), cds_feat_ptr);
265 
266         DumpEvidence(md);
267 
268         CRef< CUser_object > user_obj = create_ModelEvidence_user_object(model);
269         mrna_feat->SetExts().push_back(user_obj);
270 
271         AddInternalFeature(md);
272     }
273     catch (CException&) {
274         cerr << MSerial_AsnText << *align;
275         throw;
276     }
277 }
278 
create_cdregion_feature(SModelData & md)279 CRef<CSeq_feat> CAnnotationASN1::CImplementationData::create_cdregion_feature(SModelData& md)
280 {
281     const CGeneModel& model = md.model;
282     const CCDSInfo& cds = model.GetCdsInfo();
283 
284     CRef<CSeq_feat> cdregion_feature(new CSeq_feat);
285 
286     CRef<CGenetic_code::C_E> cdrcode(new CGenetic_code::C_E);
287     cdrcode->SetId(gencode);
288     cdregion_feature->SetData().SetCdregion().SetCode().Set().push_back(cdrcode);
289 
290    if(model.PStop()) {
291         CCdregion::TCode_break& code_breaks = cdregion_feature->SetData().SetCdregion().SetCode_break();
292         ITERATE(CCDSInfo::TPStops,s,cds.PStops()) {
293             if(!Include(cds.Cds(),*s))
294                continue;
295 
296             CRef< CCode_break > code_break(new CCode_break);
297             CRef<CSeq_loc> pstop;
298 
299             TSeqPos from = s->GetFrom();
300             TSeqPos to = s->GetTo();
301             _ASSERT(from+2==to);
302 
303             pstop.Reset(new CSeq_loc(*md.mrna_sid, from, to, eNa_strand_plus));
304 
305             code_break->SetLoc(*pstop);
306             CRef<CCode_break::C_Aa> aa(new CCode_break::C_Aa);
307             if(s->m_status == CCDSInfo::eSelenocysteine)
308                 aa->SetNcbieaa('U');
309             else
310                 aa->SetNcbieaa('X');
311             code_break->SetAa(*aa);
312             code_breaks.push_back(code_break);
313         }
314     }
315 
316     cdregion_feature->SetProduct().SetWhole(*md.prot_sid);
317 
318     TSignedSeqRange tlim = model.TranscriptLimits();
319     int altstart = (cds.MaxCdsLimits()&tlim).GetFrom();
320     int start = cds.Cds().GetFrom();
321     int stop = (cds.MaxCdsLimits()&tlim).GetTo();
322 
323     int frame = 0;
324     if(!model.HasStart()) {
325         _ASSERT(altstart == 0);
326         frame = start%3;
327         start = 0;
328     }
329     CCdregion::EFrame ncbi_frame = CCdregion::eFrame_one;
330     if(frame == 1) ncbi_frame = CCdregion::eFrame_two;
331     else if(frame == 2) ncbi_frame = CCdregion::eFrame_three;
332     cdregion_feature->SetData().SetCdregion().SetFrame(ncbi_frame);
333 
334     CRef<CSeq_loc> cdregion_location;
335 
336     cdregion_location.Reset(new CSeq_loc(*md.mrna_sid, start, stop, eNa_strand_plus));
337 
338     if (0 < altstart && altstart != start)
339         cdregion_location->SetInt().SetFuzz_from().SetAlt().push_back(altstart);
340 
341     if (!model.HasStart())
342         cdregion_location->SetPartialStart(true,eExtreme_Biological);
343     if (!model.HasStop())
344         cdregion_location->SetPartialStop(true,eExtreme_Biological);
345     cdregion_feature->SetLocation(*cdregion_location);
346 
347     if (!model.HasStart() || !model.HasStop() || !model.Continuous())
348         cdregion_feature->SetPartial(true);
349 
350     return cdregion_feature;
351 }
352 
create_packed_int_seqloc(const CGeneModel & model,TSignedSeqRange limits_on_mrna)353 CRef<CSeq_loc> CAnnotationASN1::CImplementationData::create_packed_int_seqloc(const CGeneModel& model, TSignedSeqRange limits_on_mrna)
354 {
355     CRef<CSeq_loc> seq_loc(new CSeq_loc);
356     CPacked_seqint& packed_int = seq_loc->SetPacked_int();
357     ENa_strand strand = model.Strand()==ePlus?eNa_strand_plus:eNa_strand_minus;
358 
359     CAlignMap amap = model.GetAlignMap();
360 
361     for (size_t i=0; i < model.Exons().size(); ++i) {
362         const CModelExon* e = &model.Exons()[i];
363 
364         if(e->Limits().NotEmpty()) {  // not a ggap
365             TSignedSeqRange interval_range_on_mrna = amap.MapRangeOrigToEdited(e->Limits()) & limits_on_mrna;
366             if (interval_range_on_mrna.Empty())
367                 continue;
368 
369             bool extends_to_left = interval_range_on_mrna.GetFrom() > limits_on_mrna.GetFrom();
370             bool extends_to_right = interval_range_on_mrna.GetTo() < limits_on_mrna.GetTo();
371             if(model.Strand() == eMinus)
372                 swap(extends_to_left,extends_to_right);
373 
374             TSignedSeqRange interval_range = amap.MapRangeEditedToOrig(interval_range_on_mrna);
375             CRef<CSeq_interval> interval(new CSeq_interval(*contig_sid, interval_range.GetFrom(),interval_range.GetTo(), strand));
376 
377             if (i > 0 && (!e->m_fsplice || (model.Exons()[i-1].Limits().Empty() && extends_to_left))) {
378                 _ASSERT(interval_range.GetFrom() == e->Limits().GetFrom());
379                 interval->SetFuzz_from().SetLim(CInt_fuzz::eLim_lt);
380             }
381             if (i < model.Exons().size()-1 && (!e->m_ssplice || (model.Exons()[i+1].Limits().Empty() && extends_to_right))) {
382                 _ASSERT(interval_range.GetTo() == e->Limits().GetTo());
383                 interval->SetFuzz_to().SetLim(CInt_fuzz::eLim_gt);
384             }
385 
386             packed_int.AddInterval(*interval);
387         }
388     }
389     return seq_loc->Merge(CSeq_loc::fSort, NULL);
390 }
391 
ExpandSupport(const CSupportInfoSet & src,CSupportInfoSet & dst,IEvidence & evidence)392 void ExpandSupport(const CSupportInfoSet& src, CSupportInfoSet& dst, IEvidence& evidence)
393 {
394     ITERATE(CSupportInfoSet, s, src) {
395         dst.insert(*s);
396 
397         const CAlignModel* m = evidence.GetModel(s->GetId());
398         if (m && (m->Type()&(CAlignModel::eChain|CAlignModel::eGnomon))!=0) {
399             _ASSERT( !s->IsCore() );
400             ExpandSupport(m->Support(), dst, evidence);
401         }
402     }
403 }
404 
DumpEvidence(const SModelData & md)405 void CAnnotationASN1::CImplementationData::DumpEvidence(const SModelData& md)
406 {
407     const CGeneModel& model = md.model;
408     evidence.GetModel(model.ID()); // complete chains might not get marked as used otherwise
409 
410     if (model.Support().empty())
411         return;
412     CRef<CSeq_annot> seq_annot(new CSeq_annot);
413     CSeq_annot::C_Data::TAlign* aligns = &seq_annot->SetData().SetAlign();
414 
415     {{
416          string id_str = CIdHandler::ToString(*md.mrna_sid);
417 
418          NameAnnot(*seq_annot, "Evidence for " + id_str);
419 
420          CRef<CAnnot_id> annot_id(new CAnnot_id);
421          annot_id->SetGeneral().SetDb("GNOMON");
422          CIdHandler::SetId(annot_id->SetGeneral().SetTag(), md.model.ID());
423          seq_annot->SetId().push_back(annot_id);
424      }}
425 
426 
427     ITERATE(CSupportInfoSet, s, model.Support()) {
428         Int8 id = s->GetId();
429 
430         CRef<CSeq_align> a(const_cast<CSeq_align*>(evidence.GetSeq_align(id).GetPointerOrNull()));
431         if (a.NotEmpty()) {                                     //output asn alignments (chainer)
432             aligns->push_back(a);
433         } else {
434             const CAlignModel* m = evidence.GetModel(id);
435             if (m != NULL && (m->Type()&CGeneModel::eChain)) { //output supporting chains (gnomon)
436                 auto_ptr<SModelData> smd;
437                 smd.reset( new SModelData(*m, contig_seq, shift) );
438                 AddInternalFeature(*smd);
439                 CreateModelProducts(*smd);
440                 aligns->push_back(model2spliced_seq_align(*smd));
441             }
442         }
443     }
444 
445     if(!aligns->empty())
446         main_seq_entry->SetSet().SetAnnot().push_back(seq_annot);
447 }
448 
CollectUserField(const CUser_field & field,const string & name,vector<string> & values)449 int CollectUserField(const CUser_field& field, const string& name, vector<string>& values)
450 {
451     int count = 0;
452     if (field.HasField(name)) {
453         const CUser_field& fn =  field.GetField(name);
454         const CUser_field::C_Data::TStrs& strs = fn.GetData().GetStrs();
455         copy(strs.begin(), strs.end(), back_inserter(values));
456         if(fn.CanGetNum())
457             count = fn.GetNum();
458         else
459             count = strs.size();
460     }
461 
462     return count;
463 }
464 
465 
ModelMethod(const CGeneModel & model)466 string ModelMethod(const CGeneModel& model) {
467     string method;
468     bool ggap = false;
469     for(int i = 1; i < (int)model.Exons().size() && !ggap; ++i) {
470         ggap = model.Exons()[i-1].m_ssplice_sig == "XX" || model.Exons()[i].m_fsplice_sig == "XX";
471     }
472     if(model.Type()&CGeneModel::eChain) {
473         method = ggap ? "Chainer_GapFilled" : "Chainer";
474     } else if(model.Type()&CGeneModel::eGnomon) {
475         if(model.Support().empty())
476             method = "FullAbInitio";
477         else
478             method = ggap ? "PartAbInitio_GapFilled" : "PartAbInitio";
479     } else {
480         method = CGeneModel::TypeToString(model.Type());
481     }
482 
483     return method;
484 }
485 
486 
create_ModelEvidence_user_object(const CGeneModel & model)487 CRef< CUser_object > CAnnotationASN1::CImplementationData::create_ModelEvidence_user_object(const CGeneModel& model)
488 {
489     if(model.Type()&CGeneModel::eChain) {
490         CRef<CUser_object> uo = evidence.GetModelEvidenceUserObject(model.ID());
491         if(uo.NotNull() && uo->HasField("Support"))
492             return uo;
493     }
494 
495     CRef< CUser_object > user_obj(new CUser_object);
496     CRef< CObject_id > type(new CObject_id);
497     type->SetStr("ModelEvidence");
498     user_obj->SetType(*type);
499 
500     user_obj->AddField("Method",ModelMethod(model));
501 
502     if (!model.Support().empty()) {
503         CRef<CUser_field> support_field(new CUser_field());
504         support_field->SetLabel().SetStr("Support");
505         vector<string> chains;
506         vector<string> cores;
507         vector<string> proteins;
508         vector<string> mrnas;
509         vector<string> ests;
510         vector<string> short_reads;
511         vector<string> long_sras;
512         vector<string> others;
513         vector<string> unknown;
514 
515         int ests_count = 0;
516         int long_sras_count = 0;
517         int others_count = 0;
518 
519         ITERATE(CSupportInfoSet, s, model.Support()) {
520             Int8 id = s->GetId();
521             const CAlignModel* m = evidence.GetModel(id);
522             if(m == 0)
523                 continue;
524 
525             int type = m->Type();
526             string accession;
527             if (m->Type()&CGeneModel::eChain) {
528                 accession = CIdHandler::ToString(*CIdHandler::GnomonMRNA(id));
529             } else {
530                 accession = CIdHandler::ToString(*m->GetTargetId());
531             }
532 
533             if (s->IsCore())
534                 cores.push_back(accession);
535 
536             if (type&CGeneModel::eChain) {
537                 chains.push_back(accession);
538                 CRef<CUser_object> uo = evidence.GetModelEvidenceUserObject(id);
539                 if(uo.NotNull() && uo->HasField("Support")) {
540                     const CUser_field& support_field = uo->GetField("Support");
541                     CollectUserField(support_field, "Core", cores);
542                     CollectUserField(support_field, "Proteins", proteins);
543                     CollectUserField(support_field, "mRNAs", mrnas);
544                     ests_count += CollectUserField(support_field, "ESTs", ests);
545                     CollectUserField(support_field, "RNASeq", short_reads);
546                     long_sras_count += CollectUserField(support_field, "longSRA", long_sras);
547                     others_count += CollectUserField(support_field, "other", others);
548                 }
549             } else if (type&CGeneModel::eProt) {
550                 proteins.push_back(accession);
551             } else if (type&CGeneModel::emRNA) {
552                 mrnas.push_back(accession);
553             } else if (type&CGeneModel::eEST) {
554                 if(NStr::StartsWith(accession, "gi|")) {
555                     ests.push_back(accession);
556                     ests_count += m->Weight();
557                 } else if(NStr::StartsWith(accession, "gnl|SRA")) {
558                     long_sras.push_back(accession);
559                     long_sras_count += m->Weight();
560                 } else {
561                     others.push_back(accession);
562                     others_count += m->Weight();
563                 }
564             } else if (type&CGeneModel::eSR) {
565                 short_reads.push_back(accession);
566             } else {
567                 unknown.push_back(accession);
568             }
569         }
570 
571         bool have_something = false;
572 
573         if (!chains.empty()) {
574             support_field->AddField("Chains",chains);
575             support_field->SetData().SetFields().back()->SetNum(chains.size());
576             have_something = true;
577         }
578         if (!cores.empty()) {
579             support_field->AddField("Core",cores);
580             support_field->SetData().SetFields().back()->SetNum(cores.size());
581             have_something = true;
582         }
583         if (!proteins.empty()) {
584             sort(proteins.begin(),proteins.end());
585             support_field->AddField("Proteins",proteins);
586             support_field->SetData().SetFields().back()->SetNum(proteins.size());
587             have_something = true;
588         }
589         if (!mrnas.empty()) {
590             sort(mrnas.begin(),mrnas.end());
591             support_field->AddField("mRNAs",mrnas);
592             support_field->SetData().SetFields().back()->SetNum(mrnas.size());
593             have_something = true;
594         }
595         if (!ests.empty()) {
596             sort(ests.begin(),ests.end());
597             support_field->AddField("ESTs",ests);
598             support_field->SetData().SetFields().back()->SetNum(ests_count);
599             have_something = true;
600         }
601         if (!short_reads.empty()) {
602             sort(short_reads.begin(),short_reads.end());
603             support_field->AddField("RNASeq",short_reads);
604             support_field->SetData().SetFields().back()->SetNum(short_reads.size());
605             have_something = true;
606         }
607         if (!long_sras.empty()) {
608             sort(long_sras.begin(),long_sras.end());
609             support_field->AddField("longSRA",long_sras);
610             support_field->SetData().SetFields().back()->SetNum(long_sras_count);
611             have_something = true;
612         }
613         if (!others.empty()) {
614             sort(others.begin(),others.end());
615             support_field->AddField("other",others);
616             support_field->SetData().SetFields().back()->SetNum(others_count);
617             have_something = true;
618         }
619         if (!unknown.empty()) {
620             support_field->AddField("unknown",unknown);
621             support_field->SetData().SetFields().back()->SetNum(unknown.size());
622             have_something = true;
623         }
624 
625         if (have_something)
626             user_obj->SetData().push_back(support_field);
627     }
628 
629     if(!model.ProteinHit().empty())
630         user_obj->AddField("BestTargetProteinHit",model.ProteinHit());
631     if(model.Status()&CGeneModel::eFullSupCDS)
632         user_obj->AddField("CDS support",string("full"));
633 
634     return user_obj;
635 }
636 
637 
create_internal_feature(const SModelData & md)638 CRef<CSeq_feat> CAnnotationASN1::CImplementationData::create_internal_feature(const SModelData& md)
639 {
640     const CAlignModel& model = md.model;
641 
642     CRef<CSeq_feat> feature(new CSeq_feat);
643 
644     CRef<CObject_id> obj_id( new CObject_id() );
645     CIdHandler::SetId(*obj_id, model.ID());
646     CRef<CFeat_id> feat_id( new CFeat_id() );
647     feat_id->SetLocal(*obj_id);
648     feature->SetIds().push_back(feat_id);
649 
650     CRef<CUser_object> user( new CUser_object);
651     user->SetClass("Gnomon");
652     user->SetType().SetStr("Model Internal Attributes");
653     feature->SetExts().push_back(user);
654 
655     if (model.Type() & (CGeneModel::eGnomon | CGeneModel::eChain))
656         user->AddField("Method", ModelMethod(model));
657 
658     TAttributes attributes;
659     CollectAttributes(model, attributes);
660     if (model.GetTargetId().NotEmpty())
661         attributes["Target"] = CIdHandler::ToString(*model.GetTargetId());
662     ITERATE (TAttributes, a, attributes) {
663         if (!a->second.empty())
664             user->AddField(a->first, a->second);
665     }
666 
667     if (model.Score() != BadScore())
668         user->AddField("cds_score", model.Score());
669 
670     const CCDSInfo& cds_info = model.GetCdsInfo();
671     TSignedSeqRange rcds_on_mrna = cds_info.MaxCdsLimits() & model.TranscriptLimits();
672     int frame = 0;
673     if(cds_info.HasStart())
674         rcds_on_mrna.SetFrom(cds_info.Start().GetFrom());
675     else
676         frame = (cds_info.Cds().GetFrom()-rcds_on_mrna.GetFrom())%3;
677 
678     if (cds_info.ReadingFrame().NotEmpty() && model.GetAlignMap().MapRangeOrigToEdited(model.Limits(),false).IntersectingWith(rcds_on_mrna)) {    // at least some CDS could be projected on genome
679         // create cdregion feature as there is no other place to show CDS on evidence alignment
680 
681         _ASSERT((model.Status()&CGeneModel::eReversed) == 0);  // needs to be changed if not true
682 
683         feature->SetLocation(*create_packed_int_seqloc(model,rcds_on_mrna));
684         CRef<CGenetic_code::C_E> cdrcode(new CGenetic_code::C_E);
685         cdrcode->SetId(1);
686         feature->SetData().SetCdregion().SetCode().Set().push_back(cdrcode);
687 
688         CCdregion::EFrame ncbi_frame = CCdregion::eFrame_one;
689         if(frame == 1) ncbi_frame = CCdregion::eFrame_two;
690         else if(frame == 2) ncbi_frame = CCdregion::eFrame_three;
691         feature->SetData().SetCdregion().SetFrame(ncbi_frame);
692     } else {
693         feature->SetLocation(*create_packed_int_seqloc(model,model.TranscriptLimits()));
694         feature->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
695     }
696 
697     return feature;
698 }
699 
700 //CRef<CSpliced_exon> CAnnotationASN1::CImplementationData::spliced_exon (const CModelExon& e, EStrand strand) const
spliced_exon(const CModelExon & e,EStrand strand)701 CRef<CSpliced_exon> spliced_exon (const CModelExon& e, EStrand strand)
702 {
703     CRef<CSpliced_exon> se(new CSpliced_exon());
704 
705     if(e.Limits().NotEmpty()) { // normal exon
706         se->SetGenomic_start(e.GetFrom());
707         se->SetGenomic_end(e.GetTo());
708     } else {    // gap filler
709         CRef<CSeq_id> fillerid = CIdHandler::ToSeq_id(e.m_source.m_acc);
710         se->SetGenomic_id(*fillerid);
711         se->SetGenomic_strand(e.m_source.m_strand==ePlus?eNa_strand_plus:eNa_strand_minus);
712         se->SetGenomic_start(e.m_source.m_range.GetFrom());
713         se->SetGenomic_end(e.m_source.m_range.GetTo());
714     }
715 
716     if(e.m_ident > 0) {
717         CRef<CScore> scr(new CScore);
718         scr->SetValue().SetReal(e.m_ident);
719         CRef<CObject_id> id(new CObject_id());
720         id->SetStr("idty");
721         scr->SetId(*id);
722         se->SetScores().Set().push_back(scr);
723     }
724 
725     if (e.m_fsplice) {
726         //        _ASSERT(e.m_fsplice_sig.length() == 2);
727         if (strand==ePlus) {
728             se->SetAcceptor_before_exon().SetBases(e.m_fsplice_sig);
729         } else {
730             se->SetDonor_after_exon().SetBases(e.m_fsplice_sig);
731         }
732     }
733     if (e.m_ssplice) {
734         //        _ASSERT(e.m_ssplice_sig.length() == 2);
735         if (strand==ePlus) {
736             se->SetDonor_after_exon().SetBases(e.m_ssplice_sig);
737         } else {
738             se->SetAcceptor_before_exon().SetBases(e.m_ssplice_sig);
739         }
740     }
741     return se;
742 }
743 
744 
NucPosToProtPos(int nultripos)745 CRef<CProduct_pos> NucPosToProtPos(int nultripos) {
746     CRef<CProduct_pos> pos(new CProduct_pos);
747     pos->SetProtpos().SetFrame(nultripos%3 + 1);
748     pos->SetProtpos().SetAmin(nultripos/3);
749     return pos;
750 }
751 
AlignModelToSeqalign(const CAlignModel & model,CSeq_id & mrnaid,CSeq_id & contigid,bool is_align,bool is_protalign,bool stop_found)752 CRef<CSeq_align> AlignModelToSeqalign(const CAlignModel& model, CSeq_id& mrnaid, CSeq_id& contigid, bool is_align, bool is_protalign, bool stop_found) {
753 
754     CRef< CSeq_align > seq_align( new CSeq_align );
755 
756     if(model.ID() > 0) {
757         CRef<CObject_id> id(new CObject_id());
758         CIdHandler::SetId(*id, model.ID());
759         seq_align->SetId().push_back(id);
760     }
761 
762     seq_align->SetType(CSeq_align::eType_partial);
763     CSpliced_seg& spliced_seg = seq_align->SetSegs().SetSpliced();
764 
765     if(is_protalign) {
766         spliced_seg.SetProduct_type(CSpliced_seg::eProduct_type_protein);
767         int product_length = model.TargetLen()/3;
768         if(stop_found)
769             --product_length;
770         spliced_seg.SetProduct_length(product_length);
771     } else {
772         spliced_seg.SetProduct_type(CSpliced_seg::eProduct_type_transcript);
773         spliced_seg.SetProduct_length(model.TargetLen());
774     }
775     if (model.Status() & CAlignModel::ePolyA) {
776         spliced_seg.SetPoly_a((model.Status() & CAlignModel::eReversed)? model.PolyALen() - 1 : model.TargetLen() - model.PolyALen());
777     }
778 
779     spliced_seg.SetProduct_id(mrnaid);
780     spliced_seg.SetGenomic_id(contigid);
781     spliced_seg.SetProduct_strand((model.Status() & CGeneModel::eReversed)==0 ? eNa_strand_plus : eNa_strand_minus);
782     spliced_seg.SetGenomic_strand(model.Strand()==ePlus?eNa_strand_plus:eNa_strand_minus);
783 
784     CSpliced_seg::TExons& exons = spliced_seg.SetExons();
785 
786     TInDels indels = (is_protalign ? model.GetInDels(false) : model.FrameShifts());
787     TInDels::const_iterator indel_i = indels.begin();
788     for (size_t i=0; i < model.Exons().size(); ++i) {
789         const CModelExon *e = &model.Exons()[i];
790 
791         CRef<CSpliced_exon> se = spliced_exon(*e,model.Strand());
792 
793         TSignedSeqRange transcript_exon = model.TranscriptExon(i);
794         if(is_protalign) {
795             se->SetProduct_start(*NucPosToProtPos(transcript_exon.GetFrom()));
796             se->SetProduct_end(*NucPosToProtPos(transcript_exon.GetTo()));
797         } else {
798             se->SetProduct_start().SetNucpos(transcript_exon.GetFrom());
799             se->SetProduct_end().SetNucpos(transcript_exon.GetTo());
800         }
801 
802         if(e->Limits().NotEmpty()) {    // normal exon
803             int last_chunk = e->GetFrom();
804             for( ;indel_i != indels.end() && indel_i->Loc() <= e->GetTo()+1; ++indel_i) {
805                 const CInDelInfo& indel = *indel_i;
806                 _ASSERT( e->GetFrom() <= indel.Loc() );
807 
808                 if (indel.Loc()-last_chunk > 0) {
809                     CRef< CSpliced_exon_chunk > chunk(new CSpliced_exon_chunk);
810                     if(is_align) {
811                         if(is_protalign)
812                             chunk->SetDiag(indel.Loc()-last_chunk);
813                         else
814                             chunk->SetMatch(indel.Loc()-last_chunk);
815                     } else
816                         chunk->SetMatch(indel.Loc()-last_chunk);
817                     se->SetParts().push_back(chunk);
818                     last_chunk = indel.Loc();
819                 }
820 
821                 if(indel.IsMismatch()) {
822                     _ASSERT(!is_protalign);
823                     CRef< CSpliced_exon_chunk > chunk(new CSpliced_exon_chunk);
824                     chunk->SetMismatch(indel.Len());
825                     se->SetParts().push_back(chunk);
826 
827                     last_chunk += indel.Len();
828                 } else if (indel.IsInsertion()) {
829                     CRef< CSpliced_exon_chunk > chunk(new CSpliced_exon_chunk);
830                     chunk->SetGenomic_ins(indel.Len());
831                     se->SetParts().push_back(chunk);
832 
833                     last_chunk += indel.Len();
834                 } else {
835                     CRef< CSpliced_exon_chunk > chunk(new CSpliced_exon_chunk);
836                     chunk->SetProduct_ins(indel.Len());
837                     se->SetParts().push_back(chunk);
838                 }
839             }
840             if (e->GetFrom() <= last_chunk && last_chunk <= e->GetTo()) {
841                 CRef< CSpliced_exon_chunk > chunk(new CSpliced_exon_chunk);
842                 if(is_align) {
843                     if(is_protalign)
844                         chunk->SetDiag(e->GetTo()-last_chunk+1);
845                     else
846                         chunk->SetMatch(e->GetTo()-last_chunk+1);
847                 } else
848                     chunk->SetMatch(e->GetTo()-last_chunk+1);
849                 se->SetParts().push_back(chunk);
850             }
851         }  else  {   // gap filler
852             CRef< CSpliced_exon_chunk > chunk(new CSpliced_exon_chunk);
853             if(is_align) {
854                 if(is_protalign)
855                     chunk->SetDiag(e->m_source.m_range.GetLength());
856                 else
857                     chunk->SetMatch(e->m_source.m_range.GetLength());
858             } else
859                 chunk->SetMatch(e->m_source.m_range.GetLength());
860             se->SetParts().push_back(chunk);
861         }
862 
863         exons.push_back(se);
864     }
865     _ASSERT( indel_i == indels.end() );
866 
867     if (model.Strand() == eMinus) {
868         //    reverse if minus strand (don't forget to reverse chunks)
869         exons.reverse();
870         NON_CONST_ITERATE(CSpliced_seg::TExons, exon_i, exons) {
871             CSpliced_exon& se = **exon_i;
872             if (se.IsSetParts())
873                 se.SetParts().reverse();
874         }
875     }
876 
877     if(!model.FrameShifts().empty() && !is_protalign) {
878         TSignedSeqRange tlim = model.TranscriptLimits();
879         int left_not_aligned = tlim.GetFrom();
880         int right_not_aligned = model.TargetLen()-tlim.GetTo()-1;
881         if(model.Strand() == eMinus)
882             swap(left_not_aligned, right_not_aligned);
883 
884         string mismatches(left_not_aligned, 'N'); // will include mismatches and deletions
885         string mismstatus;                        // will include status for mismatches and indels
886         TInDels::const_iterator indl = model.FrameShifts().begin();
887         for(int e = 0; e < (int)model.Exons().size(); ++e) {
888             if(model.Exons()[e].Limits().Empty())
889                 continue;
890             if(e > 0 && !model.Exons()[e].m_fsplice) {
891                 int len;
892                 if(model.Orientation() == ePlus)
893                     len = model.TranscriptExon(e).GetFrom()-model.TranscriptExon(e-1).GetTo()-1;
894                 else
895                     len = model.TranscriptExon(e-1).GetFrom()-model.TranscriptExon(e).GetTo()-1;
896                 mismatches += string(len, 'N');
897             }
898             for( ; indl != model.FrameShifts().end() && indl->IntersectingWith(model.Exons()[e].GetFrom(), model.Exons()[e].GetTo()); ++indl) {
899                 switch(indl->GetStatus()) {
900                 case CInDelInfo::eGenomeNotCorrect:
901                     mismstatus.push_back('n'); break;
902                 case CInDelInfo::eGenomeCorrect:
903                     mismstatus.push_back('c'); break;
904                 default:
905                     mismstatus.push_back('u'); break;
906                 }
907                 if(!indl->IsInsertion())
908                     mismatches += indl->GetInDelV();
909             }
910         }
911         mismatches += string(right_not_aligned, 'N');
912 
913         if(mismatches.find_first_not_of('N') != string::npos) {
914             CRef<CUser_object> userm( new CUser_object);
915             CRef<CObject_id> typem(new CObject_id);
916             typem->SetStr("MismatchedBases");
917             userm->SetType(*typem);
918             userm->AddField("Bases", mismatches);
919             seq_align->SetExt().push_back(userm);
920         }
921 
922         if(mismstatus.find_first_not_of('u') != string::npos) {
923             CRef<CUser_object> users( new CUser_object);
924             CRef<CObject_id> types(new CObject_id);
925             types->SetStr("MismatchedBasesStatus");
926             users->SetType(*types);
927             users->AddField("Status", mismstatus);
928             seq_align->SetExt().push_back(users);
929         }
930     }
931 
932     return seq_align;
933 }
934 
MakeSeqAlign(const string & contig) const935 CRef<objects::CSeq_align> CAlignModel::MakeSeqAlign(const string& contig) const {
936 
937     bool is_align = !(Type()&(eChain|eGnomon));
938 
939     CAlignModel model = *this;
940     if(is_align && (Type()&eProt) && HasStop()) {
941         _ASSERT(GetCdsInfo().IsMappedToGenome());
942         TSignedSeqRange lim = GetCdsInfo().Start()+GetCdsInfo().ReadingFrame();
943         model.Clip(lim,eRemoveExons);   // protein alignments doesn't include stop
944     }
945 
946     CRef<CSeq_id> contig_sid(CIdHandler::ToSeq_id(contig));
947     CRef<CSeq_id> mrna_sid(new CSeq_id);
948     mrna_sid->Assign(*GetTargetId());
949     CRef<CSeq_align> seq_align = AlignModelToSeqalign(model, *mrna_sid, *contig_sid, is_align, is_align && (Type()&eProt), is_align && (Type()&eProt) && HasStop());
950 
951     if(is_align) {
952         CSpliced_seg& spliced_seg = seq_align->SetSegs().SetSpliced();
953 
954         if(Ident() > 0) {
955             int matches = Ident()*seq_align->GetAlignLength()+0.5;
956             seq_align->SetNamedScore("matches", matches);
957         }
958 
959         if(Status()&eBestPlacement)
960             seq_align->SetNamedScore("rank", 1);
961 
962         if(Status()&eUnknownOrientation)
963             seq_align->SetNamedScore("ambiguous_orientation", 1);
964 
965         if(Weight() > 1)
966             seq_align->SetNamedScore("count", int(Weight()+0.5));
967 
968         if(Type()&eProt) {
969             if(HasStart()) {
970                 CRef<CSpliced_seg_modifier> modi(new CSpliced_seg_modifier);
971                 modi->SetStart_codon_found(true);
972                 spliced_seg.SetModifiers().push_back(modi);
973             }
974             if(HasStop()) {
975                 CRef<CSpliced_seg_modifier> modi(new CSpliced_seg_modifier);
976                 modi->SetStop_codon_found(true);
977                 spliced_seg.SetModifiers().push_back(modi);
978             }
979         }
980     }
981 
982     return seq_align;
983 }
984 
model2spliced_seq_align(SModelData & md)985 CRef<CSeq_align> CAnnotationASN1::CImplementationData::model2spliced_seq_align(SModelData& md)
986 {
987     const CAlignModel& model = md.model;
988 
989     CRef<CSeq_align> seq_align = AlignModelToSeqalign(model, *md.mrna_sid, *contig_sid, false, false, false);
990 
991     CRef<CUser_object> user( new CUser_object);
992     user->SetClass("Gnomon");
993     CRef< CObject_id > type(new CObject_id);
994     type->SetStr("AlignmentAttributes");
995     user->SetType(*type);
996     seq_align->SetExt().push_back(user);
997     user->AddField(kGnomonConstructed, true);
998 
999 #ifdef _DEBUG
1000     try {
1001         seq_align->Validate(true);
1002     } catch (...) {
1003         _ASSERT(false);
1004     }
1005 //     try {
1006 //         CNcbiOstrstream ostream;
1007 //         ostream << MSerial_AsnBinary << *seq_align;
1008 //     } catch (...) {
1009 //         _ASSERT(false);
1010 //     }
1011 #endif
1012 
1013     return seq_align;
1014 }
1015 
GetModelEvidenceUserObject(const CSeq_feat_Handle & feat)1016 CRef<CUser_object> GetModelEvidenceUserObject(const CSeq_feat_Handle& feat)
1017 {
1018     ITERATE(CSeq_feat::TExts, uo, feat.GetSeq_feat()->GetExts()) {
1019         if ((*uo)->GetType().GetStr() == "ModelEvidence" ) {
1020             return *uo;
1021         }
1022     }
1023     return CRef<CUser_object>();
1024 }
1025 
RestoreModelMethod(const CSeq_feat_Handle & feat,CAlignModel & model)1026 void RestoreModelMethod(const CSeq_feat_Handle& feat, CAlignModel& model)
1027 {
1028     const CUser_object& user = *feat.GetSeq_feat()->GetExts().front();
1029     _ASSERT( user.GetType().GetStr() == "Model Internal Attributes" || user.GetType().GetStr() == "ModelEvidence" );
1030     if (user.HasField("Method")) {
1031         string method = user.GetField("Method").GetData().GetStr();
1032         if (method.find("AbInitio") != string::npos || method.find("Gnomon") != string::npos)
1033             model.SetType(CGeneModel::eGnomon);
1034         else if (method.find("Chainer") != string::npos)
1035             model.SetType(CGeneModel::eChain);
1036     }
1037 }
1038 
RestoreModelAttributes(const CSeq_feat_Handle & feat_handle,CAlignModel & model)1039 void RestoreModelAttributes(const CSeq_feat_Handle& feat_handle, CAlignModel& model)
1040 {
1041     TAttributes attributes;
1042 
1043     _ASSERT(feat_handle);
1044 
1045     const CUser_object& user = *feat_handle.GetOriginalSeq_feat()->GetExts().front();
1046     _ASSERT( user.GetClass() == "Gnomon" );
1047     _ASSERT( user.GetType().GetStr() == "Model Internal Attributes" );
1048 
1049 
1050     model.SetType(0);
1051     RestoreModelMethod(feat_handle, model);
1052 
1053     ITERATE(CUser_object::TData, f, user.GetData()) {
1054         const CUser_field& fld = **f;
1055         if (fld.GetData().IsStr())
1056             attributes[fld.GetLabel().GetStr()] = fld.GetData().GetStr();
1057         else if (fld.GetLabel().GetStr() == "cds_score")
1058             attributes["cds_score"] = NStr::DoubleToString(fld.GetData().GetReal());
1059     }
1060 
1061     ParseAttributes(attributes, model);
1062 
1063     if (attributes.find("cds_score") != attributes.end()) {
1064         double score = NStr::StringToDouble(attributes["cds_score"]);
1065         CCDSInfo cds_info = model.GetCdsInfo();
1066         cds_info.SetScore(score, cds_info.OpenCds());
1067         model.SetCdsInfo(cds_info);
1068     }
1069 
1070     if(model.GetCdsInfo().ReadingFrame().NotEmpty()) {
1071         CCDSInfo cds_info_t = model.GetCdsInfo();
1072         CCDSInfo cds_info_g = cds_info_t.MapFromEditedToOrig(model.GetAlignMap());
1073         if(cds_info_g.ReadingFrame().NotEmpty())   // successful projection
1074             model.SetCdsInfo(cds_info_g);
1075     }
1076 }
1077 
RestoreModelReadingFrame(const CSeq_feat_Handle & feat,CAlignModel & model)1078 void RestoreModelReadingFrame(const CSeq_feat_Handle& feat, CAlignModel& model)
1079 {
1080     if (feat && feat.GetFeatType() == CSeqFeatData::e_Cdregion) {
1081         TSeqRange cds_range = feat.GetLocation().GetTotalRange();
1082         TSignedSeqRange rf = TSignedSeqRange(cds_range.GetFrom(), cds_range.GetTo());
1083         if (!feat.GetLocation().GetId()->Match(*CIdHandler::GnomonMRNA(model.ID()))) {
1084             rf =  model.GetAlignMap().MapRangeOrigToEdited(rf, false);
1085         }
1086 
1087         if (feat.GetData().GetCdregion().CanGetFrame()) {
1088 
1089             CCdregion::EFrame ncbi_frame = feat.GetData().GetCdregion().GetFrame();
1090             int phase = 0;
1091             switch (ncbi_frame) {
1092             case CCdregion::eFrame_not_set:
1093             case CCdregion::eFrame_one:
1094                 phase = 0;
1095                 break;
1096             case CCdregion::eFrame_two:
1097                 phase = 1;
1098                 break;
1099             case CCdregion::eFrame_three:
1100                 phase = 2;
1101                 break;
1102             default:
1103                 _ASSERT( false);
1104             }
1105 
1106             bool notreversed = (model.Status()&CGeneModel::eReversed) == 0;
1107             if(notreversed) {
1108                 rf.SetFrom(rf.GetFrom()+phase);
1109                 rf.SetTo(rf.GetTo()-rf.GetLength()%3);
1110             } else {
1111                 rf.SetTo(rf.GetTo()-phase);
1112                 rf.SetFrom(rf.GetFrom()+rf.GetLength()%3);
1113             }
1114         }
1115 
1116         CCDSInfo cds_info(false);  // mrna coordinates; will be projected to genome at the end if possible
1117         cds_info.SetReadingFrame(rf, false);
1118         model.SetCdsInfo(cds_info);
1119     }
1120 }
1121 
RestoreModel(const CSeq_feat_Handle & internal_feat,const CSeq_feat_Handle & cds_feat,const CSeq_align & align)1122 CAlignModel* RestoreModel(const CSeq_feat_Handle& internal_feat, const CSeq_feat_Handle& cds_feat, const CSeq_align& align)
1123 {
1124     CAlignModel* model = new CAlignModel(align);
1125 
1126     RestoreModelReadingFrame(cds_feat, *model);
1127 
1128     RestoreModelAttributes(internal_feat, *model);
1129 
1130     return model;
1131 }
1132 
RestoreModelFromPublicMrnaFeature(const CSeq_feat_Handle & feat)1133 CAlignModel* RestoreModelFromPublicMrnaFeature(const CSeq_feat_Handle& feat)
1134 {
1135     CScope& scope = feat.GetScope();
1136     CBioseq_Handle mrna_handle = scope.GetBioseqHandle(*feat.GetProduct().GetId());
1137     CConstRef<CBioseq> mrna = mrna_handle.GetCompleteBioseq();
1138     _ASSERT(mrna->IsNa());
1139 
1140     const CSeq_align& align = *mrna->GetInst().GetHist().GetAssembly().front();
1141     const CObject_id& obj_id = *align.GetId().back();
1142 
1143     CFeat_CI cds_feat(mrna_handle);
1144     while (cds_feat && !cds_feat->GetOriginalFeature().GetData().IsCdregion())
1145         ++cds_feat;
1146 
1147     const CTSE_Handle& tse_handle = feat.GetAnnot().GetTSE_Handle();
1148     CSeq_feat_Handle feat_handle = tse_handle.GetFeatureWithId(CSeqFeatData::e_Rna, obj_id);
1149     if (!feat_handle) {
1150         feat_handle = tse_handle.GetFeatureWithId(CSeqFeatData::e_Cdregion, obj_id);
1151     }
1152 
1153     return RestoreModel(feat_handle, *cds_feat, align);
1154 }
1155 
RestoreModelFromInternalGnomonFeature(const CSeq_feat_Handle & feat)1156 CAlignModel* RestoreModelFromInternalGnomonFeature(const CSeq_feat_Handle& feat)
1157 {
1158     Int8 id = CIdHandler::GetId(feat.GetOriginalSeq_feat()->GetIds().front()->GetLocal());
1159 
1160     CScope& scope = feat.GetScope();
1161     CConstRef<CSeq_id> mrna_seq_id = CIdHandler::GnomonMRNA(id);
1162     CBioseq_Handle mrna_handle = scope.GetBioseqHandle(*mrna_seq_id);
1163     if (!mrna_handle)
1164         return NULL;
1165     CConstRef<CBioseq> mrna = mrna_handle.GetCompleteBioseq();
1166     _ASSERT(mrna->IsNa());
1167 
1168     const CSeq_align& align = *mrna->GetInst().GetHist().GetAssembly().front();
1169 
1170     return RestoreModel(feat, feat, align);
1171 }
1172 
IsGnomonConstructed(const CSeq_align & seq_align)1173 bool IsGnomonConstructed(const CSeq_align& seq_align)
1174 {
1175     if (seq_align.CanGetExt()) {
1176         CSeq_align::TExt ext = seq_align.GetExt();
1177         ITERATE(CSeq_align::TExt, u, ext) {
1178             if ((*u)->CanGetClass() && (*u)->GetClass() == "Gnomon" &&
1179                 (*u)->HasField(kGnomonConstructed) &&
1180                 (*u)->GetField(kGnomonConstructed).GetData().GetBool())
1181                 return true;
1182         }
1183     }
1184     return false;
1185 }
1186 
ExtractSupportModels(Int8 model_id,TAlignModelList & evidence_models,list<CRef<CSeq_align>> & evidence_alignments,CSeq_entry_Handle seq_entry_handle,map<string,CRef<CSeq_annot>> & seq_annot_map,set<Int8> & processed_ids)1187 void ExtractSupportModels(Int8 model_id,
1188                           TAlignModelList& evidence_models, list<CRef<CSeq_align> >& evidence_alignments,
1189                           CSeq_entry_Handle seq_entry_handle, map<string, CRef<CSeq_annot> >& seq_annot_map,
1190                           set<Int8>& processed_ids)
1191 {
1192     map<string, CRef<CSeq_annot> >::iterator annot = seq_annot_map.find("Evidence for "+CIdHandler::ToString(*CIdHandler::GnomonMRNA(model_id)));
1193     if (annot == seq_annot_map.end())
1194         return;
1195 
1196     CSeq_annot::TData::TAlign aligns = annot->second->SetData().SetAlign();
1197 
1198     NON_CONST_ITERATE (CSeq_annot::TData::TAlign, align_ci, aligns) {
1199         CSeq_align& seq_align = **align_ci;
1200         const CObject_id& obj_id = *seq_align.GetId().back();
1201         Int8 id = CIdHandler::GetId(obj_id);
1202         if (!processed_ids.insert(id).second) // already there
1203             continue;
1204 
1205         const CTSE_Handle& tse_handle = seq_entry_handle.GetTSE_Handle();
1206         CSeq_feat_Handle feat_handle = tse_handle.GetFeatureWithId(CSeqFeatData::e_Rna, obj_id);
1207         if (!feat_handle)
1208             feat_handle = tse_handle.GetFeatureWithId(CSeqFeatData::e_Cdregion, obj_id);
1209 
1210         CSeq_feat_Handle cds_handle = tse_handle.GetFeatureWithId(CSeqFeatData::e_Cdregion, "cds."+NStr::NumericToString(id));
1211         if(!cds_handle)
1212             cds_handle = feat_handle;
1213 
1214         auto_ptr<CAlignModel> model( RestoreModel(feat_handle, cds_handle, seq_align) );
1215         evidence_models.push_back(*model);
1216 
1217         ExtractSupportModels(id, evidence_models, evidence_alignments, seq_entry_handle, seq_annot_map, processed_ids);
1218 
1219         if (IsGnomonConstructed(seq_align))
1220             continue;
1221 
1222         CRef<CSeq_align> align_ref(&seq_align);
1223         evidence_alignments.push_back(align_ref);
1224     }
1225 }
1226 
ExtractModels(objects::CSeq_entry & seq_entry,TAlignModelList & model_list,TAlignModelList & evidence_models,list<CRef<CSeq_align>> & evidence_alignments,map<Int8,CRef<CUser_object>> & model_evidence_uo)1227 string CAnnotationASN1::ExtractModels(objects::CSeq_entry& seq_entry,
1228                                       TAlignModelList& model_list,
1229                                       TAlignModelList& evidence_models,
1230                                       list<CRef<CSeq_align> >& evidence_alignments,
1231                                       map<Int8, CRef<CUser_object> >& model_evidence_uo)
1232 {
1233     CScope scope(*CObjectManager::GetInstance());
1234     CSeq_entry_Handle seq_entry_handle = scope.AddTopLevelSeqEntry(seq_entry);
1235 
1236     map<string, CRef<CSeq_annot> > seq_annot_map;
1237 
1238     string contig;
1239 
1240     ITERATE(CBioseq_set::TAnnot, annot, seq_entry.SetSet().SetAnnot()) {
1241         CAnnot_descr::Tdata::const_iterator iter = (*annot)->GetDesc().Get().begin();
1242         string name;
1243         string region;
1244         for ( ;  iter != (*annot)->GetDesc().Get().end();  ++iter) {
1245             if ((*iter)->IsName() ) {
1246                 name = (*iter)->GetName();
1247             }
1248             if ((*iter)->IsRegion() ) {
1249                 region = CIdHandler::ToString(*(*iter)->GetRegion().GetId());
1250             }
1251         }
1252         if (!name.empty()) {
1253             seq_annot_map[name] = *annot;
1254             if (name=="Gnomon models") {
1255                 contig = region;
1256             }
1257         }
1258     }
1259 
1260     CRef<CSeq_annot> feature_table = seq_annot_map["Gnomon models"];
1261     _ASSERT( feature_table.NotEmpty() );
1262     _ASSERT( feature_table->IsFtable() );
1263 
1264     CRef<CSeq_annot> internal_feature_table = seq_annot_map["Gnomon internal attributes"];
1265     _ASSERT( internal_feature_table.NotEmpty() );
1266     _ASSERT( internal_feature_table->IsFtable() );
1267 
1268     SAnnotSelector sel;
1269     sel.SetFeatType(CSeqFeatData::e_Rna);
1270     CFeat_CI feat_ci(scope.GetSeq_annotHandle(*feature_table), sel);
1271 
1272     if (contig.empty() && feat_ci) {
1273         contig = CIdHandler::ToString(*feat_ci->GetLocation().GetId());
1274     }
1275 
1276     set<Int8> processed_ids;
1277     for (; feat_ci; ++feat_ci) {
1278         auto_ptr<CAlignModel> model( RestoreModelFromPublicMrnaFeature(feat_ci->GetSeq_feat_Handle()) );
1279         model_list.push_back(*model);
1280         processed_ids.insert(model->ID());
1281         ExtractSupportModels(model->ID(), evidence_models, evidence_alignments, seq_entry_handle, seq_annot_map, processed_ids);
1282         model_evidence_uo[model->ID()] = GetModelEvidenceUserObject(feat_ci->GetSeq_feat_Handle());
1283     }
1284 
1285     CFeat_CI internal_feat_ci(scope.GetSeq_annotHandle(*internal_feature_table));
1286     for (; internal_feat_ci; ++internal_feat_ci) {
1287         Int8 id = CIdHandler::GetId(internal_feat_ci->GetOriginalFeature().GetIds().front()->GetLocal());
1288         if (processed_ids.find(id) != processed_ids.end()) // already there
1289             continue;
1290         auto_ptr<CAlignModel> model( RestoreModelFromInternalGnomonFeature(internal_feat_ci->GetSeq_feat_Handle()) );
1291         if (model.get() == NULL)
1292             continue;
1293         evidence_models.push_back(*model);
1294         processed_ids.insert(id);
1295         ExtractSupportModels(model->ID(), evidence_models, evidence_alignments, seq_entry_handle, seq_annot_map, processed_ids);
1296     }
1297 
1298 
1299     return contig;
1300 }
1301 
1302 END_SCOPE(gnomon)
1303 END_NCBI_SCOPE
1304