1 /*  $Id: gnomon_objmgr.cpp 477875 2015-09-02 14:23:40Z souvorov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Mike DiCuccio
27  *
28  * File Description: gnomon library parts requiring object manager support
29  * to allow apps not using these to be linked w/o object manager libs
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <algo/gnomon/gnomon.hpp>
35 #include <algo/gnomon/chainer.hpp>
36 #include <algo/gnomon/gnomon_exception.hpp>
37 #include <algo/gnomon/id_handler.hpp>
38 #include "hmm.hpp"
39 #include "hmm_inlines.hpp"
40 
41 #include <objects/seqloc/Seq_loc.hpp>
42 #include <objects/seqloc/Seq_id.hpp>
43 #include <objects/seqloc/Packed_seqint.hpp>
44 #include <objects/seqloc/Seq_interval.hpp>
45 #include <objects/seqfeat/Seq_feat.hpp>
46 #include <objects/seqfeat/RNA_ref.hpp>
47 #include <objects/seqfeat/Cdregion.hpp>
48 #include <objmgr/util/sequence.hpp>
49 #include <objects/seqalign/seqalign__.hpp>
50 #include <objects/general/general__.hpp>
51 #include <objmgr/object_manager.hpp>
52 
53 #include <stdio.h>
54 
55 #include <functional>
56 
57 BEGIN_NCBI_SCOPE
58 BEGIN_SCOPE(gnomon)
59 USING_SCOPE(ncbi::objects);
60 
debug()61 void debug()
62 {
63 }
64 
GetModelId(const CSeq_align & seq_align)65 Int8 GetModelId(const CSeq_align& seq_align)
66 {
67     Int8 id = 0;
68     if(seq_align.CanGetId())
69         seq_align.GetId().back()->GetIdType(id);  // 0 if no integer id
70 
71     return id;
72     //    return CIdHandler::GetId(*seq_align.GetId().back());
73 }
74 
CAlignModel(const CSeq_align & seq_align)75 CAlignModel::CAlignModel(const CSeq_align& seq_align) :
76     CGeneModel(seq_align.GetSegs().GetSpliced().GetGenomic_strand()==eNa_strand_minus?eMinus:ePlus, GetModelId(seq_align), emRNA)
77 {
78 #ifdef _DEBUG
79     debug();
80 #endif
81 
82     const CSpliced_seg& sps = seq_align.GetSegs().GetSpliced();
83 
84     bool is_protein = false;
85     if(sps.CanGetProduct_type() && sps.GetProduct_type()==CSpliced_seg::eProduct_type_protein) {
86         SetType(eProt);
87         is_protein = true;
88     }
89 
90     bool is_product_reversed = false;
91     if(sps.CanGetProduct_strand() && sps.GetProduct_strand()==eNa_strand_minus) {
92         Status() |= CGeneModel::eReversed;
93         is_product_reversed = true;
94     }
95 
96     SetTargetId(sps.GetProduct_id());
97 
98     int product_len = sps.CanGetProduct_length()?sps.GetProduct_length():0;
99     if (is_protein)
100         product_len *=3;
101     int prod_prev = -1;
102     bool prev_3_prime_splice = false;
103 
104     int target_len = product_len;
105 
106     vector<TSignedSeqRange> transcript_exons;
107     TInDels indels;
108 
109     if(!sps.CanGetGenomic_id())
110         NCBI_THROW(CGnomonException, eGenericError, "CSpliced_seg must have genomic id");
111 
112     const CSeq_id& genomic_id = sps.GetGenomic_id();
113 
114     string mismatches;
115     string mismstatus;
116 
117     if(seq_align.CanGetExt()) {
118         int count = 0;
119         ITERATE(CSeq_align::TExt, i, seq_align.GetExt()) {
120             if((*i)->IsSetType() && (*i)->GetType().IsStr()) {
121                 string type = (*i)->GetType().GetStr();
122                 if(type == "RNASeq-Counts") {
123                     ITERATE(CUser_object::TData, j, (*i)->GetData()) {
124                         if(*j && (*j)->CanGetLabel() && (*j)->GetLabel().IsStr()) {
125                             string label = (*j)->GetLabel().GetStr();
126                             if(NStr::EndsWith(label, "alignments") || label.find(' ') == string::npos)
127                                 count += (*j)->GetData().GetInt();
128                         }
129                     }
130                 } else if(type == "MismatchedBases") {
131                     mismatches = (*i)->GetData().front()->GetData().GetStr();
132                 } else if(type == "MismatchedBasesStatus") {
133                     mismstatus = (*i)->GetData().front()->GetData().GetStr();
134                 }
135             }
136         }
137         if(count > 0)
138             SetWeight(count);
139         if(Strand() == eMinus) {
140             reverse(mismatches.begin(),mismatches.end());
141             reverse(mismstatus.begin(),mismstatus.end());
142         }
143     }
144 
145 #ifdef _DEBUG
146     bool ggap_model= false;
147 #endif
148 
149     ITERATE(CSpliced_seg::TExons, e_it, sps.GetExons()) {
150         const CSpliced_exon& exon = **e_it;
151         int prod_cur_start = exon.GetProduct_start().AsSeqPos();
152         int prod_cur_end = exon.GetProduct_end().AsSeqPos();
153         if (is_product_reversed) {
154             int tmp = prod_cur_start;
155             prod_cur_start = product_len - prod_cur_end -1;
156             prod_cur_end = product_len - tmp -1;
157         }
158         int nuc_cur_start = exon.GetGenomic_start();
159         int nuc_cur_end = exon.GetGenomic_end();
160 
161         //        bool cur_5_prime_splice = exon.CanGetAcceptor_before_exon() && exon.GetAcceptor_before_exon().CanGetBases() && exon.GetAcceptor_before_exon().GetBases().size()==2;
162         bool cur_5_prime_splice = exon.CanGetAcceptor_before_exon() && exon.GetAcceptor_before_exon().CanGetBases();
163 
164         if (prod_prev+1 != prod_cur_start || !prev_3_prime_splice || !cur_5_prime_splice) {
165             AddHole();
166             if(!mismatches.empty())   // also will take care about first exon
167                 mismatches = mismatches.substr(prod_cur_start - prod_prev -1);
168         }
169 
170         //        prev_3_prime_splice = exon.CanGetDonor_after_exon() && exon.GetDonor_after_exon().CanGetBases() && exon.GetDonor_after_exon().GetBases().size()==2;
171         prev_3_prime_splice = exon.CanGetDonor_after_exon() && exon.GetDonor_after_exon().CanGetBases();
172 
173         string fs, ss;
174         if(exon.CanGetAcceptor_before_exon() && exon.GetAcceptor_before_exon().CanGetBases()) {
175             fs = exon.GetAcceptor_before_exon().GetBases();
176         }
177         if(exon.CanGetDonor_after_exon() && exon.GetDonor_after_exon().CanGetBases()) {
178             ss = exon.GetDonor_after_exon().GetBases();
179         }
180         if(Strand() == eMinus) {
181             swap(fs,ss);
182         }
183 
184         double eident = 0;
185         if(exon.CanGetScores() && exon.GetScores().CanGet()) {
186             CScore_set::Tdata scores = exon.GetScores().Get();
187             ITERATE(CScore_set::Tdata, it, scores) {
188                 if((*it)->CanGetId() && (*it)->GetId().IsStr()) {
189                     if((*it)->GetId().GetStr() == "idty") {
190                         eident = (*it)->GetValue().GetReal();
191                         break;
192                     }
193                 }
194             }
195         }
196 
197         if(!exon.CanGetGenomic_id() || !(exon.GetGenomic_id() < genomic_id || genomic_id < exon.GetGenomic_id())) {  // normal exon
198             AddNormalExon(TSignedSeqRange(nuc_cur_start,nuc_cur_end), fs, ss, eident, Strand() == eMinus);
199         } else {  // genomic gap
200             if(!exon.CanGetGenomic_strand())
201                 NCBI_THROW(CGnomonException, eGenericError, "CSpliced_exon for gap filling must have genomic strand");
202             CConstRef<objects::CSeq_id> fill_id(&exon.GetGenomic_id());
203             CScope scope(*CObjectManager::GetInstance());
204             scope.AddDefaults();
205 
206             string transcript = GetDNASequence(fill_id,scope);
207             string fill_seq = transcript.substr(nuc_cur_start, nuc_cur_end-nuc_cur_start+1);
208 
209             CInDelInfo::SSource fill_src;
210             fill_src.m_acc = CIdHandler::ToString(*fill_id);
211             fill_src.m_strand = ePlus;
212             fill_src.m_range = TSignedSeqRange(nuc_cur_start, nuc_cur_end);
213 
214             if(exon.GetGenomic_strand() == eNa_strand_minus) {
215                 fill_src.m_strand = eMinus;
216                 ReverseComplement(fill_seq.begin(),fill_seq.end());
217             }
218 
219             AddGgapExon(eident, fill_seq, fill_src, Strand() == eMinus);
220 #ifdef _DEBUG
221             ggap_model = true;
222 #endif
223         }
224         transcript_exons.push_back(TSignedSeqRange(exon.GetProduct_start().AsSeqPos(), exon.GetProduct_end().AsSeqPos()));
225 
226         _ASSERT(transcript_exons.back().NotEmpty());
227 
228         int pos = 0;
229         int prod_pos = prod_cur_start;
230 
231         ITERATE(CSpliced_exon::TParts, p_it, exon.GetParts()) {
232             const CSpliced_exon_chunk& chunk = **p_it;
233             CInDelInfo::EStatus indelstatus = CInDelInfo::eUnknown;
234             if(!mismstatus.empty() && (chunk.IsProduct_ins() || chunk.IsGenomic_ins() || chunk.IsMismatch())) {
235                 if(mismstatus[0] == 'n')
236                     indelstatus = CInDelInfo::eGenomeNotCorrect;
237                 else if(mismstatus[0] == 'c')
238                     indelstatus = CInDelInfo::eGenomeCorrect;
239                 mismstatus = mismstatus.substr(1);
240             }
241             if (chunk.IsProduct_ins()) {
242                 string v = kEmptyStr;
243                 int product_ins = chunk.GetProduct_ins();
244                 if(!mismatches.empty()) {
245                     v = mismatches.substr(0,product_ins);
246                     mismatches = mismatches.substr(product_ins);
247                 }
248                 if(Strand() == eMinus)
249                     reverse(v.begin(),v.end());
250                 CInDelInfo fs(Strand()==ePlus?nuc_cur_start+pos:nuc_cur_end-pos+1, product_ins, CInDelInfo::eDel, v);
251                 if (Strand() == ePlus)
252                     indels.push_back(fs);
253                 else
254                     indels.insert(indels.begin(), fs);
255                 prod_pos += chunk.GetProduct_ins();
256             } else if (chunk.IsGenomic_ins()) {
257                 const int genomic_ins = chunk.GetGenomic_ins();
258                 if (Strand()==ePlus)
259                     indels.push_back(CInDelInfo(nuc_cur_start+pos, genomic_ins, CInDelInfo::eIns));
260                 else
261                     indels.insert(indels.begin(), CInDelInfo(nuc_cur_end-pos-genomic_ins+1, genomic_ins, CInDelInfo::eIns));
262                 pos += genomic_ins;
263             } else if (chunk.IsMatch()) {
264                 pos += chunk.GetMatch();
265                 prod_pos += chunk.GetMatch();
266             } else if (chunk.IsMismatch()) {
267                 int mismatch_len = chunk.GetMismatch();
268                 string v(mismatch_len,'N');
269                 if(!mismatches.empty()) {
270                     _ASSERT(mismatch_len <= (int)mismatches.length());
271                     v = mismatches.substr(0,mismatch_len);
272                     mismatches = mismatches.substr(mismatch_len);
273                 }
274                 if(Strand() == ePlus) {
275                     indels.push_back(CInDelInfo(nuc_cur_start+pos, mismatch_len, CInDelInfo::eMism, v));
276                 } else {
277                     reverse(v.begin(),v.end());
278                     indels.insert(indels.begin(), CInDelInfo(nuc_cur_end-pos-mismatch_len+1, mismatch_len, CInDelInfo::eMism, v));
279                 }
280                 pos += mismatch_len;
281                 prod_pos += mismatch_len;
282             } else { // if (chunk.IsDiag())
283                 pos += chunk.GetDiag();
284                 prod_pos += chunk.GetDiag();
285             }
286             if(indelstatus != CInDelInfo::eUnknown) {
287                 if(Strand() == ePlus)
288                     indels.back().SetStatus(indelstatus);
289                 else
290                     indels.front().SetStatus(indelstatus);
291             }
292         }
293 
294         prod_prev = prod_cur_end;
295     }
296 
297     _ASSERT(mismatches.empty() || (product_len - prod_prev - 1 == (int)mismatches.length()));
298 
299     sort(transcript_exons.begin(),transcript_exons.end());
300     bool minusstrand = Strand() == eMinus;
301     EStrand orientation = (is_product_reversed == minusstrand) ? ePlus : eMinus;
302     if(orientation == eMinus)
303        reverse(transcript_exons.begin(),transcript_exons.end());
304 
305     if (is_protein) {
306         _ASSERT(orientation == Strand());
307         if (sps.CanGetModifiers()) {
308             ITERATE(CSpliced_seg::TModifiers, m, sps.GetModifiers()) {
309                 if ((*m)->IsStop_codon_found()) {
310                     target_len += 3;
311                     if (Strand() == ePlus) {
312                         ExtendRight( 3 );
313                         _ASSERT((transcript_exons.back().GetTo()+1)%3 == 0);
314                         transcript_exons.back().SetTo(transcript_exons.back().GetTo()+3);
315                     } else {
316                         ExtendLeft( 3 );
317                         _ASSERT((transcript_exons.front().GetTo()+1)%3 == 0);
318                         transcript_exons.front().SetTo(transcript_exons.front().GetTo()+3);
319                     }
320                 }
321             }
322         }
323     }
324 
325     //convert tandem indels into indel+mism
326     bool keepdoing = true;
327     while(keepdoing) {
328         keepdoing = false;
329         NON_CONST_ITERATE(TInDels, indl, indels) {
330             TInDels::iterator indl_next = indl;
331             if(++indl_next == indels.end())
332                 break;
333 
334             if(indl->InDelEnd() == indl_next->Loc()) {  // tandem
335                 string new_seq = indl->GetInDelV()+indl_next->GetInDelV();
336                 int new_seq_len =  new_seq.size();
337                 if(indl->GetType() == indl_next->GetType()) {  // combine same
338                     *indl = CInDelInfo(indl->Loc(), indl->Len()+indl_next->Len(), indl->GetType(), new_seq);
339                     indels.erase(indl_next);
340                     keepdoing = true;
341                 } else if(!indl->IsMismatch() && !indl_next->IsMismatch()) { // tandem indels
342                     if(indl->Len() == indl_next->Len()) {
343                         *indl = CInDelInfo(indl->Loc(), indl->Len(), CInDelInfo::eMism, new_seq);
344                         indels.erase(indl_next);
345                     } else if(indl->Len() < indl_next->Len()) {
346                         *indl = CInDelInfo(indl->Loc(), indl->Len(), CInDelInfo::eMism, new_seq.substr(0,indl->Len()));
347                         *indl_next = CInDelInfo(indl->InDelEnd(), indl_next->Len()-indl->Len(), indl_next->GetType(), new_seq.substr(indl->Len()));
348                     } else {  // indl_next->Len() < indl->Len()
349                         *indl = CInDelInfo(indl->Loc(), indl->Len()-indl_next->Len(), indl->GetType(), new_seq.substr(0,new_seq_len-indl_next->Len()));
350                         *indl_next = CInDelInfo(indl->InDelEnd(), indl_next->Len(), CInDelInfo::eMism, new_seq.substr(new_seq_len-indl_next->Len()));
351                     }
352                     keepdoing = true;
353                 }
354             }
355         }
356     }
357 
358     m_alignmap = CAlignMap(Exons(), transcript_exons, indels, orientation, target_len );
359     FrameShifts() = indels;
360 
361     TSignedSeqRange newlimits = m_alignmap.ShrinkToRealPoints(Limits(),is_protein);
362     if(newlimits != Limits()) {
363         Clip(newlimits,CAlignModel::eRemoveExons);
364     }
365 
366     for (CGeneModel::TExons::const_iterator piece_begin = Exons().begin(); piece_begin != Exons().end(); ++piece_begin) {
367         _ASSERT( !piece_begin->m_fsplice );
368 
369         if(piece_begin->Limits().Empty()) {   // ggap
370             _ASSERT(piece_begin->m_ssplice);
371             ++piece_begin;
372             _ASSERT(piece_begin->Limits().NotEmpty());
373         }
374 
375         CGeneModel::TExons::const_iterator piece_end;
376         for (piece_end = piece_begin; piece_end != Exons().end() && piece_end->m_ssplice; ++piece_end) ;
377         _ASSERT( piece_end != Exons().end() );
378 
379         CGeneModel::TExons::const_iterator piece_end_g = piece_end;
380         if(piece_end_g->Limits().Empty()) {   // ggap
381             _ASSERT(piece_end_g->m_fsplice);
382             --piece_end_g;
383             _ASSERT(piece_end_g->Limits().NotEmpty());
384         }
385 
386         TSignedSeqRange piece_range(piece_begin->GetFrom(),piece_end_g->GetTo());
387 
388         piece_range = m_alignmap.ShrinkToRealPoints(piece_range, is_protein); // finds first projectable interval (on codon boundaries  for proteins)
389 
390         /*
391         TSignedSeqRange pr;
392         while(pr != piece_range) {
393             pr = piece_range;
394             ITERATE(TInDels, i, indels) { // here we check that no indels touch our interval from outside
395                 if((i->IsDeletion() && i->Loc() == pr.GetFrom()) || ((i->IsInsertion() || i->IsMismatch()) && i->Loc()+i->Len() == pr.GetFrom()))
396                     pr.SetFrom(pr.GetFrom()+1);
397                 else if(i->Loc() == pr.GetTo()+1)
398                     pr.SetTo(pr.GetTo()-1);
399             }
400             if(pr != piece_range)
401                 piece_range = m_alignmap.ShrinkToRealPoints(pr, is_protein);
402         }
403         */
404 
405         _ASSERT(piece_range.NotEmpty());
406         _ASSERT(piece_range.IntersectingWith(piece_begin->Limits()) && piece_range.IntersectingWith(piece_end_g->Limits()));
407 
408         if(piece_range.GetFrom() != piece_begin->GetFrom() || piece_range.GetTo() != piece_end_g->GetTo()) {
409             _ASSERT(!ggap_model);
410             Clip(piece_range, CGeneModel::eDontRemoveExons);
411         }
412 
413         piece_begin = piece_end;
414     }
415 
416     if (is_protein) {
417         TSignedSeqRange reading_frame =  m_alignmap.MapRangeOrigToEdited(Limits(), true);
418         TSignedSeqRange start, stop;
419         if (sps.CanGetModifiers()) {
420             ITERATE(CSpliced_seg::TModifiers, m, sps.GetModifiers()) {
421                 if ((*m)->IsStart_codon_found()) {
422                     start = TSignedSeqRange(reading_frame.GetFrom(),reading_frame.GetFrom()+2);
423                     reading_frame.SetFrom(start.GetTo()+1);
424                 } else if ((*m)->IsStop_codon_found()) {
425                     stop = TSignedSeqRange(reading_frame.GetTo()-2,reading_frame.GetTo());
426                     reading_frame.SetTo(stop.GetFrom()-1);
427                 }
428             }
429         }
430 
431         CCDSInfo cds_info_t;
432         cds_info_t.SetReadingFrame(reading_frame, true);
433         if (start.NotEmpty()) {
434             cds_info_t.SetStart(start, false);
435         }
436         if (stop.NotEmpty()) {
437             cds_info_t.SetStop(stop, false);
438         }
439         CCDSInfo cds_info_g = cds_info_t.MapFromEditedToOrig(m_alignmap);
440         if(cds_info_g.ReadingFrame().NotEmpty())   // successful projection
441             SetCdsInfo(cds_info_g);
442         else
443             SetCdsInfo(cds_info_t);
444     }
445 
446     if (sps.IsSetPoly_a()) {
447         Status() |= CGeneModel::ePolyA;
448     }
449 
450     if(seq_align.CanGetScore()) {
451         const CSeq_align::TScore& score = seq_align.GetScore();
452         ITERATE(CSeq_align::TScore, it, score) {
453             if((*it)->CanGetId() && (*it)->GetId().IsStr()) {
454                 string scr = (*it)->GetId().GetStr();
455                 if((scr == "N of matches") || (scr == "num_ident") || (scr == "matches")) {
456                     double ident = (*it)->GetValue().GetInt();
457                     ident /= seq_align.GetAlignLength();
458                     SetIdent(ident);
459                 } else if(scr == "rank" && (*it)->GetValue().GetInt() == 1) {
460                     Status() |= CGeneModel::eBestPlacement;
461                 } else if(scr == "ambiguous_orientation") {
462                     Status() |= CGeneModel::eUnknownOrientation;
463                 } else if(scr == "count") {
464                     _ASSERT(Weight() == 1 || Weight() == (*it)->GetValue().GetInt());
465                     SetWeight((*it)->GetValue().GetInt());
466                 }
467             }
468         }
469     }
470 }
471 
472 
473 
GetCdsDnaSequence(const CResidueVec & contig_sequence) const474 string CGeneModel::GetCdsDnaSequence (const CResidueVec& contig_sequence) const
475 {
476     if(ReadingFrame().Empty())
477         return kEmptyStr;
478 
479     CAlignMap amap = CGeneModel::GetAlignMap();
480     CCDSInfo cds_info = GetCdsInfo();
481     if(cds_info.IsMappedToGenome())
482         cds_info = cds_info.MapFromOrigToEdited(amap);
483     int cds_len = cds_info.Cds().GetLength();
484     int cds_start = cds_info.Cds().GetFrom()-TranscriptLimits().GetFrom();
485 
486     CResidueVec mrna;
487     amap.EditedSequence(contig_sequence,mrna);
488 
489     string cds_seq(cds_len,'A');
490     copy(mrna.begin()+cds_start, mrna.begin()+cds_start+cds_len, cds_seq.begin());
491 
492     if(Status() & eReversed)
493         ReverseComplement(cds_seq.begin(),cds_seq.end());
494 
495     return cds_seq;
496 }
497 
GetProtein(const CResidueVec & contig_sequence) const498 string CGeneModel::GetProtein (const CResidueVec& contig_sequence) const
499 {
500     string cds_seq = GetCdsDnaSequence(contig_sequence);
501     string prot_seq;
502 
503     objects::CSeqTranslator::Translate(cds_seq, prot_seq, objects::CSeqTranslator::fIs5PrimePartial);
504 
505     if(PStop()) {
506         CCDSInfo cds_info = GetCdsInfo();
507         if(cds_info.IsMappedToGenome()) {
508             CAlignMap amap = GetAlignMap();
509             cds_info = cds_info.MapFromOrigToEdited(amap);
510         }
511         ITERATE(CCDSInfo::TPStops, stp, cds_info.PStops()) {
512             if(stp->m_status == CCDSInfo::eSelenocysteine)
513                 prot_seq[(stp->GetFrom()- cds_info.Cds().GetFrom())/3] = 'U';
514         }
515     }
516 
517     return prot_seq;
518 }
519 
GetProtein(const CResidueVec & contig_sequence,const CGenetic_code * gencode) const520 string CGeneModel::GetProtein (const CResidueVec& contig_sequence, const CGenetic_code* gencode) const
521 {
522     string cds_seq = GetCdsDnaSequence(contig_sequence);
523     string prot_seq;
524 
525     objects::CSeqTranslator::Translate(cds_seq, prot_seq, objects::CSeqTranslator::fDefault, gencode );
526     if (prot_seq[0] == '-') {
527         string first_triplet = cds_seq.substr(0, 3);
528         string first_aa;
529         objects::CSeqTranslator::Translate(first_triplet, first_aa, objects::CSeqTranslator::fIs5PrimePartial, gencode );
530         prot_seq = first_aa+prot_seq.substr(1);
531     }
532 
533     if(PStop()) {
534         CCDSInfo cds_info = GetCdsInfo();
535         if(cds_info.IsMappedToGenome()) {
536             CAlignMap amap = GetAlignMap();
537             cds_info = cds_info.MapFromOrigToEdited(amap);
538         }
539         ITERATE(CCDSInfo::TPStops, stp, cds_info.PStops()) {
540             if(stp->m_status == CCDSInfo::eSelenocysteine)
541                 prot_seq[(stp->GetFrom()- cds_info.Cds().GetFrom())/3] = 'U';
542         }
543     }
544 
545     return prot_seq;
546 }
547 
548 //
549 // helper function - convert a vector<TSignedSeqRange> into a compact CSeq_loc
550 //
551 namespace {
552 
s_ExonDataToLoc(const vector<TSignedSeqRange> & vec,ENa_strand strand,const CSeq_id & id)553 CRef<CSeq_loc> s_ExonDataToLoc(const vector<TSignedSeqRange>& vec,
554                                ENa_strand strand, const CSeq_id& id)
555 {
556     CRef<CSeq_loc> loc(new CSeq_loc());
557 
558     CPacked_seqint::Tdata data;
559     ITERATE (vector<TSignedSeqRange>, iter, vec) {
560         CRef<CSeq_interval> ival(new CSeq_interval);
561         ival->SetFrom(iter->GetFrom());
562         ival->SetTo(iter->GetTo());
563         ival->SetStrand(strand);
564         ival->SetId().Assign(id);
565 
566         data.push_back(ival);
567     }
568 
569     if (data.size() == 1) {
570         loc->SetInt(*data.front());
571     } else {
572         loc->SetPacked_int().Set().swap(data);
573     }
574 
575     return loc;
576 }
577 
578 } //end unnamed namespace
579 
GetAnnot(const CSeq_id & id)580 CRef<CSeq_annot> CGnomonEngine::GetAnnot(const CSeq_id& id)
581 {
582     TGeneModelList genes = GetGenes();
583 
584     CRef<objects::CSeq_annot> annot(new CSeq_annot());
585 
586     annot->SetNameDesc("Gnomon gene scan output");
587 
588     CSeq_annot::C_Data::TFtable& ftable = annot->SetData().SetFtable();
589 
590     unsigned int counter = 0;
591     string locus_tag_base("GNOMON_");
592     ITERATE (TGeneModelList, it, genes) {
593         const CGeneModel& igene = *it;
594         int strand = igene.Strand();
595         TSignedSeqRange cds_limits = igene.RealCdsLimits();
596 
597         vector<TSignedSeqRange> mrna_vec;
598         copy(igene.Exons().begin(), igene.Exons().end(), back_inserter(mrna_vec));
599         vector<TSignedSeqRange> cds_vec;
600 
601         for (size_t j = 0;  j < mrna_vec.size();  ++j) {
602             TSignedSeqRange intersect(mrna_vec[j] & cds_limits);
603             if (!intersect.Empty()) {
604                 cds_vec.push_back(intersect);
605             }
606         }
607 
608         // stop-codon removal from cds
609         if (igene.HasStop()) {
610             if (strand == ePlus) {
611                 _ASSERT(cds_vec.back().GetLength()>=3);
612                 cds_vec.back().SetTo(cds_vec.back().GetTo() - 3);
613             } else {
614                 _ASSERT(cds_vec.front().GetLength()>=3);
615                 cds_vec.front().SetFrom(cds_vec.front().GetFrom() + 3);
616             }
617         }
618 
619         //
620         // create our mRNA
621         CRef<CSeq_feat> feat_mrna;
622         if (mrna_vec.size()) {
623             feat_mrna = new CSeq_feat();
624             feat_mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
625             feat_mrna->SetLocation
626                 (*s_ExonDataToLoc(mrna_vec,
627                  (strand == ePlus ? eNa_strand_plus : eNa_strand_minus), id));
628         } else {
629             continue;
630         }
631 
632         //
633         // create the accompanying CDS
634         CRef<CSeq_feat> feat_cds;
635         if (!cds_vec.empty()) {
636             feat_cds = new CSeq_feat();
637             feat_cds->SetData().SetCdregion().SetFrame(CCdregion::TFrame(1+(strand == ePlus?(igene.ReadingFrame().GetFrom()-cds_limits.GetFrom())%3:(cds_limits.GetTo()-igene.ReadingFrame().GetTo())%3)));
638 
639             feat_cds->SetLocation
640                 (*s_ExonDataToLoc(cds_vec,
641                  (strand == ePlus ? eNa_strand_plus : eNa_strand_minus), id));
642         }
643 
644         //
645         // create a dummy gene feature as well
646         CRef<CSeq_feat> feat_gene(new CSeq_feat());
647 
648         char buf[32];
649         sprintf(buf, "%04u", ++counter);
650         string name(locus_tag_base);
651         name += buf;
652         feat_gene->SetData().SetGene().SetLocus_tag(name);
653         feat_gene->SetLocation().SetInt()
654             .SetFrom(feat_mrna->GetLocation().GetTotalRange().GetFrom());
655         feat_gene->SetLocation().SetInt()
656             .SetTo(feat_mrna->GetLocation().GetTotalRange().GetTo());
657         feat_gene->SetLocation().SetInt().SetStrand
658             (strand == ePlus ? eNa_strand_plus : eNa_strand_minus);
659 
660         feat_gene->SetLocation().SetId
661             (sequence::GetId(feat_mrna->GetLocation(), 0));
662 
663         ftable.push_back(feat_gene);
664         ftable.push_back(feat_mrna);
665         if (feat_cds) {
666             ftable.push_back(feat_cds);
667         }
668     }
669     return annot;
670 }
671 
672 
673 //
674 //
675 // This function deduces the frame from 5p coordinate of the CDS which should be on the 5p codon boundary
676 //
677 //
GetScore(CConstRef<CHMMParameters> hmm_params,const CSeq_loc & cds,CScope & scope,int * const gccontent,double * const startscore)678 double CCodingPropensity::GetScore(CConstRef<CHMMParameters> hmm_params, const CSeq_loc& cds, CScope& scope, int *const gccontent, double *const startscore)
679 {
680     *gccontent = 0;
681     const CSeq_id* seq_id = cds.GetId();
682     if (seq_id == NULL)
683 	NCBI_THROW(CGnomonException, eGenericError, "CCodingPropensity: CDS has multiple ids or no id at all");
684 
685     // Need to know GC content in order to load correct models.
686     // Compute on the whole transcript, not just CDS.
687     CBioseq_Handle xcript_hand = scope.GetBioseqHandle(*seq_id);
688     CSeqVector xcript_vec = xcript_hand.GetSeqVector();
689     xcript_vec.SetIupacCoding();
690     unsigned int gc_count = 0;
691     CSeqVector_CI xcript_iter(xcript_vec);
692     for( ;  xcript_iter;  ++xcript_iter) {
693         if (*xcript_iter == 'G' || *xcript_iter == 'C') {
694             ++gc_count;
695         }
696     }
697     *gccontent = static_cast<unsigned int>(100.0 * gc_count / xcript_vec.size() + 0.5);
698 
699     const CMC3_CodingRegion<5>&   cdr = dynamic_cast<const CMC3_CodingRegion<5>&>(hmm_params->GetParameter(CMC3_CodingRegion<5>::class_id(), *gccontent));
700     const CMC_NonCodingRegion<5>& ncdr = dynamic_cast<const CMC_NonCodingRegion<5>&>(hmm_params->GetParameter(CMC_NonCodingRegion<5>::class_id(), *gccontent));
701 
702     // Represent coding sequence as enum Nucleotides
703     CSeqVector vec(cds, scope);
704     vec.SetIupacCoding();
705     CEResidueVec seq;
706     seq.reserve(vec.size());
707     CSeqVector_CI iter(vec);
708     for( ;  iter;  ++iter) {
709 	seq.push_back(fromACGT(*iter));
710     }
711 
712     // Sum coding and non-coding scores across coding sequence.
713     // Don't include stop codon!
714     double score = 0;
715     for (unsigned int i = 5;  i < seq.size() - 3;  ++i)
716         score += cdr.Score(seq, i, i % 3) - ncdr.Score(seq, i);
717 
718     if(startscore) {
719         //
720         // start evalustion needs 17 bases total (11 before ATG and 3 after)
721         // if there is not enough sequence it will be substituted by Ns which will degrade the score
722         //
723 
724         const CWMM_Start& start = dynamic_cast<const CWMM_Start&>(hmm_params->GetParameter(CWMM_Start::class_id(), *gccontent));
725 
726         int totallen = xcript_vec.size();
727         int left, right;
728         int extraNs5p = 0;
729         int extrabases = start.Left()+2;            // (11) extrabases left of ATG needed for start (6) and noncoding (5) evaluation
730         if(cds.GetStrand() == eNa_strand_plus) {
731             int startposition = cds.GetTotalRange().GetFrom();
732             if(startposition < extrabases) {
733                 left = 0;
734                 extraNs5p = extrabases-startposition;
735             } else {
736                 left = startposition-extrabases;
737             }
738             right = min(startposition+2+start.Right(),totallen-1);
739         } else {
740             int startposition = cds.GetTotalRange().GetTo();
741             if(startposition+extrabases >= totallen) {
742                 right = totallen-1;
743                 extraNs5p = startposition+extrabases-totallen+1;
744             } else {
745                 right = startposition+extrabases;
746             }
747             left = max(0,startposition-2-start.Right());
748         }
749 
750 
751         CRef<CSeq_loc> startarea(new CSeq_loc());
752         CSeq_interval& d = startarea->SetInt();
753         d.SetStrand(cds.GetStrand());
754         CRef<CSeq_id> id(new CSeq_id());
755         id->Assign(*seq_id);
756         d.SetId(*id);
757         d.SetFrom(left);
758         d.SetTo(right);
759 
760         CSeqVector sttvec(*startarea, scope);
761         sttvec.SetIupacCoding();
762         CEResidueVec sttseq;
763         sttseq.resize(extraNs5p, enN);                         // fill with Ns if necessery
764         for(unsigned int i = 0; i < sttvec.size(); ++i) {
765             sttseq.push_back(fromACGT(sttvec[i]));
766         }
767         sttseq.resize(5+start.Left()+start.Right(), enN);      // add Ns if necessery
768 
769         *startscore = start.Score(sttseq, extrabases+2);
770         if(*startscore != BadScore()) {
771             for(unsigned int i = 5; i < sttseq.size(); ++i) {
772                 *startscore -= ncdr.Score(sttseq, i);
773             }
774         }
775     }
776 
777     return score;
778 }
779 
780 END_SCOPE(gnomon)
781 END_NCBI_SCOPE
782