1 /*  $Id: parse.cpp 396800 2013-04-22 18:54:22Z souvorov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software / database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software / database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Alexandre Souvorov
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <algo/gnomon/gnomon_model.hpp>
34 #include <algo/gnomon/gnomon_exception.hpp>
35 #include <objects/general/Object_id.hpp>
36 #include "hmm.hpp"
37 #include "parse.hpp"
38 #include "hmm_inlines.hpp"
39 
40 
41 BEGIN_NCBI_SCOPE
42 BEGIN_SCOPE(gnomon)
43 
44 USING_SCOPE(objects);
45 
46 template<class L, class R> inline
s_TooFar(const L & left,const R & right,double score)47 bool s_TooFar(const L& left, const R& right, double score)
48 {
49     if(left.MScore() == BadScore()) return true;
50     int len = right.Stop()-left.Stop();
51     return (len > kTooFarLen && score+left.MScore() < right.Score());
52 }
53 
54 struct SRState
55 {
SRStateSRState56     SRState(const CHMM_State& l, const CHMM_State& r)
57     {
58         lsp = r.LeftState();
59         rsp = const_cast<CHMM_State*>(&r);
60         rsp->UpdateLeftState(l);
61     }
~SRStateSRState62     ~SRState() { rsp->UpdateLeftState(*lsp); }
63 
64     const CHMM_State* lsp;
65     CHMM_State* rsp;
66 };
67 
68 template<class Left, class Right> inline
s_EvaluateNewScore(const Left & left,const Right & right,double & rscore,bool & openrgn,bool rightanchor=false)69 bool s_EvaluateNewScore(const Left& left, const Right& right, double& rscore, bool& openrgn, bool rightanchor = false)
70 {
71 
72     rscore = BadScore();
73 
74     SRState rst(left,right);
75 
76     int len = right.Stop()-left.Stop();
77     if(len > right.MaxLen()) return false;
78     if(!right.NoRightEnd() && len < right.MinLen()) return true;
79 
80     double scr, score = 0;
81     if(left.isPlus())
82     {
83         scr = left.BranchScore(right);
84         if(scr == BadScore()) return true;
85     }
86     else
87     {
88         scr = right.BranchScore(left);
89         if(scr == BadScore()) return true;
90         scr += right.DenScore()-left.DenScore();
91     }
92     score += scr;
93 
94     // this check is frame-dependent and MUST be after BranchScore
95     if(right.StopInside()) return false;
96 
97     if(right.NoRightEnd() && !rightanchor) scr = right.ClosingLengthScore();
98     else scr = right.LengthScore();
99     if(scr == BadScore()) return true;
100     score += scr;
101 
102     scr = right.RgnScore();
103     if(scr == BadScore()) return true;
104     score += scr;
105 
106     if(!right.NoRightEnd())
107     {
108         scr = right.TermScore();
109         if(scr == BadScore()) return true;
110         score += scr;
111     }
112 
113     openrgn = right.OpenRgn();
114     rscore = score;
115     return true;
116 }
117 
118 // leftprot   0 no protein, 1 there is a protein
119 // rightprot  0 no protein, 1 must be a protein, -1 don't care
120 template<class L, class R> // template for all exons
s_ForwardStep(const L & left,R & right,int leftprot,int rightprot)121 inline bool s_ForwardStep(const L& left, R& right, int leftprot, int rightprot)
122 {
123     double score;
124     bool openrgn;
125     if(!s_EvaluateNewScore(left,right,score,openrgn)) return false;
126     else if(score == BadScore()) return true;
127 
128     int protnum = right.GetSeqScores().ProtNumber(left.Stop(),right.Stop());
129 
130     if(rightprot == 0 && (leftprot != 0 || protnum != 0))    // there is a protein
131     {
132         return true;
133     }
134 
135     if(leftprot == 0)
136     {
137         if(rightprot == 1 && protnum == 0) return true;      // no proteins
138         else if(protnum > 0) --protnum;                      // for first protein no penalty
139     }
140 
141     if(left.Score() != BadScore() && openrgn)
142     {
143          double scr = score-protnum*right.GetSeqScores().MultiProtPenalty()+left.Score();
144         if(scr > right.Score())
145         {
146             right.UpdateLeftState(left);
147             right.UpdateScore(scr);
148         }
149     }
150 
151     return openrgn;
152 }
153 
154 template<class L>
s_ForwardStep(const L & left,CIntron & right)155 inline bool s_ForwardStep(const L& left, CIntron& right)
156 {
157     double score;
158     bool openrgn;
159     if(!s_EvaluateNewScore(left,right,score,openrgn)) return false;
160     else if(score == BadScore()) return true;
161 
162     if(left.Score() != BadScore() && openrgn)
163     {
164         double scr = score+left.Score();
165         if(scr > right.Score())
166         {
167             right.UpdateLeftState(left);
168             right.UpdateScore(scr);
169         }
170     }
171 
172     return (openrgn && !s_TooFar(left, right, score));
173 }
174 
175 template<class L>
s_ForwardStep(const L & left,CIntergenic & right,bool rightanchor)176 inline bool s_ForwardStep(const L& left, CIntergenic& right, bool rightanchor)
177 {
178     double score;
179     bool openrgn;
180     if(!s_EvaluateNewScore(left,right,score,openrgn, rightanchor)) return false;
181     else if(score == BadScore()) return true;
182 
183     if(left.Score() != BadScore() && openrgn)
184     {
185         double scr = score+left.Score();
186         if(scr > right.Score())
187         {
188             right.UpdateLeftState(left);
189             right.UpdateScore(scr);
190         }
191     }
192 
193     return (openrgn && !s_TooFar(left, right, score));
194 }
195 
196 
197 template<class L, class R> // template for all exons
s_MakeStep(vector<L> & lvec,vector<R> & rvec,int leftprot,int rightprot)198 inline void s_MakeStep(vector<L>& lvec, vector<R>& rvec, int leftprot, int rightprot)
199 {
200     if(lvec.empty()) return;
201     typename vector<L>::reverse_iterator i = lvec.rbegin();
202     if(i->Stop() == rvec.back().Stop()) ++i;
203     for(;i != lvec.rend() && s_ForwardStep(*i,rvec.back(),leftprot,rightprot);++i);
204 
205     if(rvec.size() > 1) {
206         rvec.back().UpdatePrevExon(rvec[rvec.size()-2]);
207     }
208 }
209 
210 template<class L>
s_MakeStep(vector<L> & lvec,vector<CIntron> & rvec)211 inline void s_MakeStep(vector<L>& lvec, vector<CIntron>& rvec)
212 {
213     if(lvec.empty()) return;
214     CIntron& right = rvec.back();
215     typename vector<L>::reverse_iterator i = lvec.rbegin();
216 
217     int rlimit = right.Stop();
218     if(i->Stop() == rlimit) ++i;
219     int nearlimit = max(0,rlimit-kTooFarLen);
220     while(i != lvec.rend() && i->Stop() >= nearlimit)
221     {
222         if(!s_ForwardStep(*i,right)) return;
223         ++i;
224     }
225     if(i == lvec.rend()) return;
226     for(const L* p = &*i; p !=0; p = p->PrevExon())
227     {
228         if(!s_ForwardStep(*p,right)) return;
229     }
230 }
231 
232 
233 template<class L>
s_MakeStep(const CSeqScores & seqscr,vector<L> & lvec,vector<CIntergenic> & rvec,bool rightanchor=false)234 inline void s_MakeStep(const CSeqScores& seqscr, vector<L>& lvec, vector<CIntergenic>& rvec, bool rightanchor = false)
235 {
236     if(lvec.empty()) return;
237     CIntergenic& right = rvec.back();
238     int i = lvec.size()-1;
239     int rlimit = right.Stop();
240     if(lvec[i].Stop() == rlimit) --i;
241     while(i >= 0 && lvec[i].Stop() >= seqscr.LeftAlignmentBoundary(right.Stop())) --i;  // no intergenics in alignment
242     int nearlimit = max(0,rlimit-kTooFarLen);
243     while(i >= 0 && lvec[i].Stop() >= nearlimit)
244     {
245         if(!s_ForwardStep(lvec[i--],right, rightanchor)) return;
246     }
247     if(i < 0) return;
248     for(const L* p = &lvec[i]; p !=0; p = p->PrevExon())
249     {
250         if(!s_ForwardStep(*p,right, rightanchor)) return;
251     }
252 }
253 
s_MakeStep(const CSeqScores & seqscr,const CExonParameters & exon_params,EStrand strand,int point,vector<CIntergenic> & lvec,vector<CSingleExon> & rvec)254 inline void s_MakeStep(const CSeqScores& seqscr, const CExonParameters& exon_params, EStrand strand, int point, vector<CIntergenic>& lvec, vector<CSingleExon>& rvec)            // L - intergenic, R - single exon
255 {
256     rvec.push_back(CSingleExon(strand, point, seqscr, exon_params));
257     s_MakeStep(lvec, rvec, 0, -1);
258     if(rvec.back().Score() == BadScore()) rvec.pop_back();
259 }
260 
261 template<class R>
s_MakeStep(const CSeqScores & seqscr,const CExonParameters & exon_params,EStrand strand,int point,vector<CIntergenic> & lvec,vector<R> rvec[][3])262 inline void s_MakeStep(const CSeqScores& seqscr, const CExonParameters& exon_params, EStrand strand, int point, vector<CIntergenic>& lvec, vector<R> rvec[][3])                 // L - intergenic, R - first/last exon
263 {
264     for(int kr = 0; kr < 2; ++kr)
265     {
266         for(int phr = 0; phr < 3; ++phr)
267         {
268             rvec[kr][phr].push_back(R(strand,phr,point,seqscr,exon_params));
269             s_MakeStep(lvec, rvec[kr][phr], 0, kr);
270             if(rvec[kr][phr].back().Score() == BadScore()) rvec[kr][phr].pop_back();
271         }
272     }
273 }
274 
275 template<class R>
s_MakeStep(const CSeqScores & seqscr,const CExonParameters & exon_params,EStrand strand,int point,vector<CIntron> lvec[][3],vector<R> & rvec)276 inline void s_MakeStep(const CSeqScores& seqscr, const CExonParameters& exon_params, EStrand strand, int point, vector<CIntron> lvec[][3], vector<R>& rvec)                 // L - intron R - first/last exon
277 {
278     rvec.push_back(R(strand,0,point,seqscr,exon_params));          // phase is bogus, it will be redefined in constructor
279     for(int kl = 0; kl < 2; ++kl)
280     {
281         for(int phl = 0; phl < 3; ++phl)
282         {
283             s_MakeStep(lvec[kl][phl], rvec, kl, -1);
284         }
285     }
286     if(rvec.back().Score() == BadScore()) rvec.pop_back();
287 }
288 
s_MakeStep(const CSeqScores & seqscr,const CExonParameters & exon_params,EStrand strand,int point,vector<CIntron> lvec[][3],vector<CInternalExon> rvec[][3])289 inline void s_MakeStep(const CSeqScores& seqscr, const CExonParameters& exon_params, EStrand strand, int point, vector<CIntron> lvec[][3], vector<CInternalExon> rvec[][3])   // L - intron, R - internal exon
290 {
291     for(int phr = 0; phr < 3; ++phr)
292     {
293         for(int kr = 0; kr < 2; ++kr)
294         {
295             rvec[kr][phr].push_back(CInternalExon(strand,phr,point,seqscr,exon_params));
296             for(int phl = 0; phl < 3; ++phl)
297             {
298                 for(int kl = 0; kl < 2; ++kl)
299                 {
300                     s_MakeStep(lvec[kl][phl], rvec[kr][phr], kl, kr);
301                 }
302             }
303             if(rvec[kr][phr].back().Score() == BadScore()) rvec[kr][phr].pop_back();
304         }
305     }
306 }
307 
308 template<class L1, class L2>
s_MakeStep(const CSeqScores & seqscr,const CIntronParameters & intron_params,EStrand strand,int point,vector<L1> lvec1[][3],vector<L2> lvec2[][3],vector<CIntron> rvec[][3],int shift)309 inline void s_MakeStep(const CSeqScores& seqscr, const CIntronParameters& intron_params, EStrand strand, int point, vector<L1> lvec1[][3], vector<L2> lvec2[][3], vector<CIntron> rvec[][3], int shift)    // L1,L2 - exons, R -intron
310 {
311     for(int k = 0; k < 2; ++k)
312     {
313         for(int phl = 0; phl < 3; ++phl)
314         {
315             int phr = (shift+phl)%3;
316             rvec[k][phr].push_back(CIntron(strand,phr,point,seqscr,intron_params));
317             if(k == 1) rvec[k][phr].back().UpdateScore(BadScore());   // proteins don't come from outside
318             s_MakeStep(lvec1[k][phl], rvec[k][phr]);
319             s_MakeStep(lvec2[k][phl], rvec[k][phr]);
320             if(rvec[k][phr].back().Score() == BadScore()) rvec[k][phr].pop_back();
321         }
322     }
323 }
324 
AddProbabilities(double scr1,double scr2)325 double AddProbabilities(double scr1, double scr2)
326 {
327     if(scr1 == BadScore()) return scr2;
328     else if(scr2 == BadScore()) return scr1;
329     else if(scr1 >= scr2)  return scr1+log(1+exp(scr2-scr1));
330     else return scr2+log(1+exp(scr1-scr2));
331 }
332 
CParse(const CSeqScores & ss,const CIntronParameters & intron_params,const CIntergenicParameters & intergenic_params,const CExonParameters & exon_params,bool leftanchor,bool rightanchor)333 CParse::CParse(const CSeqScores& ss,
334                const CIntronParameters&     intron_params,
335                const CIntergenicParameters& intergenic_params,
336                const CExonParameters&       exon_params,
337                bool leftanchor, bool rightanchor)
338     : m_seqscr(ss)
339 {
340     try
341     {
342         m_igplus.reserve(m_seqscr.StartNumber(ePlus)+1);
343         m_igminus.reserve(m_seqscr.StopNumber(eMinus)+1);
344         m_feminus.reserve(m_seqscr.StartNumber(eMinus)+1);
345         m_leplus.reserve(m_seqscr.StopNumber(ePlus)+1);
346         m_seplus.reserve(m_seqscr.StopNumber(ePlus)+2);
347         m_seminus.reserve(m_seqscr.StartNumber(eMinus)+1);
348         for(int k = 0; k < 2; ++k)
349         {
350             for(int phase = 0; phase < 3; ++phase)
351             {
352                 m_inplus[k][phase].reserve(m_seqscr.AcceptorNumber(ePlus)+1);
353                 m_inminus[k][phase].reserve(m_seqscr.DonorNumber(eMinus)+1);
354                 m_feplus[k][phase].reserve(m_seqscr.DonorNumber(ePlus)+1);
355                 m_ieplus[k][phase].reserve(m_seqscr.DonorNumber(ePlus)+1);
356                 m_ieminus[k][phase].reserve(m_seqscr.AcceptorNumber(eMinus)+1);
357                 m_leminus[k][phase].reserve(m_seqscr.AcceptorNumber(eMinus)+1);
358             }
359         }
360     }
361     catch(bad_alloc)
362     {
363         NCBI_THROW(CGnomonException, eMemoryLimit, "Not enough memory in CParse");
364     }
365 
366     int len = m_seqscr.SeqLen();
367 
368     double BigScore = 10000;
369     if (leftanchor) {
370         m_seplus.push_back(CSingleExon(ePlus,0,m_seqscr,exon_params));
371         m_seplus.back().UpdateScore(BigScore);               // this is a bogus anchor to unforce intergenic start at 0
372     }
373 
374     for(int i = 0; i < len; ++i)
375     {
376         if(m_seqscr.AcceptorScore(i,ePlus) != BadScore())
377         {
378             s_MakeStep(m_seqscr, intron_params, ePlus,i,m_ieplus,m_feplus,m_inplus,1);
379         }
380 
381         if(m_seqscr.AcceptorScore(i,eMinus) != BadScore())
382         {
383             s_MakeStep(m_seqscr, exon_params, eMinus,i,m_inminus,m_ieminus);
384             s_MakeStep(m_seqscr, exon_params, eMinus,i,m_igminus,m_leminus);
385         }
386 
387         if(m_seqscr.DonorScore(i,ePlus) != BadScore())
388         {
389             s_MakeStep(m_seqscr, exon_params, ePlus,i,m_inplus,m_ieplus);
390             s_MakeStep(m_seqscr, exon_params, ePlus,i,m_igplus,m_feplus);
391         }
392 
393         if(m_seqscr.DonorScore(i,eMinus) != BadScore())
394         {
395             s_MakeStep(m_seqscr, intron_params, eMinus,i,m_leminus,m_ieminus,m_inminus,0);
396         }
397 
398         if(m_seqscr.StartScore(i,ePlus) != BadScore())
399         {
400             m_igplus.push_back(CIntergenic(ePlus,i,m_seqscr,intergenic_params));
401             s_MakeStep(m_seqscr, m_seplus,m_igplus);
402             s_MakeStep(m_seqscr, m_seminus,m_igplus);
403             s_MakeStep(m_seqscr, m_leplus,m_igplus);
404             s_MakeStep(m_seqscr, m_feminus,m_igplus);
405         }
406 
407         if(m_seqscr.StartScore(i,eMinus) != BadScore())
408         {
409             s_MakeStep(m_seqscr, exon_params, eMinus,i,m_inminus,m_feminus);
410             s_MakeStep(m_seqscr, exon_params, eMinus,i,m_igminus,m_seminus);
411         }
412 
413         if(m_seqscr.StopScore(i,ePlus) != BadScore())
414         {
415             s_MakeStep(m_seqscr, exon_params, ePlus,i,m_inplus,m_leplus);
416             s_MakeStep(m_seqscr, exon_params, ePlus,i,m_igplus,m_seplus);
417         }
418 
419         if(m_seqscr.StopScore(i,eMinus) != BadScore())
420         {
421             m_igminus.push_back(CIntergenic(eMinus,i,m_seqscr,intergenic_params));
422             s_MakeStep(m_seqscr, m_seplus,m_igminus);
423             s_MakeStep(m_seqscr, m_seminus,m_igminus);
424             s_MakeStep(m_seqscr, m_leplus,m_igminus);
425             s_MakeStep(m_seqscr, m_feminus,m_igminus);
426         }
427     }
428 
429     m_igplus.push_back(CIntergenic(ePlus,-1,m_seqscr,intergenic_params));                // never is popped
430     s_MakeStep(m_seqscr, m_seplus,m_igplus,rightanchor);
431     s_MakeStep(m_seqscr, m_seminus,m_igplus,rightanchor);
432     s_MakeStep(m_seqscr, m_leplus,m_igplus,rightanchor);
433     s_MakeStep(m_seqscr, m_feminus,m_igplus,rightanchor);
434 
435     m_igminus.push_back(CIntergenic(eMinus,-1,m_seqscr,intergenic_params));               // never is popped
436     s_MakeStep(m_seqscr, m_seplus,m_igminus,rightanchor);
437     s_MakeStep(m_seqscr, m_seminus,m_igminus,rightanchor);
438     s_MakeStep(m_seqscr, m_leplus,m_igminus,rightanchor);
439     s_MakeStep(m_seqscr, m_feminus,m_igminus,rightanchor);
440 
441     m_igplus.back().UpdateScore(AddProbabilities(m_igplus.back().Score(),m_igminus.back().Score()));
442 
443     const CHMM_State* p = &m_igplus.back();
444 
445     if (!rightanchor) {
446         s_MakeStep(m_seqscr, intron_params, ePlus,-1,m_ieplus,m_feplus,m_inplus,1);        // may be popped
447         s_MakeStep(m_seqscr, intron_params, eMinus,-1,m_leminus,m_ieminus,m_inminus,0);    // may be popped
448 
449         for(int k = 0; k < 2; ++k) {
450             for(int ph = 0; ph < 3; ++ph) {
451                 if(!m_inplus[k][ph].empty() && m_inplus[k][ph].back().NoRightEnd() && m_inplus[k][ph].back().Score() > p->Score()) p = &m_inplus[k][ph].back();
452                 if(!m_inminus[k][ph].empty() && m_inminus[k][ph].back().NoRightEnd() && m_inminus[k][ph].back().Score() > p->Score()) p = &m_inminus[k][ph].back();
453             }
454         }
455     }
456     m_path = p;
457 
458     if(leftanchor) {                                       // return scores to normal and forget the connection to the left bogus anchor
459         for(p = m_path ; p != 0; p = p->LeftState()) {
460             const_cast<CHMM_State*>(p)->UpdateScore(p->Score()-BigScore);
461             if(p->LeftState() == &m_seplus[0])
462                 const_cast<CHMM_State*>(p)->ClearLeftState();
463         }
464     }
465 }
466 
Out(T t,int w,CNcbiOstream & to=cout)467 template<class T> void Out(T t, int w, CNcbiOstream& to = cout)
468 {
469     to.width(w);
470     to.setf(IOS_BASE::right,IOS_BASE::adjustfield);
471     to << t;
472 }
473 
Out(double t,int w,CNcbiOstream & to=cout,int prec=1)474 void Out(double t, int w, CNcbiOstream& to = cout, int prec = 1)
475 {
476     to.width(w);
477     to.setf(IOS_BASE::right,IOS_BASE::adjustfield);
478     to.setf(IOS_BASE::fixed,IOS_BASE::floatfield);
479     to.precision(prec);
480 
481     if(t > 1000000000) to << "+Inf";
482     else if(t < -1000000000) to << "-Inf";
483     else to << t;
484 }
485 
AddSupport(const TGeneModelList & align_list,TSignedSeqRange inside_range,CGeneModel & gene,TSignedSeqRange reading_frame)486 void AddSupport(const TGeneModelList& align_list, TSignedSeqRange inside_range, CGeneModel& gene, TSignedSeqRange reading_frame)
487 {
488     CGeneModel support;
489     support.SetStrand(gene.Strand());
490     const CGeneModel* supporting_align = NULL;
491 
492     ITERATE(TGeneModelList, it, align_list) {
493         const CGeneModel& algn(*it);
494 
495         if ((algn.Type() & (CGeneModel::eWall | CGeneModel::eNested))!=0) continue;
496         if(!Include(inside_range,algn.MaxCdsLimits())) continue;
497 
498         if ((algn.ReadingFrame().Empty()?algn.Limits():algn.ReadingFrame()).IntersectingWith(reading_frame)) {
499             support.Extend(algn, false);
500             support.AddSupport(CSupportInfo(algn.ID(),false));
501             supporting_align = &algn;
502 
503             if((algn.Status()&CGeneModel::ePolyA) != 0)
504                 gene.Status() |= CGeneModel::ePolyA;
505 
506             ITERATE(list< CRef<CSeq_id> >, i, algn.TrustedmRNA()) {
507                 gene.InsertTrustedmRNA(*i);
508             }
509             ITERATE(list< CRef<CSeq_id> >, i, algn.TrustedProt()) {
510                 gene.InsertTrustedProt(*i);
511             }
512 
513         }
514     }
515 
516     if (!support.Exons().empty()) {
517         _ASSERT( supporting_align != NULL );
518         gene.FrameShifts() = support.FrameShifts();
519         //        CCDSInfo cds_info;
520         //        cds_info.SetReadingFrame(reading_frame);
521         //        gene.SetCdsInfo(cds_info);
522 
523         //        gene.Extend(support, support.Continuous());
524         gene.Extend(support, false);
525         gene.ReplaceSupport(support.Support());
526         if ( Include(support.Limits(),gene.Limits()) && support.Continuous() ) {
527             if (gene.Support().size()==1)
528                 gene = *supporting_align;
529             gene.Status() |= CGeneModel::eFullSupCDS;
530         }
531     }
532 #ifdef _DEBUG
533     ITERATE(TGeneModelList, it, align_list) {
534         const CGeneModel& algn(*it);
535 
536         if ((algn.Type() & (CGeneModel::eWall | CGeneModel::eNested))!=0) continue;
537         if(!Include(inside_range,algn.MaxCdsLimits())) continue;
538 
539         if ((algn.ReadingFrame().Empty()?algn.Limits():algn.ReadingFrame()).IntersectingWith(reading_frame)) {
540             _ASSERT( algn.isCompatible(gene) );
541         }
542     }
543 #endif
544 }
545 
GetGenes() const546 TGeneModelList CParse::GetGenes() const
547 {
548     TGeneModelList genes;
549     vector< vector<const CExon*> > gen_exons;
550     const CAlignMap& seq_map = m_seqscr.FrameShiftedSeqMap();
551 
552     if(dynamic_cast<const CIntron*>(Path()) && Path()->LeftState())
553         gen_exons.push_back(vector<const CExon*>());
554 
555     for(const CHMM_State* p = Path(); p != 0; p = p->LeftState())
556         if(p->isExon()) {
557             if(p->isGeneRightEnd())
558                 gen_exons.push_back(vector<const CExon*>());
559 
560             gen_exons.back().push_back(static_cast<const CExon*>(p));
561         }
562 
563     const EResidue* seqp = m_seqscr.SeqPtr(0, ePlus);
564 
565     ITERATE(vector< vector<const CExon*> >, g_it, gen_exons) {
566         EStrand strand = g_it->back()->Strand();
567         genes.push_front(CGeneModel(strand,0,CGeneModel::eGnomon));
568         CGeneModel& gene = genes.front();
569 
570         double score = 0;
571 
572         CGeneModel local_gene(strand);
573         for(int i = (int)g_it->size()-1; i >= 0; --i) {
574             const CExon* pe = (*g_it)[i];
575             //        for (vector<const CExon*>::const_reverse_iterator e_it = g_it->rbegin(); e_it != g_it->rend(); ++e_it) {
576             //            const CExon* pe = *e_it;
577 
578             TSignedSeqRange local_exon(pe->Start(),pe->Stop());
579             local_gene.AddExon(local_exon);
580             TSignedSeqRange exon = seq_map.MapRangeEditedToOrig(local_exon);
581 
582             string fsig, ssig;
583             if(i != (int)g_it->size()-1) {
584                 fsig.push_back(toACGT(seqp[local_exon.GetFrom()-2]));
585                 fsig.push_back(toACGT(seqp[local_exon.GetFrom()-1]));
586             }
587             if(i != 0) {
588                 ssig.push_back(toACGT(seqp[local_exon.GetTo()+1]));
589                 ssig.push_back(toACGT(seqp[local_exon.GetTo()+2]));
590             }
591             if(pe->isMinus()) {
592                 ReverseComplement(fsig.begin(),fsig.end());
593                 ReverseComplement(ssig.begin(),ssig.end());
594             }
595 
596             gene.AddExon(exon, fsig, ssig);
597 
598             score += pe->RgnScore()+pe->ExonScore();
599         }
600         CAlignMap localmap(local_gene.Exons(), local_gene.FrameShifts(), local_gene.Strand());
601 
602         TSignedSeqRange local_reading_frame(g_it->back()->Start(), g_it->front()->Stop());
603 
604         if (!g_it->back()->isGeneLeftEnd()) {
605             const CExon* pe = g_it->back();
606             int estart = pe->Start();
607             int estop = pe->Stop();
608             int rframe = pe->Phase();
609             int lframe = pe->isPlus() ? (rframe-(estop-estart))%3 : (rframe+(estop-estart))%3;
610             if (lframe < 0)
611                 lframe += 3;
612             int del = pe->isPlus() ? (3-lframe)%3 : (1+lframe)%3;
613 
614             if(localmap.FShiftedLen(local_reading_frame) <= del) {
615                 genes.pop_front();
616                 continue;
617             }
618 
619             local_reading_frame.SetFrom(localmap.FShiftedMove(local_reading_frame.GetFrom(),del));    // do it hard way in case we will get into next exon
620         }
621         if (!g_it->front()->isGeneRightEnd()) {
622             const CExon* pe = g_it->front();
623             int rframe = pe->Phase();
624             int del = pe->isPlus() ? (1+rframe)%3 : (3-rframe)%3;
625 
626             if(localmap.FShiftedLen(local_reading_frame) <= del) {
627                 genes.pop_front();
628                 continue;
629             }
630 
631             local_reading_frame.SetTo(localmap.FShiftedMove(local_reading_frame.GetTo(),-del));      // do it hard way in case we will get into next exon
632         }
633 
634         if (local_reading_frame.Empty()) {
635             genes.pop_front();
636             continue;
637         }
638 
639         TSignedSeqRange reading_frame = seq_map.MapRangeEditedToOrig(local_reading_frame, true);
640         _ASSERT(reading_frame.NotEmpty() && reading_frame.GetFrom() >= 0 && reading_frame.GetTo() >= 0);
641         _ASSERT(gene.Limits().GetFrom() <= reading_frame.GetFrom() && gene.FShiftedLen(gene.Limits().GetFrom(),reading_frame.GetFrom(),false) <=3);
642         _ASSERT(gene.Limits().GetTo() >= reading_frame.GetTo() && gene.FShiftedLen(reading_frame.GetTo(),gene.Limits().GetTo(),false) <= 3);
643 
644         AddSupport(m_seqscr.Alignments(),
645                    TSignedSeqRange(m_seqscr.From(),m_seqscr.To()),
646                    gene, reading_frame);
647 
648         TSignedSeqRange start, stop;
649         CAlignMap gene_map = gene.GetAlignMap();
650         if (g_it->back()->isGeneLeftEnd()) {
651             int utr_len = gene_map.FShiftedLen(TSignedSeqRange(gene.Limits().GetFrom(),reading_frame.GetFrom()), CAlignMap::eSinglePoint, CAlignMap::eLeftEnd) - 1;
652             if(utr_len >= 3 || m_seqscr.isReadingFrameLeftEnd(local_reading_frame.GetFrom(),strand)) {                                       // extend gene only if start/stop is conventional
653                 if (utr_len < 3 ) {
654                     gene.ExtendLeft(3-utr_len);
655                     gene_map = gene.GetAlignMap();
656                 }
657 
658                 TSignedSeqRange rf = gene_map.MapRangeOrigToEdited(reading_frame, true);
659                 TSignedSeqRange stt(rf.GetFrom()-3,rf.GetFrom()-1);
660                 TSignedSeqRange stp(rf.GetTo()+1,rf.GetTo()+3);
661                 if (strand == eMinus)
662                     swap(stt,stp);
663                 start = gene_map.MapRangeEditedToOrig(stt, false);
664             }
665         }
666 
667         if (g_it->front()->isGeneRightEnd()) {
668             int utr_len = gene_map.FShiftedLen(TSignedSeqRange(reading_frame.GetTo(),gene.Limits().GetTo()), CAlignMap::eRightEnd, CAlignMap::eSinglePoint) - 1;
669             if(utr_len >= 3 || m_seqscr.isReadingFrameRightEnd(local_reading_frame.GetTo(),strand)) {                                        // extend gene only if start/stop is conventional
670                 if (utr_len < 3 ) {
671                     gene.ExtendRight(3-utr_len);
672                     gene_map = gene.GetAlignMap();
673                 }
674 
675                 TSignedSeqRange rf = gene_map.MapRangeOrigToEdited(reading_frame, true);
676                 TSignedSeqRange stt(rf.GetFrom()-3,rf.GetFrom()-1);
677                 TSignedSeqRange stp(rf.GetTo()+1,rf.GetTo()+3);
678                 if (strand == eMinus)
679                     swap(stt,stp);
680                 stop = gene_map.MapRangeEditedToOrig(stp, false);
681             }
682         }
683 
684         if (strand == eMinus)
685             swap(start,stop);
686 
687         CCDSInfo cds_info;
688         if (gene.GetCdsInfo().ProtReadingFrame().NotEmpty())
689             cds_info.SetReadingFrame(gene.GetCdsInfo().ProtReadingFrame(), true);
690         cds_info.SetReadingFrame(reading_frame);
691         if (start.NotEmpty())
692             cds_info.SetStart(start, gene.ConfirmedStart() && start == gene.GetCdsInfo().Start());
693         if (stop.NotEmpty())
694             cds_info.SetStop(stop, gene.ConfirmedStop());
695 
696         ITERATE(CCDSInfo::TPStops,s,gene.GetCdsInfo().PStops())
697             cds_info.AddPStop(*s);
698 
699         cds_info.SetScore(score);
700         gene.SetCdsInfo(cds_info);
701     }
702 
703     return genes;
704 }
705 
PrintInfo() const706 void CParse::PrintInfo() const
707 {
708     vector<const CHMM_State*> states;
709     for(const CHMM_State* p = Path(); p != 0; p = p->LeftState()) states.push_back(p);
710     reverse(states.begin(),states.end());
711     const CAlignMap& seq_map = m_seqscr.FrameShiftedSeqMap();
712 
713     Out(" ",15);
714     Out("Start",11);
715     Out("Stop",11);
716     Out("Score",8);
717     Out("BrScr",8);
718     Out("LnScr",8);
719     Out("RgnScr",8);
720     Out("TrmScr",8);
721     Out("AccScr",8);
722     cout << endl;
723 
724     for(int i = 0; i < (int)states.size(); ++i)
725     {
726         const CHMM_State* p = states[i];
727         if(dynamic_cast<const CIntergenic*>(p)) cout << endl;
728 
729         Out(p->GetStateName(),13);
730         Out(p->isPlus() ? '+' : '-',2);
731         int a = seq_map.MapEditedToOrig(p->Start());
732         int b = seq_map.MapEditedToOrig(p->Stop());
733         Out(a,11);
734         Out(b,11);
735         SStateScores sc = p->GetStateScores();
736         Out(sc.m_score,8);
737         Out(sc.m_branch,8);
738         Out(sc.m_length,8);
739         Out(sc.m_region,8);
740         Out(sc.m_term,8);
741         Out(p->Score(),8);
742         cout << endl;
743     }
744 }
745 
746 
747 END_SCOPE(gnomon)
748 END_NCBI_SCOPE
749