1 /* $Id: parse.cpp 396800 2013-04-22 18:54:22Z souvorov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software / database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software / database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Alexandre Souvorov
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <algo/gnomon/gnomon_model.hpp>
34 #include <algo/gnomon/gnomon_exception.hpp>
35 #include <objects/general/Object_id.hpp>
36 #include "hmm.hpp"
37 #include "parse.hpp"
38 #include "hmm_inlines.hpp"
39
40
41 BEGIN_NCBI_SCOPE
42 BEGIN_SCOPE(gnomon)
43
44 USING_SCOPE(objects);
45
46 template<class L, class R> inline
s_TooFar(const L & left,const R & right,double score)47 bool s_TooFar(const L& left, const R& right, double score)
48 {
49 if(left.MScore() == BadScore()) return true;
50 int len = right.Stop()-left.Stop();
51 return (len > kTooFarLen && score+left.MScore() < right.Score());
52 }
53
54 struct SRState
55 {
SRStateSRState56 SRState(const CHMM_State& l, const CHMM_State& r)
57 {
58 lsp = r.LeftState();
59 rsp = const_cast<CHMM_State*>(&r);
60 rsp->UpdateLeftState(l);
61 }
~SRStateSRState62 ~SRState() { rsp->UpdateLeftState(*lsp); }
63
64 const CHMM_State* lsp;
65 CHMM_State* rsp;
66 };
67
68 template<class Left, class Right> inline
s_EvaluateNewScore(const Left & left,const Right & right,double & rscore,bool & openrgn,bool rightanchor=false)69 bool s_EvaluateNewScore(const Left& left, const Right& right, double& rscore, bool& openrgn, bool rightanchor = false)
70 {
71
72 rscore = BadScore();
73
74 SRState rst(left,right);
75
76 int len = right.Stop()-left.Stop();
77 if(len > right.MaxLen()) return false;
78 if(!right.NoRightEnd() && len < right.MinLen()) return true;
79
80 double scr, score = 0;
81 if(left.isPlus())
82 {
83 scr = left.BranchScore(right);
84 if(scr == BadScore()) return true;
85 }
86 else
87 {
88 scr = right.BranchScore(left);
89 if(scr == BadScore()) return true;
90 scr += right.DenScore()-left.DenScore();
91 }
92 score += scr;
93
94 // this check is frame-dependent and MUST be after BranchScore
95 if(right.StopInside()) return false;
96
97 if(right.NoRightEnd() && !rightanchor) scr = right.ClosingLengthScore();
98 else scr = right.LengthScore();
99 if(scr == BadScore()) return true;
100 score += scr;
101
102 scr = right.RgnScore();
103 if(scr == BadScore()) return true;
104 score += scr;
105
106 if(!right.NoRightEnd())
107 {
108 scr = right.TermScore();
109 if(scr == BadScore()) return true;
110 score += scr;
111 }
112
113 openrgn = right.OpenRgn();
114 rscore = score;
115 return true;
116 }
117
118 // leftprot 0 no protein, 1 there is a protein
119 // rightprot 0 no protein, 1 must be a protein, -1 don't care
120 template<class L, class R> // template for all exons
s_ForwardStep(const L & left,R & right,int leftprot,int rightprot)121 inline bool s_ForwardStep(const L& left, R& right, int leftprot, int rightprot)
122 {
123 double score;
124 bool openrgn;
125 if(!s_EvaluateNewScore(left,right,score,openrgn)) return false;
126 else if(score == BadScore()) return true;
127
128 int protnum = right.GetSeqScores().ProtNumber(left.Stop(),right.Stop());
129
130 if(rightprot == 0 && (leftprot != 0 || protnum != 0)) // there is a protein
131 {
132 return true;
133 }
134
135 if(leftprot == 0)
136 {
137 if(rightprot == 1 && protnum == 0) return true; // no proteins
138 else if(protnum > 0) --protnum; // for first protein no penalty
139 }
140
141 if(left.Score() != BadScore() && openrgn)
142 {
143 double scr = score-protnum*right.GetSeqScores().MultiProtPenalty()+left.Score();
144 if(scr > right.Score())
145 {
146 right.UpdateLeftState(left);
147 right.UpdateScore(scr);
148 }
149 }
150
151 return openrgn;
152 }
153
154 template<class L>
s_ForwardStep(const L & left,CIntron & right)155 inline bool s_ForwardStep(const L& left, CIntron& right)
156 {
157 double score;
158 bool openrgn;
159 if(!s_EvaluateNewScore(left,right,score,openrgn)) return false;
160 else if(score == BadScore()) return true;
161
162 if(left.Score() != BadScore() && openrgn)
163 {
164 double scr = score+left.Score();
165 if(scr > right.Score())
166 {
167 right.UpdateLeftState(left);
168 right.UpdateScore(scr);
169 }
170 }
171
172 return (openrgn && !s_TooFar(left, right, score));
173 }
174
175 template<class L>
s_ForwardStep(const L & left,CIntergenic & right,bool rightanchor)176 inline bool s_ForwardStep(const L& left, CIntergenic& right, bool rightanchor)
177 {
178 double score;
179 bool openrgn;
180 if(!s_EvaluateNewScore(left,right,score,openrgn, rightanchor)) return false;
181 else if(score == BadScore()) return true;
182
183 if(left.Score() != BadScore() && openrgn)
184 {
185 double scr = score+left.Score();
186 if(scr > right.Score())
187 {
188 right.UpdateLeftState(left);
189 right.UpdateScore(scr);
190 }
191 }
192
193 return (openrgn && !s_TooFar(left, right, score));
194 }
195
196
197 template<class L, class R> // template for all exons
s_MakeStep(vector<L> & lvec,vector<R> & rvec,int leftprot,int rightprot)198 inline void s_MakeStep(vector<L>& lvec, vector<R>& rvec, int leftprot, int rightprot)
199 {
200 if(lvec.empty()) return;
201 typename vector<L>::reverse_iterator i = lvec.rbegin();
202 if(i->Stop() == rvec.back().Stop()) ++i;
203 for(;i != lvec.rend() && s_ForwardStep(*i,rvec.back(),leftprot,rightprot);++i);
204
205 if(rvec.size() > 1) {
206 rvec.back().UpdatePrevExon(rvec[rvec.size()-2]);
207 }
208 }
209
210 template<class L>
s_MakeStep(vector<L> & lvec,vector<CIntron> & rvec)211 inline void s_MakeStep(vector<L>& lvec, vector<CIntron>& rvec)
212 {
213 if(lvec.empty()) return;
214 CIntron& right = rvec.back();
215 typename vector<L>::reverse_iterator i = lvec.rbegin();
216
217 int rlimit = right.Stop();
218 if(i->Stop() == rlimit) ++i;
219 int nearlimit = max(0,rlimit-kTooFarLen);
220 while(i != lvec.rend() && i->Stop() >= nearlimit)
221 {
222 if(!s_ForwardStep(*i,right)) return;
223 ++i;
224 }
225 if(i == lvec.rend()) return;
226 for(const L* p = &*i; p !=0; p = p->PrevExon())
227 {
228 if(!s_ForwardStep(*p,right)) return;
229 }
230 }
231
232
233 template<class L>
s_MakeStep(const CSeqScores & seqscr,vector<L> & lvec,vector<CIntergenic> & rvec,bool rightanchor=false)234 inline void s_MakeStep(const CSeqScores& seqscr, vector<L>& lvec, vector<CIntergenic>& rvec, bool rightanchor = false)
235 {
236 if(lvec.empty()) return;
237 CIntergenic& right = rvec.back();
238 int i = lvec.size()-1;
239 int rlimit = right.Stop();
240 if(lvec[i].Stop() == rlimit) --i;
241 while(i >= 0 && lvec[i].Stop() >= seqscr.LeftAlignmentBoundary(right.Stop())) --i; // no intergenics in alignment
242 int nearlimit = max(0,rlimit-kTooFarLen);
243 while(i >= 0 && lvec[i].Stop() >= nearlimit)
244 {
245 if(!s_ForwardStep(lvec[i--],right, rightanchor)) return;
246 }
247 if(i < 0) return;
248 for(const L* p = &lvec[i]; p !=0; p = p->PrevExon())
249 {
250 if(!s_ForwardStep(*p,right, rightanchor)) return;
251 }
252 }
253
s_MakeStep(const CSeqScores & seqscr,const CExonParameters & exon_params,EStrand strand,int point,vector<CIntergenic> & lvec,vector<CSingleExon> & rvec)254 inline void s_MakeStep(const CSeqScores& seqscr, const CExonParameters& exon_params, EStrand strand, int point, vector<CIntergenic>& lvec, vector<CSingleExon>& rvec) // L - intergenic, R - single exon
255 {
256 rvec.push_back(CSingleExon(strand, point, seqscr, exon_params));
257 s_MakeStep(lvec, rvec, 0, -1);
258 if(rvec.back().Score() == BadScore()) rvec.pop_back();
259 }
260
261 template<class R>
s_MakeStep(const CSeqScores & seqscr,const CExonParameters & exon_params,EStrand strand,int point,vector<CIntergenic> & lvec,vector<R> rvec[][3])262 inline void s_MakeStep(const CSeqScores& seqscr, const CExonParameters& exon_params, EStrand strand, int point, vector<CIntergenic>& lvec, vector<R> rvec[][3]) // L - intergenic, R - first/last exon
263 {
264 for(int kr = 0; kr < 2; ++kr)
265 {
266 for(int phr = 0; phr < 3; ++phr)
267 {
268 rvec[kr][phr].push_back(R(strand,phr,point,seqscr,exon_params));
269 s_MakeStep(lvec, rvec[kr][phr], 0, kr);
270 if(rvec[kr][phr].back().Score() == BadScore()) rvec[kr][phr].pop_back();
271 }
272 }
273 }
274
275 template<class R>
s_MakeStep(const CSeqScores & seqscr,const CExonParameters & exon_params,EStrand strand,int point,vector<CIntron> lvec[][3],vector<R> & rvec)276 inline void s_MakeStep(const CSeqScores& seqscr, const CExonParameters& exon_params, EStrand strand, int point, vector<CIntron> lvec[][3], vector<R>& rvec) // L - intron R - first/last exon
277 {
278 rvec.push_back(R(strand,0,point,seqscr,exon_params)); // phase is bogus, it will be redefined in constructor
279 for(int kl = 0; kl < 2; ++kl)
280 {
281 for(int phl = 0; phl < 3; ++phl)
282 {
283 s_MakeStep(lvec[kl][phl], rvec, kl, -1);
284 }
285 }
286 if(rvec.back().Score() == BadScore()) rvec.pop_back();
287 }
288
s_MakeStep(const CSeqScores & seqscr,const CExonParameters & exon_params,EStrand strand,int point,vector<CIntron> lvec[][3],vector<CInternalExon> rvec[][3])289 inline void s_MakeStep(const CSeqScores& seqscr, const CExonParameters& exon_params, EStrand strand, int point, vector<CIntron> lvec[][3], vector<CInternalExon> rvec[][3]) // L - intron, R - internal exon
290 {
291 for(int phr = 0; phr < 3; ++phr)
292 {
293 for(int kr = 0; kr < 2; ++kr)
294 {
295 rvec[kr][phr].push_back(CInternalExon(strand,phr,point,seqscr,exon_params));
296 for(int phl = 0; phl < 3; ++phl)
297 {
298 for(int kl = 0; kl < 2; ++kl)
299 {
300 s_MakeStep(lvec[kl][phl], rvec[kr][phr], kl, kr);
301 }
302 }
303 if(rvec[kr][phr].back().Score() == BadScore()) rvec[kr][phr].pop_back();
304 }
305 }
306 }
307
308 template<class L1, class L2>
s_MakeStep(const CSeqScores & seqscr,const CIntronParameters & intron_params,EStrand strand,int point,vector<L1> lvec1[][3],vector<L2> lvec2[][3],vector<CIntron> rvec[][3],int shift)309 inline void s_MakeStep(const CSeqScores& seqscr, const CIntronParameters& intron_params, EStrand strand, int point, vector<L1> lvec1[][3], vector<L2> lvec2[][3], vector<CIntron> rvec[][3], int shift) // L1,L2 - exons, R -intron
310 {
311 for(int k = 0; k < 2; ++k)
312 {
313 for(int phl = 0; phl < 3; ++phl)
314 {
315 int phr = (shift+phl)%3;
316 rvec[k][phr].push_back(CIntron(strand,phr,point,seqscr,intron_params));
317 if(k == 1) rvec[k][phr].back().UpdateScore(BadScore()); // proteins don't come from outside
318 s_MakeStep(lvec1[k][phl], rvec[k][phr]);
319 s_MakeStep(lvec2[k][phl], rvec[k][phr]);
320 if(rvec[k][phr].back().Score() == BadScore()) rvec[k][phr].pop_back();
321 }
322 }
323 }
324
AddProbabilities(double scr1,double scr2)325 double AddProbabilities(double scr1, double scr2)
326 {
327 if(scr1 == BadScore()) return scr2;
328 else if(scr2 == BadScore()) return scr1;
329 else if(scr1 >= scr2) return scr1+log(1+exp(scr2-scr1));
330 else return scr2+log(1+exp(scr1-scr2));
331 }
332
CParse(const CSeqScores & ss,const CIntronParameters & intron_params,const CIntergenicParameters & intergenic_params,const CExonParameters & exon_params,bool leftanchor,bool rightanchor)333 CParse::CParse(const CSeqScores& ss,
334 const CIntronParameters& intron_params,
335 const CIntergenicParameters& intergenic_params,
336 const CExonParameters& exon_params,
337 bool leftanchor, bool rightanchor)
338 : m_seqscr(ss)
339 {
340 try
341 {
342 m_igplus.reserve(m_seqscr.StartNumber(ePlus)+1);
343 m_igminus.reserve(m_seqscr.StopNumber(eMinus)+1);
344 m_feminus.reserve(m_seqscr.StartNumber(eMinus)+1);
345 m_leplus.reserve(m_seqscr.StopNumber(ePlus)+1);
346 m_seplus.reserve(m_seqscr.StopNumber(ePlus)+2);
347 m_seminus.reserve(m_seqscr.StartNumber(eMinus)+1);
348 for(int k = 0; k < 2; ++k)
349 {
350 for(int phase = 0; phase < 3; ++phase)
351 {
352 m_inplus[k][phase].reserve(m_seqscr.AcceptorNumber(ePlus)+1);
353 m_inminus[k][phase].reserve(m_seqscr.DonorNumber(eMinus)+1);
354 m_feplus[k][phase].reserve(m_seqscr.DonorNumber(ePlus)+1);
355 m_ieplus[k][phase].reserve(m_seqscr.DonorNumber(ePlus)+1);
356 m_ieminus[k][phase].reserve(m_seqscr.AcceptorNumber(eMinus)+1);
357 m_leminus[k][phase].reserve(m_seqscr.AcceptorNumber(eMinus)+1);
358 }
359 }
360 }
361 catch(bad_alloc)
362 {
363 NCBI_THROW(CGnomonException, eMemoryLimit, "Not enough memory in CParse");
364 }
365
366 int len = m_seqscr.SeqLen();
367
368 double BigScore = 10000;
369 if (leftanchor) {
370 m_seplus.push_back(CSingleExon(ePlus,0,m_seqscr,exon_params));
371 m_seplus.back().UpdateScore(BigScore); // this is a bogus anchor to unforce intergenic start at 0
372 }
373
374 for(int i = 0; i < len; ++i)
375 {
376 if(m_seqscr.AcceptorScore(i,ePlus) != BadScore())
377 {
378 s_MakeStep(m_seqscr, intron_params, ePlus,i,m_ieplus,m_feplus,m_inplus,1);
379 }
380
381 if(m_seqscr.AcceptorScore(i,eMinus) != BadScore())
382 {
383 s_MakeStep(m_seqscr, exon_params, eMinus,i,m_inminus,m_ieminus);
384 s_MakeStep(m_seqscr, exon_params, eMinus,i,m_igminus,m_leminus);
385 }
386
387 if(m_seqscr.DonorScore(i,ePlus) != BadScore())
388 {
389 s_MakeStep(m_seqscr, exon_params, ePlus,i,m_inplus,m_ieplus);
390 s_MakeStep(m_seqscr, exon_params, ePlus,i,m_igplus,m_feplus);
391 }
392
393 if(m_seqscr.DonorScore(i,eMinus) != BadScore())
394 {
395 s_MakeStep(m_seqscr, intron_params, eMinus,i,m_leminus,m_ieminus,m_inminus,0);
396 }
397
398 if(m_seqscr.StartScore(i,ePlus) != BadScore())
399 {
400 m_igplus.push_back(CIntergenic(ePlus,i,m_seqscr,intergenic_params));
401 s_MakeStep(m_seqscr, m_seplus,m_igplus);
402 s_MakeStep(m_seqscr, m_seminus,m_igplus);
403 s_MakeStep(m_seqscr, m_leplus,m_igplus);
404 s_MakeStep(m_seqscr, m_feminus,m_igplus);
405 }
406
407 if(m_seqscr.StartScore(i,eMinus) != BadScore())
408 {
409 s_MakeStep(m_seqscr, exon_params, eMinus,i,m_inminus,m_feminus);
410 s_MakeStep(m_seqscr, exon_params, eMinus,i,m_igminus,m_seminus);
411 }
412
413 if(m_seqscr.StopScore(i,ePlus) != BadScore())
414 {
415 s_MakeStep(m_seqscr, exon_params, ePlus,i,m_inplus,m_leplus);
416 s_MakeStep(m_seqscr, exon_params, ePlus,i,m_igplus,m_seplus);
417 }
418
419 if(m_seqscr.StopScore(i,eMinus) != BadScore())
420 {
421 m_igminus.push_back(CIntergenic(eMinus,i,m_seqscr,intergenic_params));
422 s_MakeStep(m_seqscr, m_seplus,m_igminus);
423 s_MakeStep(m_seqscr, m_seminus,m_igminus);
424 s_MakeStep(m_seqscr, m_leplus,m_igminus);
425 s_MakeStep(m_seqscr, m_feminus,m_igminus);
426 }
427 }
428
429 m_igplus.push_back(CIntergenic(ePlus,-1,m_seqscr,intergenic_params)); // never is popped
430 s_MakeStep(m_seqscr, m_seplus,m_igplus,rightanchor);
431 s_MakeStep(m_seqscr, m_seminus,m_igplus,rightanchor);
432 s_MakeStep(m_seqscr, m_leplus,m_igplus,rightanchor);
433 s_MakeStep(m_seqscr, m_feminus,m_igplus,rightanchor);
434
435 m_igminus.push_back(CIntergenic(eMinus,-1,m_seqscr,intergenic_params)); // never is popped
436 s_MakeStep(m_seqscr, m_seplus,m_igminus,rightanchor);
437 s_MakeStep(m_seqscr, m_seminus,m_igminus,rightanchor);
438 s_MakeStep(m_seqscr, m_leplus,m_igminus,rightanchor);
439 s_MakeStep(m_seqscr, m_feminus,m_igminus,rightanchor);
440
441 m_igplus.back().UpdateScore(AddProbabilities(m_igplus.back().Score(),m_igminus.back().Score()));
442
443 const CHMM_State* p = &m_igplus.back();
444
445 if (!rightanchor) {
446 s_MakeStep(m_seqscr, intron_params, ePlus,-1,m_ieplus,m_feplus,m_inplus,1); // may be popped
447 s_MakeStep(m_seqscr, intron_params, eMinus,-1,m_leminus,m_ieminus,m_inminus,0); // may be popped
448
449 for(int k = 0; k < 2; ++k) {
450 for(int ph = 0; ph < 3; ++ph) {
451 if(!m_inplus[k][ph].empty() && m_inplus[k][ph].back().NoRightEnd() && m_inplus[k][ph].back().Score() > p->Score()) p = &m_inplus[k][ph].back();
452 if(!m_inminus[k][ph].empty() && m_inminus[k][ph].back().NoRightEnd() && m_inminus[k][ph].back().Score() > p->Score()) p = &m_inminus[k][ph].back();
453 }
454 }
455 }
456 m_path = p;
457
458 if(leftanchor) { // return scores to normal and forget the connection to the left bogus anchor
459 for(p = m_path ; p != 0; p = p->LeftState()) {
460 const_cast<CHMM_State*>(p)->UpdateScore(p->Score()-BigScore);
461 if(p->LeftState() == &m_seplus[0])
462 const_cast<CHMM_State*>(p)->ClearLeftState();
463 }
464 }
465 }
466
Out(T t,int w,CNcbiOstream & to=cout)467 template<class T> void Out(T t, int w, CNcbiOstream& to = cout)
468 {
469 to.width(w);
470 to.setf(IOS_BASE::right,IOS_BASE::adjustfield);
471 to << t;
472 }
473
Out(double t,int w,CNcbiOstream & to=cout,int prec=1)474 void Out(double t, int w, CNcbiOstream& to = cout, int prec = 1)
475 {
476 to.width(w);
477 to.setf(IOS_BASE::right,IOS_BASE::adjustfield);
478 to.setf(IOS_BASE::fixed,IOS_BASE::floatfield);
479 to.precision(prec);
480
481 if(t > 1000000000) to << "+Inf";
482 else if(t < -1000000000) to << "-Inf";
483 else to << t;
484 }
485
AddSupport(const TGeneModelList & align_list,TSignedSeqRange inside_range,CGeneModel & gene,TSignedSeqRange reading_frame)486 void AddSupport(const TGeneModelList& align_list, TSignedSeqRange inside_range, CGeneModel& gene, TSignedSeqRange reading_frame)
487 {
488 CGeneModel support;
489 support.SetStrand(gene.Strand());
490 const CGeneModel* supporting_align = NULL;
491
492 ITERATE(TGeneModelList, it, align_list) {
493 const CGeneModel& algn(*it);
494
495 if ((algn.Type() & (CGeneModel::eWall | CGeneModel::eNested))!=0) continue;
496 if(!Include(inside_range,algn.MaxCdsLimits())) continue;
497
498 if ((algn.ReadingFrame().Empty()?algn.Limits():algn.ReadingFrame()).IntersectingWith(reading_frame)) {
499 support.Extend(algn, false);
500 support.AddSupport(CSupportInfo(algn.ID(),false));
501 supporting_align = &algn;
502
503 if((algn.Status()&CGeneModel::ePolyA) != 0)
504 gene.Status() |= CGeneModel::ePolyA;
505
506 ITERATE(list< CRef<CSeq_id> >, i, algn.TrustedmRNA()) {
507 gene.InsertTrustedmRNA(*i);
508 }
509 ITERATE(list< CRef<CSeq_id> >, i, algn.TrustedProt()) {
510 gene.InsertTrustedProt(*i);
511 }
512
513 }
514 }
515
516 if (!support.Exons().empty()) {
517 _ASSERT( supporting_align != NULL );
518 gene.FrameShifts() = support.FrameShifts();
519 // CCDSInfo cds_info;
520 // cds_info.SetReadingFrame(reading_frame);
521 // gene.SetCdsInfo(cds_info);
522
523 // gene.Extend(support, support.Continuous());
524 gene.Extend(support, false);
525 gene.ReplaceSupport(support.Support());
526 if ( Include(support.Limits(),gene.Limits()) && support.Continuous() ) {
527 if (gene.Support().size()==1)
528 gene = *supporting_align;
529 gene.Status() |= CGeneModel::eFullSupCDS;
530 }
531 }
532 #ifdef _DEBUG
533 ITERATE(TGeneModelList, it, align_list) {
534 const CGeneModel& algn(*it);
535
536 if ((algn.Type() & (CGeneModel::eWall | CGeneModel::eNested))!=0) continue;
537 if(!Include(inside_range,algn.MaxCdsLimits())) continue;
538
539 if ((algn.ReadingFrame().Empty()?algn.Limits():algn.ReadingFrame()).IntersectingWith(reading_frame)) {
540 _ASSERT( algn.isCompatible(gene) );
541 }
542 }
543 #endif
544 }
545
GetGenes() const546 TGeneModelList CParse::GetGenes() const
547 {
548 TGeneModelList genes;
549 vector< vector<const CExon*> > gen_exons;
550 const CAlignMap& seq_map = m_seqscr.FrameShiftedSeqMap();
551
552 if(dynamic_cast<const CIntron*>(Path()) && Path()->LeftState())
553 gen_exons.push_back(vector<const CExon*>());
554
555 for(const CHMM_State* p = Path(); p != 0; p = p->LeftState())
556 if(p->isExon()) {
557 if(p->isGeneRightEnd())
558 gen_exons.push_back(vector<const CExon*>());
559
560 gen_exons.back().push_back(static_cast<const CExon*>(p));
561 }
562
563 const EResidue* seqp = m_seqscr.SeqPtr(0, ePlus);
564
565 ITERATE(vector< vector<const CExon*> >, g_it, gen_exons) {
566 EStrand strand = g_it->back()->Strand();
567 genes.push_front(CGeneModel(strand,0,CGeneModel::eGnomon));
568 CGeneModel& gene = genes.front();
569
570 double score = 0;
571
572 CGeneModel local_gene(strand);
573 for(int i = (int)g_it->size()-1; i >= 0; --i) {
574 const CExon* pe = (*g_it)[i];
575 // for (vector<const CExon*>::const_reverse_iterator e_it = g_it->rbegin(); e_it != g_it->rend(); ++e_it) {
576 // const CExon* pe = *e_it;
577
578 TSignedSeqRange local_exon(pe->Start(),pe->Stop());
579 local_gene.AddExon(local_exon);
580 TSignedSeqRange exon = seq_map.MapRangeEditedToOrig(local_exon);
581
582 string fsig, ssig;
583 if(i != (int)g_it->size()-1) {
584 fsig.push_back(toACGT(seqp[local_exon.GetFrom()-2]));
585 fsig.push_back(toACGT(seqp[local_exon.GetFrom()-1]));
586 }
587 if(i != 0) {
588 ssig.push_back(toACGT(seqp[local_exon.GetTo()+1]));
589 ssig.push_back(toACGT(seqp[local_exon.GetTo()+2]));
590 }
591 if(pe->isMinus()) {
592 ReverseComplement(fsig.begin(),fsig.end());
593 ReverseComplement(ssig.begin(),ssig.end());
594 }
595
596 gene.AddExon(exon, fsig, ssig);
597
598 score += pe->RgnScore()+pe->ExonScore();
599 }
600 CAlignMap localmap(local_gene.Exons(), local_gene.FrameShifts(), local_gene.Strand());
601
602 TSignedSeqRange local_reading_frame(g_it->back()->Start(), g_it->front()->Stop());
603
604 if (!g_it->back()->isGeneLeftEnd()) {
605 const CExon* pe = g_it->back();
606 int estart = pe->Start();
607 int estop = pe->Stop();
608 int rframe = pe->Phase();
609 int lframe = pe->isPlus() ? (rframe-(estop-estart))%3 : (rframe+(estop-estart))%3;
610 if (lframe < 0)
611 lframe += 3;
612 int del = pe->isPlus() ? (3-lframe)%3 : (1+lframe)%3;
613
614 if(localmap.FShiftedLen(local_reading_frame) <= del) {
615 genes.pop_front();
616 continue;
617 }
618
619 local_reading_frame.SetFrom(localmap.FShiftedMove(local_reading_frame.GetFrom(),del)); // do it hard way in case we will get into next exon
620 }
621 if (!g_it->front()->isGeneRightEnd()) {
622 const CExon* pe = g_it->front();
623 int rframe = pe->Phase();
624 int del = pe->isPlus() ? (1+rframe)%3 : (3-rframe)%3;
625
626 if(localmap.FShiftedLen(local_reading_frame) <= del) {
627 genes.pop_front();
628 continue;
629 }
630
631 local_reading_frame.SetTo(localmap.FShiftedMove(local_reading_frame.GetTo(),-del)); // do it hard way in case we will get into next exon
632 }
633
634 if (local_reading_frame.Empty()) {
635 genes.pop_front();
636 continue;
637 }
638
639 TSignedSeqRange reading_frame = seq_map.MapRangeEditedToOrig(local_reading_frame, true);
640 _ASSERT(reading_frame.NotEmpty() && reading_frame.GetFrom() >= 0 && reading_frame.GetTo() >= 0);
641 _ASSERT(gene.Limits().GetFrom() <= reading_frame.GetFrom() && gene.FShiftedLen(gene.Limits().GetFrom(),reading_frame.GetFrom(),false) <=3);
642 _ASSERT(gene.Limits().GetTo() >= reading_frame.GetTo() && gene.FShiftedLen(reading_frame.GetTo(),gene.Limits().GetTo(),false) <= 3);
643
644 AddSupport(m_seqscr.Alignments(),
645 TSignedSeqRange(m_seqscr.From(),m_seqscr.To()),
646 gene, reading_frame);
647
648 TSignedSeqRange start, stop;
649 CAlignMap gene_map = gene.GetAlignMap();
650 if (g_it->back()->isGeneLeftEnd()) {
651 int utr_len = gene_map.FShiftedLen(TSignedSeqRange(gene.Limits().GetFrom(),reading_frame.GetFrom()), CAlignMap::eSinglePoint, CAlignMap::eLeftEnd) - 1;
652 if(utr_len >= 3 || m_seqscr.isReadingFrameLeftEnd(local_reading_frame.GetFrom(),strand)) { // extend gene only if start/stop is conventional
653 if (utr_len < 3 ) {
654 gene.ExtendLeft(3-utr_len);
655 gene_map = gene.GetAlignMap();
656 }
657
658 TSignedSeqRange rf = gene_map.MapRangeOrigToEdited(reading_frame, true);
659 TSignedSeqRange stt(rf.GetFrom()-3,rf.GetFrom()-1);
660 TSignedSeqRange stp(rf.GetTo()+1,rf.GetTo()+3);
661 if (strand == eMinus)
662 swap(stt,stp);
663 start = gene_map.MapRangeEditedToOrig(stt, false);
664 }
665 }
666
667 if (g_it->front()->isGeneRightEnd()) {
668 int utr_len = gene_map.FShiftedLen(TSignedSeqRange(reading_frame.GetTo(),gene.Limits().GetTo()), CAlignMap::eRightEnd, CAlignMap::eSinglePoint) - 1;
669 if(utr_len >= 3 || m_seqscr.isReadingFrameRightEnd(local_reading_frame.GetTo(),strand)) { // extend gene only if start/stop is conventional
670 if (utr_len < 3 ) {
671 gene.ExtendRight(3-utr_len);
672 gene_map = gene.GetAlignMap();
673 }
674
675 TSignedSeqRange rf = gene_map.MapRangeOrigToEdited(reading_frame, true);
676 TSignedSeqRange stt(rf.GetFrom()-3,rf.GetFrom()-1);
677 TSignedSeqRange stp(rf.GetTo()+1,rf.GetTo()+3);
678 if (strand == eMinus)
679 swap(stt,stp);
680 stop = gene_map.MapRangeEditedToOrig(stp, false);
681 }
682 }
683
684 if (strand == eMinus)
685 swap(start,stop);
686
687 CCDSInfo cds_info;
688 if (gene.GetCdsInfo().ProtReadingFrame().NotEmpty())
689 cds_info.SetReadingFrame(gene.GetCdsInfo().ProtReadingFrame(), true);
690 cds_info.SetReadingFrame(reading_frame);
691 if (start.NotEmpty())
692 cds_info.SetStart(start, gene.ConfirmedStart() && start == gene.GetCdsInfo().Start());
693 if (stop.NotEmpty())
694 cds_info.SetStop(stop, gene.ConfirmedStop());
695
696 ITERATE(CCDSInfo::TPStops,s,gene.GetCdsInfo().PStops())
697 cds_info.AddPStop(*s);
698
699 cds_info.SetScore(score);
700 gene.SetCdsInfo(cds_info);
701 }
702
703 return genes;
704 }
705
PrintInfo() const706 void CParse::PrintInfo() const
707 {
708 vector<const CHMM_State*> states;
709 for(const CHMM_State* p = Path(); p != 0; p = p->LeftState()) states.push_back(p);
710 reverse(states.begin(),states.end());
711 const CAlignMap& seq_map = m_seqscr.FrameShiftedSeqMap();
712
713 Out(" ",15);
714 Out("Start",11);
715 Out("Stop",11);
716 Out("Score",8);
717 Out("BrScr",8);
718 Out("LnScr",8);
719 Out("RgnScr",8);
720 Out("TrmScr",8);
721 Out("AccScr",8);
722 cout << endl;
723
724 for(int i = 0; i < (int)states.size(); ++i)
725 {
726 const CHMM_State* p = states[i];
727 if(dynamic_cast<const CIntergenic*>(p)) cout << endl;
728
729 Out(p->GetStateName(),13);
730 Out(p->isPlus() ? '+' : '-',2);
731 int a = seq_map.MapEditedToOrig(p->Start());
732 int b = seq_map.MapEditedToOrig(p->Stop());
733 Out(a,11);
734 Out(b,11);
735 SStateScores sc = p->GetStateScores();
736 Out(sc.m_score,8);
737 Out(sc.m_branch,8);
738 Out(sc.m_length,8);
739 Out(sc.m_region,8);
740 Out(sc.m_term,8);
741 Out(p->Score(),8);
742 cout << endl;
743 }
744 }
745
746
747 END_SCOPE(gnomon)
748 END_NCBI_SCOPE
749