1 /* $Id: score.cpp 626289 2021-02-25 14:09:07Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software / database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software / database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Alexandre Souvorov
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <algo/gnomon/gnomon.hpp>
34 #include <algo/gnomon/chainer.hpp>
35 #include <algo/gnomon/gnomon_exception.hpp>
36 #include "score.hpp"
37 #include "hmm.hpp"
38 #include "hmm_inlines.hpp"
39 #include "gnomon_engine.hpp"
40
41 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(gnomon)42 BEGIN_SCOPE(gnomon)
43
44 bool CSeqScores::isStart(int i, int strand) const
45 {
46 const CEResidueVec& ss = m_seq[strand];
47 int ii = (strand == ePlus) ? i : SeqLen()-1-i;
48 if(ii < 0 || ii+2 >= SeqLen()) return false; //out of range
49 else if(ss[ii] != enA || ss[ii+1] != enT || ss[ii+2] != enG) return false;
50 else return true;
51 }
52
isStop(int i,int strand) const53 bool CSeqScores::isStop(int i, int strand) const
54 {
55 const CEResidueVec& ss = m_seq[strand];
56 int ii = (strand == ePlus) ? i : SeqLen()-1-i;
57 if(ii < 0 || ii+2 >= SeqLen()) return false; //out of range
58 if((ss[ii] != enT || ss[ii+1] != enA || ss[ii+2] != enA) &&
59 (ss[ii] != enT || ss[ii+1] != enA || ss[ii+2] != enG) &&
60 (ss[ii] != enT || ss[ii+1] != enG || ss[ii+2] != enA)) return false;
61 else return true;
62 }
63
isReadingFrameLeftEnd(int i,int strand) const64 bool CSeqScores::isReadingFrameLeftEnd(int i, int strand) const
65 {
66 if(strand == ePlus) {
67 return isStart(i-3, strand);
68 } else {
69 return isStop(i-1,strand);
70 }
71 }
72
isReadingFrameRightEnd(int i,int strand) const73 bool CSeqScores::isReadingFrameRightEnd(int i, int strand) const
74 {
75 if(strand == ePlus) {
76 return isStop(i+1, strand);
77 } else {
78 return isStart(i+3,strand);
79 }
80 }
81
isAG(int i,int strand) const82 bool CSeqScores::isAG(int i, int strand) const
83 {
84 const CEResidueVec& ss = m_seq[strand];
85 int ii = (strand == ePlus) ? i : SeqLen()-1-i;
86 if(ii-1 < 0 || ii >= SeqLen()) return false; //out of range
87 if(ss[ii-1] != enA || ss[ii] != enG) return false;
88 else return true;
89 }
90
isGT(int i,int strand) const91 bool CSeqScores::isGT(int i, int strand) const
92 {
93 const CEResidueVec& ss = m_seq[strand];
94 int ii = (strand == ePlus) ? i : SeqLen()-1-i;
95 if(ii < 0 || ii+1 >= SeqLen()) return false; //out of range
96 if(ss[ii] != enG || ss[ii+1] != enT) return false;
97 else return true;
98 }
99
isConsensusIntron(int i,int j,int strand) const100 bool CSeqScores::isConsensusIntron(int i, int j, int strand) const
101 {
102 if(strand == ePlus) return (m_dscr[ePlus][i-1] != BadScore()) && (m_ascr[ePlus][j] != BadScore());
103 else return (m_ascr[eMinus][i-1] != BadScore()) && (m_dscr[eMinus][j] != BadScore());
104 // if(strand == ePlus) return isGT(i,strand) && isAG(j,strand);
105 // else return isAG(i,strand) && isGT(j,strand);
106 }
107
SeqPtr(int i,int strand) const108 const EResidue* CSeqScores::SeqPtr(int i, int strand) const
109 {
110 const CEResidueVec& ss = m_seq[strand];
111 int ii = (strand == ePlus) ? i : SeqLen()-1-i;
112 return &ss.front()+ii;
113 }
114
StopInside(int a,int b,int strand,int frame) const115 bool CSeqScores::StopInside(int a, int b, int strand, int frame) const
116 {
117 return (a <= m_laststop[strand][frame][b]);
118 }
119
OpenCodingRegion(int a,int b,int strand,int frame) const120 bool CSeqScores::OpenCodingRegion(int a, int b, int strand, int frame) const
121 {
122 return (a > m_notinexon[strand][frame][b]);
123 }
124
OpenNonCodingRegion(int a,int b,int strand) const125 bool CSeqScores::OpenNonCodingRegion(int a, int b, int strand) const
126 {
127 return (a > m_notinintron[strand][b]);
128 }
129
CodingScore(int a,int b,int strand,int frame) const130 double CSeqScores::CodingScore(int a, int b, int strand, int frame) const
131 {
132 if(a > b) return 0; // for splitted start/stop
133 double score = m_cdrscr[strand][frame][b];
134 if(a > 0) score -= m_cdrscr[strand][frame][a-1];
135 return score;
136 }
137
NonCodingScore(int a,int b,int strand) const138 double CSeqScores::NonCodingScore(int a, int b, int strand) const
139 {
140 double score = m_ncdrscr[strand][b];
141 if(a > 0) score -= m_ncdrscr[strand][a-1];
142 return score;
143 }
144
OpenIntergenicRegion(int a,int b) const145 bool CSeqScores::OpenIntergenicRegion(int a, int b) const
146 {
147 return (a > m_notining[b]);
148 }
149
IntergenicScore(int a,int b,int strand) const150 double CSeqScores::IntergenicScore(int a, int b, int strand) const
151 {
152 _ASSERT(strand == 0 || strand == 1);
153 _ASSERT(b>= 0 && b < (int)m_ingscr[strand].size());
154 double score = m_ingscr[strand][b];
155 if(a > 0) {
156 _ASSERT(a-1 < (int)m_ingscr[strand].size());
157 score -= m_ingscr[strand][a-1];
158 }
159 return score;
160 }
161
ConstructSequenceAndMaps(const TGeneModelList & aligns,const CResidueVec & original_sequence)162 CResidueVec CSeqScores::ConstructSequenceAndMaps(const TGeneModelList& aligns, const CResidueVec& original_sequence)
163 {
164 TSignedSeqRange chunk(m_chunk_start, m_chunk_stop);
165 ITERATE(TGeneModelList, it, aligns) {
166 const CGeneModel& align = *it;
167 if (Include(chunk, align.MaxCdsLimits()))
168 m_fshifts.insert(m_fshifts.end(),align.FrameShifts().begin(),align.FrameShifts().end());
169 }
170 sort(m_fshifts.begin(),m_fshifts.end());
171 m_map = CAlignMap(m_chunk_start, m_chunk_stop, m_fshifts.begin(), m_fshifts.end());
172 CResidueVec sequence;
173 m_map.EditedSequence(original_sequence, sequence);
174
175 _ASSERT( m_map.MapEditedToOrig(0) == From() && m_map.MapOrigToEdited(From()) == 0 );
176 _ASSERT( m_map.MapEditedToOrig((int)sequence.size()-1) == To() && m_map.MapOrigToEdited(To()) == (int)sequence.size()-1 );
177
178 return sequence;
179 }
180
181 struct SAlignOrder
182 {
operator ()SAlignOrder183 bool operator() (const CGeneModel& a, const CGeneModel& b) const
184 {
185 TSignedSeqPos x = a.ReadingFrame().NotEmpty() ? a.ReadingFrame().GetFrom() : a.Limits().GetFrom();
186 TSignedSeqPos y = b.ReadingFrame().NotEmpty() ? b.ReadingFrame().GetFrom() : b.Limits().GetFrom();
187
188 if (x==y)
189 return a.ID()<b.ID();
190 return x < y;
191 }
192 };
193
194 typedef set<CGeneModel,SAlignOrder> TAlignSet;
195
196 struct CIndelMapper: public CRangeMapper {
CIndelMapperCIndelMapper197 CIndelMapper(const CAlignMap& seq_map):
198 m_seq_map(seq_map) {}
199
200 const CAlignMap& m_seq_map;
201
operator ()CIndelMapper202 TSignedSeqRange operator() (TSignedSeqRange r, bool withextras = true) const
203 {
204 // if(withextras)
205 // r = m_seq_map.ShrinkToRealPoints(r); // if exon starts/ends with insertion move to projectable points
206 return m_seq_map.MapRangeOrigToEdited(r, withextras);
207 }
208 };
209
210 struct Is_ID_equal {
211 Int8 m_id;
Is_ID_equalIs_ID_equal212 Is_ID_equal(Int8 id): m_id(id) {}
operator ()Is_ID_equal213 bool operator()(const CGeneModel& a)
214 {
215 return a.ID()==m_id;
216 }
217 };
218
AlignLeftLimit(const CGeneModel & a)219 TSignedSeqPos AlignLeftLimit(const CGeneModel& a)
220 {
221 return ( a.MaxCdsLimits().Empty() ? a.Limits().GetFrom() : a.MaxCdsLimits().GetFrom() );
222 }
223
s_AlignLeftLimitOrder(const CGeneModel & ap,const CGeneModel & bp)224 static bool s_AlignLeftLimitOrder(const CGeneModel& ap, const CGeneModel& bp)
225 {
226 return (AlignLeftLimit(ap) < AlignLeftLimit(bp));
227 }
228
229
CSeqScores(const CTerminal & a,const CTerminal & d,const CTerminal & stt,const CTerminal & stp,const CCodingRegion & cr,const CNonCodingRegion & ncr,const CNonCodingRegion & ing,const CIntronParameters & intron_params,TSignedSeqPos from,TSignedSeqPos to,const TGeneModelList & cls,const TInDels & initial_fshifts,double mpp,const CGnomonEngine & gnomon)230 CSeqScores::CSeqScores (const CTerminal& a, const CTerminal& d, const CTerminal& stt, const CTerminal& stp,
231 const CCodingRegion& cr, const CNonCodingRegion& ncr, const CNonCodingRegion& ing,
232 const CIntronParameters& intron_params,
233 TSignedSeqPos from, TSignedSeqPos to, const TGeneModelList& cls, const TInDels& initial_fshifts, double mpp, const CGnomonEngine& gnomon)
234 : m_acceptor(a), m_donor(d), m_start(stt), m_stop(stp), m_cdr(cr), m_ncdr(ncr), m_intrg(ing),
235 m_align_list(cls), m_fshifts(initial_fshifts), m_map(from,to), m_chunk_start(from), m_chunk_stop(to), m_mpp(mpp)
236 {
237 m_align_list.sort(s_AlignLeftLimitOrder);
238 NON_CONST_ITERATE(TGeneModelList, it, m_align_list) {
239 CGeneModel& align = *it;
240 CCDSInfo cds_info = align.GetCdsInfo();
241 bool fixed = false;
242 for(unsigned int i = 1; i < align.Exons().size(); ++i) {
243 if (!align.Exons()[i-1].m_ssplice || !align.Exons()[i].m_fsplice) {
244 int hole_len = align.Exons()[i].GetFrom()-align.Exons()[i-1].GetTo()-1;
245 if(hole_len <= intron_params.MinLen()) {
246 fixed = true;
247 TSignedSeqRange pstop(align.Exons()[i-1].GetTo(),align.Exons()[i].GetFrom()); // to make sure GetScore doesn't complain about "new" pstops
248 cds_info.AddPStop(pstop, CCDSInfo::eUnknown);
249 if(hole_len%3 != 0) {
250 Int8 p = align.Exons()[i-1].GetTo();
251 p = (p+align.Exons()[i].GetFrom())/2;
252 align.FrameShifts().push_back(CInDelInfo(p, hole_len%3, CInDelInfo::eIns));
253 }
254
255 CGeneModel a(align.Strand());
256 a.AddExon(TSignedSeqRange(align.Exons()[i-1].GetTo(),align.Exons()[i].GetFrom()));
257 align.Extend(a);
258
259 --i;
260 }
261 }
262 }
263 if(fixed) {
264 align.SetCdsInfo(cds_info);
265 sort(align.FrameShifts().begin(),align.FrameShifts().end());
266 gnomon.GetScore(align); // calculate possible new pstops
267 }
268 }
269 }
270
271 //void CSeqScores::Init( CResidueVec& original_sequence, bool repeats, bool leftwall, bool rightwall, double consensuspenalty, const CIntergenicParameters& intergenic_params,
Init(CResidueVec & original_sequence,bool leftwall,bool rightwall,double consensuspenalty,const CGnomonAnnotator_Base::TIntMap & notbridgeable_gaps_len,const CGnomonAnnotator_Base::TGgapInfo & ggapinfo)272 void CSeqScores::Init( CResidueVec& original_sequence, bool leftwall, bool rightwall, double consensuspenalty,
273 const CGnomonAnnotator_Base::TIntMap& notbridgeable_gaps_len, const CGnomonAnnotator_Base::TGgapInfo& ggapinfo)
274 {
275 CResidueVec sequence = ConstructSequenceAndMaps(m_align_list,original_sequence);
276
277 TSignedSeqPos len = sequence.size();
278
279 try {
280 m_notining.resize(len,-1);
281 m_inalign.resize(len,numeric_limits<int>::max());
282 m_protnum.resize(len,0);
283
284 for(int strand = 0; strand < 2; ++strand) {
285 m_seq[strand].resize(len);
286 m_ascr[strand].resize(len,BadScore());
287 m_dscr[strand].resize(len,BadScore());
288 m_sttscr[strand].resize(len,BadScore());
289 m_stpscr[strand].resize(len,BadScore());
290 m_ncdrscr[strand].resize(len,BadScore());
291 m_ingscr[strand].resize(len,BadScore());
292 m_notinintron[strand].resize(len,-1);
293 for(int frame = 0; frame < 3; ++frame) {
294 m_cdrscr[strand][frame].resize(len,BadScore());
295 m_laststop[strand][frame].resize(len,-1);
296 m_notinexon[strand][frame].resize(len,-1);
297 }
298 for(int ph = 0; ph < 2; ++ph) {
299 m_asplit[strand][ph].resize(len,0);
300 m_dsplit[strand][ph].resize(len,0);
301 }
302 }
303 } catch(bad_alloc) {
304 NCBI_THROW(CGnomonException, eMemoryLimit, "Not enough memory in CSeqScores");
305 }
306
307
308 //block not bridgeable gaps
309 ITERATE(CGnomonAnnotator_Base::TIntMap, ig, notbridgeable_gaps_len) {
310 int left = ig->first;
311 int right = ig->first+ig->second-1;
312 if(left > m_chunk_stop || right < m_chunk_start)
313 continue;
314
315 left = m_map.MapOrigToEdited(max(left,m_chunk_start));
316 _ASSERT(left >= 0);
317 right = m_map.MapOrigToEdited(min(right,m_chunk_stop));
318 _ASSERT(right >= 0);
319 for(int pnt = left; pnt <= right; ++pnt) {
320 m_notinexon[ePlus][0][pnt] = pnt;
321 m_notinexon[ePlus][1][pnt] = pnt;
322 m_notinexon[ePlus][2][pnt] = pnt;
323 m_notinexon[eMinus][0][pnt] = pnt;
324 m_notinexon[eMinus][1][pnt] = pnt;
325 m_notinexon[eMinus][2][pnt] = pnt;
326 m_notinintron[ePlus][pnt] = pnt;
327 m_notinintron[eMinus][pnt] = pnt;
328 }
329 }
330
331 ITERATE(CGnomonAnnotator_Base::TGgapInfo, ig, ggapinfo) { // block ab initio in ggaps
332 for(int jj = ig->first; jj <= ig->first + (int)ig->second->GetInDelV().length() - 1; ++jj) {
333 if(jj >= m_chunk_start && jj <= m_chunk_stop) {
334 int j = m_map.MapOrigToEdited(jj);
335 m_laststop[ePlus][0][j] = j;
336 m_laststop[ePlus][1][j] = j;
337 m_laststop[ePlus][2][j] = j;
338 m_laststop[eMinus][0][j] = j;
339 m_laststop[eMinus][1][j] = j;
340 m_laststop[eMinus][2][j] = j;
341 }
342 }
343 }
344
345 const int RepeatMargin = 25;
346 for(int rpta = 0; rpta < len; ) {
347 while(rpta < len && isupper(sequence[rpta]))
348 ++rpta;
349 int rptb = rpta;
350 while(rptb+1 < len && islower(sequence[rptb+1]))
351 ++rptb;
352 if(rptb-rpta+1 > 2*RepeatMargin) {
353 for(TSignedSeqPos i = rpta+RepeatMargin; i <= rptb-RepeatMargin; ++i) { // masking repeats
354 m_laststop[ePlus][0][i] = i;
355 m_laststop[ePlus][1][i] = i;
356 m_laststop[ePlus][2][i] = i;
357 m_laststop[eMinus][0][i] = i;
358 m_laststop[eMinus][1][i] = i;
359 m_laststop[eMinus][2][i] = i;
360 }
361 }
362 rpta = rptb+1;
363 }
364
365 for(TSignedSeqPos i = 0; i < len; ++i) {
366 char c = sequence[i];
367 EResidue l = fromACGT(c);
368 m_seq[ePlus][i] = l;
369 m_seq[eMinus][len-1-i] = k_toMinus[l];
370 }
371
372 const int NLimit = 6;
373 CEResidueVec::iterator it = search_n(m_seq[ePlus].begin(),m_seq[ePlus].end(),NLimit,enN);
374 while(it != m_seq[ePlus].end()) // masking big chunks of Ns
375 {
376 for(; it != m_seq[ePlus].end() && *it == enN; ++it)
377 {
378 int j = it-m_seq[ePlus].begin();
379 m_laststop[ePlus][0][j] = j;
380 m_laststop[ePlus][1][j] = j;
381 m_laststop[ePlus][2][j] = j;
382 m_laststop[eMinus][0][j] = j;
383 m_laststop[eMinus][1][j] = j;
384 m_laststop[eMinus][2][j] = j;
385 }
386 it = search_n(it,m_seq[ePlus].end(),NLimit,enN);
387 }
388
389 CIndelMapper mapper(m_map);
390
391 TAlignSet allaligns;
392
393 TSignedSeqRange chunk(m_chunk_start, m_chunk_stop);
394
395 ITERATE(TGeneModelList, it, m_align_list) {
396 const CGeneModel& origalign = *it;
397 TSignedSeqRange limits = origalign.Limits() & chunk;
398 if (limits.Empty() || (origalign.MaxCdsLimits().NotEmpty() && !Include(limits, origalign.MaxCdsLimits())))
399 continue;
400
401 CGeneModel align = origalign;
402 align.Clip(chunk, CGeneModel::eRemoveExons);
403
404 bool opposite = false;
405 if((align.Type() & CGeneModel::eNested)!=0 || (align.Type() & CGeneModel::eWall)!=0) {
406 ITERATE(TGeneModelList, jt, m_align_list) {
407 if(it == jt) continue;
408 const CGeneModel& aa = *jt;
409 if((aa.Type() & CGeneModel::eWall)==0 && (aa.Type() & CGeneModel::eNested)==0 &&
410 aa.MaxCdsLimits().NotEmpty() && aa.MaxCdsLimits().IntersectingWith(align.Limits())) {
411 opposite = true;
412 break;
413 }
414 }
415 }
416
417 align.Remap(mapper);
418 align.FrameShifts().clear();
419
420 EStrand strand = align.Strand();
421
422 if((align.Type()&CGeneModel::eWall)==0 && (align.Type()&CGeneModel::eNested)==0) {
423 CGeneModel al = align;
424 int extrabases = m_start.Left()-3; // bases before ATG needed for score;
425 int extraNs5p = 0; // extra Ns if too close to the end of contig
426 if(extrabases > al.Limits().GetFrom()) {
427 extraNs5p = extrabases - al.Limits().GetFrom();
428 if(al.Limits().GetFrom() > 0) al.ExtendLeft(al.Limits().GetFrom());
429 } else {
430 al.ExtendLeft(extrabases);
431 }
432 int extraNs3p = 0; // extra Ns if too close to the end of contig
433 if(al.Limits().GetTo()+extrabases > len-1) {
434 extraNs3p = al.Limits().GetTo()+extrabases-(len-1);
435 if(al.Limits().GetTo() < len-1) al.ExtendRight(len-1-al.Limits().GetTo());
436 } else {
437 al.ExtendRight(extrabases);
438 }
439 if(strand == eMinus) swap(extraNs5p,extraNs3p);
440
441 CEResidueVec mRNA;
442 CAlignMap mrnamap(al.GetAlignMap());
443 mrnamap.EditedSequence(m_seq[ePlus], mRNA);
444 mRNA.insert(mRNA.begin(),extraNs5p,enN);
445 mRNA.insert(mRNA.end(),extraNs3p,enN);
446
447 // Here we are after splitted start/stops but this loop will score nonsplitted starts/stops as well
448 // "starts/stops" crossing a hole will get some score also but they will be ignored because a hole is alwais incide CDS
449 for(int k = extraNs5p; k < (int)mRNA.size()-extraNs3p; ++k) {
450 int i;
451 if(strand == ePlus) {
452 i = mrnamap.MapEditedToOrig(k-extraNs5p+1)-1; // the position of G or right before the first coding exon if start is completely in another exon
453 if(align.MaxCdsLimits().Empty() || Include(align.MaxCdsLimits(),i))
454 m_sttscr[strand][i] = m_start.Score(mRNA,k);
455
456 i = mrnamap.MapEditedToOrig(k-extraNs5p); // the last position in the last coding exon (usually right before T)
457 if(align.MaxCdsLimits().Empty() || Include(align.MaxCdsLimits(),i))
458 m_stpscr[strand][i] = m_stop.Score(mRNA,k);
459 } else {
460 i = mrnamap.MapEditedToOrig(k-extraNs5p+1); // the last position in the last coding exon (usually right before G)
461 if(i >= 0 && (align.MaxCdsLimits().Empty() || Include(align.MaxCdsLimits(),i)))
462 m_sttscr[strand][i] = m_start.Score(mRNA,k);
463
464 i = mrnamap.MapEditedToOrig(k-extraNs5p)-1; // the position of T or right before the first coding exon if stop is completely in another exon
465 if(i >= 0 && (align.MaxCdsLimits().Empty() || Include(align.MaxCdsLimits(),i)))
466 m_stpscr[strand][i] = m_stop.Score(mRNA,k);
467 }
468 }
469 }
470
471 _ASSERT(align.FShiftedLen(align.ReadingFrame(), false)%3==0);
472
473 if(align.MaxCdsLimits().NotEmpty()) {
474 limits = align.MaxCdsLimits();
475 align.Clip(limits, CGeneModel::eRemoveExons);
476 } else {
477 limits = align.Limits();
478 }
479
480 _ASSERT( align.Exons().size() > 0 );
481
482 if((align.Type() & CGeneModel::eNested)!=0) {
483 // int a = max(0,int(limits.GetFrom())-intergenic_params.MinLen()+1);
484 // int b = min(len-1,limits.GetTo()+intergenic_params.MinLen()-1);
485 int a = limits.GetFrom();
486 int b = limits.GetTo();
487
488 if(opposite) {
489 for(int pnt = a; pnt <= b; ++pnt) { // allow prediction on the opposite strand of nested models
490 m_notinexon[strand][0][pnt] = pnt;
491 m_notinexon[strand][1][pnt] = pnt;
492 m_notinexon[strand][2][pnt] = pnt;
493 }
494 } else {
495 for(int pnt = a; pnt <= b; ++pnt) {
496 m_notinexon[ePlus][0][pnt] = pnt;
497 m_notinexon[ePlus][1][pnt] = pnt;
498 m_notinexon[ePlus][2][pnt] = pnt;
499 m_notinexon[eMinus][0][pnt] = pnt;
500 m_notinexon[eMinus][1][pnt] = pnt;
501 m_notinexon[eMinus][2][pnt] = pnt;
502 }
503 }
504 continue;
505 } else if((align.Type() & CGeneModel::eWall)!=0) {
506 // int a = max(0,int(limits.GetFrom())-intergenic_params.MinLen()+1);
507 // int b = min(len-1,limits.GetTo()+intergenic_params.MinLen()-1);
508 int a = limits.GetFrom();
509 int b = limits.GetTo();
510
511 if(opposite) {
512 for(int pnt = a; pnt <= b; ++pnt) { // allow prediction on the opposite strand of complete models
513 m_notinexon[strand][0][pnt] = pnt;
514 m_notinexon[strand][1][pnt] = pnt;
515 m_notinexon[strand][2][pnt] = pnt;
516
517 m_notinintron[strand][pnt] = pnt;
518 }
519 } else {
520 for(int pnt = a; pnt <= b; ++pnt) {
521 m_notinexon[ePlus][0][pnt] = pnt;
522 m_notinexon[ePlus][1][pnt] = pnt;
523 m_notinexon[ePlus][2][pnt] = pnt;
524 m_notinexon[eMinus][0][pnt] = pnt;
525 m_notinexon[eMinus][1][pnt] = pnt;
526 m_notinexon[eMinus][2][pnt] = pnt;
527
528 m_notinintron[ePlus][pnt] = pnt;
529 m_notinintron[eMinus][pnt] = pnt;
530 }
531 }
532 continue;
533 }
534
535 allaligns.insert(align);
536
537 ITERATE(CGeneModel::TExons, e, align.Exons()) {
538 for(TSignedSeqPos i = e->GetFrom(); i <= e->GetTo(); ++i) { // ignoring repeats and allowing Ns in alignmnets
539 m_laststop[strand][0][i] = -1;
540 m_laststop[strand][1][i] = -1;
541 m_laststop[strand][2][i] = -1;
542 }
543 }
544
545
546 if((align.Type() & CGeneModel::eProt)!=0)
547 m_protnum[align.ReadingFrame().GetFrom()] = 1;
548
549 if(align.ReadingFrame().NotEmpty()) { // enforcing frames of known CDSes and protein pieces
550 TSignedSeqPos right = align.ReadingFrame().GetTo();
551 int ff = (strand == ePlus) ? 2-right%3 : right%3; // see cdrscr
552 for(int frame = 0; frame < 3; ++frame) {
553 if(frame != ff)
554 m_notinexon[strand][frame][right] = right;
555 }
556
557 ITERATE(CGeneModel::TExons, e, align.Exons()) {
558 right = e->GetTo();
559 if (!e->m_ssplice && right < align.Limits().GetTo()) {
560 ff = (strand == ePlus) ? 2-right%3 : right%3;
561 for(int frame = 0; frame < 3; ++frame) {
562 if(frame != ff)
563 m_notinexon[strand][frame][right] = right;
564 }
565 }
566 }
567 }
568
569 if(align.ConfirmedStart()) { // enforcing confirmed start
570 int pnt;
571 if(align.Strand() == ePlus) {
572 pnt = align.ReadingFrame().GetFrom()-1;
573 } else {
574 pnt = align.ReadingFrame().GetTo()+1;
575 }
576 m_notinexon[ePlus][0][pnt] = pnt;
577 m_notinexon[ePlus][1][pnt] = pnt;
578 m_notinexon[ePlus][2][pnt] = pnt;
579 m_notinexon[eMinus][0][pnt] = pnt;
580 m_notinexon[eMinus][1][pnt] = pnt;
581 m_notinexon[eMinus][2][pnt] = pnt;
582
583 m_notinintron[ePlus][pnt] = pnt;
584 m_notinintron[eMinus][pnt] = pnt;
585 }
586
587 // restricting prediction to MaxCdsLimits if not infinite
588 if(align.GetCdsInfo().MaxCdsLimits().NotEmpty()) {
589 if(TSignedSeqRange::GetWholeFrom() < align.GetCdsInfo().MaxCdsLimits().GetFrom()) {
590 m_notinexon[ePlus][0][limits.GetFrom()] = limits.GetFrom();
591 m_notinexon[ePlus][1][limits.GetFrom()] = limits.GetFrom();
592 m_notinexon[ePlus][2][limits.GetFrom()] = limits.GetFrom();
593 m_notinintron[ePlus][limits.GetFrom()] = limits.GetFrom();
594 m_notinexon[eMinus][0][limits.GetFrom()] = limits.GetFrom();
595 m_notinexon[eMinus][1][limits.GetFrom()] = limits.GetFrom();
596 m_notinexon[eMinus][2][limits.GetFrom()] = limits.GetFrom();
597 m_notinintron[eMinus][limits.GetFrom()] = limits.GetFrom();
598 }
599 if(align.GetCdsInfo().MaxCdsLimits().GetTo() < TSignedSeqRange::GetWholeTo()) {
600 m_notinexon[ePlus][0][limits.GetTo()] = limits.GetTo();
601 m_notinexon[ePlus][1][limits.GetTo()] = limits.GetTo();
602 m_notinexon[ePlus][2][limits.GetTo()] = limits.GetTo();
603 m_notinintron[ePlus][limits.GetTo()] = limits.GetTo();
604 m_notinexon[eMinus][0][limits.GetTo()] = limits.GetTo();
605 m_notinexon[eMinus][1][limits.GetTo()] = limits.GetTo();
606 m_notinexon[eMinus][2][limits.GetTo()] = limits.GetTo();
607 m_notinintron[eMinus][limits.GetTo()] = limits.GetTo();
608 }
609 }
610 }
611
612 for(TSignedSeqPos i = 1; i < len; ++i)
613 m_protnum[i] += m_protnum[i-1];
614
615 TAlignSet::iterator it_prev = allaligns.end();
616
617 for(TAlignSet::iterator it = allaligns.begin(); it != allaligns.end(); ++it)
618 {
619 const CGeneModel& algn(*it);
620
621 if (it_prev != allaligns.end()) {
622 const CGeneModel& algn_prev(*it_prev);
623 if(algn.Limits().IntersectingWith(algn_prev.Limits())) {
624 //we can not handle overlapping alignments - at this point it is input error
625 CNcbiOstrstream ost;
626 ost << "MaxCDS of alignment " << algn.ID() << " intersects MaxCDS of alignment " << algn_prev.ID();
627 NCBI_THROW(CGnomonException, eGenericError, CNcbiOstrstreamToString(ost));
628 }
629 }
630 it_prev = it;
631
632 int strand = algn.Strand();
633 int otherstrand = (strand == ePlus) ? eMinus : ePlus;
634
635
636 for(unsigned int k = 1; k < algn.Exons().size(); ++k) // accept NONCONSENSUS alignment splices
637 {
638 if(algn.Exons()[k-1].m_ssplice)
639 {
640 int i = algn.Exons()[k-1].GetTo();
641 if(strand == ePlus) m_dscr[strand][i] = 0; // donor score on the last base of exon
642 else m_ascr[strand][i] = 0; // acceptor score on the last base of exon
643 }
644 if(algn.Exons()[k].m_fsplice)
645 {
646 int i = algn.Exons()[k].GetFrom();
647 if(strand == ePlus) m_ascr[strand][i-1] = 0; // acceptor score on the last base of intron
648 else m_dscr[strand][i-1] = 0; // donor score on the last base of intron
649 }
650 }
651
652 for(unsigned int k = 1; k < algn.Exons().size(); ++k) // enforsing introns
653 {
654 int introna = algn.Exons()[k-1].GetTo()+1;
655 int intronb = algn.Exons()[k].GetFrom()-1;
656
657 if(algn.Exons()[k-1].m_ssplice)
658 {
659 int pnt = introna;
660 m_notinexon[strand][0][pnt] = pnt;
661 m_notinexon[strand][1][pnt] = pnt;
662 m_notinexon[strand][2][pnt] = pnt;
663 }
664
665 if(algn.Exons()[k].m_fsplice)
666 {
667 int pnt = intronb;
668 m_notinexon[strand][0][pnt] = pnt;
669 m_notinexon[strand][1][pnt] = pnt;
670 m_notinexon[strand][2][pnt] = pnt;
671 }
672
673 if(algn.Exons()[k-1].m_ssplice && algn.Exons()[k].m_fsplice)
674 {
675 for(int pnt = introna; pnt <= intronb; ++pnt)
676 {
677 m_notinexon[strand][0][pnt] = pnt;
678 m_notinexon[strand][1][pnt] = pnt;
679 m_notinexon[strand][2][pnt] = pnt;
680 }
681 }
682 }
683
684 for(unsigned int k = 0; k < algn.Exons().size(); ++k) // enforcing exons
685 {
686 int exona = algn.Exons()[k].GetFrom();
687 int exonb = algn.Exons()[k].GetTo();
688 for(int pnt = exona; pnt <= exonb; ++pnt)
689 {
690 m_notinintron[strand][pnt] = pnt;
691 }
692 }
693
694 TSignedSeqPos a = algn.Limits().GetFrom();
695 TSignedSeqPos b = algn.Limits().GetTo();
696
697 for(TSignedSeqPos i = a; i <= b && i < len-1; ++i) // first and last points are conserved to deal with intergenics going outside
698 {
699 m_inalign[i] = max(TSignedSeqPos(1),a); // one CDS maximum per alignment
700 m_notinintron[otherstrand][i] = i; // no other strand models
701 m_notinexon[otherstrand][0][i] = i;
702 m_notinexon[otherstrand][1][i] = i;
703 m_notinexon[otherstrand][2][i] = i;
704 }
705
706 if(strand == ePlus) a += 3; // start is a part of intergenic!!!!
707 else b -= 3;
708
709 m_notining[b] = a; // at least one CDS per alignment
710 if(algn.ReadingFrame().NotEmpty()) {
711 for(TSignedSeqPos i = algn.ReadingFrame().GetFrom(); i <= algn.ReadingFrame().GetTo(); ++i)
712 m_notining[i] = i;
713 }
714 }
715
716
717 if(leftwall) m_notinintron[ePlus][0] = m_notinintron[eMinus][0] = 0; // no partials
718 if(rightwall) m_notinintron[ePlus][len-1] = m_notinintron[eMinus][len-1] = len-1;
719
720 for(int strand = 0; strand < 2; ++strand)
721 {
722 TIVec& nin = m_notinintron[strand];
723 for(TSignedSeqPos i = 1; i < len; ++i) nin[i] = max(nin[i-1],nin[i]);
724
725 for(int frame = 0; frame < 3; ++frame)
726 {
727 TIVec& nex = m_notinexon[strand][frame];
728 for(TSignedSeqPos i = 1; i < len; ++i) nex[i] = max(nex[i-1],nex[i]);
729 }
730 }
731 for(TSignedSeqPos i = 1; i < len; ++i) m_notining[i] = max(m_notining[i-1],m_notining[i]);
732
733 for(int strand = 0; strand < 2; ++strand)
734 {
735 CEResidueVec& s = m_seq[strand];
736
737 m_anum[strand] = 0;
738 m_dnum[strand] = 0;
739 m_sttnum[strand] = 0;
740 m_stpnum[strand] = 0;
741
742 if(strand == ePlus)
743 {
744 for(TSignedSeqPos i = 0; i < len; ++i)
745 {
746 m_ascr[strand][i] = max(m_ascr[strand][i],m_acceptor.Score(s,i));
747 m_dscr[strand][i] = max(m_dscr[strand][i],m_donor.Score(s,i));
748 /*
749 if(m_ascr[strand][i] != BadScore())
750 cout << "Acceptor\t" << i+1+chunk.GetFrom() << "\t" << m_ascr[strand][i] << "\t+" << endl;
751 if(m_dscr[strand][i] != BadScore())
752 cout << "Donor\t" << i+2+chunk.GetFrom() << "\t" << m_dscr[strand][i] << "\t+" << endl;
753 */
754 m_sttscr[strand][i] = max(m_sttscr[strand][i],m_start.Score(s,i));
755 m_stpscr[strand][i] = max(m_stpscr[strand][i],m_stop.Score(s,i));
756 if(m_ascr[strand][i] != BadScore()) ++m_anum[strand];
757 if(m_dscr[strand][i] != BadScore()) ++m_dnum[strand];
758 if(m_sttscr[strand][i] != BadScore()) ++m_sttnum[strand];
759 }
760 }
761 else
762 {
763 for(TSignedSeqPos i = 0; i < len; ++i)
764 {
765 int ii = len-2-i; // extra -1 because ii is point on the "right"
766 m_ascr[strand][i] = max(m_ascr[strand][i],m_acceptor.Score(s,ii));
767 m_dscr[strand][i] = max(m_dscr[strand][i],m_donor.Score(s,ii));
768 /*
769 if(m_ascr[strand][i] != BadScore())
770 cout << "Acceptor\t" << i+2+chunk.GetFrom() << "\t" << m_ascr[strand][i] << "\t-" << endl;
771 if(m_dscr[strand][i] != BadScore())
772 cout << "Donor\t" << i+1+chunk.GetFrom() << "\t" << m_dscr[strand][i] << "\t-" << endl;
773 */
774 m_sttscr[strand][i] = max(m_sttscr[strand][i],m_start.Score(s,ii));
775 m_stpscr[strand][i] = max(m_stpscr[strand][i],m_stop.Score(s,ii));
776 if(m_ascr[strand][i] != BadScore()) ++m_anum[strand];
777 if(m_dscr[strand][i] != BadScore()) ++m_dnum[strand];
778 }
779 }
780 }
781
782
783 for(TAlignSet::iterator it = allaligns.begin(); it != allaligns.end(); ++it)
784 {
785 const CGeneModel& algn(*it);
786 int strand = algn.Strand();
787 TSignedSeqRange cds_lim = algn.ReadingFrame();
788 if(cds_lim.Empty()) continue;
789
790 if(strand == ePlus)
791 {
792 for(unsigned int k = 0; k < algn.Exons().size(); ++k) { // ignoring STOPS in CDS
793 TSignedSeqPos a = max(algn.Exons()[k].GetFrom(),cds_lim.GetFrom());
794 if(a > 0) --a;
795 TSignedSeqPos b = min(algn.Exons()[k].GetTo(),cds_lim.GetTo())-1; // -1 don't touch the real stop
796 for(TSignedSeqPos i = a; i <= b; ++i)
797 m_stpscr[strand][i] = BadScore(); // score on last coding base
798 }
799 }
800 else
801 {
802 for(unsigned int k = 0; k < algn.Exons().size(); ++k) { // ignoring STOPS in CDS
803 TSignedSeqPos a = max(algn.Exons()[k].GetFrom(),cds_lim.GetFrom());
804 TSignedSeqPos b = min(algn.Exons()[k].GetTo(),cds_lim.GetTo());
805 for(TSignedSeqPos i = a; i <= b; ++i)
806 m_stpscr[strand][i] = BadScore(); // score on last intergenic (first stop) base
807 }
808 }
809 }
810
811 for(int strand = 0; strand < 2; ++strand)
812 {
813 for(TSignedSeqPos i = 0; i < len; ++i)
814 {
815 if(m_sttscr[strand][i] != BadScore()) ++m_sttnum[strand];
816
817 if(m_stpscr[strand][i] != BadScore())
818 {
819 if(strand == ePlus)
820 {
821 int& lstp = m_laststop[strand][2-i%3][i+3];
822 lstp = max(int(i)+1,lstp);
823 ++m_stpnum[strand];
824 }
825 else
826 {
827 int& lstp = m_laststop[strand][i%3][i];
828 lstp = max(int(i)-2,lstp);
829 ++m_stpnum[strand];
830 }
831 }
832 }
833 }
834
835
836 for(int strand = 0; strand < 2; ++strand)
837 {
838 CEResidueVec& s = m_seq[strand];
839
840 for(TSignedSeqPos i = 0; i < len; ++i)
841 {
842 TSignedSeqPos ii = strand == ePlus ? i : len-1-i;
843
844 double score = m_ncdr.Score(s,ii);
845 if(score == BadScore()) score = 0;
846 m_ncdrscr[strand][i] = score;
847 if(i > 0) m_ncdrscr[strand][i] += m_ncdrscr[strand][i-1];
848
849 score = m_intrg.Score(s,ii);
850 if(score == BadScore()) score = 0;
851 m_ingscr[strand][i] = score;
852 if(i > 0) m_ingscr[strand][i] += m_ingscr[strand][i-1];
853 }
854
855 for(int frame = 0; frame < 3; ++frame)
856 {
857 for(TSignedSeqPos i = 0; i < len; ++i)
858 {
859 int codonshift,ii;
860 if(strand == ePlus) // left end of codon is shifted by frame bases to left
861 {
862 codonshift = (frame+i)%3;
863 ii = i;
864 }
865 else // right end of codone is shifted by frame bases to right
866 {
867 codonshift = (frame-(int)i)%3;
868 if(codonshift < 0) codonshift += 3;
869 ii = len-1-i;
870 }
871
872 double score = m_cdr.Score(s,ii,codonshift);
873 if(score == BadScore()) score = 0;
874
875 m_cdrscr[strand][frame][i] = score;
876 if(i > 0)
877 {
878 m_cdrscr[strand][frame][i] += m_cdrscr[strand][frame][i-1];
879
880 TIVec& lstp = m_laststop[strand][frame];
881 lstp[i] = max(lstp[i-1],lstp[i]);
882 }
883 }
884 }
885 }
886
887 for(int strand = 0; strand < 2; ++strand)
888 {
889 for(int frame = 0; frame < 3; ++frame)
890 {
891 for(TSignedSeqPos i = 0; i < len; ++i)
892 {
893 m_cdrscr[strand][frame][i] -= m_ncdrscr[strand][i];
894 }
895 }
896 for(TSignedSeqPos i = 0; i < len; ++i)
897 {
898 m_ingscr[strand][i] -= m_ncdrscr[strand][i];
899
900 int left, right;
901 if(m_dscr[strand][i] != BadScore())
902 {
903 const CTerminal& t = m_donor;
904 left = i+1-(strand==ePlus ? t.Left():t.Right());
905 right = i+(strand==ePlus ? t.Right():t.Left());
906 left = max(0,left);
907 right = min(len-1,right);
908 m_dscr[strand][i] -= NonCodingScore(left,right,strand);
909 }
910 if(m_ascr[strand][i] != BadScore())
911 {
912 const CTerminal& t = m_acceptor;
913 left = i+1-(strand==ePlus ? t.Left():t.Right());
914 right = i+(strand==ePlus ? t.Right():t.Left());
915 left = max(0,left);
916 right = min(len-1,right);
917 m_ascr[strand][i] -= NonCodingScore(left,right,strand);
918 }
919 if(m_sttscr[strand][i] != BadScore())
920 {
921 const CTerminal& t = m_start;
922 left = i+1-(strand==ePlus ? t.Left():t.Right());
923 right = i+(strand==ePlus ? t.Right():t.Left());
924 left = max(0,left);
925 right = min(len-1,right);
926 m_sttscr[strand][i] -= NonCodingScore(left,right,strand);
927 }
928 if(m_stpscr[strand][i] != BadScore())
929 {
930 const CTerminal& t = m_stop;
931 left = i+1-(strand==ePlus ? t.Left():t.Right());
932 right = i+(strand==ePlus ? t.Right():t.Left());
933 left = max(0,left);
934 right = min(len-1,right);
935 m_stpscr[strand][i] -= NonCodingScore(left,right,strand);
936 }
937 }
938 }
939
940 ITERATE(TAlignSet, it, allaligns) {
941 const CGeneModel& algn(*it);
942 if(algn.ReadingFrame().Empty())
943 continue;
944 int strand = algn.Strand();
945
946 if(algn.Exons().front().m_fsplice_sig == "XX" || algn.Exons().front().m_ssplice_sig == "XX") {
947 int p = algn.ReadingFrame().GetFrom()-1;
948 if(strand == ePlus && !algn.HasStart()) {
949 m_sttscr[strand][p] = consensuspenalty;
950 ++m_sttnum[strand];
951 } else if(strand == eMinus && !algn.HasStop()) {
952 m_stpscr[strand][p] = consensuspenalty;
953 ++m_stpnum[strand];
954 }
955 }
956
957 if(algn.Exons().back().m_fsplice_sig == "XX" || algn.Exons().back().m_ssplice_sig == "XX") {
958 int p = algn.ReadingFrame().GetTo();
959 if(strand == ePlus && !algn.HasStop()) {
960 m_stpscr[strand][p] = consensuspenalty;
961 ++m_stpnum[strand];
962 } else if(strand == eMinus && !algn.HasStart()) {
963 m_sttscr[strand][p] = consensuspenalty;
964 ++m_sttnum[strand];
965 }
966 }
967 }
968
969 const int NonConsensusMargin = 50;
970 if (consensuspenalty != BadScore())
971 ITERATE(TAlignSet, it, allaligns) {
972 const CGeneModel& algn(*it);
973 if((algn.Type() & CGeneModel::eProt) == 0) continue;
974
975 int strand = algn.Strand();
976 TSignedSeqRange lim = algn.Limits();
977
978 for(unsigned int k = 1; k < algn.Exons().size(); ++k)
979 {
980 if(algn.Exons()[k-1].m_ssplice && algn.Exons()[k].m_fsplice) continue;
981
982 int a = algn.Exons()[k-1].GetTo();
983 int b = algn.Exons()[k].GetFrom();
984 for(TSignedSeqPos i = a; i <= b; ++i)
985 {
986 if(i <= a+NonConsensusMargin || i >= b-NonConsensusMargin)
987 {
988 if(m_dscr[strand][i] == BadScore())
989 {
990 m_dscr[strand][i] = consensuspenalty;
991 ++m_dnum[strand];
992 }
993
994 if(m_ascr[strand][i] == BadScore())
995 {
996 m_ascr[strand][i] = consensuspenalty;
997 ++m_anum[strand];
998 }
999 }
1000 }
1001 }
1002
1003 if(strand == ePlus)
1004 {
1005 if(!algn.HasStart() && algn.Exons().front().m_ssplice_sig != "XX")
1006 {
1007 int a = max(2,lim.GetFrom()-NonConsensusMargin);
1008 int b = lim.GetFrom()-1;
1009 for(TSignedSeqPos i = a; i <= b; ++i)
1010 {
1011 if(m_sttscr[strand][i] == BadScore())
1012 {
1013 m_sttscr[strand][i] = consensuspenalty;
1014 ++m_sttnum[strand];
1015 }
1016
1017 if(m_dscr[strand][i] == BadScore())
1018 {
1019 m_dscr[strand][i] = consensuspenalty;
1020 ++m_dnum[strand];
1021 }
1022
1023 if(m_ascr[strand][i] == BadScore())
1024 {
1025 m_ascr[strand][i] = consensuspenalty;
1026 ++m_anum[strand];
1027 }
1028 }
1029 }
1030
1031 if(!algn.HasStop() && algn.Exons().back().m_fsplice_sig != "XX")
1032 {
1033 int a = lim.GetTo();
1034 int b = min(len-4,lim.GetTo()+NonConsensusMargin);
1035 for(TSignedSeqPos i = a; i <= b; ++i)
1036 {
1037 if(m_stpscr[strand][i] == BadScore())
1038 {
1039 m_stpscr[strand][i] = consensuspenalty;
1040 ++m_stpnum[strand];
1041 }
1042
1043 if(m_dscr[strand][i] == BadScore())
1044 {
1045 m_dscr[strand][i] = consensuspenalty;
1046 ++m_dnum[strand];
1047 }
1048
1049 if(m_ascr[strand][i] == BadScore())
1050 {
1051 m_ascr[strand][i] = consensuspenalty;
1052 ++m_anum[strand];
1053 }
1054 }
1055 }
1056 }
1057 else
1058 {
1059 if(!algn.HasStart() && algn.Exons().back().m_fsplice_sig != "XX")
1060 {
1061 int a = lim.GetTo();
1062 int b = min(len-4,lim.GetTo()+NonConsensusMargin);
1063 for(TSignedSeqPos i = a; i <= b; ++i)
1064 {
1065 if(m_sttscr[strand][i] == BadScore())
1066 {
1067 m_sttscr[strand][i] = consensuspenalty;
1068 ++m_sttnum[strand];
1069 }
1070
1071 if(m_dscr[strand][i] == BadScore())
1072 {
1073 m_dscr[strand][i] = consensuspenalty;
1074 ++m_dnum[strand];
1075 }
1076
1077 if(m_ascr[strand][i] == BadScore())
1078 {
1079 m_ascr[strand][i] = consensuspenalty;
1080 ++m_anum[strand];
1081 }
1082 }
1083 }
1084
1085 if(!algn.HasStop() && algn.Exons().front().m_ssplice_sig != "XX")
1086 {
1087 int a = max(2,lim.GetFrom()-NonConsensusMargin);
1088 int b = lim.GetFrom()-1;
1089 for(TSignedSeqPos i = a; i <= b; ++i)
1090 {
1091 if(m_stpscr[strand][i] == BadScore())
1092 {
1093 m_stpscr[strand][i] = consensuspenalty;
1094 ++m_stpnum[strand];
1095 }
1096
1097 if(m_dscr[strand][i] == BadScore())
1098 {
1099 m_dscr[strand][i] = consensuspenalty;
1100 ++m_dnum[strand];
1101 }
1102
1103 if(m_ascr[strand][i] == BadScore())
1104 {
1105 m_ascr[strand][i] = consensuspenalty;
1106 ++m_anum[strand];
1107 }
1108 }
1109 }
1110 }
1111 }
1112
1113 const int stpT = 1, stpTA = 2, stpTG = 4;
1114
1115 for(int strand = 0; strand < 2; ++strand)
1116 {
1117 CEResidueVec& s = m_seq[strand];
1118
1119 if(strand == ePlus) {
1120 for(TSignedSeqPos i = 0; i < len; ++i) {
1121 if(m_ascr[strand][i] != BadScore()) {
1122 if(s[i+1] == enA && s[i+2] == enA) m_asplit[strand][0][i] |= stpT;
1123 if(s[i+1] == enA && s[i+2] == enG) m_asplit[strand][0][i] |= stpT;
1124 if(s[i+1] == enG && s[i+2] == enA) m_asplit[strand][0][i] |= stpT;
1125 if(s[i+1] == enA) m_asplit[strand][1][i] |= stpTA|stpTG;
1126 if(s[i+1] == enG) m_asplit[strand][1][i] |= stpTA;
1127 }
1128 if(m_dscr[strand][i] != BadScore()) {
1129 if(s[i] == enT) m_dsplit[strand][0][i] |= stpT;
1130 if(s[i-1] == enT && s[i] == enA) m_dsplit[strand][1][i] |= stpTA;
1131 if(s[i-1] == enT && s[i] == enG) m_dsplit[strand][1][i] |= stpTG;
1132 }
1133 }
1134 } else {
1135 for(TSignedSeqPos i = 0; i < len; ++i) {
1136 int ii = len-2-i; // extra -1 because ii is point on the "right"
1137 if(m_ascr[strand][i] != BadScore()) {
1138 if(s[ii+1] == enA && s[ii+2] == enA) m_asplit[strand][0][i] |= stpT;
1139 if(s[ii+1] == enA && s[ii+2] == enG) m_asplit[strand][0][i] |= stpT;
1140 if(s[ii+1] == enG && s[ii+2] == enA) m_asplit[strand][0][i] |= stpT;
1141 if(s[ii+1] == enA) m_asplit[strand][1][i] |= stpTA|stpTG;
1142 if(s[ii+1] == enG) m_asplit[strand][1][i] |= stpTA;
1143 }
1144 if(m_dscr[strand][i] != BadScore()) {
1145 if(s[ii] == enT) m_dsplit[strand][0][i] |= stpT;
1146 if(s[ii-1] == enT && s[ii] == enA) m_dsplit[strand][1][i] |= stpTA;
1147 if(s[ii-1] == enT && s[ii] == enG) m_dsplit[strand][1][i] |= stpTG;
1148 }
1149 }
1150 }
1151 }
1152
1153 for(TAlignSet::iterator it = allaligns.begin(); it != allaligns.end(); ++it)
1154 {
1155 const CGeneModel& algn(*it);
1156 int strand = algn.Strand();
1157 TSignedSeqRange cds_lim = algn.ReadingFrame();
1158 if(cds_lim.Empty()) continue;
1159
1160 if(strand == ePlus)
1161 {
1162 for(unsigned int k = 0; k < algn.Exons().size()-1; ++k) // ignoring splitted STOPS in CDS
1163 {
1164 TSignedSeqPos b = algn.Exons()[k].GetTo();
1165 if(algn.Exons()[k].m_ssplice && algn.Exons()[k+1].m_fsplice && b >= cds_lim.GetFrom() && b <= cds_lim.GetTo())
1166 {
1167 m_dsplit[strand][0][b] = 0;
1168 m_dsplit[strand][1][b] = 0;
1169 }
1170 }
1171 }
1172 else
1173 {
1174 for(unsigned int k = 1; k < algn.Exons().size(); ++k) // ignoring splitted STOPS in CDS
1175 {
1176 TSignedSeqPos a = algn.Exons()[k].GetFrom();
1177 if(a > 0 && algn.Exons()[k-1].m_ssplice && algn.Exons()[k].m_fsplice && a >= cds_lim.GetFrom() && a <= cds_lim.GetTo())
1178 {
1179 m_dsplit[strand][0][a-1] = 0;
1180 m_dsplit[strand][1][a-1] = 0;
1181 }
1182 }
1183 }
1184 }
1185 }
1186
SelectBestReadingFrame(const CGeneModel & model,const CEResidueVec & mrna,const CAlignMap & mrnamap,TIVec starts[3],TIVec stops[3],int & best_frame,int & best_start,int & best_stop,bool extend5p) const1187 double CGnomonEngine::SelectBestReadingFrame(const CGeneModel& model, const CEResidueVec& mrna, const CAlignMap& mrnamap, TIVec starts[3], TIVec stops[3], int& best_frame, int& best_start, int& best_stop, bool extend5p) const
1188 {
1189 const CTerminal& acceptor = *m_data->m_acceptor;
1190 const CTerminal& donor = *m_data->m_donor;
1191 const CTerminal& stt = *m_data->m_start;
1192 const CTerminal& stp = *m_data->m_stop;
1193 const CCodingRegion& cdr = *m_data->m_cdr;
1194 const CNonCodingRegion& ncdr = *m_data->m_ncdr;
1195 int contig_len = m_data->m_seq.size();
1196 EStrand strand = model.Strand();
1197 const vector<CModelExon>& exons = model.Exons();
1198 int num_exons = model.Exons().size();
1199
1200 const CDoubleStrandSeq& ds = m_data->m_ds;
1201 TDVec splicescr(mrna.size(),0);
1202
1203 if(strand == ePlus) {
1204 int shift = -1;
1205 for(int i = 1; i < num_exons; ++i) {
1206 shift += mrnamap.FShiftedLen(exons[i-1].GetFrom(),exons[i-1].GetTo());
1207
1208 if(exons[i-1].m_ssplice && exons[i-1].m_ssplice_sig != "XX") {
1209 int l = exons[i-1].GetTo();
1210 double scr = donor.Score(ds[ePlus],l);
1211 if(scr == BadScore()) {
1212 scr = 0;
1213 } else {
1214 for(int k = l-donor.Left()+1; k <= l+donor.Right(); ++k) {
1215 double s = ncdr.Score(ds[ePlus],k);
1216 if(s == BadScore()) s = 0;
1217 scr -= s;
1218 }
1219 }
1220 splicescr[shift] = scr;
1221 }
1222
1223 if(exons[i].m_fsplice && exons[i].m_fsplice_sig != "XX") {
1224 int l = exons[i].GetFrom()-1;
1225 double scr = acceptor.Score(ds[ePlus],l);
1226 if(scr == BadScore()) {
1227 scr = 0;
1228 } else {
1229 for(int k = l-acceptor.Left()+1; k <= l+acceptor.Right(); ++k) {
1230 double s = ncdr.Score(ds[ePlus],k);
1231 if(s == BadScore()) s = 0;
1232 scr -= s;
1233 }
1234 }
1235 splicescr[shift] += scr;
1236 }
1237 }
1238 } else {
1239 int shift = mrna.size()-1;
1240 for(int i = 1; i < num_exons; ++i) {
1241 shift -= mrnamap.FShiftedLen(exons[i-1].GetFrom(),exons[i-1].GetTo());
1242
1243 if(exons[i-1].m_ssplice && exons[i-1].m_ssplice_sig != "XX") {
1244 int l = contig_len-2-exons[i-1].GetTo();
1245 double scr = acceptor.Score(ds[eMinus],l);
1246 if(scr == BadScore()) {
1247 scr = 0;
1248 } else {
1249 for(int k = l-acceptor.Left()+1; k <= l+acceptor.Right(); ++k) {
1250 double s = ncdr.Score(ds[eMinus],k);
1251 if(s == BadScore()) s = 0;
1252 scr -= s;
1253 }
1254 }
1255 splicescr[shift] = scr;
1256 }
1257
1258 if(exons[i].m_fsplice && exons[i].m_fsplice_sig != "XX") {
1259 int l = contig_len-1-exons[i].GetFrom();
1260 double scr = donor.Score(ds[eMinus],l);
1261 if(scr == BadScore()) {
1262 scr = 0;
1263 } else {
1264 for(int k = l-donor.Left()+1; k <= l+donor.Right(); ++k) {
1265 double s = ncdr.Score(ds[eMinus],k);
1266 if(s == BadScore()) s = 0;
1267 scr -= s;
1268 }
1269 }
1270 splicescr[shift] += scr;
1271 }
1272 }
1273 }
1274
1275 TDVec cdrscr[3];
1276 for(int frame = 0; frame < 3; ++frame) {
1277 if (frame==best_frame || best_frame==-1) {
1278 cdrscr[frame].resize(mrna.size(),BadScore());
1279 for(int i = 0; i < (int)mrna.size(); ++i) {
1280 int codonshift = (i-frame)%3;
1281 if(codonshift < 0)
1282 codonshift += 3;
1283
1284 double scr = cdr.Score(mrna,i,codonshift);
1285 if(scr == BadScore()) {
1286 scr = 0;
1287 } else {
1288 double s = ncdr.Score(mrna,i);
1289 if(s == BadScore()) s = 0;
1290 scr -= s;
1291 }
1292
1293 cdrscr[frame][i] = scr+splicescr[i];
1294 if(i > 0)
1295 cdrscr[frame][i] += cdrscr[frame][i-1];
1296 }
1297 }
1298 }
1299
1300 double best_score = BadScore();
1301 best_start = -1;
1302 best_stop = -1;
1303 CCDSInfo cds_info = model.GetCdsInfo();
1304
1305 int best_frame_initial = best_frame;
1306
1307 for(int frame = 0; frame < 3; ++frame) {
1308 if (frame==best_frame_initial || best_frame_initial==-1) {
1309 for(int i = 0; i < (int)stops[frame].size(); i++) {
1310 int stop = stops[frame][i];
1311 if(stop < 0) // bogus stop
1312 continue;
1313
1314 int prev_stop = -6;
1315 if(i > 0)
1316 prev_stop = stops[frame][i-1];
1317 TIVec::iterator it_a = lower_bound(starts[frame].begin(),starts[frame].end(),prev_stop+3);
1318 if(it_a == starts[frame].end() || *it_a >= stop) // no start
1319 continue;
1320 TIVec::iterator it_b = it_a+1;
1321 if(*it_a < 0 && it_b != starts[frame].end() && *it_b < stop) // open rf and there is apropriate start at right
1322 ++it_b;
1323 if(!extend5p) {
1324 for( ; it_b != starts[frame].end() && *it_b < stop; ++it_b);
1325 }
1326
1327 for(TIVec::iterator it = it_a; it != it_b; it++) { // if extend5p==true this loop includes only open rf (if exists) and one real start
1328 // otherwise the best start is selected
1329 int start = *it;
1330
1331 if (stop-start-(start>=0?0:3) < 30)
1332 continue;
1333
1334 double s = cdrscr[frame][stop-1]-cdrscr[frame][start+2+(start>=0?0:3)];
1335
1336 double stt_score = BadScore();
1337 if(start >= stt.Left()+2) { // 5 extra bases for ncdr
1338 int pnt = start+2;
1339 stt_score = stt.Score(mrna,pnt);
1340 if(stt_score != BadScore()) {
1341 for(int k = pnt-stt.Left()+1; k <= pnt+stt.Right(); ++k) {
1342 double sn = ncdr.Score(mrna,k);
1343 if(sn != BadScore())
1344 stt_score -= sn;
1345 }
1346 }
1347 } else {
1348 int pnt = mrnamap.MapEditedToOrig(start+(start>=0?0:3));
1349 if(strand == eMinus) pnt = contig_len-1-pnt;
1350 pnt += 2;
1351 stt_score = stt.Score(ds[strand],pnt);
1352 if(stt_score != BadScore()) {
1353 for(int k = pnt-stt.Left()+1; k <= pnt+stt.Right(); ++k) {
1354 double sn = ncdr.Score(ds[strand],k);
1355 if(sn != BadScore())
1356 stt_score -= sn;
1357 }
1358 }
1359 }
1360 if(stt_score != BadScore())
1361 s += stt_score;
1362
1363 double stp_score = stp.Score(mrna,stop-1);
1364 if(stp_score != BadScore()) {
1365 for(int k = stop-stp.Left(); k < stop+stp.Right(); ++k) {
1366 double sn = ncdr.Score(mrna,k);
1367 if(sn != BadScore())
1368 stp_score -= sn;
1369 }
1370 s += stp_score;
1371 }
1372
1373 if(s >= best_score) {
1374 best_frame = frame;
1375 best_score = s;
1376 best_start = start;
1377 best_stop = stop;
1378 }
1379 }
1380 }
1381 }
1382 }
1383
1384 if(cds_info.ConfirmedStart() && cds_info.ConfirmedStop() && model.Continuous()) { // looks like a complete model, will get a permanent boost in score which will improve chanses for the first placement over notcomplete models
1385 best_score += max(1.,0.3*best_score); // in case s negative
1386 }
1387
1388 return best_score;
1389 }
1390
GetScore(CGeneModel & model,bool extend5p,bool obeystart) const1391 void CGnomonEngine::GetScore(CGeneModel& model, bool extend5p, bool obeystart) const
1392 {
1393 CAlignMap mrnamap(model.Exons(),model.FrameShifts(),model.Strand());
1394 CEResidueVec mrna;
1395 mrnamap.EditedSequence(m_data->m_ds[ePlus], mrna);
1396
1397 int frame = -1;
1398 TIVec starts[3], stops[3];
1399
1400 FindStartsStops(model, m_data->m_ds[model.Strand()], mrna, mrnamap, starts, stops, frame, obeystart);
1401
1402 CCDSInfo cds_info = model.GetCdsInfo();
1403 if((cds_info.ReadingFrame().NotEmpty() || !cds_info.PStops().empty()) && cds_info.IsMappedToGenome())
1404 cds_info = cds_info.MapFromOrigToEdited(mrnamap);
1405
1406 // remove legit pstopd from stops[]
1407 CCDSInfo::TPStops pstops = cds_info.PStops();
1408 for(int fr = 0; fr < 3; ++fr) {
1409 ERASE_ITERATE(TIVec, pstp, stops[fr]) {
1410 TSignedSeqRange pstop = TSignedSeqRange(*pstp,*pstp+2);
1411 CCDSInfo::TPStops::iterator it = find(pstops.begin(), pstops.end(), pstop);
1412 if(it != pstops.end())
1413 VECTOR_ERASE(pstp, stops[fr]);
1414 }
1415 }
1416
1417 if(cds_info.ReadingFrame().Empty()) { // we didn't know the frame before
1418 int mrna_len = (int)mrna.size();
1419
1420 for(int fr = 0; fr < 3; ++fr) {
1421
1422 int first_stop = stops[fr][0];
1423 // if(first_stop < 0) // bogus stop for maxcds
1424 // first_stop = stops[fr][1];
1425
1426 if(first_stop >= 3) { // we have 5' reading frame
1427 TSignedSeqRange rframe, stt, stp;
1428
1429 if(first_stop < mrna_len-2) {
1430 stp = mrnamap.MapRangeEditedToOrig(TSignedSeqRange(first_stop,first_stop+2), false);
1431 }
1432
1433 int first_start = -1;
1434 if(!starts[fr].empty()) {
1435 first_start = starts[fr][0];
1436 if(first_start < 0 && starts[fr].size() > 1)
1437 first_start = starts[fr][1];
1438 }
1439 int fivep_rf = fr;
1440 if(first_start >= 0 && first_start <= first_stop-6) {
1441 stt = mrnamap.MapRangeEditedToOrig(TSignedSeqRange(first_start,first_start+2), false);
1442 fivep_rf = first_start+3;
1443 }
1444
1445 if(stt.NotEmpty() || stops[fr][0] >= 0) { // there is a start or no upstream stop
1446 int threep_rf = first_stop-1;
1447 rframe = mrnamap.MapRangeEditedToOrig(TSignedSeqRange(fivep_rf,threep_rf), true);
1448
1449 _ASSERT(rframe.NotEmpty());
1450
1451 CCDSInfo ci;
1452 ci.SetReadingFrame(rframe);
1453 if(stt.NotEmpty()) {
1454 ci.SetStart(stt);
1455 if(stops[fr][0] < 0) {
1456 int bs = mrnamap.MapEditedToOrig(first_start);
1457 _ASSERT( bs >= 0 );
1458 ci.Set5PrimeCdsLimit(bs);
1459 }
1460 }
1461 if(stp.NotEmpty())
1462 ci.SetStop(stp);
1463 ci.SetScore(0, stt.NotEmpty());
1464 model.SetEdgeReadingFrames()->push_back(ci);
1465 }
1466 }
1467
1468 if(first_stop < mrna_len-2) { //there is a stop and possibly 3' reading frame
1469 TSignedSeqRange rframe, stt;
1470 int last_stop = stops[fr][(int)stops[fr].size()-1];
1471 if(last_stop >= mrna_len-2)
1472 last_stop = stops[fr][(int)stops[fr].size()-2]; //garanteed to be present
1473 int start;
1474 ITERATE(TIVec, it, starts[fr]) {
1475 start = *it;
1476 if(start > last_stop && start <= mrna_len-6) {
1477 stt = mrnamap.MapRangeEditedToOrig(TSignedSeqRange(start,start+2), false);
1478 break;
1479 }
1480 }
1481 if(stt.NotEmpty()) { // there is 3' reading frame
1482 int fivep_rf = start+3;
1483 int threep_rf = mrna_len-(mrna_len-fr)%3-1;
1484 rframe = mrnamap.MapRangeEditedToOrig(TSignedSeqRange(fivep_rf,threep_rf), true);
1485
1486 _ASSERT(rframe.NotEmpty());
1487
1488 CCDSInfo ci;
1489 ci.SetReadingFrame(rframe);
1490 ci.SetStart(stt);
1491 ci.Set5PrimeCdsLimit(model.Strand() == ePlus ? stt.GetFrom():stt.GetTo());
1492 ci.SetScore(0,false);
1493 model.SetEdgeReadingFrames()->push_back(ci);
1494 }
1495 }
1496 }
1497 }
1498
1499 int best_start, best_stop;
1500 double best_score = SelectBestReadingFrame(model, mrna, mrnamap, starts, stops, frame, best_start, best_stop, extend5p);
1501
1502 if(cds_info.MaxCdsLimits().NotEmpty())
1503 cds_info.Clear5PrimeCdsLimit();
1504
1505 if (best_score == BadScore()) {
1506 cds_info.Clear();
1507 model.SetCdsInfo( cds_info );
1508 return;
1509 }
1510
1511 bool is_open = false;
1512 if(best_start == 0) {
1513 is_open = !cds_info.ConfirmedStart();
1514 } else if(best_start<0 && starts[frame].size() > 1) {
1515 int new_start = starts[frame][1];
1516 int newlen = best_stop-new_start;
1517 int oldlen = best_stop-best_start;
1518 if(newlen >= max(6,oldlen/2) || cds_info.ConfirmedStart()) {
1519 is_open = !cds_info.ConfirmedStart();
1520 best_start = new_start;
1521 }
1522 }
1523
1524 if (cds_info.ConfirmedStart() && best_start != starts[frame].back()) { // confirmed start was expanded within alignment
1525 model.AddComment("movedconfstart");
1526 }
1527
1528 bool has_start = best_start>=0;
1529 bool confirmed_start = cds_info.ConfirmedStart(); //we wamnt to keep the status even if the actual start moved within alignment, the status will be used in gnomon and will prevent any furher extension
1530 bool confirmed_stop = cds_info.ConfirmedStop();
1531
1532 TSignedSeqRange best_reading_frame = TSignedSeqRange(best_start+3,best_stop-1);
1533
1534 if (Include(best_reading_frame, cds_info.Start()))
1535 cds_info.SetStart(TSignedSeqRange::GetEmpty());
1536 if (Include(best_reading_frame, cds_info.Stop()))
1537 cds_info.SetStop(TSignedSeqRange::GetEmpty());
1538
1539 cds_info.ClearPStops();
1540
1541 if (cds_info.ProtReadingFrame().NotEmpty() && !Include(best_reading_frame, cds_info.ProtReadingFrame())) {
1542 _ASSERT( best_start==0 );
1543 cds_info.SetReadingFrame(best_reading_frame & cds_info.ProtReadingFrame(), true);
1544 }
1545 cds_info.SetReadingFrame(best_reading_frame);
1546
1547 int upstream_stop = frame-3;
1548 if(has_start) {
1549 cds_info.SetStart(TSignedSeqRange(best_start,best_start+2), confirmed_start);
1550 if(FindUpstreamStop(stops[frame],best_start,upstream_stop))
1551 cds_info.Set5PrimeCdsLimit(best_start);
1552 }
1553
1554 if ((int)mrna.size() - best_stop >=3)
1555 cds_info.SetStop(TSignedSeqRange(best_stop,best_stop+2),confirmed_stop);
1556
1557 for(int i = upstream_stop+3; i < best_stop; i += 3) {
1558 if(IsStopCodon(&mrna[i])) {
1559 TSignedSeqRange pstop = TSignedSeqRange(i,i+2);
1560 CCDSInfo::EStatus status = CCDSInfo::eUnknown;
1561 CCDSInfo::TPStops::iterator it = find(pstops.begin(), pstops.end(), pstop);
1562 if(it != pstops.end())
1563 status = it->m_status;
1564 cds_info.AddPStop(pstop, status);
1565 }
1566 }
1567
1568 cds_info.SetScore(best_score, is_open);
1569 cds_info = cds_info.MapFromEditedToOrig(mrnamap); // returns empty cds with badscore if con't map
1570 model.SetCdsInfo(cds_info);
1571 }
1572
1573
1574 END_SCOPE(gnomon)
1575 END_NCBI_SCOPE
1576