1 /*  $Id: score.cpp 626289 2021-02-25 14:09:07Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software / database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software / database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Alexandre Souvorov
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <algo/gnomon/gnomon.hpp>
34 #include <algo/gnomon/chainer.hpp>
35 #include <algo/gnomon/gnomon_exception.hpp>
36 #include "score.hpp"
37 #include "hmm.hpp"
38 #include "hmm_inlines.hpp"
39 #include "gnomon_engine.hpp"
40 
41 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(gnomon)42 BEGIN_SCOPE(gnomon)
43 
44 bool CSeqScores::isStart(int i, int strand) const
45 {
46     const CEResidueVec& ss = m_seq[strand];
47     int ii = (strand == ePlus) ? i : SeqLen()-1-i;
48     if(ii < 0 || ii+2 >= SeqLen()) return false;  //out of range
49     else if(ss[ii] != enA || ss[ii+1] != enT || ss[ii+2] != enG) return false;
50     else return true;
51 }
52 
isStop(int i,int strand) const53 bool CSeqScores::isStop(int i, int strand) const
54 {
55     const CEResidueVec& ss = m_seq[strand];
56     int ii = (strand == ePlus) ? i : SeqLen()-1-i;
57     if(ii < 0 || ii+2 >= SeqLen()) return false;  //out of range
58     if((ss[ii] != enT || ss[ii+1] != enA || ss[ii+2] != enA) &&
59         (ss[ii] != enT || ss[ii+1] != enA || ss[ii+2] != enG) &&
60         (ss[ii] != enT || ss[ii+1] != enG || ss[ii+2] != enA)) return false;
61     else return true;
62 }
63 
isReadingFrameLeftEnd(int i,int strand) const64 bool CSeqScores::isReadingFrameLeftEnd(int i, int strand) const
65 {
66     if(strand == ePlus) {
67         return isStart(i-3, strand);
68     } else {
69         return isStop(i-1,strand);
70     }
71 }
72 
isReadingFrameRightEnd(int i,int strand) const73 bool CSeqScores::isReadingFrameRightEnd(int i, int strand) const
74 {
75     if(strand == ePlus) {
76         return isStop(i+1, strand);
77     } else {
78         return isStart(i+3,strand);
79     }
80 }
81 
isAG(int i,int strand) const82 bool CSeqScores::isAG(int i, int strand) const
83 {
84     const CEResidueVec& ss = m_seq[strand];
85     int ii = (strand == ePlus) ? i : SeqLen()-1-i;
86     if(ii-1 < 0 || ii >= SeqLen()) return false;  //out of range
87     if(ss[ii-1] != enA || ss[ii] != enG) return false;
88     else return true;
89 }
90 
isGT(int i,int strand) const91 bool CSeqScores::isGT(int i, int strand) const
92 {
93     const CEResidueVec& ss = m_seq[strand];
94     int ii = (strand == ePlus) ? i : SeqLen()-1-i;
95     if(ii < 0 || ii+1 >= SeqLen()) return false;  //out of range
96     if(ss[ii] != enG || ss[ii+1] != enT) return false;
97     else return true;
98 }
99 
isConsensusIntron(int i,int j,int strand) const100 bool CSeqScores::isConsensusIntron(int i, int j, int strand) const
101 {
102     if(strand == ePlus) return (m_dscr[ePlus][i-1] != BadScore()) && (m_ascr[ePlus][j] != BadScore());
103     else                return (m_ascr[eMinus][i-1] != BadScore()) && (m_dscr[eMinus][j] != BadScore());
104 //    if(strand == ePlus) return isGT(i,strand) && isAG(j,strand);
105 //    else return isAG(i,strand) && isGT(j,strand);
106 }
107 
SeqPtr(int i,int strand) const108 const EResidue* CSeqScores::SeqPtr(int i, int strand) const
109 {
110     const CEResidueVec& ss = m_seq[strand];
111     int ii = (strand == ePlus) ? i : SeqLen()-1-i;
112     return &ss.front()+ii;
113 }
114 
StopInside(int a,int b,int strand,int frame) const115 bool CSeqScores::StopInside(int a, int b, int strand, int frame) const
116 {
117     return (a <= m_laststop[strand][frame][b]);
118 }
119 
OpenCodingRegion(int a,int b,int strand,int frame) const120 bool CSeqScores::OpenCodingRegion(int a, int b, int strand, int frame) const
121 {
122     return (a > m_notinexon[strand][frame][b]);
123 }
124 
OpenNonCodingRegion(int a,int b,int strand) const125 bool CSeqScores::OpenNonCodingRegion(int a, int b, int strand) const
126 {
127     return (a > m_notinintron[strand][b]);
128 }
129 
CodingScore(int a,int b,int strand,int frame) const130 double CSeqScores::CodingScore(int a, int b, int strand, int frame) const
131 {
132     if(a > b) return 0; // for splitted start/stop
133     double score = m_cdrscr[strand][frame][b];
134     if(a > 0) score -= m_cdrscr[strand][frame][a-1];
135     return score;
136 }
137 
NonCodingScore(int a,int b,int strand) const138 double CSeqScores::NonCodingScore(int a, int b, int strand) const
139 {
140     double score = m_ncdrscr[strand][b];
141     if(a > 0) score -= m_ncdrscr[strand][a-1];
142     return score;
143 }
144 
OpenIntergenicRegion(int a,int b) const145 bool CSeqScores::OpenIntergenicRegion(int a, int b) const
146 {
147     return (a > m_notining[b]);
148 }
149 
IntergenicScore(int a,int b,int strand) const150 double CSeqScores::IntergenicScore(int a, int b, int strand) const
151 {
152     _ASSERT(strand == 0 || strand == 1);
153     _ASSERT(b>= 0 && b < (int)m_ingscr[strand].size());
154     double score = m_ingscr[strand][b];
155     if(a > 0) {
156         _ASSERT(a-1 < (int)m_ingscr[strand].size());
157         score -= m_ingscr[strand][a-1];
158     }
159     return score;
160 }
161 
ConstructSequenceAndMaps(const TGeneModelList & aligns,const CResidueVec & original_sequence)162 CResidueVec CSeqScores::ConstructSequenceAndMaps(const TGeneModelList& aligns, const CResidueVec& original_sequence)
163 {
164     TSignedSeqRange chunk(m_chunk_start, m_chunk_stop);
165     ITERATE(TGeneModelList, it, aligns) {
166         const CGeneModel& align = *it;
167         if (Include(chunk, align.MaxCdsLimits()))
168             m_fshifts.insert(m_fshifts.end(),align.FrameShifts().begin(),align.FrameShifts().end());
169     }
170     sort(m_fshifts.begin(),m_fshifts.end());
171     m_map = CAlignMap(m_chunk_start, m_chunk_stop, m_fshifts.begin(), m_fshifts.end());
172     CResidueVec sequence;
173     m_map.EditedSequence(original_sequence, sequence);
174 
175     _ASSERT( m_map.MapEditedToOrig(0) == From() && m_map.MapOrigToEdited(From()) == 0 );
176     _ASSERT( m_map.MapEditedToOrig((int)sequence.size()-1) == To() && m_map.MapOrigToEdited(To()) == (int)sequence.size()-1 );
177 
178     return sequence;
179 }
180 
181 struct SAlignOrder
182 {
operator ()SAlignOrder183     bool operator() (const CGeneModel& a, const CGeneModel& b) const
184     {
185         TSignedSeqPos x = a.ReadingFrame().NotEmpty() ? a.ReadingFrame().GetFrom() : a.Limits().GetFrom();
186         TSignedSeqPos y = b.ReadingFrame().NotEmpty() ? b.ReadingFrame().GetFrom() : b.Limits().GetFrom();
187 
188         if (x==y)
189             return a.ID()<b.ID();
190         return x < y;
191     }
192 };
193 
194 typedef set<CGeneModel,SAlignOrder> TAlignSet;
195 
196 struct CIndelMapper: public CRangeMapper {
CIndelMapperCIndelMapper197     CIndelMapper(const CAlignMap& seq_map):
198         m_seq_map(seq_map) {}
199 
200     const CAlignMap& m_seq_map;
201 
operator ()CIndelMapper202     TSignedSeqRange operator() (TSignedSeqRange r, bool withextras = true) const
203     {
204         //        if(withextras)
205             //            r = m_seq_map.ShrinkToRealPoints(r); // if exon starts/ends with insertion move to projectable points
206         return m_seq_map.MapRangeOrigToEdited(r, withextras);
207     }
208 };
209 
210 struct Is_ID_equal {
211     Int8 m_id;
Is_ID_equalIs_ID_equal212     Is_ID_equal(Int8 id): m_id(id) {}
operator ()Is_ID_equal213     bool operator()(const CGeneModel& a)
214     {
215         return a.ID()==m_id;
216     }
217 };
218 
AlignLeftLimit(const CGeneModel & a)219 TSignedSeqPos AlignLeftLimit(const CGeneModel& a)
220 {
221     return ( a.MaxCdsLimits().Empty() ? a.Limits().GetFrom() : a.MaxCdsLimits().GetFrom() );
222 }
223 
s_AlignLeftLimitOrder(const CGeneModel & ap,const CGeneModel & bp)224 static bool s_AlignLeftLimitOrder(const CGeneModel& ap, const CGeneModel& bp)
225 {
226     return (AlignLeftLimit(ap) < AlignLeftLimit(bp));
227 }
228 
229 
CSeqScores(const CTerminal & a,const CTerminal & d,const CTerminal & stt,const CTerminal & stp,const CCodingRegion & cr,const CNonCodingRegion & ncr,const CNonCodingRegion & ing,const CIntronParameters & intron_params,TSignedSeqPos from,TSignedSeqPos to,const TGeneModelList & cls,const TInDels & initial_fshifts,double mpp,const CGnomonEngine & gnomon)230 CSeqScores::CSeqScores (const CTerminal& a, const CTerminal& d, const CTerminal& stt, const CTerminal& stp,
231 const CCodingRegion& cr, const CNonCodingRegion& ncr, const CNonCodingRegion& ing,
232                const CIntronParameters&     intron_params,
233                         TSignedSeqPos from, TSignedSeqPos to, const TGeneModelList& cls, const TInDels& initial_fshifts, double mpp, const CGnomonEngine& gnomon)
234 : m_acceptor(a), m_donor(d), m_start(stt), m_stop(stp), m_cdr(cr), m_ncdr(ncr), m_intrg(ing),
235   m_align_list(cls), m_fshifts(initial_fshifts), m_map(from,to), m_chunk_start(from), m_chunk_stop(to), m_mpp(mpp)
236 {
237     m_align_list.sort(s_AlignLeftLimitOrder);
238     NON_CONST_ITERATE(TGeneModelList, it, m_align_list) {
239         CGeneModel& align = *it;
240         CCDSInfo cds_info = align.GetCdsInfo();
241         bool fixed = false;
242         for(unsigned int i = 1; i < align.Exons().size(); ++i) {
243             if (!align.Exons()[i-1].m_ssplice || !align.Exons()[i].m_fsplice) {
244                 int hole_len = align.Exons()[i].GetFrom()-align.Exons()[i-1].GetTo()-1;
245                 if(hole_len <= intron_params.MinLen()) {
246                     fixed = true;
247                     TSignedSeqRange pstop(align.Exons()[i-1].GetTo(),align.Exons()[i].GetFrom());     // to make sure GetScore doesn't complain about "new" pstops
248                     cds_info.AddPStop(pstop, CCDSInfo::eUnknown);
249                     if(hole_len%3 != 0) {
250                         Int8 p = align.Exons()[i-1].GetTo();
251                         p = (p+align.Exons()[i].GetFrom())/2;
252                         align.FrameShifts().push_back(CInDelInfo(p, hole_len%3, CInDelInfo::eIns));
253                     }
254 
255                     CGeneModel a(align.Strand());
256                     a.AddExon(TSignedSeqRange(align.Exons()[i-1].GetTo(),align.Exons()[i].GetFrom()));
257                     align.Extend(a);
258 
259                     --i;
260                 }
261             }
262         }
263         if(fixed) {
264             align.SetCdsInfo(cds_info);
265             sort(align.FrameShifts().begin(),align.FrameShifts().end());
266             gnomon.GetScore(align);                                                        // calculate possible new pstops
267         }
268     }
269 }
270 
271 //void CSeqScores::Init( CResidueVec& original_sequence, bool repeats, bool leftwall, bool rightwall, double consensuspenalty, const CIntergenicParameters& intergenic_params,
Init(CResidueVec & original_sequence,bool leftwall,bool rightwall,double consensuspenalty,const CGnomonAnnotator_Base::TIntMap & notbridgeable_gaps_len,const CGnomonAnnotator_Base::TGgapInfo & ggapinfo)272 void CSeqScores::Init( CResidueVec& original_sequence, bool leftwall, bool rightwall, double consensuspenalty,
273          const CGnomonAnnotator_Base::TIntMap& notbridgeable_gaps_len, const CGnomonAnnotator_Base::TGgapInfo& ggapinfo)
274 {
275     CResidueVec sequence = ConstructSequenceAndMaps(m_align_list,original_sequence);
276 
277     TSignedSeqPos len = sequence.size();
278 
279     try {
280         m_notining.resize(len,-1);
281         m_inalign.resize(len,numeric_limits<int>::max());
282         m_protnum.resize(len,0);
283 
284         for(int strand = 0; strand < 2; ++strand) {
285             m_seq[strand].resize(len);
286             m_ascr[strand].resize(len,BadScore());
287             m_dscr[strand].resize(len,BadScore());
288             m_sttscr[strand].resize(len,BadScore());
289             m_stpscr[strand].resize(len,BadScore());
290             m_ncdrscr[strand].resize(len,BadScore());
291             m_ingscr[strand].resize(len,BadScore());
292             m_notinintron[strand].resize(len,-1);
293             for(int frame = 0; frame < 3; ++frame) {
294                 m_cdrscr[strand][frame].resize(len,BadScore());
295                 m_laststop[strand][frame].resize(len,-1);
296                 m_notinexon[strand][frame].resize(len,-1);
297             }
298             for(int ph = 0; ph < 2; ++ph) {
299                 m_asplit[strand][ph].resize(len,0);
300                 m_dsplit[strand][ph].resize(len,0);
301             }
302         }
303     } catch(bad_alloc) {
304         NCBI_THROW(CGnomonException, eMemoryLimit, "Not enough memory in CSeqScores");
305     }
306 
307 
308     //block not bridgeable gaps
309     ITERATE(CGnomonAnnotator_Base::TIntMap, ig, notbridgeable_gaps_len) {
310         int left = ig->first;
311         int right = ig->first+ig->second-1;
312         if(left > m_chunk_stop || right < m_chunk_start)
313             continue;
314 
315         left = m_map.MapOrigToEdited(max(left,m_chunk_start));
316         _ASSERT(left >= 0);
317         right = m_map.MapOrigToEdited(min(right,m_chunk_stop));
318         _ASSERT(right >= 0);
319         for(int pnt = left; pnt <= right; ++pnt) {
320             m_notinexon[ePlus][0][pnt] = pnt;
321             m_notinexon[ePlus][1][pnt] = pnt;
322             m_notinexon[ePlus][2][pnt] = pnt;
323             m_notinexon[eMinus][0][pnt] = pnt;
324             m_notinexon[eMinus][1][pnt] = pnt;
325             m_notinexon[eMinus][2][pnt] = pnt;
326             m_notinintron[ePlus][pnt] = pnt;
327             m_notinintron[eMinus][pnt] = pnt;
328         }
329     }
330 
331     ITERATE(CGnomonAnnotator_Base::TGgapInfo, ig, ggapinfo) {      // block ab initio in ggaps
332         for(int jj = ig->first; jj <= ig->first + (int)ig->second->GetInDelV().length() - 1; ++jj) {
333             if(jj >= m_chunk_start && jj <= m_chunk_stop) {
334                 int j = m_map.MapOrigToEdited(jj);
335                 m_laststop[ePlus][0][j] = j;
336                 m_laststop[ePlus][1][j] = j;
337                 m_laststop[ePlus][2][j] = j;
338                 m_laststop[eMinus][0][j] = j;
339                 m_laststop[eMinus][1][j] = j;
340                 m_laststop[eMinus][2][j] = j;
341             }
342         }
343     }
344 
345     const int RepeatMargin = 25;
346     for(int rpta = 0; rpta < len; ) {
347         while(rpta < len && isupper(sequence[rpta]))
348             ++rpta;
349         int rptb = rpta;
350         while(rptb+1 < len && islower(sequence[rptb+1]))
351             ++rptb;
352         if(rptb-rpta+1 > 2*RepeatMargin) {
353             for(TSignedSeqPos i = rpta+RepeatMargin; i <= rptb-RepeatMargin; ++i) {  // masking repeats
354                 m_laststop[ePlus][0][i] = i;
355                 m_laststop[ePlus][1][i] = i;
356                 m_laststop[ePlus][2][i] = i;
357                 m_laststop[eMinus][0][i] = i;
358                 m_laststop[eMinus][1][i] = i;
359                 m_laststop[eMinus][2][i] = i;
360             }
361         }
362         rpta = rptb+1;
363     }
364 
365     for(TSignedSeqPos i = 0; i < len; ++i) {
366         char c = sequence[i];
367         EResidue l = fromACGT(c);
368         m_seq[ePlus][i] = l;
369         m_seq[eMinus][len-1-i] = k_toMinus[l];
370     }
371 
372     const int NLimit = 6;
373     CEResidueVec::iterator it = search_n(m_seq[ePlus].begin(),m_seq[ePlus].end(),NLimit,enN);
374     while(it != m_seq[ePlus].end())           // masking big chunks of Ns
375     {
376         for(; it != m_seq[ePlus].end() && *it == enN; ++it)
377         {
378             int j = it-m_seq[ePlus].begin();
379             m_laststop[ePlus][0][j] = j;
380             m_laststop[ePlus][1][j] = j;
381             m_laststop[ePlus][2][j] = j;
382             m_laststop[eMinus][0][j] = j;
383             m_laststop[eMinus][1][j] = j;
384             m_laststop[eMinus][2][j] = j;
385         }
386         it = search_n(it,m_seq[ePlus].end(),NLimit,enN);
387     }
388 
389     CIndelMapper mapper(m_map);
390 
391     TAlignSet allaligns;
392 
393     TSignedSeqRange chunk(m_chunk_start, m_chunk_stop);
394 
395     ITERATE(TGeneModelList, it, m_align_list) {
396         const CGeneModel& origalign = *it;
397         TSignedSeqRange limits = origalign.Limits() & chunk;
398         if (limits.Empty() || (origalign.MaxCdsLimits().NotEmpty() &&  !Include(limits, origalign.MaxCdsLimits())))
399             continue;
400 
401         CGeneModel align = origalign;
402         align.Clip(chunk, CGeneModel::eRemoveExons);
403 
404         bool opposite = false;
405         if((align.Type() & CGeneModel::eNested)!=0 || (align.Type() & CGeneModel::eWall)!=0) {
406             ITERATE(TGeneModelList, jt, m_align_list) {
407                 if(it == jt) continue;
408                 const CGeneModel& aa = *jt;
409                 if((aa.Type() & CGeneModel::eWall)==0 && (aa.Type() & CGeneModel::eNested)==0 &&
410                    aa.MaxCdsLimits().NotEmpty() && aa.MaxCdsLimits().IntersectingWith(align.Limits())) {
411                     opposite = true;
412                     break;
413                 }
414             }
415         }
416 
417         align.Remap(mapper);
418         align.FrameShifts().clear();
419 
420         EStrand strand = align.Strand();
421 
422         if((align.Type()&CGeneModel::eWall)==0 && (align.Type()&CGeneModel::eNested)==0) {
423             CGeneModel al = align;
424             int extrabases = m_start.Left()-3;   // bases before ATG needed for score;
425             int extraNs5p = 0;                   // extra Ns if too close to the end of contig
426             if(extrabases > al.Limits().GetFrom()) {
427                 extraNs5p = extrabases - al.Limits().GetFrom();
428                 if(al.Limits().GetFrom() > 0) al.ExtendLeft(al.Limits().GetFrom());
429             } else {
430                 al.ExtendLeft(extrabases);
431             }
432             int extraNs3p = 0;                   // extra Ns if too close to the end of contig
433             if(al.Limits().GetTo()+extrabases > len-1) {
434                 extraNs3p = al.Limits().GetTo()+extrabases-(len-1);
435                 if(al.Limits().GetTo() < len-1) al.ExtendRight(len-1-al.Limits().GetTo());
436             } else {
437                 al.ExtendRight(extrabases);
438             }
439             if(strand == eMinus) swap(extraNs5p,extraNs3p);
440 
441             CEResidueVec mRNA;
442             CAlignMap mrnamap(al.GetAlignMap());
443             mrnamap.EditedSequence(m_seq[ePlus], mRNA);
444             mRNA.insert(mRNA.begin(),extraNs5p,enN);
445             mRNA.insert(mRNA.end(),extraNs3p,enN);
446 
447             // Here we are after splitted start/stops but this loop will score nonsplitted starts/stops as well
448             // "starts/stops" crossing a hole will get some score also but they will be ignored because a hole is alwais incide CDS
449             for(int k = extraNs5p; k < (int)mRNA.size()-extraNs3p; ++k) {
450                 int i;
451                 if(strand == ePlus) {
452                     i = mrnamap.MapEditedToOrig(k-extraNs5p+1)-1;      // the position of G or right before the first coding exon if start is completely in another exon
453                     if(align.MaxCdsLimits().Empty() || Include(align.MaxCdsLimits(),i))
454                         m_sttscr[strand][i] = m_start.Score(mRNA,k);
455 
456                     i = mrnamap.MapEditedToOrig(k-extraNs5p);             // the last position in the last coding exon (usually right before T)
457                     if(align.MaxCdsLimits().Empty() || Include(align.MaxCdsLimits(),i))
458                         m_stpscr[strand][i] = m_stop.Score(mRNA,k);
459                 } else {
460                     i = mrnamap.MapEditedToOrig(k-extraNs5p+1);           // the last position in the last coding exon (usually right before G)
461                     if(i >= 0 && (align.MaxCdsLimits().Empty() || Include(align.MaxCdsLimits(),i)))
462                         m_sttscr[strand][i] = m_start.Score(mRNA,k);
463 
464                     i = mrnamap.MapEditedToOrig(k-extraNs5p)-1;        // the position of T or right before the first coding exon if stop is completely in another exon
465                     if(i >= 0 && (align.MaxCdsLimits().Empty() || Include(align.MaxCdsLimits(),i)))
466                         m_stpscr[strand][i] = m_stop.Score(mRNA,k);
467                 }
468             }
469         }
470 
471         _ASSERT(align.FShiftedLen(align.ReadingFrame(), false)%3==0);
472 
473         if(align.MaxCdsLimits().NotEmpty()) {
474             limits = align.MaxCdsLimits();
475             align.Clip(limits, CGeneModel::eRemoveExons);
476         } else {
477             limits = align.Limits();
478         }
479 
480         _ASSERT( align.Exons().size() > 0 );
481 
482         if((align.Type() & CGeneModel::eNested)!=0) {
483             //            int a = max(0,int(limits.GetFrom())-intergenic_params.MinLen()+1);
484             //            int b = min(len-1,limits.GetTo()+intergenic_params.MinLen()-1);
485             int a = limits.GetFrom();
486             int b = limits.GetTo();
487 
488             if(opposite) {
489                 for(int pnt = a; pnt <= b; ++pnt) {           // allow prediction on the opposite strand of nested models
490                     m_notinexon[strand][0][pnt] = pnt;
491                     m_notinexon[strand][1][pnt] = pnt;
492                     m_notinexon[strand][2][pnt] = pnt;
493                 }
494             } else {
495                 for(int pnt = a; pnt <= b; ++pnt) {
496                     m_notinexon[ePlus][0][pnt] = pnt;
497                     m_notinexon[ePlus][1][pnt] = pnt;
498                     m_notinexon[ePlus][2][pnt] = pnt;
499                     m_notinexon[eMinus][0][pnt] = pnt;
500                     m_notinexon[eMinus][1][pnt] = pnt;
501                     m_notinexon[eMinus][2][pnt] = pnt;
502                 }
503             }
504             continue;
505         } else if((align.Type() & CGeneModel::eWall)!=0) {
506         //            int a = max(0,int(limits.GetFrom())-intergenic_params.MinLen()+1);
507         //            int b = min(len-1,limits.GetTo()+intergenic_params.MinLen()-1);
508             int a = limits.GetFrom();
509             int b = limits.GetTo();
510 
511             if(opposite) {
512                 for(int pnt = a; pnt <= b; ++pnt) {           // allow prediction on the opposite strand of complete models
513                     m_notinexon[strand][0][pnt] = pnt;
514                     m_notinexon[strand][1][pnt] = pnt;
515                     m_notinexon[strand][2][pnt] = pnt;
516 
517                     m_notinintron[strand][pnt] = pnt;
518                 }
519             } else {
520                 for(int pnt = a; pnt <= b; ++pnt) {
521                     m_notinexon[ePlus][0][pnt] = pnt;
522                     m_notinexon[ePlus][1][pnt] = pnt;
523                     m_notinexon[ePlus][2][pnt] = pnt;
524                     m_notinexon[eMinus][0][pnt] = pnt;
525                     m_notinexon[eMinus][1][pnt] = pnt;
526                     m_notinexon[eMinus][2][pnt] = pnt;
527 
528                     m_notinintron[ePlus][pnt] = pnt;
529                     m_notinintron[eMinus][pnt] = pnt;
530                 }
531             }
532             continue;
533         }
534 
535         allaligns.insert(align);
536 
537         ITERATE(CGeneModel::TExons, e, align.Exons()) {
538             for(TSignedSeqPos i = e->GetFrom(); i <= e->GetTo(); ++i) {  // ignoring repeats and allowing Ns in alignmnets
539                 m_laststop[strand][0][i] = -1;
540                 m_laststop[strand][1][i] = -1;
541                 m_laststop[strand][2][i] = -1;
542             }
543         }
544 
545 
546         if((align.Type() & CGeneModel::eProt)!=0)
547             m_protnum[align.ReadingFrame().GetFrom()] = 1;
548 
549         if(align.ReadingFrame().NotEmpty()) {  // enforcing frames of known CDSes and protein pieces
550             TSignedSeqPos right = align.ReadingFrame().GetTo();
551             int ff = (strand == ePlus) ? 2-right%3 : right%3;               // see cdrscr
552             for(int frame = 0; frame < 3; ++frame) {
553                 if(frame != ff)
554                     m_notinexon[strand][frame][right] = right;
555             }
556 
557             ITERATE(CGeneModel::TExons, e, align.Exons()) {
558                 right = e->GetTo();
559                 if (!e->m_ssplice && right < align.Limits().GetTo()) {
560                     ff = (strand == ePlus) ? 2-right%3 : right%3;
561                     for(int frame = 0; frame < 3; ++frame) {
562                         if(frame != ff)
563                             m_notinexon[strand][frame][right] = right;
564                     }
565                 }
566             }
567         }
568 
569         if(align.ConfirmedStart()) {        // enforcing confirmed start
570             int pnt;
571             if(align.Strand() == ePlus) {
572                 pnt = align.ReadingFrame().GetFrom()-1;
573             } else {
574                 pnt = align.ReadingFrame().GetTo()+1;
575             }
576             m_notinexon[ePlus][0][pnt] = pnt;
577             m_notinexon[ePlus][1][pnt] = pnt;
578             m_notinexon[ePlus][2][pnt] = pnt;
579             m_notinexon[eMinus][0][pnt] = pnt;
580             m_notinexon[eMinus][1][pnt] = pnt;
581             m_notinexon[eMinus][2][pnt] = pnt;
582 
583             m_notinintron[ePlus][pnt] = pnt;
584             m_notinintron[eMinus][pnt] = pnt;
585         }
586 
587         // restricting prediction to MaxCdsLimits if not infinite
588         if(align.GetCdsInfo().MaxCdsLimits().NotEmpty()) {
589             if(TSignedSeqRange::GetWholeFrom() < align.GetCdsInfo().MaxCdsLimits().GetFrom()) {
590                 m_notinexon[ePlus][0][limits.GetFrom()] = limits.GetFrom();
591                 m_notinexon[ePlus][1][limits.GetFrom()] = limits.GetFrom();
592                 m_notinexon[ePlus][2][limits.GetFrom()] = limits.GetFrom();
593                 m_notinintron[ePlus][limits.GetFrom()] = limits.GetFrom();
594                 m_notinexon[eMinus][0][limits.GetFrom()] = limits.GetFrom();
595                 m_notinexon[eMinus][1][limits.GetFrom()] = limits.GetFrom();
596                 m_notinexon[eMinus][2][limits.GetFrom()] = limits.GetFrom();
597                 m_notinintron[eMinus][limits.GetFrom()] = limits.GetFrom();
598             }
599             if(align.GetCdsInfo().MaxCdsLimits().GetTo() < TSignedSeqRange::GetWholeTo()) {
600                 m_notinexon[ePlus][0][limits.GetTo()] = limits.GetTo();
601                 m_notinexon[ePlus][1][limits.GetTo()] = limits.GetTo();
602                 m_notinexon[ePlus][2][limits.GetTo()] = limits.GetTo();
603                 m_notinintron[ePlus][limits.GetTo()] = limits.GetTo();
604                 m_notinexon[eMinus][0][limits.GetTo()] = limits.GetTo();
605                 m_notinexon[eMinus][1][limits.GetTo()] = limits.GetTo();
606                 m_notinexon[eMinus][2][limits.GetTo()] = limits.GetTo();
607                 m_notinintron[eMinus][limits.GetTo()] = limits.GetTo();
608             }
609         }
610     }
611 
612     for(TSignedSeqPos i = 1; i < len; ++i)
613         m_protnum[i] += m_protnum[i-1];
614 
615     TAlignSet::iterator it_prev = allaligns.end();
616 
617     for(TAlignSet::iterator it = allaligns.begin(); it != allaligns.end(); ++it)
618     {
619         const CGeneModel& algn(*it);
620 
621         if (it_prev != allaligns.end()) {
622             const CGeneModel& algn_prev(*it_prev);
623             if(algn.Limits().IntersectingWith(algn_prev.Limits())) {
624                 //we can not handle overlapping alignments - at this point it is input error
625                 CNcbiOstrstream ost;
626                 ost << "MaxCDS of alignment " << algn.ID() << " intersects MaxCDS of alignment " << algn_prev.ID();
627                 NCBI_THROW(CGnomonException, eGenericError, CNcbiOstrstreamToString(ost));
628             }
629         }
630         it_prev = it;
631 
632         int strand = algn.Strand();
633         int otherstrand = (strand == ePlus) ? eMinus : ePlus;
634 
635 
636         for(unsigned int k = 1; k < algn.Exons().size(); ++k)  // accept NONCONSENSUS alignment splices
637         {
638             if(algn.Exons()[k-1].m_ssplice)
639             {
640                 int i = algn.Exons()[k-1].GetTo();
641                 if(strand == ePlus) m_dscr[strand][i] = 0;  // donor score on the last base of exon
642                 else m_ascr[strand][i] = 0;                 // acceptor score on the last base of exon
643             }
644             if(algn.Exons()[k].m_fsplice)
645             {
646                 int i = algn.Exons()[k].GetFrom();
647                 if(strand == ePlus) m_ascr[strand][i-1] = 0; // acceptor score on the last base of intron
648                 else m_dscr[strand][i-1] = 0;                // donor score on the last base of intron
649             }
650         }
651 
652         for(unsigned int k = 1; k < algn.Exons().size(); ++k) // enforsing introns
653         {
654             int introna = algn.Exons()[k-1].GetTo()+1;
655             int intronb = algn.Exons()[k].GetFrom()-1;
656 
657             if(algn.Exons()[k-1].m_ssplice)
658             {
659                 int pnt = introna;
660                 m_notinexon[strand][0][pnt] = pnt;
661                 m_notinexon[strand][1][pnt] = pnt;
662                 m_notinexon[strand][2][pnt] = pnt;
663             }
664 
665             if(algn.Exons()[k].m_fsplice)
666             {
667                 int pnt = intronb;
668                 m_notinexon[strand][0][pnt] = pnt;
669                 m_notinexon[strand][1][pnt] = pnt;
670                 m_notinexon[strand][2][pnt] = pnt;
671             }
672 
673             if(algn.Exons()[k-1].m_ssplice && algn.Exons()[k].m_fsplice)
674             {
675                 for(int pnt = introna; pnt <= intronb; ++pnt)
676                 {
677                     m_notinexon[strand][0][pnt] = pnt;
678                     m_notinexon[strand][1][pnt] = pnt;
679                     m_notinexon[strand][2][pnt] = pnt;
680                 }
681             }
682         }
683 
684         for(unsigned int k = 0; k < algn.Exons().size(); ++k)  // enforcing exons
685         {
686             int exona = algn.Exons()[k].GetFrom();
687             int exonb = algn.Exons()[k].GetTo();
688             for(int pnt = exona; pnt <= exonb; ++pnt)
689             {
690                 m_notinintron[strand][pnt] = pnt;
691             }
692         }
693 
694         TSignedSeqPos a = algn.Limits().GetFrom();
695         TSignedSeqPos b = algn.Limits().GetTo();
696 
697         for(TSignedSeqPos i = a; i <= b && i < len-1; ++i)  // first and last points are conserved to deal with intergenics going outside
698         {
699             m_inalign[i] = max(TSignedSeqPos(1),a);         // one CDS maximum per alignment
700             m_notinintron[otherstrand][i] = i;              // no other strand models
701             m_notinexon[otherstrand][0][i] = i;
702             m_notinexon[otherstrand][1][i] = i;
703             m_notinexon[otherstrand][2][i] = i;
704         }
705 
706         if(strand == ePlus) a += 3;    // start is a part of intergenic!!!!
707         else b -= 3;
708 
709         m_notining[b] = a;                                  // at least one CDS per alignment
710         if(algn.ReadingFrame().NotEmpty()) {
711             for(TSignedSeqPos i = algn.ReadingFrame().GetFrom(); i <= algn.ReadingFrame().GetTo(); ++i)
712                 m_notining[i] = i;
713         }
714     }
715 
716 
717     if(leftwall) m_notinintron[ePlus][0] = m_notinintron[eMinus][0] = 0;                  // no partials
718     if(rightwall) m_notinintron[ePlus][len-1] = m_notinintron[eMinus][len-1] = len-1;
719 
720     for(int strand = 0; strand < 2; ++strand)
721     {
722         TIVec& nin = m_notinintron[strand];
723         for(TSignedSeqPos i = 1; i < len; ++i) nin[i] = max(nin[i-1],nin[i]);
724 
725         for(int frame = 0; frame < 3; ++frame)
726         {
727             TIVec& nex = m_notinexon[strand][frame];
728             for(TSignedSeqPos i = 1; i < len; ++i) nex[i] = max(nex[i-1],nex[i]);
729         }
730     }
731     for(TSignedSeqPos i = 1; i < len; ++i) m_notining[i] = max(m_notining[i-1],m_notining[i]);
732 
733     for(int strand = 0; strand < 2; ++strand)
734     {
735         CEResidueVec& s = m_seq[strand];
736 
737         m_anum[strand] = 0;
738         m_dnum[strand] = 0;
739         m_sttnum[strand] = 0;
740         m_stpnum[strand] = 0;
741 
742         if(strand == ePlus)
743         {
744             for(TSignedSeqPos i = 0; i < len; ++i)
745             {
746                 m_ascr[strand][i] = max(m_ascr[strand][i],m_acceptor.Score(s,i));
747                 m_dscr[strand][i] = max(m_dscr[strand][i],m_donor.Score(s,i));
748                 /*
749                 if(m_ascr[strand][i] != BadScore())
750                     cout << "Acceptor\t" << i+1+chunk.GetFrom() << "\t" << m_ascr[strand][i] << "\t+" << endl;
751                 if(m_dscr[strand][i] != BadScore())
752                     cout << "Donor\t" << i+2+chunk.GetFrom() << "\t" << m_dscr[strand][i] << "\t+" << endl;
753                 */
754                 m_sttscr[strand][i] = max(m_sttscr[strand][i],m_start.Score(s,i));
755                 m_stpscr[strand][i] = max(m_stpscr[strand][i],m_stop.Score(s,i));
756                 if(m_ascr[strand][i] != BadScore()) ++m_anum[strand];
757                 if(m_dscr[strand][i] != BadScore()) ++m_dnum[strand];
758                 if(m_sttscr[strand][i] != BadScore()) ++m_sttnum[strand];
759             }
760         }
761         else
762         {
763             for(TSignedSeqPos i = 0; i < len; ++i)
764             {
765                 int ii = len-2-i;   // extra -1 because ii is point on the "right"
766                 m_ascr[strand][i] = max(m_ascr[strand][i],m_acceptor.Score(s,ii));
767                 m_dscr[strand][i] = max(m_dscr[strand][i],m_donor.Score(s,ii));
768                 /*
769                 if(m_ascr[strand][i] != BadScore())
770                     cout << "Acceptor\t" << i+2+chunk.GetFrom() << "\t" << m_ascr[strand][i] << "\t-" << endl;
771                 if(m_dscr[strand][i] != BadScore())
772                     cout << "Donor\t" << i+1+chunk.GetFrom() << "\t" << m_dscr[strand][i] << "\t-" << endl;
773                 */
774                 m_sttscr[strand][i] = max(m_sttscr[strand][i],m_start.Score(s,ii));
775                 m_stpscr[strand][i] = max(m_stpscr[strand][i],m_stop.Score(s,ii));
776                 if(m_ascr[strand][i] != BadScore()) ++m_anum[strand];
777                 if(m_dscr[strand][i] != BadScore()) ++m_dnum[strand];
778             }
779         }
780     }
781 
782 
783     for(TAlignSet::iterator it = allaligns.begin(); it != allaligns.end(); ++it)
784     {
785         const CGeneModel& algn(*it);
786         int strand = algn.Strand();
787         TSignedSeqRange cds_lim = algn.ReadingFrame();
788         if(cds_lim.Empty()) continue;
789 
790         if(strand == ePlus)
791         {
792             for(unsigned int k = 0; k < algn.Exons().size(); ++k) {   // ignoring STOPS in CDS
793                 TSignedSeqPos a = max(algn.Exons()[k].GetFrom(),cds_lim.GetFrom());
794                 if(a > 0) --a;
795                 TSignedSeqPos b = min(algn.Exons()[k].GetTo(),cds_lim.GetTo())-1;   // -1 don't touch the real stop
796                 for(TSignedSeqPos i = a; i <= b; ++i)
797                     m_stpscr[strand][i] = BadScore();  // score on last coding base
798             }
799         }
800         else
801         {
802             for(unsigned int k = 0; k < algn.Exons().size(); ++k) {   // ignoring STOPS in CDS
803                 TSignedSeqPos a = max(algn.Exons()[k].GetFrom(),cds_lim.GetFrom());
804                 TSignedSeqPos b = min(algn.Exons()[k].GetTo(),cds_lim.GetTo());
805                 for(TSignedSeqPos i = a; i <= b; ++i)
806                     m_stpscr[strand][i] = BadScore();  // score on last intergenic (first stop) base
807             }
808         }
809     }
810 
811     for(int strand = 0; strand < 2; ++strand)
812     {
813         for(TSignedSeqPos i = 0; i < len; ++i)
814         {
815             if(m_sttscr[strand][i] != BadScore()) ++m_sttnum[strand];
816 
817             if(m_stpscr[strand][i] != BadScore())
818             {
819                 if(strand == ePlus)
820                 {
821                     int& lstp = m_laststop[strand][2-i%3][i+3];
822                     lstp = max(int(i)+1,lstp);
823                     ++m_stpnum[strand];
824                 }
825                 else
826                 {
827                     int& lstp = m_laststop[strand][i%3][i];
828                     lstp = max(int(i)-2,lstp);
829                     ++m_stpnum[strand];
830                 }
831             }
832         }
833     }
834 
835 
836     for(int strand = 0; strand < 2; ++strand)
837     {
838         CEResidueVec& s = m_seq[strand];
839 
840         for(TSignedSeqPos i = 0; i < len; ++i)
841         {
842             TSignedSeqPos ii = strand == ePlus ? i : len-1-i;
843 
844             double score = m_ncdr.Score(s,ii);
845             if(score == BadScore()) score = 0;
846             m_ncdrscr[strand][i] = score;
847             if(i > 0) m_ncdrscr[strand][i] += m_ncdrscr[strand][i-1];
848 
849             score = m_intrg.Score(s,ii);
850             if(score == BadScore()) score = 0;
851             m_ingscr[strand][i] = score;
852             if(i > 0) m_ingscr[strand][i] += m_ingscr[strand][i-1];
853         }
854 
855         for(int frame = 0; frame < 3; ++frame)
856         {
857             for(TSignedSeqPos i = 0; i < len; ++i)
858             {
859                 int codonshift,ii;
860                 if(strand == ePlus)     // left end of codon is shifted by frame bases to left
861                 {
862                     codonshift = (frame+i)%3;
863                     ii = i;
864                 }
865                 else                  // right end of codone is shifted by frame bases to right
866                 {
867                     codonshift = (frame-(int)i)%3;
868                     if(codonshift < 0) codonshift += 3;
869                     ii = len-1-i;
870                 }
871 
872                 double score = m_cdr.Score(s,ii,codonshift);
873                 if(score == BadScore()) score = 0;
874 
875                 m_cdrscr[strand][frame][i] = score;
876                 if(i > 0)
877                 {
878                     m_cdrscr[strand][frame][i] += m_cdrscr[strand][frame][i-1];
879 
880                     TIVec& lstp = m_laststop[strand][frame];
881                     lstp[i] = max(lstp[i-1],lstp[i]);
882                 }
883             }
884         }
885     }
886 
887     for(int strand = 0; strand < 2; ++strand)
888     {
889         for(int frame = 0; frame < 3; ++frame)
890         {
891             for(TSignedSeqPos i = 0; i < len; ++i)
892             {
893                 m_cdrscr[strand][frame][i] -= m_ncdrscr[strand][i];
894             }
895         }
896         for(TSignedSeqPos i = 0; i < len; ++i)
897         {
898             m_ingscr[strand][i] -= m_ncdrscr[strand][i];
899 
900             int left, right;
901             if(m_dscr[strand][i] != BadScore())
902             {
903                 const CTerminal& t = m_donor;
904                 left = i+1-(strand==ePlus ? t.Left():t.Right());
905                 right = i+(strand==ePlus ? t.Right():t.Left());
906                 left = max(0,left);
907                 right = min(len-1,right);
908                 m_dscr[strand][i] -= NonCodingScore(left,right,strand);
909             }
910             if(m_ascr[strand][i] != BadScore())
911             {
912                 const CTerminal& t = m_acceptor;
913                 left = i+1-(strand==ePlus ? t.Left():t.Right());
914                 right = i+(strand==ePlus ? t.Right():t.Left());
915                 left = max(0,left);
916                 right = min(len-1,right);
917                 m_ascr[strand][i] -= NonCodingScore(left,right,strand);
918             }
919             if(m_sttscr[strand][i] != BadScore())
920             {
921                 const CTerminal& t = m_start;
922                 left = i+1-(strand==ePlus ? t.Left():t.Right());
923                 right = i+(strand==ePlus ? t.Right():t.Left());
924                 left = max(0,left);
925                 right = min(len-1,right);
926                 m_sttscr[strand][i] -= NonCodingScore(left,right,strand);
927             }
928             if(m_stpscr[strand][i] != BadScore())
929             {
930                 const CTerminal& t = m_stop;
931                 left = i+1-(strand==ePlus ? t.Left():t.Right());
932                 right = i+(strand==ePlus ? t.Right():t.Left());
933                 left = max(0,left);
934                 right = min(len-1,right);
935                 m_stpscr[strand][i] -= NonCodingScore(left,right,strand);
936             }
937         }
938     }
939 
940     ITERATE(TAlignSet, it, allaligns) {
941         const CGeneModel& algn(*it);
942         if(algn.ReadingFrame().Empty())
943             continue;
944         int strand = algn.Strand();
945 
946         if(algn.Exons().front().m_fsplice_sig == "XX" || algn.Exons().front().m_ssplice_sig == "XX") {
947             int p = algn.ReadingFrame().GetFrom()-1;
948             if(strand == ePlus &&  !algn.HasStart()) {
949                 m_sttscr[strand][p] = consensuspenalty;
950                 ++m_sttnum[strand];
951             } else if(strand == eMinus && !algn.HasStop()) {
952                 m_stpscr[strand][p] = consensuspenalty;
953                 ++m_stpnum[strand];
954             }
955         }
956 
957         if(algn.Exons().back().m_fsplice_sig == "XX" || algn.Exons().back().m_ssplice_sig == "XX") {
958             int p = algn.ReadingFrame().GetTo();
959             if(strand == ePlus && !algn.HasStop()) {
960                 m_stpscr[strand][p] = consensuspenalty;
961                 ++m_stpnum[strand];
962             } else if(strand == eMinus && !algn.HasStart()) {
963                 m_sttscr[strand][p] = consensuspenalty;
964                 ++m_sttnum[strand];
965             }
966         }
967     }
968 
969     const int NonConsensusMargin = 50;
970     if (consensuspenalty != BadScore())
971     ITERATE(TAlignSet, it, allaligns) {
972         const CGeneModel& algn(*it);
973         if((algn.Type() & CGeneModel::eProt) == 0) continue;
974 
975         int strand = algn.Strand();
976         TSignedSeqRange lim = algn.Limits();
977 
978         for(unsigned int k = 1; k < algn.Exons().size(); ++k)
979         {
980             if(algn.Exons()[k-1].m_ssplice && algn.Exons()[k].m_fsplice) continue;
981 
982             int a = algn.Exons()[k-1].GetTo();
983             int b = algn.Exons()[k].GetFrom();
984             for(TSignedSeqPos i = a; i <= b; ++i)
985             {
986                 if(i <= a+NonConsensusMargin || i >= b-NonConsensusMargin)
987                 {
988                     if(m_dscr[strand][i] == BadScore())
989                     {
990                         m_dscr[strand][i] = consensuspenalty;
991                         ++m_dnum[strand];
992                     }
993 
994                     if(m_ascr[strand][i] == BadScore())
995                     {
996                         m_ascr[strand][i] = consensuspenalty;
997                         ++m_anum[strand];
998                     }
999                 }
1000             }
1001         }
1002 
1003         if(strand == ePlus)
1004         {
1005             if(!algn.HasStart() && algn.Exons().front().m_ssplice_sig != "XX")
1006             {
1007                 int a = max(2,lim.GetFrom()-NonConsensusMargin);
1008                 int b = lim.GetFrom()-1;
1009                 for(TSignedSeqPos i = a; i <= b; ++i)
1010                 {
1011                     if(m_sttscr[strand][i] == BadScore())
1012                     {
1013                         m_sttscr[strand][i] = consensuspenalty;
1014                         ++m_sttnum[strand];
1015                     }
1016 
1017                     if(m_dscr[strand][i] == BadScore())
1018                     {
1019                         m_dscr[strand][i] = consensuspenalty;
1020                         ++m_dnum[strand];
1021                     }
1022 
1023                     if(m_ascr[strand][i] == BadScore())
1024                     {
1025                         m_ascr[strand][i] = consensuspenalty;
1026                         ++m_anum[strand];
1027                     }
1028                 }
1029             }
1030 
1031             if(!algn.HasStop() && algn.Exons().back().m_fsplice_sig != "XX")
1032             {
1033                 int a = lim.GetTo();
1034                 int b = min(len-4,lim.GetTo()+NonConsensusMargin);
1035                 for(TSignedSeqPos i = a; i <= b; ++i)
1036                 {
1037                     if(m_stpscr[strand][i] == BadScore())
1038                     {
1039                         m_stpscr[strand][i] = consensuspenalty;
1040                         ++m_stpnum[strand];
1041                     }
1042 
1043                     if(m_dscr[strand][i] == BadScore())
1044                     {
1045                         m_dscr[strand][i] = consensuspenalty;
1046                         ++m_dnum[strand];
1047                     }
1048 
1049                     if(m_ascr[strand][i] == BadScore())
1050                     {
1051                         m_ascr[strand][i] = consensuspenalty;
1052                         ++m_anum[strand];
1053                     }
1054                 }
1055             }
1056         }
1057         else
1058         {
1059             if(!algn.HasStart() && algn.Exons().back().m_fsplice_sig != "XX")
1060             {
1061                 int a = lim.GetTo();
1062                 int b = min(len-4,lim.GetTo()+NonConsensusMargin);
1063                 for(TSignedSeqPos i = a; i <= b; ++i)
1064                 {
1065                     if(m_sttscr[strand][i] == BadScore())
1066                     {
1067                         m_sttscr[strand][i] = consensuspenalty;
1068                         ++m_sttnum[strand];
1069                     }
1070 
1071                     if(m_dscr[strand][i] == BadScore())
1072                     {
1073                         m_dscr[strand][i] = consensuspenalty;
1074                         ++m_dnum[strand];
1075                     }
1076 
1077                     if(m_ascr[strand][i] == BadScore())
1078                     {
1079                         m_ascr[strand][i] = consensuspenalty;
1080                         ++m_anum[strand];
1081                     }
1082                 }
1083             }
1084 
1085             if(!algn.HasStop() && algn.Exons().front().m_ssplice_sig != "XX")
1086             {
1087                 int a = max(2,lim.GetFrom()-NonConsensusMargin);
1088                 int b = lim.GetFrom()-1;
1089                 for(TSignedSeqPos i = a; i <= b; ++i)
1090                 {
1091                     if(m_stpscr[strand][i] == BadScore())
1092                     {
1093                         m_stpscr[strand][i] = consensuspenalty;
1094                         ++m_stpnum[strand];
1095                     }
1096 
1097                     if(m_dscr[strand][i] == BadScore())
1098                     {
1099                         m_dscr[strand][i] = consensuspenalty;
1100                         ++m_dnum[strand];
1101                     }
1102 
1103                     if(m_ascr[strand][i] == BadScore())
1104                     {
1105                         m_ascr[strand][i] = consensuspenalty;
1106                         ++m_anum[strand];
1107                     }
1108                 }
1109             }
1110         }
1111     }
1112 
1113     const int stpT = 1, stpTA = 2, stpTG = 4;
1114 
1115     for(int strand = 0; strand < 2; ++strand)
1116     {
1117         CEResidueVec& s = m_seq[strand];
1118 
1119         if(strand == ePlus) {
1120             for(TSignedSeqPos i = 0; i < len; ++i) {
1121                 if(m_ascr[strand][i] != BadScore()) {
1122                     if(s[i+1] == enA && s[i+2] == enA) m_asplit[strand][0][i] |= stpT;
1123                     if(s[i+1] == enA && s[i+2] == enG) m_asplit[strand][0][i] |= stpT;
1124                     if(s[i+1] == enG && s[i+2] == enA) m_asplit[strand][0][i] |= stpT;
1125                     if(s[i+1] == enA) m_asplit[strand][1][i] |= stpTA|stpTG;
1126                     if(s[i+1] == enG) m_asplit[strand][1][i] |= stpTA;
1127                 }
1128                 if(m_dscr[strand][i] != BadScore()) {
1129                     if(s[i] == enT) m_dsplit[strand][0][i] |= stpT;
1130                     if(s[i-1] == enT && s[i] == enA) m_dsplit[strand][1][i] |= stpTA;
1131                     if(s[i-1] == enT && s[i] == enG) m_dsplit[strand][1][i] |= stpTG;
1132                 }
1133             }
1134         } else {
1135             for(TSignedSeqPos i = 0; i < len; ++i) {
1136                 int ii = len-2-i;   // extra -1 because ii is point on the "right"
1137                 if(m_ascr[strand][i] != BadScore()) {
1138                     if(s[ii+1] == enA && s[ii+2] == enA) m_asplit[strand][0][i] |= stpT;
1139                     if(s[ii+1] == enA && s[ii+2] == enG) m_asplit[strand][0][i] |= stpT;
1140                     if(s[ii+1] == enG && s[ii+2] == enA) m_asplit[strand][0][i] |= stpT;
1141                     if(s[ii+1] == enA) m_asplit[strand][1][i] |= stpTA|stpTG;
1142                     if(s[ii+1] == enG) m_asplit[strand][1][i] |= stpTA;
1143                 }
1144                 if(m_dscr[strand][i] != BadScore()) {
1145                     if(s[ii] == enT) m_dsplit[strand][0][i] |= stpT;
1146                     if(s[ii-1] == enT && s[ii] == enA) m_dsplit[strand][1][i] |= stpTA;
1147                     if(s[ii-1] == enT && s[ii] == enG) m_dsplit[strand][1][i] |= stpTG;
1148                 }
1149             }
1150         }
1151     }
1152 
1153     for(TAlignSet::iterator it = allaligns.begin(); it != allaligns.end(); ++it)
1154     {
1155         const CGeneModel& algn(*it);
1156         int strand = algn.Strand();
1157         TSignedSeqRange cds_lim = algn.ReadingFrame();
1158         if(cds_lim.Empty()) continue;
1159 
1160         if(strand == ePlus)
1161         {
1162             for(unsigned int k = 0; k < algn.Exons().size()-1; ++k)   // ignoring splitted STOPS in CDS
1163             {
1164                 TSignedSeqPos b = algn.Exons()[k].GetTo();
1165                 if(algn.Exons()[k].m_ssplice && algn.Exons()[k+1].m_fsplice && b >= cds_lim.GetFrom() && b <= cds_lim.GetTo())
1166                 {
1167                     m_dsplit[strand][0][b] = 0;
1168                     m_dsplit[strand][1][b] = 0;
1169                 }
1170             }
1171         }
1172         else
1173         {
1174             for(unsigned int k = 1; k < algn.Exons().size(); ++k)   // ignoring splitted STOPS in CDS
1175             {
1176                 TSignedSeqPos a = algn.Exons()[k].GetFrom();
1177                 if(a > 0 && algn.Exons()[k-1].m_ssplice && algn.Exons()[k].m_fsplice && a >= cds_lim.GetFrom() && a <= cds_lim.GetTo())
1178                 {
1179                     m_dsplit[strand][0][a-1] = 0;
1180                     m_dsplit[strand][1][a-1] = 0;
1181                 }
1182             }
1183         }
1184     }
1185 }
1186 
SelectBestReadingFrame(const CGeneModel & model,const CEResidueVec & mrna,const CAlignMap & mrnamap,TIVec starts[3],TIVec stops[3],int & best_frame,int & best_start,int & best_stop,bool extend5p) const1187 double CGnomonEngine::SelectBestReadingFrame(const CGeneModel& model, const CEResidueVec& mrna, const CAlignMap& mrnamap, TIVec starts[3],  TIVec stops[3], int& best_frame, int& best_start, int& best_stop, bool extend5p) const
1188 {
1189     const CTerminal& acceptor    = *m_data->m_acceptor;
1190     const CTerminal& donor       = *m_data->m_donor;
1191     const CTerminal& stt         = *m_data->m_start;
1192     const CTerminal& stp         = *m_data->m_stop;
1193     const CCodingRegion& cdr     = *m_data->m_cdr;
1194     const CNonCodingRegion& ncdr = *m_data->m_ncdr;
1195     int contig_len = m_data->m_seq.size();
1196     EStrand strand = model.Strand();
1197     const vector<CModelExon>& exons = model.Exons();
1198     int num_exons = model.Exons().size();
1199 
1200     const CDoubleStrandSeq& ds = m_data->m_ds;
1201     TDVec splicescr(mrna.size(),0);
1202 
1203     if(strand == ePlus) {
1204         int shift = -1;
1205         for(int i = 1; i < num_exons; ++i) {
1206             shift += mrnamap.FShiftedLen(exons[i-1].GetFrom(),exons[i-1].GetTo());
1207 
1208             if(exons[i-1].m_ssplice && exons[i-1].m_ssplice_sig != "XX") {
1209                 int l = exons[i-1].GetTo();
1210                 double scr = donor.Score(ds[ePlus],l);
1211                 if(scr == BadScore()) {
1212                     scr = 0;
1213                 } else {
1214                     for(int k = l-donor.Left()+1; k <= l+donor.Right(); ++k) {
1215                         double s = ncdr.Score(ds[ePlus],k);
1216                         if(s == BadScore()) s = 0;
1217                         scr -= s;
1218                     }
1219                 }
1220                 splicescr[shift] = scr;
1221             }
1222 
1223             if(exons[i].m_fsplice && exons[i].m_fsplice_sig != "XX") {
1224                 int l = exons[i].GetFrom()-1;
1225                 double scr = acceptor.Score(ds[ePlus],l);
1226                 if(scr == BadScore()) {
1227                     scr = 0;
1228                 } else {
1229                     for(int k = l-acceptor.Left()+1; k <= l+acceptor.Right(); ++k) {
1230                         double s = ncdr.Score(ds[ePlus],k);
1231                         if(s == BadScore()) s = 0;
1232                         scr -= s;
1233                     }
1234                 }
1235                 splicescr[shift] += scr;
1236             }
1237         }
1238     } else {
1239         int shift = mrna.size()-1;
1240         for(int i = 1; i < num_exons; ++i) {
1241             shift -= mrnamap.FShiftedLen(exons[i-1].GetFrom(),exons[i-1].GetTo());
1242 
1243             if(exons[i-1].m_ssplice && exons[i-1].m_ssplice_sig != "XX") {
1244                 int l = contig_len-2-exons[i-1].GetTo();
1245                 double scr = acceptor.Score(ds[eMinus],l);
1246                 if(scr == BadScore()) {
1247                     scr = 0;
1248                 } else {
1249                     for(int k = l-acceptor.Left()+1; k <= l+acceptor.Right(); ++k) {
1250                         double s = ncdr.Score(ds[eMinus],k);
1251                         if(s == BadScore()) s = 0;
1252                         scr -= s;
1253                     }
1254                 }
1255                 splicescr[shift] = scr;
1256             }
1257 
1258             if(exons[i].m_fsplice && exons[i].m_fsplice_sig != "XX") {
1259                 int l = contig_len-1-exons[i].GetFrom();
1260                 double scr = donor.Score(ds[eMinus],l);
1261                 if(scr == BadScore()) {
1262                     scr = 0;
1263                 } else {
1264                     for(int k = l-donor.Left()+1; k <= l+donor.Right(); ++k) {
1265                         double s = ncdr.Score(ds[eMinus],k);
1266                         if(s == BadScore()) s = 0;
1267                         scr -= s;
1268                     }
1269                 }
1270                 splicescr[shift] += scr;
1271             }
1272         }
1273     }
1274 
1275     TDVec cdrscr[3];
1276     for(int frame = 0; frame < 3; ++frame) {
1277         if (frame==best_frame || best_frame==-1) {
1278             cdrscr[frame].resize(mrna.size(),BadScore());
1279             for(int i = 0; i < (int)mrna.size(); ++i) {
1280                 int codonshift = (i-frame)%3;
1281                 if(codonshift < 0)
1282                     codonshift += 3;
1283 
1284                 double scr = cdr.Score(mrna,i,codonshift);
1285                 if(scr == BadScore())  {
1286                     scr = 0;
1287                 } else {
1288                     double s = ncdr.Score(mrna,i);
1289                     if(s == BadScore()) s = 0;
1290                     scr -= s;
1291                 }
1292 
1293                 cdrscr[frame][i] = scr+splicescr[i];
1294                 if(i > 0)
1295                     cdrscr[frame][i] += cdrscr[frame][i-1];
1296             }
1297         }
1298     }
1299 
1300     double best_score  = BadScore();
1301     best_start = -1;
1302     best_stop = -1;
1303     CCDSInfo cds_info = model.GetCdsInfo();
1304 
1305     int best_frame_initial = best_frame;
1306 
1307     for(int frame = 0; frame < 3; ++frame) {
1308         if (frame==best_frame_initial || best_frame_initial==-1) {
1309             for(int i = 0; i < (int)stops[frame].size(); i++) {
1310                 int stop = stops[frame][i];
1311                 if(stop < 0)        // bogus stop
1312                     continue;
1313 
1314                 int prev_stop = -6;
1315                 if(i > 0)
1316                     prev_stop = stops[frame][i-1];
1317                 TIVec::iterator it_a = lower_bound(starts[frame].begin(),starts[frame].end(),prev_stop+3);
1318                 if(it_a == starts[frame].end() || *it_a >= stop)    // no start
1319                     continue;
1320                 TIVec::iterator it_b = it_a+1;
1321                 if(*it_a < 0 && it_b != starts[frame].end() && *it_b < stop)   // open rf and there is apropriate start at right
1322                     ++it_b;
1323                 if(!extend5p) {
1324                     for( ; it_b != starts[frame].end() && *it_b < stop; ++it_b);
1325                 }
1326 
1327                 for(TIVec::iterator it = it_a; it != it_b; it++) {             // if extend5p==true this loop includes only open rf (if exists) and one real start
1328                                                                                // otherwise the best start is selected
1329                     int start = *it;
1330 
1331                     if (stop-start-(start>=0?0:3) < 30)
1332                         continue;
1333 
1334                     double s = cdrscr[frame][stop-1]-cdrscr[frame][start+2+(start>=0?0:3)];
1335 
1336                     double stt_score = BadScore();
1337                     if(start >= stt.Left()+2) {    // 5 extra bases for ncdr
1338                         int pnt = start+2;
1339                         stt_score = stt.Score(mrna,pnt);
1340                         if(stt_score != BadScore()) {
1341                             for(int k = pnt-stt.Left()+1; k <= pnt+stt.Right(); ++k) {
1342                                 double sn = ncdr.Score(mrna,k);
1343                                 if(sn != BadScore())
1344                                     stt_score -= sn;
1345                             }
1346                         }
1347                     } else {
1348                         int pnt = mrnamap.MapEditedToOrig(start+(start>=0?0:3));
1349                         if(strand == eMinus) pnt = contig_len-1-pnt;
1350                         pnt += 2;
1351                         stt_score = stt.Score(ds[strand],pnt);
1352                         if(stt_score != BadScore()) {
1353                             for(int k = pnt-stt.Left()+1; k <= pnt+stt.Right(); ++k) {
1354                                 double sn = ncdr.Score(ds[strand],k);
1355                                 if(sn != BadScore())
1356                                     stt_score -= sn;
1357                             }
1358                         }
1359                     }
1360                     if(stt_score != BadScore())
1361                         s += stt_score;
1362 
1363                     double stp_score = stp.Score(mrna,stop-1);
1364                     if(stp_score != BadScore()) {
1365                         for(int k = stop-stp.Left(); k < stop+stp.Right(); ++k) {
1366                             double sn = ncdr.Score(mrna,k);
1367                             if(sn != BadScore())
1368                                 stp_score -= sn;
1369                         }
1370                         s += stp_score;
1371                     }
1372 
1373                     if(s >= best_score) {
1374                         best_frame = frame;
1375                         best_score = s;
1376                         best_start = start;
1377                         best_stop = stop;
1378                     }
1379                 }
1380             }
1381         }
1382     }
1383 
1384     if(cds_info.ConfirmedStart() && cds_info.ConfirmedStop() && model.Continuous()) {  // looks like a complete model, will get a permanent boost in score which will improve chanses for the first placement over notcomplete models
1385         best_score += max(1.,0.3*best_score);        // in case s negative
1386     }
1387 
1388     return best_score;
1389 }
1390 
GetScore(CGeneModel & model,bool extend5p,bool obeystart) const1391 void CGnomonEngine::GetScore(CGeneModel& model, bool extend5p, bool obeystart) const
1392 {
1393     CAlignMap mrnamap(model.Exons(),model.FrameShifts(),model.Strand());
1394     CEResidueVec mrna;
1395     mrnamap.EditedSequence(m_data->m_ds[ePlus], mrna);
1396 
1397     int frame = -1;
1398     TIVec starts[3], stops[3];
1399 
1400     FindStartsStops(model, m_data->m_ds[model.Strand()], mrna, mrnamap, starts, stops, frame, obeystart);
1401 
1402     CCDSInfo cds_info = model.GetCdsInfo();
1403     if((cds_info.ReadingFrame().NotEmpty() || !cds_info.PStops().empty()) && cds_info.IsMappedToGenome())
1404         cds_info = cds_info.MapFromOrigToEdited(mrnamap);
1405 
1406     // remove legit pstopd from stops[]
1407     CCDSInfo::TPStops pstops = cds_info.PStops();
1408     for(int fr = 0; fr < 3; ++fr) {
1409         ERASE_ITERATE(TIVec, pstp, stops[fr]) {
1410             TSignedSeqRange pstop = TSignedSeqRange(*pstp,*pstp+2);
1411             CCDSInfo::TPStops::iterator it = find(pstops.begin(), pstops.end(), pstop);
1412             if(it != pstops.end())
1413                 VECTOR_ERASE(pstp, stops[fr]);
1414         }
1415     }
1416 
1417     if(cds_info.ReadingFrame().Empty()) {   // we didn't know the frame before
1418         int mrna_len = (int)mrna.size();
1419 
1420         for(int fr = 0; fr < 3; ++fr) {
1421 
1422             int first_stop = stops[fr][0];
1423             //            if(first_stop < 0)  // bogus stop for maxcds
1424             //                first_stop = stops[fr][1];
1425 
1426             if(first_stop >= 3) {   // we have 5' reading frame
1427                 TSignedSeqRange rframe, stt, stp;
1428 
1429                 if(first_stop < mrna_len-2) {
1430                     stp = mrnamap.MapRangeEditedToOrig(TSignedSeqRange(first_stop,first_stop+2), false);
1431                 }
1432 
1433                 int first_start = -1;
1434                 if(!starts[fr].empty()) {
1435                     first_start = starts[fr][0];
1436                     if(first_start < 0 && starts[fr].size() > 1)
1437                         first_start = starts[fr][1];
1438                 }
1439                 int fivep_rf = fr;
1440                 if(first_start >= 0 && first_start <= first_stop-6) {
1441                     stt = mrnamap.MapRangeEditedToOrig(TSignedSeqRange(first_start,first_start+2), false);
1442                     fivep_rf = first_start+3;
1443                 }
1444 
1445                 if(stt.NotEmpty() || stops[fr][0] >= 0) {    // there is a start or no upstream stop
1446                     int threep_rf = first_stop-1;
1447                     rframe = mrnamap.MapRangeEditedToOrig(TSignedSeqRange(fivep_rf,threep_rf), true);
1448 
1449                     _ASSERT(rframe.NotEmpty());
1450 
1451                     CCDSInfo ci;
1452                     ci.SetReadingFrame(rframe);
1453                     if(stt.NotEmpty()) {
1454                         ci.SetStart(stt);
1455                         if(stops[fr][0] < 0) {
1456                             int bs  = mrnamap.MapEditedToOrig(first_start);
1457                             _ASSERT( bs >= 0 );
1458                             ci.Set5PrimeCdsLimit(bs);
1459                         }
1460                     }
1461                     if(stp.NotEmpty())
1462                         ci.SetStop(stp);
1463                     ci.SetScore(0, stt.NotEmpty());
1464                     model.SetEdgeReadingFrames()->push_back(ci);
1465                 }
1466             }
1467 
1468             if(first_stop < mrna_len-2) {   //there is a stop and possibly 3' reading frame
1469                 TSignedSeqRange rframe, stt;
1470                 int last_stop = stops[fr][(int)stops[fr].size()-1];
1471                 if(last_stop >= mrna_len-2)
1472                     last_stop = stops[fr][(int)stops[fr].size()-2]; //garanteed to be present
1473                 int start;
1474                 ITERATE(TIVec, it, starts[fr]) {
1475                     start = *it;
1476                     if(start > last_stop && start <= mrna_len-6) {
1477                         stt = mrnamap.MapRangeEditedToOrig(TSignedSeqRange(start,start+2), false);
1478                         break;
1479                     }
1480                 }
1481                 if(stt.NotEmpty()) {    // there is 3' reading frame
1482                     int fivep_rf = start+3;
1483                     int threep_rf = mrna_len-(mrna_len-fr)%3-1;
1484                     rframe = mrnamap.MapRangeEditedToOrig(TSignedSeqRange(fivep_rf,threep_rf), true);
1485 
1486                     _ASSERT(rframe.NotEmpty());
1487 
1488                     CCDSInfo ci;
1489                     ci.SetReadingFrame(rframe);
1490                     ci.SetStart(stt);
1491                     ci.Set5PrimeCdsLimit(model.Strand() == ePlus ? stt.GetFrom():stt.GetTo());
1492                     ci.SetScore(0,false);
1493                     model.SetEdgeReadingFrames()->push_back(ci);
1494                 }
1495             }
1496         }
1497     }
1498 
1499     int best_start, best_stop;
1500     double best_score = SelectBestReadingFrame(model, mrna, mrnamap, starts, stops, frame, best_start, best_stop, extend5p);
1501 
1502     if(cds_info.MaxCdsLimits().NotEmpty())
1503         cds_info.Clear5PrimeCdsLimit();
1504 
1505     if (best_score == BadScore()) {
1506         cds_info.Clear();
1507         model.SetCdsInfo( cds_info );
1508         return;
1509     }
1510 
1511     bool is_open = false;
1512     if(best_start == 0) {
1513         is_open = !cds_info.ConfirmedStart();
1514     } else if(best_start<0 && starts[frame].size() > 1) {
1515         int new_start = starts[frame][1];
1516         int newlen = best_stop-new_start;
1517         int oldlen = best_stop-best_start;
1518         if(newlen >= max(6,oldlen/2) || cds_info.ConfirmedStart()) {
1519             is_open = !cds_info.ConfirmedStart();
1520             best_start = new_start;
1521         }
1522     }
1523 
1524     if (cds_info.ConfirmedStart() && best_start != starts[frame].back()) {  // confirmed start was expanded within alignment
1525         model.AddComment("movedconfstart");
1526     }
1527 
1528     bool has_start = best_start>=0;
1529     bool confirmed_start = cds_info.ConfirmedStart();   //we wamnt to keep the status even if the actual start moved within alignment, the status will be used in gnomon and will prevent any furher extension
1530     bool confirmed_stop = cds_info.ConfirmedStop();
1531 
1532     TSignedSeqRange best_reading_frame = TSignedSeqRange(best_start+3,best_stop-1);
1533 
1534     if (Include(best_reading_frame, cds_info.Start()))
1535         cds_info.SetStart(TSignedSeqRange::GetEmpty());
1536     if (Include(best_reading_frame, cds_info.Stop()))
1537         cds_info.SetStop(TSignedSeqRange::GetEmpty());
1538 
1539     cds_info.ClearPStops();
1540 
1541     if (cds_info.ProtReadingFrame().NotEmpty() && !Include(best_reading_frame, cds_info.ProtReadingFrame())) {
1542         _ASSERT( best_start==0 );
1543         cds_info.SetReadingFrame(best_reading_frame & cds_info.ProtReadingFrame(), true);
1544     }
1545     cds_info.SetReadingFrame(best_reading_frame);
1546 
1547     int upstream_stop = frame-3;
1548     if(has_start) {
1549         cds_info.SetStart(TSignedSeqRange(best_start,best_start+2), confirmed_start);
1550         if(FindUpstreamStop(stops[frame],best_start,upstream_stop))
1551             cds_info.Set5PrimeCdsLimit(best_start);
1552     }
1553 
1554     if ((int)mrna.size() - best_stop >=3)
1555         cds_info.SetStop(TSignedSeqRange(best_stop,best_stop+2),confirmed_stop);
1556 
1557     for(int i = upstream_stop+3; i < best_stop; i += 3) {
1558         if(IsStopCodon(&mrna[i])) {
1559             TSignedSeqRange pstop = TSignedSeqRange(i,i+2);
1560             CCDSInfo::EStatus status = CCDSInfo::eUnknown;
1561             CCDSInfo::TPStops::iterator it = find(pstops.begin(), pstops.end(), pstop);
1562             if(it != pstops.end())
1563                 status = it->m_status;
1564             cds_info.AddPStop(pstop, status);
1565         }
1566     }
1567 
1568     cds_info.SetScore(best_score, is_open);
1569     cds_info = cds_info.MapFromEditedToOrig(mrnamap); // returns empty cds with badscore if con't map
1570     model.SetCdsInfo(cds_info);
1571 }
1572 
1573 
1574 END_SCOPE(gnomon)
1575 END_NCBI_SCOPE
1576