1 #ifndef ALGO_GNOMON___GNOMON_MODEL__HPP
2 #define ALGO_GNOMON___GNOMON_MODEL__HPP
3 
4 /*  $Id: gnomon_model.hpp 626290 2021-02-25 14:09:15Z ivanov $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors:  Alexandre Souvorov
30  *
31  * File Description:
32  *
33  */
34 
35 #include <corelib/ncbiobj.hpp>
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbi_limits.hpp>
38 #include "set.hpp"
39 
40 #include <set>
41 #include <vector>
42 #include <algorithm>
43 #include <math.h>
44 
45 #include <objmgr/seq_vector_ci.hpp> // CSeqVectorTypes::TResidue
46 #include <util/range.hpp>           // TSignedSeqRange
47 
48 BEGIN_NCBI_SCOPE
49 BEGIN_SCOPE(objects)
50 class CSeq_align;
51 class CSeq_id;
52 class CGenetic_code;
53 END_SCOPE(objects)
54 USING_SCOPE(objects);
55 BEGIN_SCOPE(gnomon)
56 
57 class CGnomonEngine;
58 
59 // Making this a constant declaration (kBadScore) would be preferable,
60 // but backfires on WorkShop, where it is implicitly static and hence
61 // unavailable for use in inline functions.
62 inline
BadScore()63 double BadScore() { return -numeric_limits<double>::max(); }
64 
65 enum EStrand { ePlus, eMinus};
OtherStrand(EStrand s)66 inline EStrand OtherStrand(EStrand s) { return (s == ePlus ? eMinus : ePlus); }
67 
68 typedef objects::CSeqVectorTypes::TResidue TResidue; // unsigned char
69 
70 typedef vector<TResidue> CResidueVec;
71 
72 typedef vector<int> TIVec;
73 
Precede(TSignedSeqRange l,TSignedSeqRange r)74 inline bool Precede(TSignedSeqRange l, TSignedSeqRange r) { return l.GetTo() < r.GetFrom(); }
Include(TSignedSeqRange big,TSignedSeqRange small)75 inline bool Include(TSignedSeqRange big, TSignedSeqRange small) { return (big.GetFrom()<=small.GetFrom() && small.GetTo()<=big.GetTo()); }
Include(TSignedSeqRange r,TSignedSeqPos p)76 inline bool Include(TSignedSeqRange r, TSignedSeqPos p) { return (r.GetFrom()<=p && p<=r.GetTo()); }
Enclosed(TSignedSeqRange big,TSignedSeqRange small)77 inline bool Enclosed(TSignedSeqRange big, TSignedSeqRange small) { return (big != small && Include(big, small)); }
78 
79 class CSupportInfo
80 {
81 public:
82     CSupportInfo(Int8 model_id, bool core=false);
83 
84 
85     Int8 GetId() const;
86     void SetCore(bool core);
87     bool IsCore() const;
88     bool operator==(const CSupportInfo& s) const;
89     bool operator<(const CSupportInfo& s) const;
90 
91 private:
92     Int8 m_id;
93     bool m_core_align;
94 };
95 
96 typedef CVectorSet<CSupportInfo> CSupportInfoSet;
97 
98 class CAlignModel;
99 
100 class NCBI_XALGOGNOMON_EXPORT CInDelInfo {
101 public:
102 
103     struct SSource {
SSourceCInDelInfo::SSource104         SSource() : m_strand(ePlus) {}
105         string m_acc;
106         TSignedSeqRange m_range;
107         EStrand m_strand;
108     };
109 
110     enum EType {eDel, eIns, eMism};
111     enum EStatus {eGenomeNotCorrect, eGenomeCorrect, eUnknown};
112 
CInDelInfo(TSignedSeqPos l,int len,EType type,const string & v=kEmptyStr,const SSource & s=SSource ())113     CInDelInfo(TSignedSeqPos l, int len, EType type, const string& v = kEmptyStr, const SSource& s = SSource()) { Init(l, len, type, v, s); }
114 
Loc() const115     TSignedSeqPos Loc() const { return m_loc; }
Len() const116     int Len() const { return m_len; }
InDelEnd() const117     int InDelEnd() const { return ((IsInsertion() || IsMismatch()) ? Loc()+Len() : Loc()); }  // first base "after" correction
IsInsertion() const118     bool IsInsertion() const { return m_type == eIns; }
IsDeletion() const119     bool IsDeletion() const { return m_type == eDel; }
IsMismatch() const120     bool IsMismatch() const { return m_type == eMism; }
IntersectingWith(TSignedSeqPos a,TSignedSeqPos b) const121     bool IntersectingWith(TSignedSeqPos a, TSignedSeqPos b) const    // insertion/mismatch at least partially inside, deletion inside or flanking
122     {
123         return (IsDeletion() && Loc() >= a && Loc() <= b+1) ||
124             ((IsInsertion() || IsMismatch()) && Loc() <= b && a <= Loc()+Len()-1);
125     }
operator <(const CInDelInfo & fsi) const126     bool operator<(const CInDelInfo& fsi) const    // source is ignored!!!!!!!!!!!
127     {
128         if(m_loc != fsi.m_loc)
129             return m_loc < fsi.m_loc;
130         else if(m_type != fsi.m_type)
131             return m_type < fsi.m_type;  // if location is same deletion first
132         else if(m_len != fsi.m_len)
133             return m_len < fsi.m_len;
134         else
135             return m_indelv < fsi.m_indelv;
136     }
operator !=(const CInDelInfo & fsi) const137     bool operator!=(const CInDelInfo& fsi) const { return (*this < fsi || fsi < *this); }
operator ==(const CInDelInfo & fsi) const138     bool operator==(const CInDelInfo& fsi) const { return !(*this != fsi); }
GetInDelV() const139     string GetInDelV() const { return m_indelv; }
GetSource() const140     const SSource& GetSource() const { return m_source; }
GetType() const141     EType GetType() const { return m_type; };
SetStatus(EStatus s)142     void SetStatus(EStatus s) { m_status = s; }
GetStatus() const143     EStatus GetStatus() const { return m_status; }
SetLoc(TSignedSeqPos l)144     void SetLoc(TSignedSeqPos l) { m_loc = l; }
145 
146 private:
Init(TSignedSeqPos l,int len,EType type,const string & v,const SSource & s)147     void Init(TSignedSeqPos l, int len, EType type, const string& v, const SSource& s) {
148         m_loc = l;
149         m_len = len;
150         m_type = type;
151         m_status = eUnknown;
152         m_indelv = v;
153         m_source = s;
154         _ASSERT(m_indelv.empty() || (int)m_indelv.length() == len);
155         _ASSERT(m_indelv.empty() || m_type != eIns);
156         if((IsDeletion() || IsMismatch()) && GetInDelV().empty())
157             m_indelv.insert( m_indelv.end(), Len(),'N');
158     }
159 
160     TSignedSeqPos m_loc;  // left location for insertion, deletion is before m_loc
161                           // insertion - when there are extra bases in the genome
162     int m_len;
163     EType m_type;
164     EStatus m_status;
165     string m_indelv;
166     SSource m_source;
167 };
168 
169 typedef vector<CInDelInfo> TInDels;
170 
171 template <class Res>
172 bool IsStartCodon(const Res * seq, int strand = ePlus);   // seq points to the first base in biological order
173 template <class Res>
174 bool IsStopCodon(const Res * seq, int strand = ePlus);   // seq points to the first base in biological order
175 
176 
177 class CRangeMapper {
178 public:
~CRangeMapper()179     virtual ~CRangeMapper() {}
180     virtual TSignedSeqRange operator()(TSignedSeqRange r, bool withextras = true) const = 0;
181 };
182 
183 class NCBI_XALGOGNOMON_EXPORT CModelExon {
184 public:
CModelExon(TSignedSeqPos f=0,TSignedSeqPos s=0,bool fs=false,bool ss=false,const string & fsig="",const string & ssig="",double ident=0,const string & seq="",const CInDelInfo::SSource & src=CInDelInfo::SSource ())185     CModelExon(TSignedSeqPos f = 0, TSignedSeqPos s = 0, bool fs = false, bool ss = false, const string& fsig = "", const string& ssig = "", double ident = 0, const string& seq = "", const CInDelInfo::SSource& src = CInDelInfo::SSource()) :
186         m_fsplice(fs), m_ssplice(ss), m_fsplice_sig(fsig), m_ssplice_sig(ssig), m_ident(ident), m_seq(seq), m_source(src), m_range(f,s)
187     {
188         _ASSERT(m_seq.empty() || m_range.Empty());
189     };
190 
operator ==(const CModelExon & p) const191     bool operator==(const CModelExon& p) const
192     {
193         return (m_range==p.m_range && m_fsplice == p.m_fsplice && m_ssplice == p.m_ssplice);
194     }
operator !=(const CModelExon & p) const195     bool operator!=(const CModelExon& p) const
196     {
197         return !(*this == p);
198     }
operator <(const CModelExon & p) const199     bool operator<(const CModelExon& p) const { return Precede(Limits(),p.Limits()); }
200 
operator TSignedSeqRange() const201     operator TSignedSeqRange() const { return m_range; }
Limits() const202     const TSignedSeqRange& Limits() const { return m_range; }
Limits()203           TSignedSeqRange& Limits()       { return m_range; }
GetFrom() const204     TSignedSeqPos GetFrom() const { return m_range.GetFrom(); }
GetTo() const205     TSignedSeqPos GetTo() const { return m_range.GetTo(); }
206     void Extend(const CModelExon& e);
AddFrom(int d)207     void AddFrom(int d) { m_range.SetFrom( m_range.GetFrom() +d ); }
AddTo(int d)208     void AddTo(int d) { m_range.SetTo( m_range.GetTo() +d ); }
209 
210     bool m_fsplice, m_ssplice;
211     string m_fsplice_sig, m_ssplice_sig;   // obeys strand
212     double m_ident;
213     string m_seq;   // exon sequence if in gap; obeys strand
214     CInDelInfo::SSource m_source;
215 
Remap(const CRangeMapper & mapper)216     void Remap(const CRangeMapper& mapper) { m_range = mapper(m_range); }
217 private:
218     TSignedSeqRange m_range;
219 };
220 
221 class CAlignMap;
222 
223 class CCDSInfo {
224 public:
CCDSInfo(bool gcoords=true)225     CCDSInfo(bool gcoords = true): m_confirmed_start(false), m_confirmed_stop(false), m_open(false), m_score(BadScore()), m_genomic_coordinates(gcoords) {}
226 
227     bool operator== (const CCDSInfo& another) const;
228 
229     //CDS mapped to transcript should be used only for for final models (not alignments)
230     //Change in indels or 5' UTR will invalidate the cooordinates (in particular conversion from CAlignModel to CGeneModel);
IsMappedToGenome() const231     bool IsMappedToGenome() const { return m_genomic_coordinates; }
232     CCDSInfo MapFromOrigToEdited(const CAlignMap& amap) const;
233     CCDSInfo MapFromEditedToOrig(const CAlignMap& amap) const;  // returns 'empty' CDS if can't map
234 
ReadingFrame() const235     TSignedSeqRange ReadingFrame() const { return m_reading_frame; }
ProtReadingFrame() const236     TSignedSeqRange ProtReadingFrame() const { return m_reading_frame_from_proteins; }
Cds() const237     TSignedSeqRange Cds() const { return Start()+ReadingFrame()+Stop(); }
MaxCdsLimits() const238     TSignedSeqRange MaxCdsLimits() const { return m_max_cds_limits; }
239 
Start() const240     TSignedSeqRange Start() const {return m_start; }
Stop() const241     TSignedSeqRange Stop() const {return m_stop; }
HasStart() const242     bool HasStart() const { return Start().NotEmpty(); }
HasStop() const243     bool HasStop () const { return Stop().NotEmpty();  }
ConfirmedStart() const244     bool ConfirmedStart() const { return m_confirmed_start; }  // start is confirmed by protein alignment
ConfirmedStop() const245     bool ConfirmedStop() const { return m_confirmed_stop; }  // stop is confirmed by protein alignment
246 
OpenCds() const247     bool OpenCds() const { return m_open; }  // "optimal" CDS is not internal
Score() const248     double Score() const { return m_score; }
249 
250     void SetReadingFrame(TSignedSeqRange r, bool protein = false);
251     void SetStart(TSignedSeqRange r, bool confirmed = false);
252     void SetStop(TSignedSeqRange r, bool confirmed = false);
253     void Set5PrimeCdsLimit(TSignedSeqPos p);
254     void Clear5PrimeCdsLimit();
255     void SetScore(double score, bool open=false);
256 
257     void CombineWith(const CCDSInfo& another_cds_info);
258     void Remap(const CRangeMapper& mapper);
259     void Clip(TSignedSeqRange limits);
260     void Cut(TSignedSeqRange hole);
261     void Clear();
262 
263     int Strand() const; // -1 (minus), 0 (unknown), 1 (plus)
264 
265     enum EStatus {eSelenocysteine, eGenomeNotCorrect, eGenomeCorrect, eUnknown};
266     struct SPStop : public TSignedSeqRange {
SPStopCCDSInfo::SPStop267         SPStop(TSignedSeqRange r, EStatus s) : TSignedSeqRange(r), m_status(s) {};
268 
269         //not overloaded == is used for uniquing and finding intervals
270         //overloaded < is used for sorting before uniquing
operator <CCDSInfo::SPStop271         bool operator<(const SPStop& stp) const {
272             if(operator==(stp))                // == is not overloaded
273                 return m_status < stp.m_status;
274             else
275                 return TSignedSeqRange::operator<(stp);
276         }
277 
278         EStatus m_status;
279     };
280 
281     typedef vector<SPStop> TPStops;
PStops() const282     const TPStops& PStops() const { return m_p_stops; }
283     bool PStop(bool includeall = true) const;                  // has premature stop(s)
AddPStop(SPStop stp)284     void AddPStop(SPStop stp) { m_p_stops.push_back(stp); _ASSERT( Invariant() ); }
285     void AddPStop(TSignedSeqRange r, EStatus status);
ClearPStops()286     void ClearPStops() { m_p_stops.clear(); }
287 
Invariant() const288     bool Invariant() const
289     {
290 #ifdef _DEBUG
291         if (ReadingFrame().Empty()) {
292             _ASSERT( !HasStop() && !HasStart() );
293             _ASSERT( ProtReadingFrame().Empty() && MaxCdsLimits().Empty() );
294             _ASSERT( !ConfirmedStart() );
295             _ASSERT( !ConfirmedStop() );
296             //            _ASSERT( !PStop() );
297             _ASSERT( !OpenCds() );
298             _ASSERT( Score()==BadScore() );
299             return true;
300         }
301 
302         _ASSERT( !Start().IntersectingWith(ReadingFrame()) );
303         _ASSERT( !Stop().IntersectingWith(ReadingFrame()) );
304         _ASSERT( ProtReadingFrame().Empty() || Include(ReadingFrame(), ProtReadingFrame()) );
305         _ASSERT( Include( MaxCdsLimits(), Cds() ) );
306 
307         if (!HasStop() && !HasStart()) {
308             _ASSERT(MaxCdsLimits().GetFrom()==TSignedSeqRange::GetWholeFrom() && MaxCdsLimits().GetTo()==TSignedSeqRange::GetWholeTo());
309         } else if (HasStart() && !HasStop()) {
310             if (Precede(Start(), ReadingFrame())) {
311                 _ASSERT( MaxCdsLimits().GetTo()==TSignedSeqRange::GetWholeTo() );
312             } else {
313                 _ASSERT( MaxCdsLimits().GetFrom()==TSignedSeqRange::GetWholeFrom() );
314             }
315         } else if (HasStart() && HasStop()) {
316             _ASSERT( Include(Start()+Stop(),ReadingFrame()) );
317         }
318         if (HasStop()) {
319             if (Precede(ReadingFrame(),Stop())) {
320                 _ASSERT( MaxCdsLimits().GetTo()==Stop().GetTo() );
321             } else {
322                 _ASSERT( MaxCdsLimits().GetFrom()==Stop().GetFrom() );
323             }
324         }
325 
326         if (ConfirmedStart()) {
327             _ASSERT( HasStart() );
328         }
329 
330         if (ConfirmedStop()) {
331             _ASSERT( HasStop() );
332         }
333 
334         //       ITERATE(TPStops, s, PStops())
335         //            _ASSERT( Include(MaxCdsLimits(), *s) );
336 #endif
337 
338         return true;
339     }
340 
341 private:
342     TSignedSeqRange m_start, m_stop;
343     TSignedSeqRange m_reading_frame;
344     TSignedSeqRange m_reading_frame_from_proteins;
345     TSignedSeqRange m_max_cds_limits;
346 
347     bool m_confirmed_start, m_confirmed_stop;
348     TPStops m_p_stops;
349 
350     bool m_open;
351     double m_score;
352 
353     bool m_genomic_coordinates;
354 };
355 
356 
357 class NCBI_XALGOGNOMON_EXPORT CGeneModel
358 {
359 public:
360     enum EType {
361         eWall = 1,
362         eNested = 2,
363         eSR = 4,
364         eEST = 8,
365         emRNA = 16,
366         eProt = 32,
367         eNotForChaining = 64,
368         eChain = 128,
369         eGnomon = 256
370     };
371     static string TypeToString(int type);
372 
373     enum EStatus {
374         ecDNAIntrons = 1,
375         eReversed = 2,
376         eSkipped = 4,
377         eLeftTrimmed = 8,
378         eRightTrimmed = 16,
379         eFullSupCDS = 32,
380         ePseudo = 64,
381         ePolyA = 128,
382         eCap = 256,
383         eBestPlacement = 512,
384         eUnknownOrientation = 1024,
385         eConsistentCoverage = 2048,
386         eGapFiller = 4096,
387         eUnmodifiedAlign = 8192,
388         eChangedByFilter = 16384,
389         eTSA = 32768,
390         eLeftConfirmed = 65536,
391         eRightConfirmed = 131072,
392         eLeftFlexible = 262144,
393         eRightFlexible = 524288
394     };
395 
CGeneModel(EStrand s=ePlus,Int8 id=0,int type=0)396     CGeneModel(EStrand s = ePlus, Int8 id = 0, int type = 0) :
397         m_type(type), m_id(id), m_status(0), m_ident(0), m_weight(1), m_expecting_hole(false), m_strand(s), m_geneid(0), m_rank_in_gene(0) {}
~CGeneModel()398     virtual ~CGeneModel() {}
399 
400     void AddExon(TSignedSeqRange exon, const string& fs = "", const string& ss = "", double ident = 0, const string& seq = "", const CInDelInfo::SSource& src = CInDelInfo::SSource());
401     void AddHole(); // between model and next exons
402     void AddGgapExon(double ident, const string& seq, const CInDelInfo::SSource& src, bool infront);
403     void AddNormalExon(TSignedSeqRange exon, const string& fs, const string& ss, double ident, bool infront);
404 
405     typedef vector<CModelExon> TExons;
Exons() const406     const TExons& Exons() const { return m_exons; }
ClearExons()407     void ClearExons() {
408         m_exons.clear();
409         m_fshifts.clear();
410         m_range = TSignedSeqRange::GetEmpty();
411         m_cds_info = CCDSInfo();
412         m_edge_reading_frames.clear();
413     }
SetSplices(int i,const string & f_sig,const string & s_sig)414     void SetSplices(int i, const string& f_sig, const string& s_sig) { m_exons[i].m_fsplice_sig = f_sig; m_exons[i].m_ssplice_sig = s_sig; }
415 
416     void ReverseComplementModel();
417 
418     void Remap(const CRangeMapper& mapper);
419     enum EClipMode { eRemoveExons, eDontRemoveExons };
420     virtual void Clip(TSignedSeqRange limits, EClipMode mode, bool ensure_cds_invariant = true);  // drops the score!!!!!!!!!
421     virtual void CutExons(TSignedSeqRange hole); // clip or remove exons, dangerous, should be completely in or outside the cds, should not cut an exon in two
422     void ExtendLeft(int amount);
423     void ExtendRight(int amount);
424     void Extend(const CGeneModel& a, bool ensure_cds_invariant = true);
425     void RemoveShortHolesAndRescore(const CGnomonEngine& gnomon);   // removes holes shorter than min intron (may add frameshifts/stops)
426 
427     TSignedSeqRange TranscriptExon(int i) const;
428 
Limits() const429     TSignedSeqRange Limits() const { return m_range; }
430     TSignedSeqRange TranscriptLimits() const;
431     int AlignLen() const ;
432     void RecalculateLimits();
433 
434    // ReadingFrame doesn't include start/stop. It's always on codon boundaries
ReadingFrame() const435     TSignedSeqRange ReadingFrame() const { return m_cds_info.ReadingFrame(); }
436     // CdsLimits include start/stop if any, goes to model limit if no start/stop
437     TSignedSeqRange RealCdsLimits() const;
438     int RealCdsLen() const ;              // %3!=0 is possible
439     // MaxCdsLimits - longest cds. include start/stop if any,
440     // goes to 5' limit if no upstream stop, goes to 3' limit if no stop
441     TSignedSeqRange MaxCdsLimits() const;
442 
GetCdsInfo() const443     const CCDSInfo& GetCdsInfo() const { return m_cds_info; }
444     void SetCdsInfo(const CCDSInfo& cds_info);
445     void SetCdsInfo(const CGeneModel& a);
446     void CombineCdsInfo(const CGeneModel& a, bool ensure_cds_invariant = true);
447     void CombineCdsInfo(const CCDSInfo& cds_info, bool ensure_cds_invariant = true);
448 
IntersectingWith(const CGeneModel & a) const449     bool IntersectingWith(const CGeneModel& a) const
450     {
451         return Limits().IntersectingWith(a.Limits());
452     }
453 
Ident() const454     double Ident() const { return m_ident; }
SetIdent(double i)455     void SetIdent(double i) { m_ident = i; }
456 
Weight() const457     double Weight() const { return m_weight; }
SetWeight(double w)458     void SetWeight(double w) { m_weight = w; }
459 
SetStrand(EStrand s)460     void SetStrand(EStrand s) { m_strand = s; }
Strand() const461     EStrand Strand() const { return m_strand; }
Orientation() const462     EStrand Orientation() const {
463         bool notreversed = (Status()&CGeneModel::eReversed) == 0;
464         bool plusstrand = Strand() == ePlus;
465         return (notreversed == plusstrand) ? ePlus : eMinus;
466     }
467 
SetType(int t)468     void SetType(int t) { m_type = t; }
Type() const469     int Type() const { return m_type; }
GeneID() const470     Int8 GeneID() const { return m_geneid; }
SetGeneID(Int8 id)471     void SetGeneID(Int8 id) { m_geneid = id; }
RankInGene() const472     int RankInGene() const { return m_rank_in_gene; }
SetRankInGene(int rank)473     void SetRankInGene(int rank) { m_rank_in_gene = rank; }
ID() const474     Int8 ID() const { return m_id; }
SetID(Int8 id)475     void SetID(Int8 id) { m_id = id; }
Support() const476     const CSupportInfoSet& Support() const { return m_support; }
AddSupport(const CSupportInfo & support)477     bool AddSupport(const CSupportInfo& support) { return m_support.insert(support); }
ReplaceSupport(const CSupportInfoSet & support_set)478     void ReplaceSupport(const CSupportInfoSet& support_set) {  m_support = support_set; }
ProteinHit() const479     const string& ProteinHit() const { return  m_protein_hit; }
ProteinHit()480           string& ProteinHit()       { return  m_protein_hit; }
481 
Status()482     unsigned int& Status() { return m_status; }
Status() const483     const unsigned int& Status() const { return m_status; }
ClearStatus()484     void ClearStatus() { m_status = 0; }
485 
GetComment() const486     const string& GetComment() const { return m_comment; }
SetComment(const string & comment)487     void SetComment(const string& comment) { m_comment = comment; }
AddComment(const string & comment)488     void AddComment(const string& comment) { m_comment += " " + comment; }
489 
operator <(const CGeneModel & a) const490     bool operator<(const CGeneModel& a) const { return Precede(Limits(),a.Limits()); }
491 
Score() const492     double Score() const { return m_cds_info.Score(); }
493 
Continuous() const494     bool Continuous() const           //  no "holes" in alignment
495     {
496         for(unsigned int i = 1; i < Exons().size(); ++i)
497             if (!Exons()[i-1].m_ssplice || !Exons()[i].m_fsplice)
498                 return false;
499         return true;
500     }
HasStart() const501     bool HasStart() const { return m_cds_info.HasStart(); }
HasStop() const502     bool HasStop () const { return m_cds_info.HasStop ();  }
LeftComplete() const503     bool LeftComplete()  const { return Strand() == ePlus ? HasStart() : HasStop();  }
RightComplete() const504     bool RightComplete() const { return Strand() == ePlus ? HasStop()  : HasStart(); }
FullCds() const505     bool FullCds() const { return HasStart() && HasStop() && Continuous(); }
CompleteCds() const506     bool CompleteCds() const { return FullCds() && (!Open5primeEnd() || ConfirmedStart()); }
507 
GoodEnoughToBeAnnotation() const508     bool GoodEnoughToBeAnnotation() const
509     {
510         _ASSERT( !(OpenCds()&&ConfirmedStart()) );
511         return (ReadingFrame().Empty() || (!OpenCds() && FullCds()));
512     }
513 
Open5primeEnd() const514     bool Open5primeEnd() const
515     {
516         return (Strand() == ePlus ? OpenLeftEnd() : OpenRightEnd());
517     }
OpenLeftEnd() const518     bool OpenLeftEnd() const
519     {
520         return ReadingFrame().NotEmpty() && GetCdsInfo().MaxCdsLimits().GetFrom()==TSignedSeqRange::GetWholeFrom();
521     }
OpenRightEnd() const522     bool OpenRightEnd() const
523     {
524         return ReadingFrame().NotEmpty() && GetCdsInfo().MaxCdsLimits().GetTo()==TSignedSeqRange::GetWholeTo();
525     }
526 
OpenCds() const527     bool OpenCds() const { return m_cds_info.OpenCds(); }  // "optimal" CDS is not internal
PStop(bool includeall=true) const528     bool PStop(bool includeall = true) const { return m_cds_info.PStop(includeall); }  // has premature stop(s)
529 
ConfirmedStart() const530     bool ConfirmedStart() const { return m_cds_info.ConfirmedStart(); }  // start is confirmed by protein alignment
ConfirmedStop() const531     bool ConfirmedStop() const { return m_cds_info.ConfirmedStop(); }  // stop is confirmed by protein alignment
532 
533     bool isNMD(int limit = 50) const;
534 
FrameShifts()535     TInDels& FrameShifts() { return m_fshifts; }
FrameShifts() const536     const TInDels& FrameShifts() const { return m_fshifts; }
537     TInDels FrameShifts(TSignedSeqPos a, TSignedSeqPos b) const;
538     TInDels GetInDels(bool fs_only) const;
539     TInDels GetInDels(TSignedSeqPos a, TSignedSeqPos b, bool fs_only) const;
540 
541     int FShiftedLen(TSignedSeqRange ab, bool withextras = true) const;    // won't work if a/b is insertion
FShiftedLen(TSignedSeqPos a,TSignedSeqPos b,bool withextras=true) const542     int FShiftedLen(TSignedSeqPos a, TSignedSeqPos b, bool withextras = true) const  { return FShiftedLen(TSignedSeqRange(a,b),withextras); }
543 
544     // move along mrna skipping introns
545     TSignedSeqPos FShiftedMove(TSignedSeqPos pos, int len) const;  // may retun <0 if hits a deletion at the end of move
546 
547     virtual CAlignMap GetAlignMap() const;
548 
549     string GetCdsDnaSequence (const CResidueVec& contig_sequence) const;
550     string GetProtein (const CResidueVec& contig_sequence) const;
551     string GetProtein (const CResidueVec& contig_sequence, const CGenetic_code* gencode) const;
552 
553     // Below comparisons ignore CDS completely, first 3 assume that alignments are the same strand
554 
555     int isCompatible(const CGeneModel& a) const;  // returns 0 for notcompatible or (number of common splices)+1
556     bool IsSubAlignOf(const CGeneModel& a) const;
557     int MutualExtension(const CGeneModel& a) const;  // returns 0 for notcompatible or (number of introns) + 1
558 
IdenticalAlign(const CGeneModel & a) const559     bool IdenticalAlign(const CGeneModel& a) const
560     { return Strand()==a.Strand() && Limits()==a.Limits() && Exons() == a.Exons() && FrameShifts()==a.FrameShifts() &&
561             GetCdsInfo().PStops() == a.GetCdsInfo().PStops() && Type() == a.Type() && Status() == a.Status(); }
operator ==(const CGeneModel & a) const562     bool operator==(const CGeneModel& a) const
563     {
564         return IdenticalAlign(a) && Type()==a.Type() && m_id==a.m_id && m_support==a.m_support;
565     }
566 
TrustedmRNA() const567     const list< CRef<CSeq_id> >& TrustedmRNA() const { return m_trusted_mrna; }
InsertTrustedmRNA(CRef<CSeq_id> g)568     void InsertTrustedmRNA(CRef<CSeq_id> g) { m_trusted_mrna.push_back(g); };
ClearTrustedmRNA()569     void ClearTrustedmRNA() { m_trusted_mrna.clear(); };
570 
TrustedProt() const571     const list< CRef<CSeq_id> >& TrustedProt() const { return m_trusted_prot; }
InsertTrustedProt(CRef<CSeq_id> g)572     void InsertTrustedProt(CRef<CSeq_id> g) { m_trusted_prot.push_back(g); };
ClearTrustedProt()573     void ClearTrustedProt() { m_trusted_prot.clear(); };
574 
GetEdgeReadingFrames() const575     const vector<CCDSInfo>* GetEdgeReadingFrames() const { return &m_edge_reading_frames; }
SetEdgeReadingFrames()576     vector<CCDSInfo>* SetEdgeReadingFrames() { return &m_edge_reading_frames; }
577 
578 
579 #ifdef _DEBUG
580     int oid;
581 #endif
582 
583 private:
584     void RemoveExtraFShifts(int left, int right);
585     void TrimEdgesToFrameInOtherAlignGaps(const TExons& exons_with_gaps);
586 
587     int m_type;
588     Int8 m_id;
589     unsigned int m_status;
590 
591     double m_ident;
592     double m_weight;
593 
594     TExons m_exons;
MyExons()595     TExons& MyExons() { return m_exons; }
596     bool m_expecting_hole;
597 
598     TSignedSeqRange m_range;
599     EStrand m_strand;
600     TInDels m_fshifts;
601 
602     CCDSInfo m_cds_info;
603     bool CdsInvariant(bool check_start_stop = true) const;
604 
605     Int8 m_geneid;
606     int m_rank_in_gene;
607     CSupportInfoSet m_support;
608     string m_protein_hit;
609     string m_comment;
610     list< CRef<CSeq_id> > m_trusted_prot;
611     list< CRef<CSeq_id> > m_trusted_mrna;
612 
613 
614     vector<CCDSInfo> m_edge_reading_frames;
615 
616     friend class CChain;
617 };
618 
619 
620 class CAlignMap {
621 public:
622     enum EEdgeType { eBoundary, eSplice, eInDel, eGgap };
623     enum ERangeEnd{ eLeftEnd, eRightEnd, eSinglePoint };
624 
CAlignMap()625     CAlignMap() {};
CAlignMap(TSignedSeqPos orig_a,TSignedSeqPos orig_b)626     CAlignMap(TSignedSeqPos orig_a, TSignedSeqPos orig_b) : m_orientation(ePlus) {
627         m_orig_ranges.push_back(SMapRange(SMapRangeEdge(orig_a), SMapRangeEdge(orig_b), kEmptyStr));
628         m_edited_ranges = m_orig_ranges;
629         m_target_len = FShiftedLen(orig_a, orig_b);
630     }
CAlignMap(TSignedSeqPos orig_a,TSignedSeqPos orig_b,TInDels::const_iterator fsi_begin,const TInDels::const_iterator fsi_end)631     CAlignMap(TSignedSeqPos orig_a, TSignedSeqPos orig_b, TInDels::const_iterator fsi_begin, const TInDels::const_iterator fsi_end) : m_orientation(ePlus) {
632         EEdgeType atype = eBoundary;
633         EEdgeType btype = eBoundary;
634         if(fsi_begin != fsi_end) {
635             if(fsi_begin->Loc() == orig_a && !fsi_begin->IsMismatch()) {
636                 _ASSERT(!fsi_begin->IsInsertion());   // no reason to have insertion
637                 atype = eInDel;
638             }
639             TInDels::const_iterator fs = fsi_end-1;
640             if(fs->Loc() == orig_b+1 && fs->IsDeletion())
641                 btype = eInDel;
642         }
643         InsertIndelRangesForInterval(orig_a, orig_b, 0, fsi_begin, fsi_end, atype, btype, "", "");
644         m_target_len = FShiftedLen(orig_a, orig_b);
645     }
646     CAlignMap(const CGeneModel::TExons& exons, const vector<TSignedSeqRange>& transcript_exons, const TInDels& indels, EStrand orientation, int targetlen );     //orientation == strand if not Reversed
647     CAlignMap(const CGeneModel::TExons& exons, const TInDels& frameshifts, EStrand strand, TSignedSeqRange lim = TSignedSeqRange::GetWhole(), int holelen = 0, int polyalen = 0);
648     TSignedSeqPos MapOrigToEdited(TSignedSeqPos orig_pos) const;
649     TSignedSeqPos MapEditedToOrig(TSignedSeqPos edited_pos) const;
650     TSignedSeqRange MapRangeOrigToEdited(TSignedSeqRange orig_range, ERangeEnd lend, ERangeEnd rend) const;
MapRangeOrigToEdited(TSignedSeqRange orig_range,bool withextras=true) const651     TSignedSeqRange MapRangeOrigToEdited(TSignedSeqRange orig_range, bool withextras = true) const { return MapRangeOrigToEdited(orig_range, withextras?eLeftEnd:eSinglePoint, withextras?eRightEnd:eSinglePoint); }
652     TSignedSeqRange MapRangeEditedToOrig(TSignedSeqRange edited_range, bool withextras = true) const;
653     template <class Vec>
654     void EditedSequence(const Vec& original_sequence, Vec& edited_sequence, bool includeholes = false) const;
655     int FShiftedLen(TSignedSeqRange ab, ERangeEnd lend, ERangeEnd rend) const;
656     int FShiftedLen(TSignedSeqRange ab, bool withextras = true) const;
FShiftedLen(TSignedSeqPos a,TSignedSeqPos b,bool withextras=true) const657     int FShiftedLen(TSignedSeqPos a, TSignedSeqPos b, bool withextras = true) const { return FShiftedLen(TSignedSeqRange(a,b), withextras); }
658     //snap to codons works by analising transcript coordinates (MUST be a protein or reading frame cutout)
659     TSignedSeqRange ShrinkToRealPoints(TSignedSeqRange orig_range, bool snap_to_codons = false) const;
660     TSignedSeqRange ShrinkToRealPointsOnEdited(TSignedSeqRange edited_range) const;
661     TSignedSeqPos FShiftedMove(TSignedSeqPos orig_pos, int len) const;   // may reurn < 0 if hits a gap
662     //    TInDels GetInDels(bool fs_only) const;
663     //    TInDels GetAllCorrections() const;
TargetLen() const664     int TargetLen() const { return m_target_len; }
Orientation() const665     EStrand Orientation() const { return m_orientation; }
MoveOrigin(TSignedSeqPos shift)666     void MoveOrigin(TSignedSeqPos shift) {
667         for(auto& mrange : m_orig_ranges)
668             mrange.MoveOrigin(shift);
669     }
670 
671 // private: // breaks SMapRange on WorkShop. :-/
672     struct SMapRangeEdge {
SMapRangeEdgeCAlignMap::SMapRangeEdge673         SMapRangeEdge(TSignedSeqPos p, TSignedSeqPos e = 0, EEdgeType t = eBoundary, const string& seq = kEmptyStr) : m_pos(p), m_extra(e), m_edge_type(t), m_extra_seq(seq) {}
operator <CAlignMap::SMapRangeEdge674         bool operator<(const SMapRangeEdge& mre) const { return m_pos < mre.m_pos; }
operator ==CAlignMap::SMapRangeEdge675         bool operator==(const SMapRangeEdge& mre) const { return m_pos == mre.m_pos; }
676 
677         TSignedSeqPos m_pos, m_extra;
678         EEdgeType m_edge_type;
679         string m_extra_seq;
680     };
681 
682     class SMapRange {
683     public:
SMapRange(SMapRangeEdge from,SMapRangeEdge to,const string & mism)684         SMapRange(SMapRangeEdge from, SMapRangeEdge to, const string& mism) : m_from(from), m_to(to), m_mism_seq(mism) {}
GetEdgeFrom() const685         SMapRangeEdge GetEdgeFrom() const { return m_from; }
GetEdgeTo() const686         SMapRangeEdge GetEdgeTo() const { return m_to; }
SetEdgeFrom(SMapRangeEdge from)687         void SetEdgeFrom(SMapRangeEdge from) { m_from = from; }
SetEdgeTo(SMapRangeEdge to)688         void SetEdgeTo(SMapRangeEdge to) { m_to = to; }
MoveOrigin(TSignedSeqPos shift)689         void MoveOrigin(TSignedSeqPos shift) {
690             m_from.m_pos -= shift;
691             m_to.m_pos -= shift;
692         }
GetFrom() const693         TSignedSeqPos GetFrom() const { return m_from.m_pos; }
GetTo() const694         TSignedSeqPos GetTo() const { return m_to.m_pos; }
GetExtendedFrom() const695         TSignedSeqPos GetExtendedFrom() const { return m_from.m_pos - m_from.m_extra; }
GetExtendedTo() const696         TSignedSeqPos GetExtendedTo() const { return m_to.m_pos+m_to.m_extra; }
GetExtraFrom() const697         TSignedSeqPos GetExtraFrom() const { return m_from.m_extra; }
GetExtraSeqFrom() const698         string GetExtraSeqFrom() const { return m_from.m_extra_seq; }
GetExtraTo() const699         TSignedSeqPos GetExtraTo() const { return m_to.m_extra; }
GetExtraSeqTo() const700         string GetExtraSeqTo() const { return m_to.m_extra_seq; }
GetTypeFrom() const701         EEdgeType GetTypeFrom() const { return m_from.m_edge_type; }
GetTypeTo() const702         EEdgeType GetTypeTo() const { return m_to.m_edge_type; }
GetMismatch() const703         const string& GetMismatch() const { return m_mism_seq; }
operator <(const SMapRange & mr) const704         bool operator<(const SMapRange& mr) const {
705             if(m_from.m_pos == mr.m_from.m_pos) return m_to.m_pos < mr.m_to.m_pos;
706             else return m_from.m_pos < mr.m_from.m_pos;
707         }
708 
709     private:
710         SMapRangeEdge m_from, m_to;
711         string m_mism_seq;
712     };
713 
714     //    static TInDels RemoveExtraIndels(const TInDels& indels, const CGeneModel::TExons& exons);
715 
716 private:
717     static TSignedSeqPos MapAtoB(const vector<CAlignMap::SMapRange>& a, const vector<CAlignMap::SMapRange>& b, TSignedSeqPos p, ERangeEnd move_mode);
718     static TSignedSeqRange MapRangeAtoB(const vector<CAlignMap::SMapRange>& a, const vector<CAlignMap::SMapRange>& b, TSignedSeqRange r, ERangeEnd lend, ERangeEnd rend);
MapRangeAtoB(const vector<CAlignMap::SMapRange> & a,const vector<CAlignMap::SMapRange> & b,TSignedSeqRange r,bool withextras)719     static TSignedSeqRange MapRangeAtoB(const vector<CAlignMap::SMapRange>& a, const vector<CAlignMap::SMapRange>& b, TSignedSeqRange r, bool withextras ) {
720         return MapRangeAtoB(a, b, r, withextras?eLeftEnd:eSinglePoint, withextras?eRightEnd:eSinglePoint);
721     };
722     static int FindLowerRange(const vector<CAlignMap::SMapRange>& a,  TSignedSeqPos p);
723 
724     void InsertOneToOneRange(TSignedSeqPos orig_start, TSignedSeqPos edited_start, int len, const string& mism, int left_orige, int left_edite, int right_orige, int right_edite, EEdgeType left_type, EEdgeType right_type, const string& left_edit_extra_seq, const string& right_edit_extra_seq);
725     TSignedSeqPos InsertIndelRangesForInterval(TSignedSeqPos orig_a, TSignedSeqPos orig_b, TSignedSeqPos edit_a, TInDels::const_iterator fsi_begin, TInDels::const_iterator fsi_end, EEdgeType type_a, EEdgeType type_b, const string& gseq_a, const string& gseq_b);
726 
727     vector<SMapRange> m_orig_ranges, m_edited_ranges;
728     EStrand m_orientation;
729     int m_target_len;
730 };
731 
732 
733 
734 
735 class NCBI_XALGOGNOMON_EXPORT CAlignModel : public CGeneModel {
736 public:
CAlignModel()737     CAlignModel() {}
738     CAlignModel(const objects::CSeq_align& seq_align);
739     CAlignModel(const CGeneModel& g, const CAlignMap& a);
GetAlignMap() const740     virtual CAlignMap GetAlignMap() const { return m_alignmap; }
741     void ResetAlignMap();
742 
Clip(TSignedSeqRange limits,EClipMode mode,bool ensure_cds_invariant=true)743     virtual void Clip(TSignedSeqRange limits, EClipMode mode, bool ensure_cds_invariant = true) { // drops the score!!!!!!!!!
744         CGeneModel::Clip(limits,mode,ensure_cds_invariant);
745         RecalculateAlignMap(limits.GetFrom(), limits.GetTo());
746     }
CutExons(TSignedSeqRange hole)747     virtual void CutExons(TSignedSeqRange hole) { // clip or remove exons, dangerous, should be completely in or outside the cds, should not cut an exon in two
748         CGeneModel::CutExons(hole);
749         RecalculateAlignMap(hole.GetTo()+1, hole.GetFrom()-1);
750     }
751 
752     string TargetAccession() const;
SetTargetId(const objects::CSeq_id & id)753     void SetTargetId(const objects::CSeq_id& id) { m_target_id.Reset(&id); }
GetTargetId() const754     CConstRef<objects::CSeq_id> GetTargetId() const { return m_target_id; }
TargetLen() const755     int TargetLen() const { return m_alignmap.TargetLen(); }
756     int PolyALen() const;
757     CRef<objects::CSeq_align> MakeSeqAlign(const string& contig) const;   // should be used for alignments only; for chains and models will produce a Splign alignment of mRNA
758 
759 private:
760     void RecalculateAlignMap(int left, int right);
761     CAlignMap m_alignmap;
762     CConstRef<objects::CSeq_id> m_target_id;
763 };
764 
765 
766 
767 
768 struct NCBI_XALGOGNOMON_EXPORT setcontig {
769     const string& m_contig;
setcontigsetcontig770     explicit setcontig(const string& cntg) : m_contig(cntg) {}
771 };
772 struct NCBI_XALGOGNOMON_EXPORT getcontig {
773     string& m_contig;
getcontiggetcontig774     explicit getcontig(string& cntg) : m_contig(cntg) {}
775 };
776 NCBI_XALGOGNOMON_EXPORT CNcbiOstream& operator<<(CNcbiOstream& s, const setcontig& c);
777 NCBI_XALGOGNOMON_EXPORT CNcbiIstream& operator>>(CNcbiIstream& s, const getcontig& c);
778 
779 NCBI_XALGOGNOMON_EXPORT CNcbiIstream& operator>>(CNcbiIstream& s, CAlignModel& a);
780 NCBI_XALGOGNOMON_EXPORT CNcbiOstream& operator<<(CNcbiOstream& s, const CAlignModel& a);
781 NCBI_XALGOGNOMON_EXPORT CNcbiOstream& operator<<(CNcbiOstream& s, const CGeneModel& a);
782 
783 
784 template<class Model>
785 class NCBI_XALGOGNOMON_EXPORT CModelCluster : public list<Model> {
786 public:
787     typedef Model TModel;
CModelCluster(int f=numeric_limits<int>::max (),int s=0)788     CModelCluster(int f = numeric_limits<int>::max(), int s = 0) : m_limits(f,s) {}
CModelCluster(TSignedSeqRange limits)789     CModelCluster(TSignedSeqRange limits) : m_limits(limits) {}
Insert(const Model & a)790     void Insert(const Model& a) {
791         m_limits.CombineWith(a.Limits());
792         this->push_back(a);
793     }
Splice(CModelCluster & c)794     void Splice(CModelCluster& c) { // elements removed from c and inserted into *this
795         m_limits.CombineWith(c.Limits());
796         this->splice(list<Model>::end(),c);
797     }
Limits() const798     TSignedSeqRange Limits() const { return m_limits; }
operator <(const CModelCluster & c) const799     bool operator<(const CModelCluster& c) const { return Precede(m_limits, c.m_limits); }
Init(TSignedSeqPos first,TSignedSeqPos second)800     void Init(TSignedSeqPos first, TSignedSeqPos second) {
801         list<Model>::clear();
802         m_limits.SetFrom( first );
803         m_limits.SetTo( second );
804     }
805 
806 private:
807     TSignedSeqRange m_limits;
808 };
809 
810 typedef CModelCluster<CGeneModel> TGeneModelCluster;
811 typedef CModelCluster<CAlignModel> TAlignModelCluster;
812 
813 typedef list<CGeneModel> TGeneModelList;
814 typedef list<CAlignModel> TAlignModelList;
815 
816 
817 template<class Cluster>
818 class NCBI_XALGOGNOMON_EXPORT CModelClusterSet : public set<Cluster> {
819  public:
820     typedef typename set<Cluster>::iterator Titerator;
CModelClusterSet()821     CModelClusterSet() {}
Insert(const typename Cluster::TModel & a)822     void Insert(const typename Cluster::TModel& a) {
823         Cluster clust;
824         clust.Insert(a);
825         Titerator first = set<Cluster>::lower_bound(Cluster(a.Limits().GetFrom(),a.Limits().GetFrom()));
826         Titerator second = set<Cluster>::upper_bound(Cluster(a.Limits().GetTo(),a.Limits().GetTo()));
827         for(Titerator it = first; it != second;) {
828             clust.Splice(const_cast<Cluster&>(*it));
829             this->erase(it++);
830         }
831         const_cast<Cluster&>(*this->insert(second,Cluster(clust.Limits()))).Splice(clust);
832     }
833 };
834 
835 typedef CModelClusterSet<TGeneModelCluster> TGeneModelClusterSet;
836 typedef CModelClusterSet<TAlignModelCluster> TAlignModelClusterSet;
837 
838 
839 enum EResidueNames { enA, enC, enG, enT, enN };
840 
841 class EResidue {
842 public :
EResidue()843     EResidue() : data(enN) {}
EResidue(EResidueNames e)844     EResidue(EResidueNames e) : data(e) {}
845 
operator int() const846     operator int() const { return int(data); }
847 
848 private:
849     unsigned char data;
850 };
851 
Complement(TResidue c)852 inline TResidue Complement(TResidue c)
853 {
854     switch(c)
855     {
856         case 'A':
857             return 'T';
858         case 'a':
859             return 't';
860         case 'C':
861             return 'G';
862         case 'c':
863             return 'g';
864         case 'G':
865             return 'C';
866         case 'g':
867             return 'c';
868         case 'T':
869             return 'A';
870         case 't':
871             return 'a';
872         default:
873             return 'N';
874     }
875 }
876 
877 extern const EResidue    k_toMinus[5];
878 extern const char *const k_aa_table;
879 
Complement(EResidue c)880 inline EResidue Complement(EResidue c)
881 {
882     return k_toMinus[c];
883 }
884 
885 template <class BidirectionalIterator>
ReverseComplement(const BidirectionalIterator & first,const BidirectionalIterator & last)886 void ReverseComplement(const BidirectionalIterator& first, const BidirectionalIterator& last)
887 {
888     for (BidirectionalIterator i( first ); i != last; ++i)
889         *i = Complement(*i);
890     reverse(first, last);
891 }
892 
893 template<class Model>
GetAlignParts(const Model & algn,bool settrimflags)894 list<Model> GetAlignParts(const Model& algn, bool settrimflags) {
895     list<Model> parts;
896     int left = algn.Limits().GetFrom();
897     for(unsigned int i = 1; i < algn.Exons().size(); ++i) {
898         if (!algn.Exons()[i-1].m_ssplice || !algn.Exons()[i].m_fsplice) {
899             Model m = algn;
900             m.Status() &= ~CGeneModel::ePolyA;
901             m.Status() &= ~CGeneModel::eCap;
902             m.Clip(TSignedSeqRange(left,algn.Exons()[i-1].GetTo()),CGeneModel::eRemoveExons);
903             if(!parts.empty() && settrimflags) {
904                 parts.back().Status() &= ~CGeneModel::eRightTrimmed;
905                 m.Status() &= ~CGeneModel::eLeftTrimmed;
906             }
907             parts.push_back(m);
908             left = algn.Exons()[i].GetFrom();
909         }
910     }
911     if(!parts.empty()) {
912         Model m = algn;
913         m.Clip(TSignedSeqRange(left,algn.Limits().GetTo()),CGeneModel::eRemoveExons);
914         m.Status() &= ~CGeneModel::ePolyA;
915         m.Status() &= ~CGeneModel::eCap;
916         if(settrimflags) {
917             parts.back().Status() &= ~CGeneModel::eRightTrimmed;
918             m.Status() &= ~CGeneModel::eLeftTrimmed;
919         }
920         parts.push_back(m);
921 
922         if(algn.Status()&CGeneModel::ePolyA) {
923             if(algn.Strand() == ePlus)
924                 parts.back().Status() |= CGeneModel::ePolyA;
925             else
926                 parts.front().Status() |= CGeneModel::ePolyA;
927         }
928         if(algn.Status()&CGeneModel::eCap) {
929             if(algn.Strand() == ePlus)
930                 parts.front().Status() |= CGeneModel::eCap;
931             else
932                 parts.back().Status() |= CGeneModel::eCap;
933         }
934     }
935 
936     return parts;
937 }
938 
939 void MapAlignsToOrigContig(TAlignModelList& aligns, const TInDels& corrections, int contig_size);
940 
941 
942 
943 END_SCOPE(gnomon)
944 END_NCBI_SCOPE
945 
946 #endif  // ALGO_GNOMON___GNOMON_MODEL__HPP
947