1 /*  $Id: gnomon_model.cpp 626290 2021-02-25 14:09:15Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Alexandre Souvorov
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <algo/gnomon/gnomon_model.hpp>
34 #include <algo/gnomon/gnomon.hpp>
35 #include "gnomon_seq.hpp"
36 #include <algo/gnomon/id_handler.hpp>
37 #include <set>
38 #include <functional>
39 #include <corelib/ncbiutil.hpp>
40 #include <objects/general/Object_id.hpp>
41 #include <objects/seqloc/Seq_id.hpp>
42 
43 BEGIN_NCBI_SCOPE
44 BEGIN_SCOPE(gnomon)
45 
46 USING_SCOPE(objects);
47 
GetInDels(TSignedSeqPos a,TSignedSeqPos b,bool fs_only) const48 TInDels CGeneModel::GetInDels(TSignedSeqPos a, TSignedSeqPos b, bool fs_only) const {
49 
50     TInDels indels = GetInDels(fs_only);
51     TInDels selected_indels;
52 
53     ITERATE(TInDels, i, indels) {
54         if(i->IntersectingWith(a,b))
55             selected_indels.push_back( *i );
56     }
57 
58     return selected_indels;
59 }
60 
GetInDels(bool fs_only) const61 TInDels CGeneModel::GetInDels(bool fs_only) const {
62 
63     TInDels selected_indels;
64     if(!fs_only) {
65         ITERATE(TInDels, i, FrameShifts()) {
66             if(!i->IsMismatch())
67                 selected_indels.push_back(*i);
68         }
69     } else {
70         TExons::const_iterator e = Exons().begin();
71         ITERATE(TInDels, i, FrameShifts()) {
72             if(i->IsMismatch() || i->Len()%3 == 0)  // skip mismatches and full codons
73                 continue;
74 
75             for( ;e != Exons().end() && (e->Limits().Empty() || !i->IntersectingWith(e->GetFrom(),e->GetTo())); ++e); // harboring exon
76             if(i->InDelEnd() > e->GetTo() && e != Exons().end()) {                                                   // skip full codons splitted by intron
77                 TInDels::const_iterator next = i;
78                 if(++next != FrameShifts().end() && (++e)->Limits().NotEmpty() && next->Loc() == e->GetFrom() && !next->IsMismatch()) {
79                     int len = i->Len() + (i->GetType() == next->GetType() ? next->Len() : -next->Len());
80                     if(len%3 == 0) {
81                         i = next;
82                         continue;
83                     }
84                 }
85             }
86 
87             selected_indels.push_back(*i);
88         }
89     }
90 
91     return selected_indels;
92 }
93 
94 
TranscriptExon(int i) const95 TSignedSeqRange CGeneModel::TranscriptExon(int i) const {
96     CAlignMap amap = GetAlignMap();
97 
98     if(Exons()[i].Limits().NotEmpty()) {
99         return amap.MapRangeOrigToEdited(Exons()[i].Limits(), true);
100     } else if(i > 0) {  // there is real exon on the left
101         int p = amap.MapOrigToEdited(Exons()[i-1].GetTo());
102         if(Orientation() == ePlus)
103             return TSignedSeqRange(p+1,p+Exons()[i].m_seq.size());
104         else
105             return TSignedSeqRange(p-Exons()[i].m_seq.size(),p-1);
106     } else {  // there is a real exon on the right
107         _ASSERT(i < (int)Exons().size()-1);
108         int p = amap.MapOrigToEdited(Exons()[i+1].GetFrom());
109         if(Orientation() == ePlus)
110             return TSignedSeqRange(p-Exons()[i].m_seq.size(),p-1);
111         else
112             return TSignedSeqRange(p+1,p+Exons()[i].m_seq.size());
113     }
114 }
115 
TranscriptLimits() const116 TSignedSeqRange CGeneModel::TranscriptLimits() const {
117     CAlignMap amap = GetAlignMap();
118 
119     int l,r;
120 
121     if(Orientation() == ePlus) {
122         if(Exons().front().Limits().NotEmpty()) {
123             l = amap.MapRangeOrigToEdited(Exons().front().Limits(),true).GetFrom();
124         } else {
125             _ASSERT(Exons().size() > 1 && Exons()[1].Limits().NotEmpty());
126             l = amap.MapOrigToEdited(Exons()[1].GetFrom())-Exons().front().m_seq.length();
127         }
128         if(Exons().back().Limits().NotEmpty()) {
129             r = amap.MapRangeOrigToEdited(Exons().back().Limits(),true).GetTo();
130         } else {
131             _ASSERT(Exons().size() > 1 && Exons()[Exons().size()-2].Limits().NotEmpty());
132             r = amap.MapOrigToEdited(Exons()[Exons().size()-2].GetTo()) + Exons().back().m_seq.length();
133         }
134     } else {
135         if(Exons().front().Limits().NotEmpty()) {
136             r = amap.MapRangeOrigToEdited(Exons().front().Limits(),true).GetTo();
137         } else {
138             _ASSERT(Exons().size() > 1 && Exons()[1].Limits().NotEmpty());
139             r = amap.MapOrigToEdited(Exons()[1].GetFrom())+Exons().front().m_seq.length();
140         }
141         if(Exons().back().Limits().NotEmpty()) {
142             l = amap.MapRangeOrigToEdited(Exons().back().Limits(),true).GetFrom();
143         } else {
144             _ASSERT(Exons().size() > 1 && Exons()[Exons().size()-2].Limits().NotEmpty());
145             l = amap.MapOrigToEdited(Exons()[Exons().size()-2].GetTo()) - Exons().back().m_seq.length();
146         }
147     }
148 
149     return TSignedSeqRange(l,r);
150     }
151 
isNMD(int limit) const152 bool CGeneModel::isNMD(int limit) const
153 {
154     if (GetCdsInfo().ReadingFrame().Empty() || Exons().size() <= 1)
155         return false;
156 
157     TSignedSeqRange cds = GetCdsInfo().Start()+GetCdsInfo().ReadingFrame()+GetCdsInfo().Stop();
158     CAlignMap amap = GetAlignMap();
159     if(GetCdsInfo().IsMappedToGenome()) {
160         cds = amap.MapRangeOrigToEdited(cds);
161     }
162 
163     int l = -1;
164     if (Strand() == ePlus) {
165         for(int i = (int)Exons().size()-1; i > 0; --i) {
166             if(Exons()[i].m_fsplice && Exons()[i-1].m_ssplice && Exons()[i].m_fsplice_sig != "XX" && Exons()[i-1].m_ssplice_sig != "XX") {
167                 l = amap.MapRangeOrigToEdited(Exons()[i],true).GetFrom();
168                 break;
169             }
170         }
171     } else {
172         for(int i = 0; i < (int)Exons().size()-1; ++i) {
173             if(Exons()[i].m_ssplice && Exons()[i+1].m_fsplice && Exons()[i].m_ssplice_sig != "XX" && Exons()[i+1].m_fsplice_sig != "XX") {
174                 l = amap.MapRangeOrigToEdited(Exons()[i],true).GetFrom();
175                 break;
176             }
177         }
178     }
179     return l > cds.GetTo()+limit;
180 }
181 
182 
ReverseComplementModel()183 void CGeneModel::ReverseComplementModel() {
184     SetStrand(Strand() == ePlus ? eMinus : ePlus);
185     Status() ^= eReversed;
186     NON_CONST_ITERATE(TExons, it, MyExons()) {
187         if(it->m_fsplice_sig != "XX")
188             ReverseComplement(it->m_fsplice_sig.begin(),it->m_fsplice_sig.end());
189         if(it->m_ssplice_sig != "XX")
190             ReverseComplement(it->m_ssplice_sig.begin(),it->m_ssplice_sig.end());
191     }
192 }
193 
GetAlignMap() const194 CAlignMap CGeneModel::GetAlignMap() const { return CAlignMap(Exons(), FrameShifts(), Strand()); }
195 
CAlignModel(const CGeneModel & g,const CAlignMap & a)196 CAlignModel::CAlignModel(const CGeneModel& g, const CAlignMap& a)
197     : CGeneModel(g), m_alignmap(a)
198 {
199 
200 #ifdef _DEBUG
201     int diff = 0;
202     for(int ie = 0; ie < (int)Exons().size(); ++ie) {
203         if(Exons()[ie].Limits().Empty())
204             diff += Exons()[ie].m_seq.size();
205         else
206             diff += Exons()[ie].Limits().GetLength();
207 
208         diff -= TranscriptExon(ie).GetLength();
209     }
210     ITERATE(TInDels, i, FrameShifts()) {
211         if(i->IsDeletion())
212             diff += i->Len();
213         else if(i->IsInsertion())
214             diff -= i->Len();
215     }
216     _ASSERT(diff == 0);
217 #endif
218 
219     SetTargetId(*CIdHandler::GnomonMRNA(ID()));
220     if(a.Orientation() != g.Strand())
221         Status() |= CGeneModel::eReversed;
222 }
223 
ResetAlignMap()224 void CAlignModel::ResetAlignMap()
225 {
226     Status() &= ~CGeneModel::eReversed;
227     m_alignmap = CGeneModel::GetAlignMap();
228 }
229 
RecalculateAlignMap(int left,int right)230 void CAlignModel::RecalculateAlignMap(int left, int right) {
231     if(Exons().empty()) {
232         m_alignmap = CAlignMap();
233         return;
234     }
235 
236     vector<TSignedSeqRange> transcript_exons;
237     for(int ie = 0; ie < (int)Exons().size(); ++ie) {
238         const CModelExon& e = Exons()[ie];
239         if(e.Limits().Empty()) {
240             transcript_exons.push_back(TranscriptExon(ie));
241         } else {
242             CAlignMap::ERangeEnd l = e.GetFrom() == left ? CAlignMap::eSinglePoint : CAlignMap::eLeftEnd;
243             CAlignMap::ERangeEnd r = e.GetTo() == right ? CAlignMap::eSinglePoint : CAlignMap::eRightEnd;
244             TSignedSeqRange texon = m_alignmap.MapRangeOrigToEdited(e.Limits(), l, r);
245             transcript_exons.push_back(texon);
246         }
247     }
248     m_alignmap = CAlignMap(Exons(), transcript_exons, FrameShifts(), Orientation(), m_alignmap.TargetLen());
249 }
250 
TargetAccession() const251 string CAlignModel::TargetAccession() const {
252     _ASSERT( !GetTargetId().Empty() );
253     if (GetTargetId().Empty())
254         return "UnknownTarget";
255     return CIdHandler::ToString(*GetTargetId());
256 }
257 
Remap(const CRangeMapper & mapper)258 void CGeneModel::Remap(const CRangeMapper& mapper)
259 {
260     NON_CONST_ITERATE(TExons, e, MyExons()) {
261         e->Remap(mapper);
262     }
263     RecalculateLimits();
264 
265     if(ReadingFrame().NotEmpty())
266         m_cds_info.Remap(mapper);
267 }
268 
operator ==(const CCDSInfo & a) const269 bool CCDSInfo::operator== (const CCDSInfo& a) const
270 {
271     return
272         m_start==a.m_start &&
273         m_stop==a.m_stop &&
274         m_reading_frame==a.m_reading_frame &&
275         m_reading_frame_from_proteins==a.m_reading_frame_from_proteins &&
276         m_max_cds_limits==a.m_max_cds_limits &&
277         m_confirmed_start==a.m_confirmed_start &&
278         m_confirmed_stop==a.m_confirmed_stop &&
279         m_p_stops==a.m_p_stops &&
280         m_open==a.m_open &&
281         m_score==a.m_score &&
282         m_genomic_coordinates == a.m_genomic_coordinates;
283 }
284 
MapFromEditedToOrig(const CAlignMap & amap) const285 CCDSInfo CCDSInfo::MapFromEditedToOrig(const CAlignMap& amap) const
286 {
287     CCDSInfo empty_cds;
288     CCDSInfo new_cds(true);
289 
290     if(ProtReadingFrame().NotEmpty()) {
291         TSignedSeqRange prot_rf = amap.MapRangeEditedToOrig(ProtReadingFrame(), true);
292         if(prot_rf.NotEmpty())
293             new_cds.SetReadingFrame(prot_rf, true);
294         else
295             return empty_cds;
296     }
297     if(ReadingFrame().NotEmpty()) {
298         TSignedSeqRange rf = amap.MapRangeEditedToOrig(ReadingFrame(), true);
299         if(rf.NotEmpty() && amap.FShiftedLen(rf, true)%3 == 0)
300             new_cds.SetReadingFrame(rf, false);
301         else
302             return empty_cds;
303     }
304     if(HasStart()) {
305         TSignedSeqRange start = amap.MapRangeEditedToOrig(Start(), false);
306         if(start.NotEmpty())
307             new_cds.SetStart(start, ConfirmedStart());
308         else
309             return empty_cds;
310     }
311     if(HasStop()) {
312         TSignedSeqRange stop = amap.MapRangeEditedToOrig(Stop(), false);
313         if(stop.NotEmpty())
314             new_cds.SetStop(stop, ConfirmedStop());
315         else
316             return empty_cds;
317     }
318     ITERATE(TPStops, i, PStops()) {
319         TSignedSeqRange pstop = amap.MapRangeEditedToOrig(*i, false);
320         if(pstop.NotEmpty())
321             new_cds.AddPStop(pstop, i->m_status);
322         else
323             return empty_cds;
324     }
325     if(HasStart() && MaxCdsLimits().GetFrom() == Start().GetFrom()) {
326         new_cds.Set5PrimeCdsLimit(amap.MapEditedToOrig(Start().GetFrom()));
327     } else if(HasStart() &&  MaxCdsLimits().GetTo() == Start().GetTo()) {
328         new_cds.Set5PrimeCdsLimit(amap.MapEditedToOrig(Start().GetTo()));
329     }
330     new_cds.SetScore(Score(), OpenCds());
331 
332     return new_cds;
333 }
334 
MapFromOrigToEdited(const CAlignMap & amap) const335 CCDSInfo CCDSInfo::MapFromOrigToEdited(const CAlignMap& amap) const
336 {
337     CCDSInfo new_cds(false);
338 
339     if(ProtReadingFrame().NotEmpty()) {
340         new_cds.SetReadingFrame(amap.MapRangeOrigToEdited(ProtReadingFrame(), true), true);
341         _ASSERT(new_cds.ProtReadingFrame().NotEmpty());
342     }
343     if(ReadingFrame().NotEmpty()) {
344         new_cds.SetReadingFrame(amap.MapRangeOrigToEdited(ReadingFrame(), true), false);
345         _ASSERT(new_cds.ReadingFrame().NotEmpty());
346     }
347     if(HasStart()) {
348         new_cds.SetStart(amap.MapRangeOrigToEdited(Start(), false), ConfirmedStart());
349         _ASSERT(new_cds.HasStart());
350     }
351     if(HasStop()) {
352         new_cds.SetStop(amap.MapRangeOrigToEdited(Stop(), false), ConfirmedStop());
353         _ASSERT(new_cds.HasStop());
354     }
355     ITERATE(TPStops, i, PStops()) {
356         TSignedSeqRange p = amap.MapRangeOrigToEdited(*i, false);
357         _ASSERT(p.NotEmpty());
358         new_cds.AddPStop(p, i->m_status);
359     }
360     if(HasStart() && MaxCdsLimits().GetFrom() == Start().GetFrom()) {
361         new_cds.Set5PrimeCdsLimit(amap.MapOrigToEdited(Start().GetFrom()));
362     } else if(HasStart() &&  MaxCdsLimits().GetTo() == Start().GetTo()) {
363         new_cds.Set5PrimeCdsLimit(amap.MapOrigToEdited(Start().GetTo()));
364     }
365     new_cds.SetScore(Score(), OpenCds());
366 
367     return new_cds;
368 }
369 
SetReadingFrame(TSignedSeqRange r,bool protein)370 void CCDSInfo::SetReadingFrame(TSignedSeqRange r, bool protein)
371 {
372     if (r.Empty()) {
373         if(protein)
374             m_reading_frame_from_proteins = r;
375         else
376             Clear();
377     } else {
378         m_reading_frame = r;
379         if (protein)
380             m_reading_frame_from_proteins = r;
381         if (m_max_cds_limits.Empty()) {
382             m_max_cds_limits = TSignedSeqRange::GetWhole();
383         }
384     }
385     _ASSERT( Invariant() );
386 }
387 
SetStart(TSignedSeqRange r,bool confirmed)388 void CCDSInfo::SetStart(TSignedSeqRange r, bool confirmed)
389 {
390     if (confirmed) {
391         m_confirmed_start = true;
392         m_open = false;
393     } else if (m_confirmed_start && r != m_start) {
394         m_confirmed_start = false;
395     }
396     m_start = r;
397     _ASSERT( Invariant() );
398 }
399 
SetStop(TSignedSeqRange r,bool confirmed)400 void CCDSInfo::SetStop(TSignedSeqRange r, bool confirmed)
401 {
402     if(m_stop.NotEmpty()) {
403         if(m_max_cds_limits.GetFrom() == m_stop.GetFrom())
404             m_max_cds_limits.SetFrom( TSignedSeqRange::GetWholeFrom());
405         if(m_max_cds_limits.GetTo() == m_stop.GetTo())
406             m_max_cds_limits.SetTo( TSignedSeqRange::GetWholeTo());
407     }
408 
409     if (confirmed)
410         m_confirmed_stop = true;
411     else if (m_confirmed_stop && r != m_stop) {
412         m_confirmed_stop = false;
413     }
414     m_stop = r;
415     if (r.NotEmpty()) {
416         if (Precede(m_reading_frame,r))
417             m_max_cds_limits.SetTo(r.GetTo());
418         else
419             m_max_cds_limits.SetFrom(r.GetFrom());
420     }
421     if(!m_p_stops.empty() && m_p_stops.back() == m_stop)
422         m_p_stops.pop_back();
423     if(!m_p_stops.empty() && m_p_stops.front() == m_stop)
424         m_p_stops.erase(m_p_stops.begin());
425 
426     _ASSERT( Invariant() );
427 }
428 
Clear5PrimeCdsLimit()429 void CCDSInfo::Clear5PrimeCdsLimit()
430 {
431     if (HasStop()) {
432         if (Precede(ReadingFrame(),Stop()))
433             m_max_cds_limits.SetFrom( TSignedSeqRange::GetWholeFrom());
434         else
435             m_max_cds_limits.SetTo( TSignedSeqRange::GetWholeTo());
436     } else {
437         m_max_cds_limits = TSignedSeqRange::GetWhole();
438     }
439     _ASSERT( Invariant() );
440 }
441 
Set5PrimeCdsLimit(TSignedSeqPos p)442 void CCDSInfo::Set5PrimeCdsLimit(TSignedSeqPos p)
443 {
444     if (p <= m_reading_frame.GetFrom())
445         m_max_cds_limits.SetFrom(p);
446     else
447         m_max_cds_limits.SetTo(p);
448 
449     _ASSERT( Invariant() );
450 }
451 
452 
AddPStop(TSignedSeqRange r,EStatus status)453 void CCDSInfo::AddPStop(TSignedSeqRange r, EStatus status)
454 {
455     m_p_stops.push_back(SPStop(r, status));
456     _ASSERT( Invariant() );
457 }
458 
459 
SetScore(double score,bool open)460 void CCDSInfo::SetScore(double score, bool open)
461 {
462     m_score = score;
463     _ASSERT( !(open && !HasStart()) );
464     m_open = open;
465     _ASSERT( Invariant() );
466 }
467 
Remap(const CRangeMapper & mapper)468 void CCDSInfo::Remap(const CRangeMapper& mapper)
469 {
470     if(m_start.NotEmpty())
471         m_start = mapper(m_start, false);
472     if(m_stop.NotEmpty())
473         m_stop = mapper(m_stop, false);
474     if(m_reading_frame.NotEmpty())
475         m_reading_frame = mapper(m_reading_frame, true);
476     if(m_reading_frame_from_proteins.NotEmpty())
477         m_reading_frame_from_proteins = mapper(m_reading_frame_from_proteins, true);
478     if(m_max_cds_limits.NotEmpty())
479         m_max_cds_limits = mapper(m_max_cds_limits, false);
480     NON_CONST_ITERATE(TPStops, s, m_p_stops) {
481         *s = SPStop(mapper(*s, false), s->m_status);
482     }
483     _ASSERT(Invariant());
484 }
485 
CombineWith(const CCDSInfo & another_cds_info)486 void CCDSInfo::CombineWith(const CCDSInfo& another_cds_info)
487 {
488     if (another_cds_info.m_reading_frame.Empty())
489         return;
490 
491     CCDSInfo new_cds_info;
492 
493     if (m_reading_frame.Empty()) {
494         new_cds_info = another_cds_info;
495     } else {
496         new_cds_info = *this;
497 
498         new_cds_info.m_p_stops.insert(new_cds_info.m_p_stops.end(),another_cds_info.m_p_stops.begin(),another_cds_info.m_p_stops.end());
499         sort(new_cds_info.m_p_stops.begin(),new_cds_info.m_p_stops.end());
500         new_cds_info.m_p_stops.erase( unique(new_cds_info.m_p_stops.begin(),new_cds_info.m_p_stops.end()), new_cds_info.m_p_stops.end() ) ;
501 
502         new_cds_info.m_reading_frame               += another_cds_info.m_reading_frame;
503         new_cds_info.m_reading_frame_from_proteins += another_cds_info.m_reading_frame_from_proteins;
504         new_cds_info.m_max_cds_limits        &= another_cds_info.m_max_cds_limits;
505 
506         if (new_cds_info.m_confirmed_start && Include(new_cds_info.m_reading_frame_from_proteins,new_cds_info.m_start)) {
507             new_cds_info.m_start = TSignedSeqRange::GetEmpty();
508             new_cds_info.m_confirmed_start = false;
509         }
510         if (another_cds_info.m_confirmed_start && !Include(new_cds_info.m_reading_frame_from_proteins,another_cds_info.m_start)) {
511             new_cds_info.m_start = another_cds_info.m_start;
512             new_cds_info.m_confirmed_start = true;
513         }
514         if (new_cds_info.m_confirmed_start) {
515             if (Include(new_cds_info.m_reading_frame,new_cds_info.m_start)) {
516                 if (Precede(new_cds_info.m_start,new_cds_info.m_reading_frame_from_proteins))
517                     new_cds_info.m_reading_frame.SetFrom( new_cds_info.m_reading_frame_from_proteins.GetFrom() );
518                 else
519                     new_cds_info.m_reading_frame.SetTo( new_cds_info.m_reading_frame_from_proteins.GetTo() );
520             }
521         } else {
522             if (new_cds_info.HasStart() && Include(new_cds_info.m_reading_frame,new_cds_info.m_start)) {
523                 new_cds_info.m_start = TSignedSeqRange::GetEmpty();
524             }
525             if (another_cds_info.HasStart() && !Include(new_cds_info.m_reading_frame,another_cds_info.m_start)) {
526                 new_cds_info.m_start = another_cds_info.m_start;
527             }
528         }
529 
530         _ASSERT( !new_cds_info.HasStop() || !another_cds_info.HasStop() || new_cds_info.Stop() == another_cds_info.Stop() );
531         new_cds_info.m_stop += another_cds_info.m_stop;
532         if (another_cds_info.m_confirmed_stop)
533             new_cds_info.m_confirmed_stop = true;
534 
535         new_cds_info.SetScore( BadScore(), false );
536     }
537 
538     _ASSERT( new_cds_info.Invariant() );
539     *this = new_cds_info;
540 }
541 
Strand() const542 int CCDSInfo::Strand() const
543 {
544     int strand = 0; //unknown
545     if (HasStart())
546         strand = Precede(Start(), ReadingFrame()) ? 1 : -1;
547     else if (HasStop())
548         strand = Precede(ReadingFrame(), Stop()) ? 1 : -1;
549     else if (m_max_cds_limits.GetFrom() != TSignedSeqRange::GetWholeFrom())
550         strand = 1;
551     else if (m_max_cds_limits.GetTo() != TSignedSeqRange::GetWholeTo())
552         strand = -1;
553     return strand;
554 }
555 
Clip(TSignedSeqRange limits)556 void CCDSInfo::Clip(TSignedSeqRange limits)
557 {
558     if (ReadingFrame().Empty())
559         return;
560 
561     m_reading_frame &= limits;
562 
563     if (m_reading_frame.Empty()) {
564         Clear();
565         return;
566     }
567 
568     m_start  &= limits;
569     m_confirmed_start &= HasStart();
570     m_stop  &= limits;
571     m_confirmed_stop &= HasStop();
572     m_reading_frame_from_proteins &= limits;
573 
574     if (m_max_cds_limits.GetFrom() < limits.GetFrom())
575         m_max_cds_limits.SetFrom(TSignedSeqRange::GetWholeFrom());
576     if (limits.GetTo() < m_max_cds_limits.GetTo())
577         m_max_cds_limits.SetTo(TSignedSeqRange::GetWholeTo());
578 
579     for(TPStops::iterator s = m_p_stops.begin(); s != m_p_stops.end(); ) {
580         *s &= limits;
581         if (s->NotEmpty())
582             ++s;
583         else
584             s = m_p_stops.erase(s);
585     }
586 
587     SetScore( BadScore(), false );
588 
589     _ASSERT( Invariant() );
590 }
591 
Cut(TSignedSeqRange hole)592 void CCDSInfo::Cut(TSignedSeqRange hole) {
593     if((Cds() & hole).Empty())
594         return;
595 
596     if(Include(hole, Cds())) {
597         Clear();
598         return;
599     }
600 
601     if(hole.IntersectingWith(m_start)) {
602         _ASSERT(Include(hole, m_start));
603         m_start = TSignedSeqRange::GetEmpty();
604         m_confirmed_start = false;
605     }
606     if(hole.IntersectingWith(m_stop)) {
607         _ASSERT(Include(hole, m_stop));
608         m_stop = TSignedSeqRange::GetEmpty();
609         m_confirmed_stop = false;
610     }
611 
612     if(Include(hole, m_max_cds_limits.GetFrom()))
613         m_max_cds_limits.SetFrom(TSignedSeqRange::GetWholeFrom());
614     if(Include(hole, m_max_cds_limits.GetTo()))
615         m_max_cds_limits.SetTo(TSignedSeqRange::GetWholeTo());
616 
617     if(hole.IntersectingWith(m_reading_frame_from_proteins)) {
618         if(hole.GetFrom() <= m_reading_frame_from_proteins.GetFrom())
619             m_reading_frame_from_proteins.SetFrom(hole.GetTo()+1);
620         if(hole.GetTo() >= m_reading_frame_from_proteins.GetTo())
621             m_reading_frame_from_proteins.SetTo(hole.GetFrom()-1);
622     }
623 
624     if(hole.IntersectingWith(m_reading_frame)) {
625         if(hole.GetFrom() <= m_reading_frame.GetFrom())
626             m_reading_frame.SetFrom(hole.GetTo()+1);
627         if(hole.GetTo() >= m_reading_frame.GetTo())
628             m_reading_frame.SetTo(hole.GetFrom()-1);
629     }
630 
631     ERASE_ITERATE(TPStops, s, m_p_stops) {
632         if(hole.IntersectingWith(*s)) {
633             _ASSERT(Include(hole, *s));
634             VECTOR_ERASE(s, m_p_stops);
635         }
636     }
637 
638     SetScore( BadScore(), false );
639     _ASSERT( Invariant() );
640 }
641 
Clear()642 void CCDSInfo::Clear()
643 {
644     m_start = m_stop = m_reading_frame = m_reading_frame_from_proteins = m_max_cds_limits = TSignedSeqRange::GetEmpty();
645     m_confirmed_start = false;
646     m_confirmed_stop = false;
647     m_p_stops.clear();
648     SetScore( BadScore(), false );
649 }
650 
PStop(bool includeall) const651 bool CCDSInfo::PStop(bool includeall) const
652 {
653     if(includeall)
654         return !m_p_stops.empty();
655 
656     ITERATE(TPStops, stp, m_p_stops) {
657         if(stp->m_status != eGenomeNotCorrect && stp->m_status != eSelenocysteine)
658             return true;
659     }
660 
661     return false;
662 }
663 
RemoveShortHolesAndRescore(const CGnomonEngine & gnomon)664 void CGeneModel::RemoveShortHolesAndRescore(const CGnomonEngine& gnomon) {
665     TExons new_exons;
666     new_exons.push_back(Exons().front());
667     bool modified = false;
668     for(unsigned int i = 1; i < Exons().size(); ++i) {
669         bool hole = !Exons()[i-1].m_ssplice || !Exons()[i].m_fsplice;
670         int intron = Exons()[i].GetFrom()-Exons()[i-1].GetTo()-1;
671         if (hole && intron < gnomon.GetMinIntronLen()) {
672             modified = true;
673             new_exons.back().m_ssplice = Exons()[i].m_ssplice;
674             new_exons.back().AddTo(Exons()[i].GetTo()-Exons()[i-1].GetTo());
675             if(intron%3 != 0) {
676                 int len = intron%3;
677                 int loc = Exons()[i-1].GetTo() + 1 + (intron-len)/2;
678                 FrameShifts().push_back(CInDelInfo(loc, len, CInDelInfo::eIns));
679             }
680         } else {
681             new_exons.push_back(Exons()[i]);
682         }
683     }
684 
685     if(modified) {
686         MyExons() = new_exons;
687         sort(FrameShifts().begin(),FrameShifts().end());
688         gnomon.GetScore(*this);
689     }
690 }
691 
CdsInvariant(bool check_start_stop) const692 bool CGeneModel::CdsInvariant(bool check_start_stop) const
693 {
694     if (ReadingFrame().Empty())
695         return true;
696 
697 
698 #ifdef _DEBUG
699     CAlignMap mrnamap(Exons(),FrameShifts(),Strand());
700     CCDSInfo cds_info = GetCdsInfo();
701     if(cds_info.IsMappedToGenome())
702         cds_info = cds_info.MapFromOrigToEdited(mrnamap);
703 
704     _ASSERT( !(ConfirmedStart() && OpenCds()) );
705 
706     _ASSERT(Include(cds_info.MaxCdsLimits(), cds_info.Cds()));
707     _ASSERT(cds_info.MaxCdsLimits().GetFrom() == TSignedSeqRange::GetWholeFrom() || cds_info.MaxCdsLimits().GetFrom() == cds_info.Cds().GetFrom());
708     _ASSERT(cds_info.MaxCdsLimits().GetTo() == TSignedSeqRange::GetWholeTo() || cds_info.MaxCdsLimits().GetTo() == cds_info.Cds().GetTo());
709 
710     if (check_start_stop && Score() != BadScore()) {
711         _ASSERT(cds_info.ReadingFrame().GetFrom() < 3 || (cds_info.HasStart() && cds_info.Start().GetTo()+1 == cds_info.ReadingFrame().GetFrom()));
712         _ASSERT(cds_info.ReadingFrame().GetTo() >= mrnamap.TargetLen()-3 || (cds_info.HasStop() && cds_info.Stop().GetFrom()-1 == cds_info.ReadingFrame().GetTo()));
713     }
714 
715     _ASSERT(!cds_info.HasStart() || cds_info.Start().GetLength()==3);
716     _ASSERT(!cds_info.HasStop() || cds_info.Stop().GetLength()==3);
717     _ASSERT(cds_info.Cds().GetLength()%3==0);
718 #endif
719 
720     return true;
721 }
722 
SetCdsInfo(const CCDSInfo & cds_info)723 void CGeneModel::SetCdsInfo(const CCDSInfo& cds_info)
724 {
725     m_cds_info = cds_info;
726     _ASSERT( CdsInvariant() );
727 }
728 
SetCdsInfo(const CGeneModel & a)729 void CGeneModel::SetCdsInfo(const CGeneModel& a)
730 {
731     m_cds_info = a.m_cds_info;
732     _ASSERT( CdsInvariant() );
733 }
734 
CombineCdsInfo(const CGeneModel & a,bool ensure_cds_invariant)735 void CGeneModel::CombineCdsInfo(const CGeneModel& a, bool ensure_cds_invariant)
736 {
737     CombineCdsInfo(a.m_cds_info, ensure_cds_invariant);
738 }
739 
CombineCdsInfo(const CCDSInfo & cds_info,bool ensure_cds_invariant)740 void CGeneModel::CombineCdsInfo(const CCDSInfo& cds_info, bool ensure_cds_invariant)
741 {
742     _ASSERT( cds_info.ReadingFrame().Empty() || Include(Limits(),cds_info.ReadingFrame()) );
743     //This assert is problem when when cds_info.ReadingFrame() touches a fs  ????
744     _ASSERT( cds_info.ReadingFrame().Empty() || FShiftedLen(cds_info.Start()+cds_info.ReadingFrame()+cds_info.Stop(), false)%3==0 );
745 
746     m_cds_info.CombineWith(cds_info);
747 
748     _ASSERT( !ensure_cds_invariant || CdsInvariant(false) );
749 }
750 
CutExons(TSignedSeqRange hole)751 void CGeneModel::CutExons(TSignedSeqRange hole)
752 {
753     if (ReadingFrame().NotEmpty()) {
754         TSignedSeqRange cds_hole = hole;
755         for(int i = 0; i < (int)MyExons().size(); ++i) {
756             if(i > 0 && hole.GetFrom() == MyExons()[i].GetFrom())
757                 cds_hole.SetFrom(MyExons()[i].GetTo()+1);
758             if(i < (int)MyExons().size()-1 && hole.GetTo() ==  MyExons()[i].GetTo())
759                 cds_hole.SetTo(MyExons()[i].GetFrom()-1);
760         }
761         m_cds_info.Cut(cds_hole);
762     }
763 
764     for(int i = 0; i < (int)MyExons().size(); ++i) {
765         TSignedSeqRange intersection = MyExons()[i].Limits() & hole;
766         if (intersection.Empty())
767             continue;
768         if (MyExons()[i].GetFrom()<hole.GetFrom()) {
769             MyExons()[i].Limits().SetTo(hole.GetFrom()-1);
770             MyExons()[i].m_ssplice = false;
771             MyExons()[i].m_ssplice_sig.clear();
772             if (i+1 < (int)MyExons().size()) {
773                 MyExons()[i+1].m_fsplice=false;
774             }
775         } else if (hole.GetTo()<MyExons()[i].GetTo()) {
776             MyExons()[i].Limits().SetFrom(hole.GetTo()+1);
777             MyExons()[i].m_fsplice = false;
778             MyExons()[i].m_fsplice_sig.clear();
779             if (0<i) {
780                 MyExons()[i-1].m_ssplice=false;
781             }
782         } else {
783             if (0<i) {
784                 MyExons()[i-1].m_ssplice=false;
785                 //                MyExons()[i-1].m_ssplice_sig.clear();
786            }
787             if (i+1 < (int)MyExons().size()) {
788                 MyExons()[i+1].m_fsplice=false;
789                 //                MyExons()[i+1].m_fsplice_sig.clear();
790             }
791             MyExons().erase( MyExons().begin()+i);
792             --i;
793         }
794     }
795     RecalculateLimits();
796     RemoveExtraFShifts(hole.GetTo()+1, hole.GetFrom()-1);
797 }
798 
Clip(TSignedSeqRange clip_limits,EClipMode mode,bool ensure_cds_invariant)799 void CGeneModel::Clip(TSignedSeqRange clip_limits, EClipMode mode, bool ensure_cds_invariant)
800 {
801     if (ReadingFrame().NotEmpty()) {
802         TSignedSeqRange cds_clip_limits = Limits();
803         if (mode == eRemoveExons)
804             cds_clip_limits &= clip_limits;
805         else {
806             TSignedSeqRange intersection;
807             intersection = Exons().front().Limits() & clip_limits;
808             if (intersection.NotEmpty())
809                 cds_clip_limits.SetFrom(intersection.GetFrom());
810             intersection = Exons().back().Limits() & clip_limits;
811             if (intersection.NotEmpty())
812                 cds_clip_limits.SetTo(intersection.GetTo());
813         }
814 
815         TSignedSeqRange precise_cds_clip_limits;
816         ITERATE(TExons, e, Exons())
817             precise_cds_clip_limits += e->Limits() & cds_clip_limits;
818         cds_clip_limits = precise_cds_clip_limits;
819 
820         CAlignMap mrnamap(Exons(),FrameShifts(),Strand(),GetCdsInfo().Cds());
821 
822         if (RealCdsLimits().GetFrom() < cds_clip_limits.GetFrom() && cds_clip_limits.GetFrom() < ReadingFrame().GetFrom())
823             cds_clip_limits.SetFrom(ReadingFrame().GetFrom());
824         else if (ReadingFrame().GetFrom() < cds_clip_limits.GetFrom() && cds_clip_limits.GetFrom() <= RealCdsLimits().GetTo()) {
825             TSignedSeqRange tmp = mrnamap.ShrinkToRealPoints(TSignedSeqRange(cds_clip_limits.GetFrom(), RealCdsLimits().GetTo()),true);
826             if(tmp.Empty() || tmp.GetFrom() >= ReadingFrame().GetTo())
827                 cds_clip_limits = TSignedSeqRange::GetEmpty();
828             else
829                 cds_clip_limits.SetFrom(tmp.GetFrom());
830         }
831 
832         if( cds_clip_limits.NotEmpty()) {
833             if (RealCdsLimits().GetFrom() <= cds_clip_limits.GetTo() && cds_clip_limits.GetTo() < ReadingFrame().GetTo()) {
834                 TSignedSeqRange tmp = mrnamap.ShrinkToRealPoints(TSignedSeqRange(RealCdsLimits().GetFrom(),cds_clip_limits.GetTo()),true);
835                 if(tmp.Empty() || ReadingFrame().GetFrom() >= tmp.GetTo())
836                     cds_clip_limits = TSignedSeqRange::GetEmpty();
837                 else
838                     cds_clip_limits.SetTo(tmp.GetTo());
839             } else if (ReadingFrame().GetTo() < cds_clip_limits.GetTo() && cds_clip_limits.GetTo() < RealCdsLimits().GetTo())
840                 cds_clip_limits.SetTo(ReadingFrame().GetTo());
841         }
842 
843         m_cds_info.Clip(cds_clip_limits);
844     }
845 
846     for (TExons::iterator e = MyExons().begin(); e != MyExons().end();) {
847         TSignedSeqRange clip = e->Limits() & clip_limits;
848         if (clip.NotEmpty()) {
849             if(e->GetFrom() <= clip_limits.GetFrom())
850                 e->m_fsplice = false;
851             if(e->GetFrom() < clip_limits.GetFrom())
852                 e->m_fsplice_sig.clear();
853             if(e->GetTo() >= clip_limits.GetTo())
854                 e->m_ssplice = false;
855             if(e->GetTo() > clip_limits.GetTo())
856                 e->m_ssplice_sig.clear();
857 
858             e++->Limits() = clip;
859         } else if (mode == eRemoveExons)
860             e = MyExons().erase(e);
861         else
862             ++e;
863     }
864 
865     if (Exons().size()>0) {
866         MyExons().front().m_fsplice = false;
867         //        MyExons().front().m_fsplice_sig.clear();
868         MyExons().back().m_ssplice = false;
869         //        MyExons().back().m_ssplice_sig.clear();
870     }
871 
872     RecalculateLimits();
873     RemoveExtraFShifts(clip_limits.GetFrom(), clip_limits.GetTo());
874 
875     _ASSERT( CdsInvariant(ensure_cds_invariant) );
876 }
877 
RemoveExtraFShifts(int left,int right)878 void CGeneModel::RemoveExtraFShifts(int left, int right)
879 {
880     TInDels::const_iterator indl = m_fshifts.begin();
881     TInDels indels;
882     for(CGeneModel::TExons::const_iterator e = Exons().begin(); e != Exons().end() && indl != m_fshifts.end(); ++e) {
883         if(e->Limits().Empty())
884             continue;
885 
886         for( ;indl != m_fshifts.end() && indl->InDelEnd() < (indl->IsDeletion() ? e->GetFrom() : e->GetFrom()+1); ++indl);  // skip indels on the left
887         for( ;indl != m_fshifts.end() && e->GetFrom() == left && indl->IsDeletion() && indl->Loc() == left; ++indl);        // skip flanking deletion if clipped
888 
889         if(indl != m_fshifts.end() && indl->Loc() < e->GetFrom()) {                                                         // clip long mismatch
890             _ASSERT(indl->IsMismatch());
891             int extra = e->GetFrom()-indl->Loc();
892             indels.push_back(CInDelInfo(e->GetFrom(),indl->Len()-extra,CInDelInfo::eMism,indl->GetInDelV().substr(extra)));
893             ++indl;
894         }
895 
896         for( ; indl != m_fshifts.end() && indl->Loc() <= e->GetTo(); ++indl)                                                // include all internal indels
897             indels.push_back(*indl);
898 
899         for( ; indl != m_fshifts.end() && e->GetTo() != right && indl->IsDeletion() && indl->Loc() == e->GetTo()+1; ++indl) // include flanking deletion if not cliped
900              indels.push_back(*indl);
901 
902         if(!indels.empty() && indels.back().InDelEnd() > e->GetTo()+1) {      // clip long mismatch
903             _ASSERT(indels.back().IsMismatch());
904             int extra = indels.back().InDelEnd()-e->GetTo()-1;
905             indels.back() = CInDelInfo(indels.back().Loc(),indels.back().Len()-extra,CInDelInfo::eMism,indels.back().GetInDelV().substr(0,indels.back().Len()-extra));
906         }
907     }
908 
909     m_fshifts = indels;
910 }
911 
912 
FrameShifts(TSignedSeqPos a,TSignedSeqPos b) const913 TInDels  CGeneModel::FrameShifts(TSignedSeqPos a, TSignedSeqPos b) const
914 {
915     TInDels fshifts;
916 
917     ITERATE(TInDels, i, m_fshifts) {
918         if(i->IntersectingWith(a,b))
919             fshifts.push_back( *i );
920     }
921 
922     return fshifts;
923 }
924 
925 
FShiftedLen(TSignedSeqRange ab,bool withextras) const926 int CGeneModel::FShiftedLen(TSignedSeqRange ab, bool withextras) const
927 {
928     if (ab.Empty())
929         return 0;
930 
931     _ASSERT(Include(Limits(),ab));
932 
933     return GetAlignMap().FShiftedLen(ab, withextras);
934 }
935 
FShiftedMove(TSignedSeqPos pos,int len) const936 TSignedSeqPos CGeneModel::FShiftedMove(TSignedSeqPos pos, int len) const
937 {
938     return GetAlignMap().FShiftedMove(pos, len);
939 }
940 
941 
MutualExtension(const CGeneModel & a) const942 int CGeneModel::MutualExtension(const CGeneModel& a) const
943 {
944     if(Strand() != a.Strand())
945         return 0;
946     const TSignedSeqRange limits = Limits();
947     const TSignedSeqRange alimits = a.Limits();
948 
949     if((Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)) == 0 && (a.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)) == 0) {
950         const int intersection = (limits & alimits).GetLength();
951         if(intersection==0 || intersection==limits.GetLength() || intersection==alimits.GetLength())
952             return 0;
953         return isCompatible(a);
954     } else if((Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)) == 0) {
955         if(a.Status()&CGeneModel::eRightFlexible)
956             return alimits.GetFrom() < limits.GetFrom() && alimits.GetTo() > limits.GetFrom();
957         else
958             return alimits.GetTo() > limits.GetTo() && alimits.GetFrom() < limits.GetTo();
959     } else if((a.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)) == 0) {
960         if(Status()&CGeneModel::eRightFlexible)
961             return limits.GetFrom() < alimits.GetFrom() && limits.GetTo() > alimits.GetFrom();
962         else
963             return limits.GetTo() > alimits.GetTo() && limits.GetFrom() < alimits.GetTo();
964     } else if((Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)) != (a.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))) {
965         return 0;
966     } else {
967         return limits.IntersectingWith(alimits) && limits != alimits;
968     }
969 }
970 
isCompatible(const CGeneModel & a) const971 int CGeneModel::isCompatible(const CGeneModel& a) const
972 {
973     const CGeneModel& b = *this;  // shortcut to this alignment
974 
975     _ASSERT( b.Strand() == a.Strand() || ((a.Status()&CGeneModel::eUnknownOrientation) != 0 && (b.Status()&CGeneModel::eUnknownOrientation) != 0));
976 
977     TSignedSeqRange intersect(a.Limits() & b.Limits());
978     if(intersect.GetLength() <= 1) return 0;     // intersection with 1 base is not legit
979 
980     int anum = a.Exons().size()-1;   // exon containing left point or first exon on the left
981     for(; intersect.GetFrom() < a.Exons()[anum].GetFrom() ; --anum);
982     bool aexon = intersect.GetFrom() <= a.Exons()[anum].GetTo();
983     if(!aexon && intersect.GetTo() < a.Exons()[anum+1].GetFrom()) return 0;    // b is in intron
984 
985     int bnum = b.Exons().size()-1;   // exon containing left point or first exon on the left
986     for(; intersect.GetFrom() < b.Exons()[bnum].GetFrom(); --bnum);
987     bool bexon = intersect.GetFrom() <= b.Exons()[bnum].GetTo();
988     if(!bexon && intersect.GetTo() < b.Exons()[bnum+1].GetFrom()) return 0;    // a is in intron
989 
990     TSignedSeqPos left = intersect.GetFrom();
991     TSignedSeqPos afirst = -1;
992     TSignedSeqPos bfirst = -1;
993     int commonspl = 0;
994     int firstcommonpoint = -1;
995 
996     auto_ptr<CAlignMap> pmapa(0), pmapb(0);
997 
998     while(left <= intersect.GetTo())
999     {
1000         TSignedSeqPos aright = aexon ? a.Exons()[anum].GetTo() : a.Exons()[anum+1].GetFrom()-1;
1001         TSignedSeqPos bright = bexon ? b.Exons()[bnum].GetTo() : b.Exons()[bnum+1].GetFrom()-1;
1002         TSignedSeqPos right = min(aright,bright);
1003 
1004         if(!aexon && bexon)
1005         {
1006             if(a.Exons()[anum].m_ssplice && a.Exons()[anum+1].m_fsplice) return 0;        // intron has both splices
1007             if(left == a.Exons()[anum].GetTo()+1 && a.Exons()[anum].m_ssplice) return 0;   // intron has left splice and == left
1008             if(right == aright && a.Exons()[anum+1].m_fsplice) return 0;          // intron has right splice and == right
1009         }
1010         if(aexon && !bexon)
1011         {
1012             if(b.Exons()[bnum].m_ssplice && b.Exons()[bnum+1].m_fsplice) return 0;
1013             if(left == b.Exons()[bnum].GetTo()+1 && b.Exons()[bnum].m_ssplice) return 0;
1014             if(right == bright && b.Exons()[bnum+1].m_fsplice) return 0;
1015         }
1016 
1017         if(aexon && bexon) {
1018             if(firstcommonpoint < 0) {
1019                 firstcommonpoint = left;
1020             } else if((anum > 0 && left == a.Exons()[anum].GetFrom() && !a.Exons()[anum].m_fsplice) ||   // end of a-hole
1021                       (bnum > 0 && left == b.Exons()[bnum].GetFrom() && !b.Exons()[bnum].m_fsplice))  {  // end of b-hole
1022 
1023                 if(afirst < 0) {
1024                     pmapa.reset(new CAlignMap(a.Exons(),a.FrameShifts(),a.Strand()));
1025                     pmapb.reset(new CAlignMap(b.Exons(),b.FrameShifts(),b.Strand()));
1026                     afirst = pmapa->MapOrigToEdited(firstcommonpoint);
1027                     bfirst = pmapb->MapOrigToEdited(firstcommonpoint);
1028                     if(afirst < 0 || bfirst < 0) return 0;
1029                 }
1030 
1031                 TSignedSeqPos asecond = pmapa->MapOrigToEdited(left);
1032                 TSignedSeqPos bsecond = pmapb->MapOrigToEdited(left);
1033                 if(asecond < 0 || bsecond < 0 || (asecond-afirst)%3 != (bsecond-bfirst)%3) return 0;
1034             }
1035         }
1036 
1037         if(aexon && aright == right && a.Exons()[anum].m_ssplice &&
1038            bexon && bright == right && b.Exons()[bnum].m_ssplice) ++commonspl;
1039         if(!aexon && aright == right && a.Exons()[anum+1].m_fsplice &&
1040            !bexon && bright == right && b.Exons()[bnum+1].m_fsplice) ++commonspl;
1041 
1042         left = right+1;
1043 
1044         if(right == aright)
1045         {
1046             aexon = !aexon;
1047             if(aexon) ++anum;
1048         }
1049 
1050         if(right == bright)
1051         {
1052             bexon = !bexon;
1053             if(bexon) ++bnum;
1054         }
1055     }
1056 
1057     return firstcommonpoint >= 0 ? commonspl+1 : 0;
1058 }
1059 
IsSubAlignOf(const CGeneModel & a) const1060 bool CGeneModel::IsSubAlignOf(const CGeneModel& a) const
1061 {
1062     if(!Include(a.Limits(),Limits()) || !isCompatible(a))
1063         return false;
1064 
1065     for(unsigned int i = 1; i < a.Exons().size(); ++i) {
1066         if (!a.Exons()[i-1].m_ssplice || !a.Exons()[i].m_fsplice){
1067             TSignedSeqRange hole(a.Exons()[i-1].GetTo()+1, a.Exons()[i].GetFrom()-1);
1068             ITERATE(CGeneModel::TExons, k, Exons()) {
1069                 if(k->Limits().IntersectingWith(hole)) {
1070                     return false;
1071                 }
1072             }
1073         }
1074     }
1075 
1076     return true;
1077 }
1078 
1079 
AddExon(TSignedSeqRange exon_range,const string & fs,const string & ss,double ident,const string & seq,const CInDelInfo::SSource & src)1080 void CGeneModel::AddExon(TSignedSeqRange exon_range, const string& fs, const string& ss, double ident, const string& seq, const CInDelInfo::SSource& src)
1081 {
1082     _ASSERT( (m_range & exon_range).Empty() );
1083     m_range += exon_range;
1084 
1085     CModelExon e(exon_range.GetFrom(),exon_range.GetTo(),false,false,fs,ss,ident,seq,src);
1086     if (MyExons().empty())
1087         MyExons().push_back(e);
1088     else if ((exon_range.Empty() || MyExons().back().Limits().Empty()) || MyExons().back().GetTo() < exon_range.GetFrom()) {
1089         if (!m_expecting_hole) {
1090             MyExons().back().m_ssplice = true;
1091             e.m_fsplice = true;
1092         }
1093         MyExons().push_back(e);
1094     } else {
1095         if (!m_expecting_hole) {
1096             MyExons().front().m_fsplice = true;
1097             e.m_ssplice = true;
1098         }
1099         MyExons().insert(MyExons().begin(),e);
1100     }
1101     m_expecting_hole = false;
1102 }
1103 
AddHole()1104 void CGeneModel::AddHole()
1105 {
1106     m_expecting_hole = true;
1107 }
1108 
AddGgapExon(double ident,const string & seq,const CInDelInfo::SSource & src,bool infront)1109 void CGeneModel::AddGgapExon(double ident, const string& seq, const CInDelInfo::SSource& src, bool infront)
1110 {
1111     _ASSERT(MyExons().empty() || !m_expecting_hole);
1112 
1113     TSignedSeqRange exon_range(TSignedSeqRange::GetEmpty());
1114     CModelExon e(exon_range.GetFrom(),exon_range.GetTo(),false,false,"","",ident,seq,src);
1115     if (MyExons().empty())
1116         MyExons().push_back(e);
1117     else if (!infront) {
1118         _ASSERT(MyExons().back().Limits().NotEmpty());
1119         MyExons().back().m_ssplice = true;
1120         e.m_fsplice = true;
1121         e.m_fsplice_sig = "XX";
1122         MyExons().push_back(e);
1123     } else {
1124         _ASSERT(MyExons().front().Limits().NotEmpty());
1125         MyExons().front().m_fsplice = true;
1126         e.m_ssplice = true;
1127         e.m_ssplice_sig = "XX";
1128         MyExons().insert(MyExons().begin(),e);
1129     }
1130     m_expecting_hole = false;
1131 }
1132 
AddNormalExon(TSignedSeqRange exon_range,const string & fs,const string & ss,double ident,bool infront)1133 void CGeneModel::AddNormalExon(TSignedSeqRange exon_range, const string& fs, const string& ss, double ident, bool infront)
1134 {
1135     _ASSERT( (m_range & exon_range).Empty() );
1136     m_range += exon_range;
1137 
1138     CModelExon e(exon_range.GetFrom(),exon_range.GetTo(),false,false,fs,ss,ident);
1139     if (MyExons().empty())
1140         MyExons().push_back(e);
1141     else if (!infront) {
1142         if (!m_expecting_hole) {
1143             MyExons().back().m_ssplice = true;
1144             if(MyExons().back().Limits().Empty())
1145                 MyExons().back().m_ssplice_sig = "XX";
1146             e.m_fsplice = true;
1147         }
1148         MyExons().push_back(e);
1149     } else {
1150         if (!m_expecting_hole) {
1151             MyExons().front().m_fsplice = true;
1152             if(MyExons().front().Limits().Empty())
1153                 MyExons().front().m_fsplice_sig = "XX";
1154             e.m_ssplice = true;
1155         }
1156         MyExons().insert(MyExons().begin(),e);
1157     }
1158     m_expecting_hole = false;
1159 }
1160 
Extend(const CModelExon & e)1161 void CModelExon::Extend(const CModelExon& e)
1162 {
1163     // spliced should include the other
1164     _ASSERT(
1165             (!m_fsplice || GetFrom() <= e.GetFrom()) &&
1166             (!e.m_fsplice || GetFrom() >= e.GetFrom()) &&
1167             (!m_ssplice || e.GetTo() <= GetTo()) &&
1168             (!e.m_ssplice || e.GetTo() >= GetTo())
1169             );
1170 
1171     Limits().CombineWith(e.Limits());
1172     m_fsplice =m_fsplice || e.m_fsplice;
1173     m_ssplice =m_ssplice || e.m_ssplice;
1174     if(e.m_fsplice && !e.m_fsplice_sig.empty())
1175         m_fsplice_sig = e.m_fsplice_sig;
1176     if(e.m_ssplice && !e.m_ssplice_sig.empty())
1177         m_ssplice_sig = e.m_ssplice_sig;
1178 }
1179 
TrimEdgesToFrameInOtherAlignGaps(const TExons & exons_with_gaps)1180 void CGeneModel::TrimEdgesToFrameInOtherAlignGaps(const TExons& exons_with_gaps)
1181 {
1182     if(Exons().empty())
1183         return;
1184 
1185     TSignedSeqPos left_edge = Limits().GetFrom();
1186     TSignedSeqPos right_edge = Limits().GetTo();
1187     CAlignMap mrnamap(Exons(), FrameShifts(), ePlus);   // 'positive' alignmap
1188 
1189     for (int i = 0; i<int(exons_with_gaps.size())-1; ++i) {
1190         if (exons_with_gaps[i].GetTo() < left_edge && left_edge < exons_with_gaps[i+1].GetFrom()) {
1191             _ASSERT( !exons_with_gaps[i].m_ssplice && !exons_with_gaps[i+1].m_fsplice );
1192             _ASSERT( right_edge >= exons_with_gaps[i+1].GetFrom() );
1193             TSignedSeqRange tlim = mrnamap.MapRangeOrigToEdited(TSignedSeqRange(left_edge,exons_with_gaps[i+1].GetFrom()), CAlignMap::eLeftEnd, CAlignMap::eSinglePoint);
1194             _ASSERT(tlim.NotEmpty());
1195             int del = (tlim.GetLength()-1)%3;
1196             if(del > 0) {
1197                 left_edge = -1;
1198                 for(int tpos = tlim.GetFrom()+del; left_edge < 0 && tpos <= tlim.GetTo(); tpos += 3) {
1199                     left_edge = mrnamap.MapEditedToOrig(tpos);
1200                 }
1201                 _ASSERT(left_edge > 0);
1202                 CutExons(TSignedSeqRange(Limits().GetFrom(),left_edge-1));
1203             }
1204         }
1205 
1206         if (exons_with_gaps[i].GetTo() < right_edge && right_edge < exons_with_gaps[i+1].GetFrom()) {
1207             _ASSERT( !exons_with_gaps[i].m_ssplice && !exons_with_gaps[i+1].m_fsplice );
1208             _ASSERT( left_edge <= exons_with_gaps[i].GetTo() );
1209             TSignedSeqRange tlim = mrnamap.MapRangeOrigToEdited(TSignedSeqRange(exons_with_gaps[i].GetTo(),right_edge), CAlignMap::eSinglePoint, CAlignMap::eRightEnd);
1210             _ASSERT(tlim.NotEmpty());
1211             int del = (tlim.GetLength()-1)%3;
1212             if(del > 0) {
1213                 right_edge = -1;
1214                 for(int tpos = tlim.GetTo()-del; right_edge < 0 && tpos >= tlim.GetFrom(); tpos -= 3) {
1215                     right_edge = mrnamap.MapEditedToOrig(tpos);
1216                 }
1217                 _ASSERT(right_edge > 0);
1218                 CutExons(TSignedSeqRange(right_edge+1, Limits().GetTo()));
1219             }
1220         }
1221     }
1222 }
1223 
Extend(const CGeneModel & align,bool ensure_cds_invariant)1224 void CGeneModel::Extend(const CGeneModel& align, bool ensure_cds_invariant)
1225 {
1226     _ASSERT( align.Strand() == Strand() );
1227 
1228     CGeneModel other_align = align;
1229 
1230     if ( !other_align.Continuous() )
1231         this->TrimEdgesToFrameInOtherAlignGaps(other_align.Exons());
1232     if ( !this->Continuous() )
1233         other_align.TrimEdgesToFrameInOtherAlignGaps(this->Exons());
1234 
1235     TExons a = MyExons();
1236     TExons b = other_align.Exons();
1237 
1238     MyExons().clear();
1239 
1240     size_t i,j;
1241     for ( i=0,j=0; i<a.size() || j<b.size(); ) {
1242         if (i==a.size())
1243             MyExons().push_back(b[j++]);
1244         else if (j==b.size())
1245             MyExons().push_back(a[i++]);
1246         else if (a[i].GetTo()+1<b[j].GetFrom())
1247             MyExons().push_back(a[i++]);
1248         else if (b[j].GetTo()+1<a[i].GetFrom())
1249             MyExons().push_back(b[j++]);
1250         else {
1251             b[j].Extend(a[i++]);
1252             while (j+1<b.size() && b[j].GetTo()+1>=b[j+1].GetFrom()) {
1253                 b[j+1].Extend(b[j]);
1254                 ++j;
1255             }
1256         }
1257         if(!MyExons().empty())
1258             MyExons().back().m_ident = 0;
1259     }
1260 
1261     RecalculateLimits();
1262 
1263     m_fshifts.insert(m_fshifts.end(),align.m_fshifts.begin(),align.m_fshifts.end());
1264     if(!m_fshifts.empty()) {
1265         sort(m_fshifts.begin(),m_fshifts.end());
1266         m_fshifts.erase( unique(m_fshifts.begin(),m_fshifts.end()), m_fshifts.end() );
1267     }
1268 #ifdef _DEBUG
1269     ITERATE(TInDels, indl, m_fshifts) {
1270         if(indl->IsDeletion())
1271             continue;
1272         TInDels::const_iterator next = indl;
1273         if(++next !=  m_fshifts.end())
1274             _ASSERT(indl->InDelEnd() <= next->Loc());
1275     }
1276 #endif
1277 
1278     SetType(Type() | ((CGeneModel::eProt|CGeneModel::eSR|CGeneModel::eEST|CGeneModel::emRNA|CGeneModel::eNotForChaining)&align.Type()));
1279     if(align.ReadingFrame().NotEmpty())
1280         CombineCdsInfo(align, ensure_cds_invariant);
1281 }
1282 
RecalculateLimits()1283 void CGeneModel::RecalculateLimits()
1284 {
1285     if (Exons().empty()) {
1286         m_range = TSignedSeqRange::GetEmpty();
1287     } else {
1288         int num = Exons().size();
1289         if(Exons()[0].Limits().NotEmpty()) {
1290             m_range.SetFrom(Exons()[0].GetFrom());
1291         } else {
1292             _ASSERT(num > 1 && Exons()[1].Limits().NotEmpty());
1293             m_range.SetFrom(Exons()[1].GetFrom());
1294         }
1295         if(Exons()[num-1].Limits().NotEmpty()) {
1296             m_range.SetTo(Exons()[num-1].GetTo());
1297         } else {
1298            _ASSERT(num > 1 && Exons()[num-2].Limits().NotEmpty());
1299             m_range.SetTo(Exons()[num-2].GetTo());
1300         }
1301     }
1302 }
1303 
1304 
ExtendLeft(int amount)1305 void CGeneModel::ExtendLeft(int amount)
1306 {
1307     _ASSERT(amount>=0);
1308     MyExons().front().AddFrom(-amount);
1309     RecalculateLimits();
1310 }
1311 
ExtendRight(int amount)1312 void CGeneModel::ExtendRight(int amount)
1313 {
1314     _ASSERT(amount>=0);
1315     MyExons().back().AddTo(amount);
1316     RecalculateLimits();
1317 }
1318 
AlignLen() const1319 int CGeneModel::AlignLen() const
1320 {
1321     return FShiftedLen(Limits(), false);
1322 }
1323 
RealCdsLimits() const1324 TSignedSeqRange CGeneModel::RealCdsLimits() const
1325 {
1326     TSignedSeqRange cds = MaxCdsLimits();
1327     if (HasStart()) {
1328         if (Strand()==ePlus)
1329             cds.SetFrom(GetCdsInfo().Start().GetFrom());
1330         else
1331             cds.SetTo(GetCdsInfo().Start().GetTo());
1332     }
1333     return cds;
1334 }
1335 
RealCdsLen() const1336 int CGeneModel::RealCdsLen() const
1337 {
1338     return FShiftedLen(RealCdsLimits(), false);
1339 }
1340 
MaxCdsLimits() const1341 TSignedSeqRange CGeneModel::MaxCdsLimits() const
1342 {
1343     if (ReadingFrame().Empty())
1344         return TSignedSeqRange::GetEmpty();
1345 
1346     return GetCdsInfo().MaxCdsLimits() & Limits();
1347 }
1348 
1349 template< class T>
1350 class CStreamState {
1351 #ifdef HAVE_IOS_REGISTER_CALLBACK
1352 public:
CStreamState(const T & deflt)1353     CStreamState(const T& deflt) : m_deflt(deflt), m_index( ios_base::xalloc() ) {}
1354 
slot(ios_base & iob)1355     T& slot(ios_base& iob)
1356     {
1357         void *&p = iob.pword(m_index);
1358         T *c = static_cast<T*>(p);
1359         if (c == NULL) {
1360             p = c = new T(m_deflt);
1361             iob.register_callback(ios_callback, m_index);
1362         }
1363         return *c;
1364     }
1365 
1366 private:
ios_callback(ios_base::event e,ios_base & iob,int index)1367     static void ios_callback(ios_base::event e, ios_base& iob, int index)
1368     {
1369         if (e == ios_base::erase_event) {
1370             delete static_cast<T*>(iob.pword(index));
1371         } else if (e == ios_base::copyfmt_event) {
1372             void *&p = iob.pword(index);
1373             try {
1374                 T *old = static_cast<T*>(p);
1375                 p = new T(*old);
1376             } catch (...) {
1377                 p = NULL;
1378             }
1379         }
1380     }
1381 
1382     T   m_deflt;
1383     int m_index;
1384 #else
1385     // Crude approximation for use by older compilers (notably GCC 2.95).
1386     // In particular, this implementation has no way to find out when
1387     // anything happens to the streams it tracks, so it leaks some
1388     // amount of memory and disregards the possibility that multiple
1389     // distinct tracked streams have the same address in sequence. :-/
1390 public:
1391     CStreamState(const T& deflt) : m_deflt(deflt) {}
1392 
1393     T& slot(IOS_BASE& iob)
1394     {
1395         TMap::iterator it = m_map.find(&iob);
1396         if (it == m_map.end()) {
1397             it = m_map.insert(make_pair(&iob, m_deflt)).first;
1398         }
1399         return it->second;
1400     }
1401 
1402 private:
1403     typedef map<IOS_BASE*, T> TMap;
1404 
1405     T    m_deflt;
1406     TMap m_map;
1407 #endif
1408 };
1409 
PolyALen() const1410 int CAlignModel::PolyALen() const {
1411     if((Status()&ePolyA) == 0)
1412         return 0;
1413 
1414     TSignedSeqRange lim = GetAlignMap().MapRangeOrigToEdited(GetAlignMap().ShrinkToRealPoints(Limits()),false);
1415     if((Status()&eReversed) == 0) {
1416         return TargetLen()-lim.GetTo()-1;
1417     } else {
1418         return lim.GetFrom();
1419     }
1420 }
1421 
1422 CStreamState<pair<string,string> > line_buffer(make_pair(kEmptyStr,kEmptyStr));
1423 
Getline(CNcbiIstream & is,string & line)1424 CNcbiIstream& Getline(CNcbiIstream& is, string& line)
1425 {
1426     if (!line_buffer.slot(is).first.empty()) {
1427         line = line_buffer.slot(is).first;
1428         line_buffer.slot(is).first.erase();
1429     } else {
1430         NcbiGetlineEOL(is, line);
1431     }
1432     line_buffer.slot(is).second = line;
1433     return is;
1434 }
1435 
Ungetline(CNcbiIstream & is)1436 void Ungetline(CNcbiIstream& is)
1437 {
1438     line_buffer.slot(is).first = line_buffer.slot(is).second;
1439     is.clear();
1440 }
1441 
InputError(CNcbiIstream & is)1442 CNcbiIstream& InputError(CNcbiIstream& is)
1443 {
1444     is.clear();
1445     ERR_POST( Error << "Input error. Last line: " <<  line_buffer.slot(is).second );
1446     Ungetline(is);
1447     is.setstate(ios::failbit);
1448     return is;
1449 }
1450 
1451 CStreamState<string> contig_stream_state(kEmptyStr);
1452 
operator <<(CNcbiOstream & s,const setcontig & c)1453 CNcbiOstream& operator<<(CNcbiOstream& s, const setcontig& c)
1454 {
1455     contig_stream_state.slot(s) = c.m_contig; return s;
1456 }
operator >>(CNcbiIstream & is,const getcontig & c)1457 CNcbiIstream& operator>>(CNcbiIstream& is, const getcontig& c)
1458 {
1459     c.m_contig = contig_stream_state.slot(is); return is;
1460 }
1461 
1462 enum EModelFileFormat { eGnomonFileFormat, eGFF3FileFormat, eASNFileFormat };
1463 NCBI_XALGOGNOMON_EXPORT CNcbiOstream& operator<<(CNcbiOstream& s, EModelFileFormat f);
1464 
1465 CStreamState<EModelFileFormat> model_file_format_state(eGFF3FileFormat);
1466 
operator <<(CNcbiOstream & s,EModelFileFormat f)1467 CNcbiOstream& operator<<(CNcbiOstream& s, EModelFileFormat f)
1468 {
1469     model_file_format_state.slot(s) = f; return s;
1470 }
1471 
1472 struct SGFFrec {
SGFFrecSGFFrec1473     SGFFrec() : start(-1), end(-1), score(BadScore()), strand('.'), phase(-1), model(0), tstart(-1), tend(-2), tstrand('+') {}
1474     string seqid;
1475     string source;
1476     string type;
1477     int start;
1478     int end;
1479     double score;
1480     char strand;
1481     int phase;
1482     Int8 model;
1483     int tstart;
1484     int tend;
1485     char tstrand;
1486     map<string,string> attributes;
1487 
1488     void print(CNcbiOstream& os) const;
1489 };
1490 
operator <<(CNcbiOstream & os,const SGFFrec & r)1491 CNcbiOstream& operator<<(CNcbiOstream& os, const SGFFrec& r)
1492 {
1493     r.print(os); return os;
1494 }
1495 
1496 namespace {
1497 const char* dot = ".";
1498 }
1499 
print(CNcbiOstream & os) const1500 void SGFFrec::print(CNcbiOstream& os) const
1501 {
1502 
1503     os << (seqid.empty()?dot:seqid) << '\t';
1504     os << (source.empty()?dot:source) << '\t';
1505     os << (type.empty()?dot:type) << '\t';
1506     if(start >= 0)
1507         os << (start+1) << '\t';
1508     else
1509         os << "-\t";
1510     if(end >= 0)
1511         os << (end+1) << '\t';
1512     else
1513         os << "-\t";
1514     if (score == BadScore()) os << dot; else os << score; os << '\t';
1515     os << strand << '\t';
1516     if (phase < 0) os << dot; else os << phase; os << '\t';
1517 
1518     os << "model=" << model;
1519     for (map<string,string>::const_iterator i = attributes.begin(); i != attributes.end(); ++i ) {
1520         if (!i->second.empty())
1521             os << ';' << i->first << '=' << i->second;
1522     }
1523 
1524     os << '\n';
1525 }
1526 
operator >>(CNcbiIstream & is,SGFFrec & res)1527 CNcbiIstream& operator>>(CNcbiIstream& is, SGFFrec& res)
1528 {
1529     string line;
1530     do {
1531         Getline(is, line);
1532     } while (is && (line.empty() || NStr::StartsWith(line, "#")));
1533     if (!is) {
1534         return is;
1535     }
1536 
1537     vector<string> v;
1538     NStr::Split(line, "\t", v);
1539     if (v.size()!=9)
1540         return InputError(is);
1541     SGFFrec rec;
1542     try {
1543         rec.seqid = v[0];
1544         rec.source = v[1];
1545         rec.type = v[2];
1546         rec.start = -1;
1547         if(v[3] != "-")
1548             rec.start = NStr::StringToNumeric<int>(v[3])-1;
1549         rec.end = -1;
1550         if(v[4] != "-")
1551             rec.end = NStr::StringToNumeric<int>(v[4])-1;
1552         rec.score = v[5]==dot?BadScore():NStr::StringToNumeric<double>(v[5]);
1553         rec.strand = v[6][0];
1554         rec.phase = v[7]==dot?-1:NStr::StringToNumeric<int>(v[7]);
1555     } catch (...) {
1556         return InputError(is);
1557     }
1558     bool model_id_present = false;
1559     vector<string> attributes;
1560     NStr::Split(v[8], ";", attributes, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1561     for (size_t i = 0; i < attributes.size(); ++i) {
1562         string key, value;
1563         if (NStr::SplitInTwo(attributes[i], "=", key, value) && !value.empty()) {
1564             if (key == "model") {
1565                 rec.model = NStr::StringToNumeric<Int8>(value);
1566                 model_id_present = true;
1567             } else if(key == "Target") {
1568                 vector<string> tt;
1569                 NStr::Split(value, " \t", tt, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1570                 rec.tstart = NStr::StringToNumeric<int>(tt[1])-1;
1571                 rec.tend = NStr::StringToNumeric<int>(tt[2])-1;
1572                 if(tt.size() > 3 && tt[3] == "-")
1573                     rec.tstrand = '-';
1574                 rec.attributes[key] = tt[0];
1575             } else
1576                 rec.attributes[key] = value;
1577         }
1578     }
1579     if (!model_id_present)
1580         return InputError(is);
1581     res = rec;
1582     return is;
1583 }
1584 
BuildGFF3Gap(int & prev_pos,const CInDelInfo & indel)1585 string BuildGFF3Gap(int& prev_pos, const CInDelInfo& indel) {
1586     string gap;
1587     string status;
1588     if(indel.GetStatus() == CInDelInfo::eGenomeCorrect)
1589         status = "c";
1590     else if(indel.GetStatus() == CInDelInfo::eGenomeNotCorrect)
1591         status = "n";
1592     if (prev_pos < indel.Loc())
1593         gap += " M"+NStr::NumericToString(indel.Loc()-prev_pos);
1594     if(indel.IsInsertion()) {
1595         gap += string(" ")+status+"D"+NStr::NumericToString(indel.Len());
1596     } else if(indel.IsDeletion()) {
1597         gap += string(" ")+status+"I"+indel.GetInDelV();
1598     } else {
1599         gap += string(" ")+status+"R"+indel.GetInDelV();
1600     }
1601 
1602     prev_pos = indel.InDelEnd();
1603 
1604     return gap;
1605 }
1606 
BuildGFF3Gap(int start,int end,const TInDels & indels)1607 string BuildGFF3Gap(int start, int end, const TInDels& indels)
1608 {
1609     string gap;
1610 
1611     int prev_pos = start;
1612     ITERATE (TInDels, indelp, indels) {
1613         const CInDelInfo& indel = *indelp;
1614         if(indel.Loc() < start)
1615             continue;
1616         if(indel.InDelEnd() > end+1)
1617             break;
1618 
1619         _ASSERT( indel.IsDeletion() || (start <=indel.Loc() && indel.Loc()+indel.Len()-1 <= end));
1620         gap += BuildGFF3Gap(prev_pos, indel);
1621     }
1622     if (!gap.empty()) {
1623         gap.erase(0,1);
1624         if (prev_pos < end+1)
1625             gap += " M"+NStr::NumericToString(end+1-prev_pos);
1626     }
1627 
1628     return gap;
1629 }
1630 
readGFF3Gap(const string & gap,int start,int end,TInDels & indels)1631 void readGFF3Gap(const string& gap, int start, int end, TInDels& indels)
1632 {
1633     if (gap.empty())
1634         return;
1635     vector<string> operations;
1636     NStr::Split(gap, " ", operations, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1637     TSignedSeqPos loc = start;
1638     NON_CONST_ITERATE(vector<string>, o, operations) {
1639         CInDelInfo::EStatus status = CInDelInfo::eUnknown;
1640         if((*o)[0] == 'c') {
1641             status = CInDelInfo::eGenomeCorrect;
1642             o->erase(o->begin());
1643         } else if((*o)[0] == 'n') {
1644             status = CInDelInfo::eGenomeNotCorrect;
1645             o->erase(o->begin());
1646         }
1647         int len = NStr::StringToNumeric<int>(*o,NStr::fConvErr_NoThrow|NStr::fAllowLeadingSymbols);
1648         if ((*o)[0] == 'M') {
1649             loc += len;
1650         } else if ((*o)[0] == 'D') {
1651             indels.push_back(CInDelInfo(loc,len,CInDelInfo::eIns));
1652             indels.back().SetStatus(status);
1653             loc += len;
1654         } else if ((*o)[0] == 'I') {
1655             string seq;
1656             len = o->length()-1;
1657             seq = o->substr(1);
1658             indels.push_back(CInDelInfo(loc,len,CInDelInfo::eDel,seq));
1659             indels.back().SetStatus(status);
1660         } else {
1661             _ASSERT((*o)[0] == 'R');
1662             string seq;
1663             len = o->length()-1;
1664             seq = o->substr(1);
1665             indels.push_back(CInDelInfo(loc,len,CInDelInfo::eMism,seq));
1666             indels.back().SetStatus(status);
1667             loc += len;
1668         }
1669     }
1670     _ASSERT( loc == end+1 );
1671 }
1672 
TypeToString(int type)1673 string CGeneModel::TypeToString(int type)
1674 {
1675     if ((type & eGnomon)!=0) return "Gnomon";
1676     if ((type & eChain)!=0) return  "Chainer";
1677     if ((type & eProt)!=0) return  "ProSplign";
1678     if ((type & (eSR|eEST|emRNA|eNotForChaining))!=0) return  "Splign";
1679     return "Unknown";
1680 }
1681 
CollectAttributes(const CAlignModel & a,map<string,string> & attributes)1682 void CollectAttributes(const CAlignModel& a, map<string,string>& attributes)
1683 {
1684     attributes["ID"] = NStr::NumericToString(a.ID());
1685     if (a.GeneID()!=0)
1686         attributes["Parent"] = "gene"+NStr::NumericToString(a.GeneID());
1687     if (a.RankInGene()!=0)
1688         attributes["rankInGene"] = NStr::NumericToString(a.RankInGene());
1689 
1690     ITERATE(CSupportInfoSet, i, a.Support()) {
1691         attributes["support"] += ",";
1692         if(i->IsCore())
1693             attributes["support"] += "*";
1694 
1695         attributes["support"] += NStr::NumericToString(i->GetId());
1696     }
1697     attributes["support"].erase(0,1);
1698 
1699     list<string> tp;
1700     ITERATE(list< CRef<CSeq_id> >, i, a.TrustedProt()) {
1701         tp.push_back(CIdHandler::ToString(**i));
1702     }
1703     tp.sort();
1704     attributes["TrustedProt"] = NStr::Join(tp,",");
1705 
1706     list<string> tm;
1707     ITERATE(list< CRef<CSeq_id> >, i, a.TrustedmRNA()) {
1708         tm.push_back(CIdHandler::ToString(**i));
1709     }
1710     tm.sort();
1711     attributes["TrustedmRNA"] = NStr::Join(tm,",");
1712 
1713     if(a.TargetLen() > 0)
1714         attributes["TargetLen"] = NStr::NumericToString(a.TargetLen());
1715 
1716     if(a.Ident() > 0)
1717         attributes["Ident"] = NStr::DoubleToString(a.Ident());
1718 
1719     if(a.Weight() > 1)
1720         attributes["Weight"] = NStr::DoubleToString(a.Weight());
1721 
1722     if (!a.ProteinHit().empty())
1723         attributes["protein_hit"] = a.ProteinHit();
1724 
1725     if ((a.Type()  &CGeneModel::eWall)!=0)       attributes["flags"] += ",Wall";
1726     if ((a.Type()  &CGeneModel::eNested)!=0)     attributes["flags"] += ",Nested";
1727     if ((a.Type()  &CGeneModel::eNotForChaining)!=0)         attributes["flags"] += ",NotForChaining";
1728     if ((a.Type()  &CGeneModel::eSR)!=0)         attributes["flags"] += ",SR";
1729     if ((a.Type()  &CGeneModel::eEST)!=0)        attributes["flags"] += ",EST";
1730     if ((a.Type()  &CGeneModel::emRNA)!=0)       attributes["flags"] += ",mRNA";
1731     if ((a.Type()  &CGeneModel::eProt)!=0)       attributes["flags"] += ",Prot";
1732 
1733     if ((a.Status()&CGeneModel::eCap)!=0)      attributes["flags"] += ",Cap";
1734     if ((a.Status()&CGeneModel::ePolyA)!=0)      attributes["flags"] += ",PolyA";
1735     if ((a.Status()&CGeneModel::eSkipped)!=0)    attributes["flags"] += ",Skip";
1736     if ((a.Status()&CGeneModel::eBestPlacement)!=0)    attributes["flags"] += ",BestPlacement";
1737     if ((a.Status()&CGeneModel::eConsistentCoverage)!=0)    attributes["flags"] += ",ConsistentCoverage";
1738     if ((a.Status()&CGeneModel::eUnknownOrientation)!=0)    attributes["flags"] += ",UnknownOrientation";
1739     if ((a.Status()&CGeneModel::eGapFiller)!=0)    attributes["flags"] += ",GapFiller";
1740     if ((a.Status()&CGeneModel::ecDNAIntrons)!=0)    attributes["flags"] += ",cDNAIntrons";
1741     if ((a.Status()&CGeneModel::eChangedByFilter)!=0)    attributes["flags"] += ",ChangedByFilter";
1742     if ((a.Status()&CGeneModel::eTSA)!=0)    attributes["flags"] += ",TSA";
1743     if ((a.Status()&CGeneModel::eLeftConfirmed)!=0)    attributes["flags"] += ",LeftConfirmed";
1744     if ((a.Status()&CGeneModel::eRightConfirmed)!=0)    attributes["flags"] += ",RightConfirmed";
1745 
1746     if (a.GetCdsInfo().ReadingFrame().NotEmpty()) {
1747 
1748         CCDSInfo cds_info = a.GetCdsInfo();
1749         if(cds_info.IsMappedToGenome()) {    // convert local copy to tcoords
1750             cds_info = cds_info.MapFromOrigToEdited(a.GetAlignMap());
1751         }
1752 
1753         TSignedSeqRange tlim = a.TranscriptLimits();
1754         TSignedSeqRange cds = cds_info.Cds();  // cdsinfo already in tcoords
1755         TSignedSeqRange start = cds_info.Start();
1756         TSignedSeqRange maxcdslim = tlim & cds_info.MaxCdsLimits();
1757         TSignedSeqRange protcds = cds_info.ProtReadingFrame();
1758 
1759         TSignedSeqRange rf = cds;
1760         if(cds_info.HasStart())
1761             rf.SetFrom(rf.GetFrom()+3);
1762         if(cds_info.HasStop())
1763             rf.SetTo(rf.GetTo()-3);
1764 
1765         bool tCoords = false;
1766 
1767         if(start.NotEmpty() && (start.GetFrom() != maxcdslim.GetFrom() && start.GetTo() != maxcdslim.GetTo())) {
1768             attributes["maxCDS"] = NStr::NumericToString(maxcdslim.GetFrom()+1)+" "+NStr::NumericToString(maxcdslim.GetTo()+1);
1769             tCoords = true;
1770         }
1771         if(protcds.NotEmpty() && protcds != rf) {
1772             attributes["protCDS"] = NStr::NumericToString(protcds.GetFrom()+1)+" "+NStr::NumericToString(protcds.GetTo()  +1);
1773             tCoords = true;
1774         }
1775         if(cds_info.HasStart()) {
1776             _ASSERT( !(cds_info.ConfirmedStart() && cds_info.OpenCds()) );
1777             string adj;
1778             if (cds_info.ConfirmedStart())
1779                 adj = "Confirmed";
1780             else if (cds_info.OpenCds())
1781                 adj = "Putative";
1782             attributes["flags"] += ","+adj+"Start";
1783         }
1784         if (cds_info.HasStop()) {
1785             string adj;
1786             if (cds_info.ConfirmedStop())
1787                 adj = "Confirmed";
1788             attributes["flags"] += ","+adj+"Stop";
1789         }
1790 
1791         if ((a.Status()&CGeneModel::eFullSupCDS)!=0) attributes["flags"] += ",FullSupCDS";
1792         if ((a.Status()&CGeneModel::ePseudo)!=0)     attributes["flags"] += ",Pseudo";
1793         TInDels fs = a.GetInDels(true);
1794         ITERATE(TInDels, indl, fs) {
1795             if(indl->GetStatus() != CInDelInfo::eGenomeNotCorrect) {
1796                 attributes["flags"] += ",Frameshifts";
1797                 break;
1798             }
1799         }
1800         if(a.isNMD(50))                              attributes["flags"] += ",NMD";
1801 
1802         ITERATE(CCDSInfo::TPStops, s, cds_info.PStops()) {
1803             TSignedSeqRange pstop = *s;
1804             attributes["pstop"] += ","+NStr::NumericToString(pstop.GetFrom()+1)+" "+NStr::NumericToString(pstop.GetTo()+1);
1805             if(s->m_status == CCDSInfo::eGenomeCorrect)
1806                 attributes["pstop"] += " C";
1807             else if(s->m_status == CCDSInfo::eGenomeNotCorrect)
1808                 attributes["pstop"] += " N";
1809             else if(s->m_status == CCDSInfo::eSelenocysteine)
1810                 attributes["pstop"] += " S";
1811 
1812             tCoords = true;
1813         }
1814         attributes["pstop"].erase(0,1);
1815 
1816         if(tCoords)
1817            attributes["flags"] += ",tCoords";
1818     }
1819 
1820     attributes["flags"].erase(0,1);
1821 
1822     attributes["note"] = a.GetComment();
1823 }
1824 
StringToRange(const string & s)1825 TSignedSeqRange StringToRange(const string& s)
1826 {
1827     string start, stop;
1828     NStr::SplitInTwo(s, " ", start, stop);
1829     return TSignedSeqRange(NStr::StringToNumeric<TSignedSeqPos>(start)-1,NStr::StringToNumeric<TSignedSeqPos>(stop)-1);
1830 }
1831 
ParseAttributes(map<string,string> & attributes,CAlignModel & a)1832 void ParseAttributes(map<string,string>& attributes, CAlignModel& a)
1833 {
1834     bool open_cds = false;
1835     bool confirmed_start = false;
1836     bool confirmed_stop = false;
1837     bool has_start = false;
1838     bool has_stop = false;
1839     bool tcoords = false;
1840 
1841     vector<string> flags;
1842     NStr::Split(attributes["flags"], ",", flags, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1843     ITERATE(vector<string>, f, flags) {
1844         if (*f == "Wall")       a.SetType(a.Type()|  CGeneModel::eWall);
1845         else if (*f == "Nested")     a.SetType(a.Type()|  CGeneModel::eNested);
1846         else if (*f == "NotForChaining")         a.SetType(a.Type()|  CGeneModel::eNotForChaining);
1847         else if (*f == "SR")         a.SetType(a.Type()|  CGeneModel::eSR);
1848         else if (*f == "EST")        a.SetType(a.Type()|  CGeneModel::eEST);
1849         else if (*f == "mRNA")       a.SetType(a.Type()|  CGeneModel::emRNA);
1850         else if (*f == "Prot")       a.SetType(a.Type()|  CGeneModel::eProt);
1851         else if (*f == "Skip")       a.Status()        |= CGeneModel::eSkipped;
1852         else if (*f == "FullSupCDS") a.Status()        |= CGeneModel::eFullSupCDS;
1853         else if (*f == "Pseudo")     a.Status()        |= CGeneModel::ePseudo;
1854         else if (*f == "PolyA")      a.Status()        |= CGeneModel::ePolyA;
1855         else if (*f == "Cap")        a.Status()        |= CGeneModel::eCap;
1856         else if (*f == "BestPlacement") a.Status()     |= CGeneModel::eBestPlacement;
1857         else if (*f == "ConsistentCoverage") a.Status()     |= CGeneModel::eConsistentCoverage;
1858         else if (*f == "UnknownOrientation") a.Status()|= CGeneModel::eUnknownOrientation;
1859         else if (*f == "GapFiller") a.Status()|= CGeneModel::eGapFiller;
1860         else if (*f == "cDNAIntrons") a.Status()       |= CGeneModel::ecDNAIntrons;
1861         else if (*f == "TSA") a.Status()       |= CGeneModel::eTSA;
1862         else if (*f == "LeftConfirmed") a.Status()       |= CGeneModel::eLeftConfirmed;
1863         else if (*f == "RightConfirmed") a.Status()       |= CGeneModel::eRightConfirmed;
1864         else if (*f == "ChangedByFilter") a.Status()       |= CGeneModel::eChangedByFilter;
1865         else if (*f == "ConfirmedStart")   { confirmed_start = true; has_start = true; }
1866         else if (*f == "PutativeStart")   { open_cds = true; has_start = true; }
1867         else if (*f == "Start") has_start = true;
1868         else if (*f == "ConfirmedStop")   { confirmed_stop = true; has_stop = true; }
1869         else if (*f == "Stop")  has_stop = true;
1870         else if (*f == "tCoords") tcoords = true;
1871     }
1872 
1873     if (NStr::StartsWith(attributes["Parent"],"gene"))
1874         a.SetGeneID(NStr::StringToNumeric<Int8>(attributes["Parent"],NStr::fConvErr_NoThrow|NStr::fAllowLeadingSymbols));
1875     if (!attributes["rankInGene"].empty()) {
1876         a.SetRankInGene(NStr::StringToNumeric<int>(attributes["rankInGene"]));
1877     }
1878 
1879     if (!attributes["Target"].empty()) {
1880         string target = NStr::Replace(attributes["Target"], "%20", " ");
1881 
1882         CRef<CSeq_id> target_id;
1883         try {
1884             target_id = CIdHandler::ToSeq_id(target);
1885         } catch(CException) {
1886             if(((a.Type() & CGeneModel::eGnomon) != 0 || (a.Type() & CGeneModel::eChain) != 0) && target.substr(0,4) == "hmm.") { // handles legacy files
1887                 target = "gnl|GNOMON|"+target.substr(4);
1888             } else if((a.Type() & CGeneModel::eProt) != 0 && target.length() == 6 && isalpha(target[0])) {    // probably PIR
1889                 target = "pir||"+target;
1890             }
1891             target_id = CIdHandler::ToSeq_id(target);
1892         }
1893         a.SetTargetId(*target_id);
1894     }
1895 
1896     if((a.Type() & CGeneModel::eGnomon) != 0 || (a.Type() & CGeneModel::eChain) != 0) { // handles legacy files
1897         vector<string> support;
1898         NStr::Split(attributes["support"], ",", support, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1899         ITERATE(vector<string>, s, support) {
1900             bool core = (*s)[0] == '*';
1901             Int8 id = NStr::StringToNumeric<Int8>(core ? s->substr(1) : *s);
1902             a.AddSupport(CSupportInfo(id, core));
1903         }
1904     }
1905 
1906     vector<string> trustedmrna;
1907     NStr::Split(attributes["TrustedmRNA"], ",", trustedmrna, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1908     ITERATE(vector<string>, s, trustedmrna) {
1909         a.InsertTrustedmRNA(CIdHandler::ToSeq_id(*s));
1910     }
1911 
1912     vector<string> trustedprot;
1913     NStr::Split(attributes["TrustedProt"], ",", trustedprot, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1914     ITERATE(vector<string>, s, trustedprot) {
1915         a.InsertTrustedProt(CIdHandler::ToSeq_id(*s));
1916     }
1917 
1918     a.SetComment(attributes["note"]);
1919 
1920     if (!attributes["Ident"].empty())
1921         a.SetIdent(NStr::StringToNumeric<double>(attributes["Ident"]));
1922 
1923     if (!attributes["Weight"].empty())
1924         a.SetWeight(NStr::StringToNumeric<double>(attributes["Weight"]));
1925 
1926     if (!attributes["protein_hit"].empty())
1927         a.ProteinHit() = attributes["protein_hit"];
1928 
1929     CCDSInfo cds_info = a.GetCdsInfo();
1930     TSignedSeqRange reading_frame = cds_info.ReadingFrame();
1931 
1932     if (reading_frame.NotEmpty()) {
1933 
1934         if (!attributes["protCDS"].empty()) {
1935             TSignedSeqRange protcds = StringToRange(attributes["protCDS"]);
1936             if(!tcoords)
1937                 protcds = a.GetAlignMap().MapRangeOrigToEdited(protcds, false);
1938             cds_info.SetReadingFrame(protcds, true );
1939         }
1940 
1941         bool reversed = (a.Status()&CGeneModel::eReversed) != 0;
1942 
1943         if(reversed)
1944             swap(has_start, has_stop);
1945 
1946         TSignedSeqRange start, stop;
1947 
1948         if(has_start) {
1949             start = TSignedSeqRange(reading_frame.GetFrom(),reading_frame.GetFrom()+2);
1950             reading_frame.SetFrom(reading_frame.GetFrom()+3);
1951         }
1952 
1953         if(has_stop) {
1954             stop = TSignedSeqRange(reading_frame.GetTo()-2,reading_frame.GetTo());
1955             reading_frame.SetTo(reading_frame.GetTo()-3);
1956         }
1957 
1958         if(reversed) {
1959             swap(has_start, has_stop);
1960             swap(start, stop);
1961         }
1962 
1963         cds_info.SetReadingFrame(reading_frame, (a.Type()&CGeneModel::eProt)!=0 && cds_info.ProtReadingFrame().Empty());
1964         cds_info.SetStart(start,confirmed_start);
1965 
1966         cds_info.SetStop(stop,confirmed_stop);
1967 
1968         if(!attributes["pstop"].empty()) {
1969             vector<string> stops;
1970             NStr::Split(attributes["pstop"], ",", stops, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1971             ITERATE(vector<string>, s, stops) {
1972                 vector<string> parts;
1973                 NStr::Split(*s, " ", parts, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1974                 TSignedSeqRange pstop = StringToRange(parts[0]+" "+parts[1]);
1975                 if(!tcoords)
1976                     pstop = a.GetAlignMap().MapRangeOrigToEdited(pstop, false);
1977                 CCDSInfo::EStatus status = CCDSInfo::eUnknown;
1978                 if(parts.size() > 2) {
1979                     if(parts[2] == "C")
1980                         status = CCDSInfo::eGenomeCorrect;
1981                     else if(parts[2] == "N")
1982                         status = CCDSInfo::eGenomeNotCorrect;
1983                     else if(parts[2] == "S")
1984                         status = CCDSInfo::eSelenocysteine;
1985                     else
1986                         _ASSERT(false);
1987                 }
1988                 cds_info.AddPStop(pstop, status);
1989             }
1990         }
1991 
1992         TSignedSeqRange max_cds_limits;
1993         TSignedSeqRange tlim = a.TranscriptLimits();
1994         if (attributes["maxCDS"].empty()) {
1995             max_cds_limits = tlim;
1996             if(reversed)
1997                 swap(start, stop);
1998             if(start.NotEmpty())
1999                 max_cds_limits.SetFrom(start.GetFrom());
2000             if(stop.NotEmpty())
2001                 max_cds_limits.SetTo(stop.GetTo());
2002             if(reversed)
2003                 swap(start, stop);
2004         } else {
2005             max_cds_limits = StringToRange(attributes["maxCDS"]);
2006             if(!tcoords)
2007                 max_cds_limits = a.GetAlignMap().MapRangeOrigToEdited(max_cds_limits, false);
2008         }
2009 
2010         if(reversed) {
2011             if(max_cds_limits.GetTo() < tlim.GetTo())
2012                 cds_info.Set5PrimeCdsLimit(max_cds_limits.GetTo());
2013         } else {
2014             if(max_cds_limits.GetFrom() > tlim.GetFrom())
2015                 cds_info.Set5PrimeCdsLimit(max_cds_limits.GetFrom());
2016         }
2017     }
2018 
2019     cds_info.SetScore(cds_info.Score(), open_cds);
2020 
2021     a.SetCdsInfo(cds_info);
2022 }
2023 
printGFF3(CNcbiOstream & os,CAlignModel a)2024 CNcbiOstream& printGFF3(CNcbiOstream& os, CAlignModel a)
2025 {
2026     CCDSInfo cds_info = a.GetCdsInfo();
2027     if(cds_info.IsMappedToGenome()) {    // convert local copy to tcoords
2028         cds_info = cds_info.MapFromOrigToEdited(a.GetAlignMap());
2029         a.SetCdsInfo(cds_info);
2030     }
2031 
2032     SGFFrec templ;
2033     templ.model = a.ID();
2034     templ.seqid = contig_stream_state.slot(os);
2035 
2036     templ.source = CGeneModel::TypeToString(a.Type());
2037 
2038     if(a.Strand() == ePlus)
2039         templ.strand = '+';
2040     else if(a.Strand() == eMinus)
2041         templ.strand = '-';
2042 
2043     SGFFrec mrna = templ;
2044     mrna.type = "mRNA";
2045     mrna.start = a.Limits().GetFrom();
2046     mrna.end = a.Limits().GetTo();
2047     mrna.score = a.Score();
2048 
2049     CollectAttributes(a, mrna.attributes);
2050 
2051     os << mrna;
2052 
2053     int part = 0;
2054     vector<SGFFrec> exons,cdss;
2055 
2056     templ.attributes["Parent"] = NStr::NumericToString(a.ID());
2057     SGFFrec exon = templ;
2058     exon.type = "exon";
2059     SGFFrec cds = templ;
2060     cds.type = "CDS";
2061 
2062     TSignedSeqRange tlim = a.TranscriptLimits();
2063     TSignedSeqRange rcds = tlim & cds_info.MaxCdsLimits();   // realcdslimits in tcoords
2064     if(cds_info.HasStart()) {
2065         if(a.Status()&CGeneModel::eReversed)
2066             rcds.SetTo(cds_info.Start().GetTo());
2067         else
2068             rcds.SetFrom(cds_info.Start().GetFrom());
2069     }
2070 
2071     int phase = 0;
2072     if(cds_info.ReadingFrame().NotEmpty() && ((a.Strand() == ePlus && !cds_info.HasStart()) || (a.Strand() == eMinus && !cds_info.HasStop()))) {   // may have extra bases on the left end
2073         phase = (a.Orientation() == ePlus) ? cds_info.ReadingFrame().GetFrom() - tlim.GetFrom() : tlim.GetTo()-cds_info.ReadingFrame().GetTo();
2074         _ASSERT(phase < 3);
2075     }
2076 
2077     TInDels corrections = a.FrameShifts();
2078     for(unsigned int i = 0; i < a.Exons().size(); ++i) {
2079         const CModelExon *e = &a.Exons()[i];
2080 
2081         if(e->Limits().NotEmpty()) {
2082             exon.start = e->GetFrom();
2083             exon.end = e->GetTo();
2084             exon.attributes["Gap"] = BuildGFF3Gap(exon.start, exon.end, corrections);
2085         } else {
2086             exon.start = -1;
2087             exon.end = -1;
2088             exon.attributes["Seq"] = e->m_seq;
2089             if(e->m_source.m_range.NotEmpty()) {
2090                 exon.attributes["Source"] = e->m_source.m_acc + ":"+NStr::NumericToString(e->m_source.m_range.GetFrom()+1) + ":"+NStr::NumericToString(e->m_source.m_range.GetTo()+1) + ":";
2091                 if(e->m_source.m_strand == ePlus)
2092                     exon.attributes["Source"] += "+";
2093                 else
2094                     exon.attributes["Source"] += "-";
2095             }
2096         }
2097 
2098         string targetid = NStr::Replace(CIdHandler::ToString(*a.GetTargetId()), " ", "%20");
2099 
2100         TSignedSeqRange transcript_exon = a.TranscriptExon(i);
2101         string target = " "+NStr::NumericToString(transcript_exon.GetFrom()+1)+" "+NStr::NumericToString(transcript_exon.GetTo()+1);
2102         if((a.Status() & CGeneModel::eReversed)!=0) {
2103             target += " -";
2104         } else {
2105             target += " +";
2106         }
2107         exon.attributes["Target"] = targetid+target;
2108 
2109         if(e->m_ident > 0) {
2110             exon.attributes["Ident"] = NStr::DoubleToString(e->m_ident);
2111         }
2112 
2113         if(!e->m_fsplice_sig.empty() || !e->m_ssplice_sig.empty()) {
2114             string fs;
2115             if(!e->m_fsplice_sig.empty())
2116                 fs = e->m_fsplice_sig;
2117             string ss;
2118             if(!e->m_ssplice_sig.empty())
2119                 ss = e->m_ssplice_sig;
2120             if(a.Strand() == ePlus) {
2121                 exon.attributes["Splices"] = fs+".."+ss;
2122             } else {
2123                 exon.attributes["Splices"] = ss+".."+fs;
2124             }
2125         }
2126 
2127         exons.push_back(exon);
2128         exon.attributes["Gap"].erase();
2129         exon.attributes["Ident"].erase();
2130         exon.attributes["Splices"].erase();
2131         exon.attributes["Seq"].erase();
2132         exon.attributes["Source"].erase();
2133 
2134         TSignedSeqRange tcds = transcript_exon & rcds;
2135         if(tcds.NotEmpty()) {
2136             cds.tstart = tcds.GetFrom();
2137             cds.tend = tcds.GetTo();
2138             cds.start = -1;
2139             cds.end = -1;
2140             TSignedSeqRange cds_limits = a.GetAlignMap().MapRangeEditedToOrig(tcds, true);
2141             if(cds_limits.NotEmpty()) {
2142                 cds.start = cds_limits.GetFrom();
2143                 cds.end = cds_limits.GetTo();
2144             }
2145             string ctarget = " "+NStr::NumericToString(tcds.GetFrom()+1)+" "+NStr::NumericToString(tcds.GetTo()+1);
2146             if((a.Status() & CGeneModel::eReversed)!=0) {
2147                 ctarget += " -";
2148             } else {
2149                 ctarget += " +";
2150             }
2151             cds.attributes["Target"] = targetid+ctarget;
2152             cdss.push_back(cds);
2153         }
2154 
2155         if (!e->m_ssplice) {
2156             string suffix;
2157             if (!a.Continuous()) {
2158                 SGFFrec rec = templ;
2159                 suffix= "."+NStr::NumericToString(++part);
2160                 if(exons.front().start >= 0) {
2161                     rec.start = exons.front().start;
2162                 } else {
2163                     _ASSERT(exons.size() > 1 && exons[1].start >= 0);
2164                     rec.start = exons[1].start;
2165                 }
2166                 if(exons.back().end >= 0) {
2167                     rec.end = exons.back().end;
2168                 } else {
2169                     _ASSERT(exons.size() > 1 && exons[exons.size()-2].end >= 0);
2170                     rec.end = exons[exons.size()-2].end;
2171                 }
2172                 rec.type = "mRNA_part";
2173                 rec.attributes["ID"] = rec.attributes["Parent"]+suffix;
2174                 os << rec;
2175             }
2176             NON_CONST_ITERATE(vector<SGFFrec>, e, exons) {
2177                 e->attributes["Parent"] += suffix;
2178                 os << *e;
2179             }
2180             exons.clear();
2181 
2182             NON_CONST_ITERATE(vector<SGFFrec>, e, cdss) {
2183                 if (a.Strand()==ePlus)
2184                     e->phase = phase;
2185 
2186                 phase = (e->tend - e->tstart+1 - phase)%3;
2187                 if (a.Strand()==eMinus)
2188                     e->phase = phase;
2189                 phase = (3-phase)%3;
2190 
2191                 e->attributes["Parent"] += suffix;
2192                 os << *e;
2193             }
2194             cdss.clear();
2195         }
2196     }
2197 
2198     return os;
2199 }
2200 
2201 struct Precedence : public binary_function<TSignedSeqRange, TSignedSeqRange, bool>
2202 {
operator ()Precedence2203     bool operator()(const TSignedSeqRange& __x, const TSignedSeqRange& __y) const
2204     { return Precede( __x, __y ); }
2205 };
2206 
operator -(TSignedSeqRange a,TSignedSeqRange b)2207 TSignedSeqRange operator- (TSignedSeqRange a, TSignedSeqRange b)
2208 {
2209     b &= a;
2210     if (b.Empty())
2211         return a;
2212     if (a.GetFrom()==b.GetFrom())
2213         a.SetFrom(b.GetTo()+1);
2214     else if (a.GetTo()==b.GetTo())
2215         a.SetTo(b.GetFrom()-1);
2216     return a;
2217 }
2218 
2219 
readGFF3(CNcbiIstream & is,CAlignModel & align)2220 CNcbiIstream& readGFF3(CNcbiIstream& is, CAlignModel& align)
2221 {
2222     SGFFrec rec;
2223     is >> rec;
2224     if (!is) {
2225          return is;
2226     }
2227 
2228     Int8 id = rec.model;
2229 
2230     if (id==0)
2231         return InputError(is);
2232 
2233     vector<SGFFrec> recs;
2234 
2235     do {
2236         recs.push_back(rec);
2237     } while (is >> rec && rec.seqid == recs.front().seqid && rec.model == id && rec.type != "mRNA");
2238 
2239     Ungetline(is);
2240 
2241     CGeneModel a;
2242 
2243     a.SetID(id);
2244 #ifdef _DEBUG
2245     a.oid = id;
2246 #endif
2247 
2248     TSignedSeqRange cds;
2249     int phase = 0;
2250     bool tcoords = false;
2251 
2252     vector<CModelExon> exons;
2253     vector<TSignedSeqRange> transcript_exons;
2254     TInDels indels;
2255     set<TSignedSeqRange,Precedence> mrna_parts;
2256 
2257     rec = recs.front();
2258 
2259     if (rec.source == "Gnomon") a.SetType(a.Type() | CGeneModel::eGnomon);
2260     else if (rec.source == "Chainer") a.SetType(a.Type() | CGeneModel::eChain);
2261 
2262     if(rec.strand=='+')
2263         a.SetStrand(ePlus);
2264     else if(rec.strand=='-')
2265         a.SetStrand(eMinus);
2266 
2267     double score = BadScore();
2268     map<string, string> attributes;
2269 
2270     NON_CONST_ITERATE(vector<SGFFrec>, r, recs) {
2271         if (r->type == "mRNA") {
2272             attributes = r->attributes;
2273             score = r->score;
2274         } else if (r->type == "exon") {
2275             if(r->tstart >= 0 && r->tstrand == '-') {
2276                 a.Status() |= CGeneModel::eReversed;
2277             }
2278 
2279             string fs, ss;
2280             if(!r->attributes["Splices"].empty()) {
2281                 string splices = r->attributes["Splices"];
2282                 auto ispl = splices.find("..");
2283                 if(ispl != string::npos) {
2284                     if(ispl == 2)
2285                         fs = splices.substr(0,2);
2286                     if(ispl+4 == splices.length())
2287                         ss = splices.substr(ispl+2,2);
2288                 }
2289                 if(a.Strand() == eMinus)
2290                     swap(fs,ss);
2291             }
2292 
2293             double eident = 0;
2294             if(!r->attributes["Ident"].empty()) {
2295                 eident = NStr::StringToNumeric<double>(r->attributes["Ident"]);
2296             }
2297 
2298             if(r->start >= 0 && r->end >= 0)
2299                 exons.push_back(CModelExon(r->start,r->end,false,false,fs,ss,eident));
2300             else {
2301                 CInDelInfo::SSource src;
2302                 if(!r->attributes["Source"].empty()) {
2303                     vector<string> v;
2304                     NStr::Split(r->attributes["Source"], ":", v, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
2305                     _ASSERT((int)v.size() == 4);
2306                     src.m_acc = v[0];
2307                     src.m_range.SetFrom(NStr::StringToNumeric<TSignedSeqPos>(v[1])-1);
2308                     src.m_range.SetTo(NStr::StringToNumeric<TSignedSeqPos>(v[2])-1);
2309                     if(v[3] == "+")
2310                         src.m_strand = ePlus;
2311                     else
2312                         src.m_strand = eMinus;
2313                }
2314                 exons.push_back(CModelExon(1,-1,false,false,fs,ss,eident,r->attributes["Seq"],src));
2315             }
2316             TSignedSeqRange texon(r->tstart,r->tend);
2317             if(texon.NotEmpty())
2318                 transcript_exons.push_back(texon);
2319             readGFF3Gap(r->attributes["Gap"],r->start,r->end, indels);
2320             if(!r->attributes["Target"].empty())
2321                 attributes["Target"] = r->attributes["Target"];
2322         } else if (r->type == "CDS") {
2323             _ASSERT((a.Status()&CGeneModel::eUnknownOrientation) == 0);
2324 
2325             if (r->strand=='+') {
2326                 if (cds.Empty())
2327                     phase = r->phase;
2328             } else {
2329                 phase = r->phase;
2330             }
2331             if(r->tstart < 0) {   // old format before ggaps
2332                 TSignedSeqRange cds_exon(r->start,r->end);
2333                 cds.CombineWith(cds_exon);
2334                 tcoords = false;
2335             } else {
2336                 TSignedSeqRange cds_exon(r->tstart,r->tend);
2337                 cds.CombineWith(cds_exon);
2338                 tcoords = true;
2339             }
2340         } else if (r->type == "mRNA_part") {
2341             mrna_parts.insert(TSignedSeqRange(r->start,r->end));
2342         }
2343     }
2344 
2345     for(int i = 0; i < (int)exons.size(); ++i) {
2346         const CModelExon& e = exons[i];
2347         if (a.Limits().IntersectingWith(e.Limits())) {
2348             return  InputError(is);
2349         }
2350         if(i > 0 && exons[i-1].Limits().NotEmpty()) {    // there can't be ggap adjacent to 'hole'
2351             set<TSignedSeqRange,Precedence>::iterator p = mrna_parts.lower_bound(e);
2352             if (p != mrna_parts.end() && p->GetFrom()==e.Limits().GetFrom())
2353                 a.AddHole();
2354         }
2355         a.AddExon(e.Limits(),e.m_fsplice_sig,e.m_ssplice_sig,e.m_ident,e.m_seq,e.m_source);
2356     }
2357 
2358     sort(indels.begin(),indels.end());
2359     CAlignMap amap;
2360     if(!transcript_exons.empty())
2361         amap = CAlignMap(a.Exons(), transcript_exons, indels, a.Orientation(), NStr::StringToNumeric<int>(attributes["TargetLen"]));
2362     else
2363         amap = CAlignMap(a.Exons(), indels, a.Strand());
2364 
2365     a.FrameShifts() = indels;
2366     align = CAlignModel(a, amap);
2367 
2368     TSignedSeqRange reading_frame;
2369     CCDSInfo cds_info;
2370 
2371     if (cds.NotEmpty()) {
2372         cds_info = CCDSInfo(false);
2373 
2374         //        align.FrameShifts() = amap.GetInDels(true);
2375 
2376         TSignedSeqRange rf = cds;
2377         if(!tcoords)
2378             rf = amap.MapRangeOrigToEdited(cds, false);
2379 
2380         if(!(a.Status() & CGeneModel::eReversed)) {
2381             rf.SetFrom(rf.GetFrom()+phase);
2382             rf.SetTo(rf.GetTo()-rf.GetLength()%3);
2383         } else {
2384             rf.SetTo(rf.GetTo()-phase);
2385             rf.SetFrom(rf.GetFrom()+rf.GetLength()%3);
2386         }
2387 
2388         cds_info.SetReadingFrame(rf, false);
2389     }
2390 
2391     align.SetCdsInfo(cds_info);
2392 
2393     ParseAttributes(attributes, align);
2394 
2395     if(cds.NotEmpty()) {
2396         CCDSInfo cds_info_t = align.GetCdsInfo();
2397         cds_info_t.SetScore(score, cds_info_t.OpenCds());
2398         CCDSInfo cds_info_g = cds_info_t.MapFromEditedToOrig(amap);
2399         if(cds_info_g.ReadingFrame().NotEmpty())   // successful projection
2400             align.SetCdsInfo(cds_info_g);
2401         else
2402             align.SetCdsInfo(cds_info_t);
2403     }
2404 
2405     contig_stream_state.slot(is) = rec.seqid;
2406     return is;
2407 }
2408 
operator <<(CNcbiOstream & s,const CGeneModel & a)2409 CNcbiOstream& operator<<(CNcbiOstream& s, const CGeneModel& a)
2410 {
2411     return operator<<(s, CAlignModel(a, a.GetAlignMap()));
2412 }
2413 
operator <<(CNcbiOstream & os,const CAlignModel & a)2414 CNcbiOstream& operator<<(CNcbiOstream& os, const CAlignModel& a)
2415 {
2416     switch (model_file_format_state.slot(os)) {
2417     case eGFF3FileFormat:
2418         return printGFF3(os,a);
2419     default:
2420         os.setstate(ios::failbit);
2421         return os;
2422     }
2423 }
2424 
operator >>(CNcbiIstream & is,CAlignModel & align)2425 CNcbiIstream& operator>>(CNcbiIstream& is, CAlignModel& align)
2426 {
2427     switch (model_file_format_state.slot(is)) {
2428     case eGFF3FileFormat:
2429         return readGFF3(is, align);
2430     default:
2431         is.setstate(ios::failbit);
2432         return is;
2433     }
2434 }
2435 
CSupportInfo(Int8 model_id,bool core)2436 CSupportInfo::CSupportInfo(Int8 model_id, bool core)
2437     : m_id( model_id), m_core_align(core)
2438 {
2439 }
2440 
GetId() const2441 Int8 CSupportInfo::GetId() const
2442 {
2443     return m_id;
2444 }
2445 
SetCore(bool core)2446 void CSupportInfo::SetCore(bool core)
2447 {
2448     m_core_align = core;
2449 }
2450 
IsCore() const2451 bool CSupportInfo::IsCore() const
2452 {
2453     return m_core_align;
2454 }
2455 
operator ==(const CSupportInfo & s) const2456 bool CSupportInfo::operator==(const CSupportInfo& s) const
2457 {
2458     return IsCore() == s.IsCore() && GetId() == s.GetId();
2459 }
2460 
operator <(const CSupportInfo & s) const2461 bool CSupportInfo::operator<(const CSupportInfo& s) const
2462 {
2463     return GetId() == s.GetId() ? IsCore() < s.IsCore() : GetId() < s.GetId();
2464 }
2465 
2466 
2467 END_SCOPE(gnomon)
2468 END_NCBI_SCOPE
2469