1 /* $Id: gnomon_model.cpp 626290 2021-02-25 14:09:15Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Alexandre Souvorov
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <algo/gnomon/gnomon_model.hpp>
34 #include <algo/gnomon/gnomon.hpp>
35 #include "gnomon_seq.hpp"
36 #include <algo/gnomon/id_handler.hpp>
37 #include <set>
38 #include <functional>
39 #include <corelib/ncbiutil.hpp>
40 #include <objects/general/Object_id.hpp>
41 #include <objects/seqloc/Seq_id.hpp>
42
43 BEGIN_NCBI_SCOPE
44 BEGIN_SCOPE(gnomon)
45
46 USING_SCOPE(objects);
47
GetInDels(TSignedSeqPos a,TSignedSeqPos b,bool fs_only) const48 TInDels CGeneModel::GetInDels(TSignedSeqPos a, TSignedSeqPos b, bool fs_only) const {
49
50 TInDels indels = GetInDels(fs_only);
51 TInDels selected_indels;
52
53 ITERATE(TInDels, i, indels) {
54 if(i->IntersectingWith(a,b))
55 selected_indels.push_back( *i );
56 }
57
58 return selected_indels;
59 }
60
GetInDels(bool fs_only) const61 TInDels CGeneModel::GetInDels(bool fs_only) const {
62
63 TInDels selected_indels;
64 if(!fs_only) {
65 ITERATE(TInDels, i, FrameShifts()) {
66 if(!i->IsMismatch())
67 selected_indels.push_back(*i);
68 }
69 } else {
70 TExons::const_iterator e = Exons().begin();
71 ITERATE(TInDels, i, FrameShifts()) {
72 if(i->IsMismatch() || i->Len()%3 == 0) // skip mismatches and full codons
73 continue;
74
75 for( ;e != Exons().end() && (e->Limits().Empty() || !i->IntersectingWith(e->GetFrom(),e->GetTo())); ++e); // harboring exon
76 if(i->InDelEnd() > e->GetTo() && e != Exons().end()) { // skip full codons splitted by intron
77 TInDels::const_iterator next = i;
78 if(++next != FrameShifts().end() && (++e)->Limits().NotEmpty() && next->Loc() == e->GetFrom() && !next->IsMismatch()) {
79 int len = i->Len() + (i->GetType() == next->GetType() ? next->Len() : -next->Len());
80 if(len%3 == 0) {
81 i = next;
82 continue;
83 }
84 }
85 }
86
87 selected_indels.push_back(*i);
88 }
89 }
90
91 return selected_indels;
92 }
93
94
TranscriptExon(int i) const95 TSignedSeqRange CGeneModel::TranscriptExon(int i) const {
96 CAlignMap amap = GetAlignMap();
97
98 if(Exons()[i].Limits().NotEmpty()) {
99 return amap.MapRangeOrigToEdited(Exons()[i].Limits(), true);
100 } else if(i > 0) { // there is real exon on the left
101 int p = amap.MapOrigToEdited(Exons()[i-1].GetTo());
102 if(Orientation() == ePlus)
103 return TSignedSeqRange(p+1,p+Exons()[i].m_seq.size());
104 else
105 return TSignedSeqRange(p-Exons()[i].m_seq.size(),p-1);
106 } else { // there is a real exon on the right
107 _ASSERT(i < (int)Exons().size()-1);
108 int p = amap.MapOrigToEdited(Exons()[i+1].GetFrom());
109 if(Orientation() == ePlus)
110 return TSignedSeqRange(p-Exons()[i].m_seq.size(),p-1);
111 else
112 return TSignedSeqRange(p+1,p+Exons()[i].m_seq.size());
113 }
114 }
115
TranscriptLimits() const116 TSignedSeqRange CGeneModel::TranscriptLimits() const {
117 CAlignMap amap = GetAlignMap();
118
119 int l,r;
120
121 if(Orientation() == ePlus) {
122 if(Exons().front().Limits().NotEmpty()) {
123 l = amap.MapRangeOrigToEdited(Exons().front().Limits(),true).GetFrom();
124 } else {
125 _ASSERT(Exons().size() > 1 && Exons()[1].Limits().NotEmpty());
126 l = amap.MapOrigToEdited(Exons()[1].GetFrom())-Exons().front().m_seq.length();
127 }
128 if(Exons().back().Limits().NotEmpty()) {
129 r = amap.MapRangeOrigToEdited(Exons().back().Limits(),true).GetTo();
130 } else {
131 _ASSERT(Exons().size() > 1 && Exons()[Exons().size()-2].Limits().NotEmpty());
132 r = amap.MapOrigToEdited(Exons()[Exons().size()-2].GetTo()) + Exons().back().m_seq.length();
133 }
134 } else {
135 if(Exons().front().Limits().NotEmpty()) {
136 r = amap.MapRangeOrigToEdited(Exons().front().Limits(),true).GetTo();
137 } else {
138 _ASSERT(Exons().size() > 1 && Exons()[1].Limits().NotEmpty());
139 r = amap.MapOrigToEdited(Exons()[1].GetFrom())+Exons().front().m_seq.length();
140 }
141 if(Exons().back().Limits().NotEmpty()) {
142 l = amap.MapRangeOrigToEdited(Exons().back().Limits(),true).GetFrom();
143 } else {
144 _ASSERT(Exons().size() > 1 && Exons()[Exons().size()-2].Limits().NotEmpty());
145 l = amap.MapOrigToEdited(Exons()[Exons().size()-2].GetTo()) - Exons().back().m_seq.length();
146 }
147 }
148
149 return TSignedSeqRange(l,r);
150 }
151
isNMD(int limit) const152 bool CGeneModel::isNMD(int limit) const
153 {
154 if (GetCdsInfo().ReadingFrame().Empty() || Exons().size() <= 1)
155 return false;
156
157 TSignedSeqRange cds = GetCdsInfo().Start()+GetCdsInfo().ReadingFrame()+GetCdsInfo().Stop();
158 CAlignMap amap = GetAlignMap();
159 if(GetCdsInfo().IsMappedToGenome()) {
160 cds = amap.MapRangeOrigToEdited(cds);
161 }
162
163 int l = -1;
164 if (Strand() == ePlus) {
165 for(int i = (int)Exons().size()-1; i > 0; --i) {
166 if(Exons()[i].m_fsplice && Exons()[i-1].m_ssplice && Exons()[i].m_fsplice_sig != "XX" && Exons()[i-1].m_ssplice_sig != "XX") {
167 l = amap.MapRangeOrigToEdited(Exons()[i],true).GetFrom();
168 break;
169 }
170 }
171 } else {
172 for(int i = 0; i < (int)Exons().size()-1; ++i) {
173 if(Exons()[i].m_ssplice && Exons()[i+1].m_fsplice && Exons()[i].m_ssplice_sig != "XX" && Exons()[i+1].m_fsplice_sig != "XX") {
174 l = amap.MapRangeOrigToEdited(Exons()[i],true).GetFrom();
175 break;
176 }
177 }
178 }
179 return l > cds.GetTo()+limit;
180 }
181
182
ReverseComplementModel()183 void CGeneModel::ReverseComplementModel() {
184 SetStrand(Strand() == ePlus ? eMinus : ePlus);
185 Status() ^= eReversed;
186 NON_CONST_ITERATE(TExons, it, MyExons()) {
187 if(it->m_fsplice_sig != "XX")
188 ReverseComplement(it->m_fsplice_sig.begin(),it->m_fsplice_sig.end());
189 if(it->m_ssplice_sig != "XX")
190 ReverseComplement(it->m_ssplice_sig.begin(),it->m_ssplice_sig.end());
191 }
192 }
193
GetAlignMap() const194 CAlignMap CGeneModel::GetAlignMap() const { return CAlignMap(Exons(), FrameShifts(), Strand()); }
195
CAlignModel(const CGeneModel & g,const CAlignMap & a)196 CAlignModel::CAlignModel(const CGeneModel& g, const CAlignMap& a)
197 : CGeneModel(g), m_alignmap(a)
198 {
199
200 #ifdef _DEBUG
201 int diff = 0;
202 for(int ie = 0; ie < (int)Exons().size(); ++ie) {
203 if(Exons()[ie].Limits().Empty())
204 diff += Exons()[ie].m_seq.size();
205 else
206 diff += Exons()[ie].Limits().GetLength();
207
208 diff -= TranscriptExon(ie).GetLength();
209 }
210 ITERATE(TInDels, i, FrameShifts()) {
211 if(i->IsDeletion())
212 diff += i->Len();
213 else if(i->IsInsertion())
214 diff -= i->Len();
215 }
216 _ASSERT(diff == 0);
217 #endif
218
219 SetTargetId(*CIdHandler::GnomonMRNA(ID()));
220 if(a.Orientation() != g.Strand())
221 Status() |= CGeneModel::eReversed;
222 }
223
ResetAlignMap()224 void CAlignModel::ResetAlignMap()
225 {
226 Status() &= ~CGeneModel::eReversed;
227 m_alignmap = CGeneModel::GetAlignMap();
228 }
229
RecalculateAlignMap(int left,int right)230 void CAlignModel::RecalculateAlignMap(int left, int right) {
231 if(Exons().empty()) {
232 m_alignmap = CAlignMap();
233 return;
234 }
235
236 vector<TSignedSeqRange> transcript_exons;
237 for(int ie = 0; ie < (int)Exons().size(); ++ie) {
238 const CModelExon& e = Exons()[ie];
239 if(e.Limits().Empty()) {
240 transcript_exons.push_back(TranscriptExon(ie));
241 } else {
242 CAlignMap::ERangeEnd l = e.GetFrom() == left ? CAlignMap::eSinglePoint : CAlignMap::eLeftEnd;
243 CAlignMap::ERangeEnd r = e.GetTo() == right ? CAlignMap::eSinglePoint : CAlignMap::eRightEnd;
244 TSignedSeqRange texon = m_alignmap.MapRangeOrigToEdited(e.Limits(), l, r);
245 transcript_exons.push_back(texon);
246 }
247 }
248 m_alignmap = CAlignMap(Exons(), transcript_exons, FrameShifts(), Orientation(), m_alignmap.TargetLen());
249 }
250
TargetAccession() const251 string CAlignModel::TargetAccession() const {
252 _ASSERT( !GetTargetId().Empty() );
253 if (GetTargetId().Empty())
254 return "UnknownTarget";
255 return CIdHandler::ToString(*GetTargetId());
256 }
257
Remap(const CRangeMapper & mapper)258 void CGeneModel::Remap(const CRangeMapper& mapper)
259 {
260 NON_CONST_ITERATE(TExons, e, MyExons()) {
261 e->Remap(mapper);
262 }
263 RecalculateLimits();
264
265 if(ReadingFrame().NotEmpty())
266 m_cds_info.Remap(mapper);
267 }
268
operator ==(const CCDSInfo & a) const269 bool CCDSInfo::operator== (const CCDSInfo& a) const
270 {
271 return
272 m_start==a.m_start &&
273 m_stop==a.m_stop &&
274 m_reading_frame==a.m_reading_frame &&
275 m_reading_frame_from_proteins==a.m_reading_frame_from_proteins &&
276 m_max_cds_limits==a.m_max_cds_limits &&
277 m_confirmed_start==a.m_confirmed_start &&
278 m_confirmed_stop==a.m_confirmed_stop &&
279 m_p_stops==a.m_p_stops &&
280 m_open==a.m_open &&
281 m_score==a.m_score &&
282 m_genomic_coordinates == a.m_genomic_coordinates;
283 }
284
MapFromEditedToOrig(const CAlignMap & amap) const285 CCDSInfo CCDSInfo::MapFromEditedToOrig(const CAlignMap& amap) const
286 {
287 CCDSInfo empty_cds;
288 CCDSInfo new_cds(true);
289
290 if(ProtReadingFrame().NotEmpty()) {
291 TSignedSeqRange prot_rf = amap.MapRangeEditedToOrig(ProtReadingFrame(), true);
292 if(prot_rf.NotEmpty())
293 new_cds.SetReadingFrame(prot_rf, true);
294 else
295 return empty_cds;
296 }
297 if(ReadingFrame().NotEmpty()) {
298 TSignedSeqRange rf = amap.MapRangeEditedToOrig(ReadingFrame(), true);
299 if(rf.NotEmpty() && amap.FShiftedLen(rf, true)%3 == 0)
300 new_cds.SetReadingFrame(rf, false);
301 else
302 return empty_cds;
303 }
304 if(HasStart()) {
305 TSignedSeqRange start = amap.MapRangeEditedToOrig(Start(), false);
306 if(start.NotEmpty())
307 new_cds.SetStart(start, ConfirmedStart());
308 else
309 return empty_cds;
310 }
311 if(HasStop()) {
312 TSignedSeqRange stop = amap.MapRangeEditedToOrig(Stop(), false);
313 if(stop.NotEmpty())
314 new_cds.SetStop(stop, ConfirmedStop());
315 else
316 return empty_cds;
317 }
318 ITERATE(TPStops, i, PStops()) {
319 TSignedSeqRange pstop = amap.MapRangeEditedToOrig(*i, false);
320 if(pstop.NotEmpty())
321 new_cds.AddPStop(pstop, i->m_status);
322 else
323 return empty_cds;
324 }
325 if(HasStart() && MaxCdsLimits().GetFrom() == Start().GetFrom()) {
326 new_cds.Set5PrimeCdsLimit(amap.MapEditedToOrig(Start().GetFrom()));
327 } else if(HasStart() && MaxCdsLimits().GetTo() == Start().GetTo()) {
328 new_cds.Set5PrimeCdsLimit(amap.MapEditedToOrig(Start().GetTo()));
329 }
330 new_cds.SetScore(Score(), OpenCds());
331
332 return new_cds;
333 }
334
MapFromOrigToEdited(const CAlignMap & amap) const335 CCDSInfo CCDSInfo::MapFromOrigToEdited(const CAlignMap& amap) const
336 {
337 CCDSInfo new_cds(false);
338
339 if(ProtReadingFrame().NotEmpty()) {
340 new_cds.SetReadingFrame(amap.MapRangeOrigToEdited(ProtReadingFrame(), true), true);
341 _ASSERT(new_cds.ProtReadingFrame().NotEmpty());
342 }
343 if(ReadingFrame().NotEmpty()) {
344 new_cds.SetReadingFrame(amap.MapRangeOrigToEdited(ReadingFrame(), true), false);
345 _ASSERT(new_cds.ReadingFrame().NotEmpty());
346 }
347 if(HasStart()) {
348 new_cds.SetStart(amap.MapRangeOrigToEdited(Start(), false), ConfirmedStart());
349 _ASSERT(new_cds.HasStart());
350 }
351 if(HasStop()) {
352 new_cds.SetStop(amap.MapRangeOrigToEdited(Stop(), false), ConfirmedStop());
353 _ASSERT(new_cds.HasStop());
354 }
355 ITERATE(TPStops, i, PStops()) {
356 TSignedSeqRange p = amap.MapRangeOrigToEdited(*i, false);
357 _ASSERT(p.NotEmpty());
358 new_cds.AddPStop(p, i->m_status);
359 }
360 if(HasStart() && MaxCdsLimits().GetFrom() == Start().GetFrom()) {
361 new_cds.Set5PrimeCdsLimit(amap.MapOrigToEdited(Start().GetFrom()));
362 } else if(HasStart() && MaxCdsLimits().GetTo() == Start().GetTo()) {
363 new_cds.Set5PrimeCdsLimit(amap.MapOrigToEdited(Start().GetTo()));
364 }
365 new_cds.SetScore(Score(), OpenCds());
366
367 return new_cds;
368 }
369
SetReadingFrame(TSignedSeqRange r,bool protein)370 void CCDSInfo::SetReadingFrame(TSignedSeqRange r, bool protein)
371 {
372 if (r.Empty()) {
373 if(protein)
374 m_reading_frame_from_proteins = r;
375 else
376 Clear();
377 } else {
378 m_reading_frame = r;
379 if (protein)
380 m_reading_frame_from_proteins = r;
381 if (m_max_cds_limits.Empty()) {
382 m_max_cds_limits = TSignedSeqRange::GetWhole();
383 }
384 }
385 _ASSERT( Invariant() );
386 }
387
SetStart(TSignedSeqRange r,bool confirmed)388 void CCDSInfo::SetStart(TSignedSeqRange r, bool confirmed)
389 {
390 if (confirmed) {
391 m_confirmed_start = true;
392 m_open = false;
393 } else if (m_confirmed_start && r != m_start) {
394 m_confirmed_start = false;
395 }
396 m_start = r;
397 _ASSERT( Invariant() );
398 }
399
SetStop(TSignedSeqRange r,bool confirmed)400 void CCDSInfo::SetStop(TSignedSeqRange r, bool confirmed)
401 {
402 if(m_stop.NotEmpty()) {
403 if(m_max_cds_limits.GetFrom() == m_stop.GetFrom())
404 m_max_cds_limits.SetFrom( TSignedSeqRange::GetWholeFrom());
405 if(m_max_cds_limits.GetTo() == m_stop.GetTo())
406 m_max_cds_limits.SetTo( TSignedSeqRange::GetWholeTo());
407 }
408
409 if (confirmed)
410 m_confirmed_stop = true;
411 else if (m_confirmed_stop && r != m_stop) {
412 m_confirmed_stop = false;
413 }
414 m_stop = r;
415 if (r.NotEmpty()) {
416 if (Precede(m_reading_frame,r))
417 m_max_cds_limits.SetTo(r.GetTo());
418 else
419 m_max_cds_limits.SetFrom(r.GetFrom());
420 }
421 if(!m_p_stops.empty() && m_p_stops.back() == m_stop)
422 m_p_stops.pop_back();
423 if(!m_p_stops.empty() && m_p_stops.front() == m_stop)
424 m_p_stops.erase(m_p_stops.begin());
425
426 _ASSERT( Invariant() );
427 }
428
Clear5PrimeCdsLimit()429 void CCDSInfo::Clear5PrimeCdsLimit()
430 {
431 if (HasStop()) {
432 if (Precede(ReadingFrame(),Stop()))
433 m_max_cds_limits.SetFrom( TSignedSeqRange::GetWholeFrom());
434 else
435 m_max_cds_limits.SetTo( TSignedSeqRange::GetWholeTo());
436 } else {
437 m_max_cds_limits = TSignedSeqRange::GetWhole();
438 }
439 _ASSERT( Invariant() );
440 }
441
Set5PrimeCdsLimit(TSignedSeqPos p)442 void CCDSInfo::Set5PrimeCdsLimit(TSignedSeqPos p)
443 {
444 if (p <= m_reading_frame.GetFrom())
445 m_max_cds_limits.SetFrom(p);
446 else
447 m_max_cds_limits.SetTo(p);
448
449 _ASSERT( Invariant() );
450 }
451
452
AddPStop(TSignedSeqRange r,EStatus status)453 void CCDSInfo::AddPStop(TSignedSeqRange r, EStatus status)
454 {
455 m_p_stops.push_back(SPStop(r, status));
456 _ASSERT( Invariant() );
457 }
458
459
SetScore(double score,bool open)460 void CCDSInfo::SetScore(double score, bool open)
461 {
462 m_score = score;
463 _ASSERT( !(open && !HasStart()) );
464 m_open = open;
465 _ASSERT( Invariant() );
466 }
467
Remap(const CRangeMapper & mapper)468 void CCDSInfo::Remap(const CRangeMapper& mapper)
469 {
470 if(m_start.NotEmpty())
471 m_start = mapper(m_start, false);
472 if(m_stop.NotEmpty())
473 m_stop = mapper(m_stop, false);
474 if(m_reading_frame.NotEmpty())
475 m_reading_frame = mapper(m_reading_frame, true);
476 if(m_reading_frame_from_proteins.NotEmpty())
477 m_reading_frame_from_proteins = mapper(m_reading_frame_from_proteins, true);
478 if(m_max_cds_limits.NotEmpty())
479 m_max_cds_limits = mapper(m_max_cds_limits, false);
480 NON_CONST_ITERATE(TPStops, s, m_p_stops) {
481 *s = SPStop(mapper(*s, false), s->m_status);
482 }
483 _ASSERT(Invariant());
484 }
485
CombineWith(const CCDSInfo & another_cds_info)486 void CCDSInfo::CombineWith(const CCDSInfo& another_cds_info)
487 {
488 if (another_cds_info.m_reading_frame.Empty())
489 return;
490
491 CCDSInfo new_cds_info;
492
493 if (m_reading_frame.Empty()) {
494 new_cds_info = another_cds_info;
495 } else {
496 new_cds_info = *this;
497
498 new_cds_info.m_p_stops.insert(new_cds_info.m_p_stops.end(),another_cds_info.m_p_stops.begin(),another_cds_info.m_p_stops.end());
499 sort(new_cds_info.m_p_stops.begin(),new_cds_info.m_p_stops.end());
500 new_cds_info.m_p_stops.erase( unique(new_cds_info.m_p_stops.begin(),new_cds_info.m_p_stops.end()), new_cds_info.m_p_stops.end() ) ;
501
502 new_cds_info.m_reading_frame += another_cds_info.m_reading_frame;
503 new_cds_info.m_reading_frame_from_proteins += another_cds_info.m_reading_frame_from_proteins;
504 new_cds_info.m_max_cds_limits &= another_cds_info.m_max_cds_limits;
505
506 if (new_cds_info.m_confirmed_start && Include(new_cds_info.m_reading_frame_from_proteins,new_cds_info.m_start)) {
507 new_cds_info.m_start = TSignedSeqRange::GetEmpty();
508 new_cds_info.m_confirmed_start = false;
509 }
510 if (another_cds_info.m_confirmed_start && !Include(new_cds_info.m_reading_frame_from_proteins,another_cds_info.m_start)) {
511 new_cds_info.m_start = another_cds_info.m_start;
512 new_cds_info.m_confirmed_start = true;
513 }
514 if (new_cds_info.m_confirmed_start) {
515 if (Include(new_cds_info.m_reading_frame,new_cds_info.m_start)) {
516 if (Precede(new_cds_info.m_start,new_cds_info.m_reading_frame_from_proteins))
517 new_cds_info.m_reading_frame.SetFrom( new_cds_info.m_reading_frame_from_proteins.GetFrom() );
518 else
519 new_cds_info.m_reading_frame.SetTo( new_cds_info.m_reading_frame_from_proteins.GetTo() );
520 }
521 } else {
522 if (new_cds_info.HasStart() && Include(new_cds_info.m_reading_frame,new_cds_info.m_start)) {
523 new_cds_info.m_start = TSignedSeqRange::GetEmpty();
524 }
525 if (another_cds_info.HasStart() && !Include(new_cds_info.m_reading_frame,another_cds_info.m_start)) {
526 new_cds_info.m_start = another_cds_info.m_start;
527 }
528 }
529
530 _ASSERT( !new_cds_info.HasStop() || !another_cds_info.HasStop() || new_cds_info.Stop() == another_cds_info.Stop() );
531 new_cds_info.m_stop += another_cds_info.m_stop;
532 if (another_cds_info.m_confirmed_stop)
533 new_cds_info.m_confirmed_stop = true;
534
535 new_cds_info.SetScore( BadScore(), false );
536 }
537
538 _ASSERT( new_cds_info.Invariant() );
539 *this = new_cds_info;
540 }
541
Strand() const542 int CCDSInfo::Strand() const
543 {
544 int strand = 0; //unknown
545 if (HasStart())
546 strand = Precede(Start(), ReadingFrame()) ? 1 : -1;
547 else if (HasStop())
548 strand = Precede(ReadingFrame(), Stop()) ? 1 : -1;
549 else if (m_max_cds_limits.GetFrom() != TSignedSeqRange::GetWholeFrom())
550 strand = 1;
551 else if (m_max_cds_limits.GetTo() != TSignedSeqRange::GetWholeTo())
552 strand = -1;
553 return strand;
554 }
555
Clip(TSignedSeqRange limits)556 void CCDSInfo::Clip(TSignedSeqRange limits)
557 {
558 if (ReadingFrame().Empty())
559 return;
560
561 m_reading_frame &= limits;
562
563 if (m_reading_frame.Empty()) {
564 Clear();
565 return;
566 }
567
568 m_start &= limits;
569 m_confirmed_start &= HasStart();
570 m_stop &= limits;
571 m_confirmed_stop &= HasStop();
572 m_reading_frame_from_proteins &= limits;
573
574 if (m_max_cds_limits.GetFrom() < limits.GetFrom())
575 m_max_cds_limits.SetFrom(TSignedSeqRange::GetWholeFrom());
576 if (limits.GetTo() < m_max_cds_limits.GetTo())
577 m_max_cds_limits.SetTo(TSignedSeqRange::GetWholeTo());
578
579 for(TPStops::iterator s = m_p_stops.begin(); s != m_p_stops.end(); ) {
580 *s &= limits;
581 if (s->NotEmpty())
582 ++s;
583 else
584 s = m_p_stops.erase(s);
585 }
586
587 SetScore( BadScore(), false );
588
589 _ASSERT( Invariant() );
590 }
591
Cut(TSignedSeqRange hole)592 void CCDSInfo::Cut(TSignedSeqRange hole) {
593 if((Cds() & hole).Empty())
594 return;
595
596 if(Include(hole, Cds())) {
597 Clear();
598 return;
599 }
600
601 if(hole.IntersectingWith(m_start)) {
602 _ASSERT(Include(hole, m_start));
603 m_start = TSignedSeqRange::GetEmpty();
604 m_confirmed_start = false;
605 }
606 if(hole.IntersectingWith(m_stop)) {
607 _ASSERT(Include(hole, m_stop));
608 m_stop = TSignedSeqRange::GetEmpty();
609 m_confirmed_stop = false;
610 }
611
612 if(Include(hole, m_max_cds_limits.GetFrom()))
613 m_max_cds_limits.SetFrom(TSignedSeqRange::GetWholeFrom());
614 if(Include(hole, m_max_cds_limits.GetTo()))
615 m_max_cds_limits.SetTo(TSignedSeqRange::GetWholeTo());
616
617 if(hole.IntersectingWith(m_reading_frame_from_proteins)) {
618 if(hole.GetFrom() <= m_reading_frame_from_proteins.GetFrom())
619 m_reading_frame_from_proteins.SetFrom(hole.GetTo()+1);
620 if(hole.GetTo() >= m_reading_frame_from_proteins.GetTo())
621 m_reading_frame_from_proteins.SetTo(hole.GetFrom()-1);
622 }
623
624 if(hole.IntersectingWith(m_reading_frame)) {
625 if(hole.GetFrom() <= m_reading_frame.GetFrom())
626 m_reading_frame.SetFrom(hole.GetTo()+1);
627 if(hole.GetTo() >= m_reading_frame.GetTo())
628 m_reading_frame.SetTo(hole.GetFrom()-1);
629 }
630
631 ERASE_ITERATE(TPStops, s, m_p_stops) {
632 if(hole.IntersectingWith(*s)) {
633 _ASSERT(Include(hole, *s));
634 VECTOR_ERASE(s, m_p_stops);
635 }
636 }
637
638 SetScore( BadScore(), false );
639 _ASSERT( Invariant() );
640 }
641
Clear()642 void CCDSInfo::Clear()
643 {
644 m_start = m_stop = m_reading_frame = m_reading_frame_from_proteins = m_max_cds_limits = TSignedSeqRange::GetEmpty();
645 m_confirmed_start = false;
646 m_confirmed_stop = false;
647 m_p_stops.clear();
648 SetScore( BadScore(), false );
649 }
650
PStop(bool includeall) const651 bool CCDSInfo::PStop(bool includeall) const
652 {
653 if(includeall)
654 return !m_p_stops.empty();
655
656 ITERATE(TPStops, stp, m_p_stops) {
657 if(stp->m_status != eGenomeNotCorrect && stp->m_status != eSelenocysteine)
658 return true;
659 }
660
661 return false;
662 }
663
RemoveShortHolesAndRescore(const CGnomonEngine & gnomon)664 void CGeneModel::RemoveShortHolesAndRescore(const CGnomonEngine& gnomon) {
665 TExons new_exons;
666 new_exons.push_back(Exons().front());
667 bool modified = false;
668 for(unsigned int i = 1; i < Exons().size(); ++i) {
669 bool hole = !Exons()[i-1].m_ssplice || !Exons()[i].m_fsplice;
670 int intron = Exons()[i].GetFrom()-Exons()[i-1].GetTo()-1;
671 if (hole && intron < gnomon.GetMinIntronLen()) {
672 modified = true;
673 new_exons.back().m_ssplice = Exons()[i].m_ssplice;
674 new_exons.back().AddTo(Exons()[i].GetTo()-Exons()[i-1].GetTo());
675 if(intron%3 != 0) {
676 int len = intron%3;
677 int loc = Exons()[i-1].GetTo() + 1 + (intron-len)/2;
678 FrameShifts().push_back(CInDelInfo(loc, len, CInDelInfo::eIns));
679 }
680 } else {
681 new_exons.push_back(Exons()[i]);
682 }
683 }
684
685 if(modified) {
686 MyExons() = new_exons;
687 sort(FrameShifts().begin(),FrameShifts().end());
688 gnomon.GetScore(*this);
689 }
690 }
691
CdsInvariant(bool check_start_stop) const692 bool CGeneModel::CdsInvariant(bool check_start_stop) const
693 {
694 if (ReadingFrame().Empty())
695 return true;
696
697
698 #ifdef _DEBUG
699 CAlignMap mrnamap(Exons(),FrameShifts(),Strand());
700 CCDSInfo cds_info = GetCdsInfo();
701 if(cds_info.IsMappedToGenome())
702 cds_info = cds_info.MapFromOrigToEdited(mrnamap);
703
704 _ASSERT( !(ConfirmedStart() && OpenCds()) );
705
706 _ASSERT(Include(cds_info.MaxCdsLimits(), cds_info.Cds()));
707 _ASSERT(cds_info.MaxCdsLimits().GetFrom() == TSignedSeqRange::GetWholeFrom() || cds_info.MaxCdsLimits().GetFrom() == cds_info.Cds().GetFrom());
708 _ASSERT(cds_info.MaxCdsLimits().GetTo() == TSignedSeqRange::GetWholeTo() || cds_info.MaxCdsLimits().GetTo() == cds_info.Cds().GetTo());
709
710 if (check_start_stop && Score() != BadScore()) {
711 _ASSERT(cds_info.ReadingFrame().GetFrom() < 3 || (cds_info.HasStart() && cds_info.Start().GetTo()+1 == cds_info.ReadingFrame().GetFrom()));
712 _ASSERT(cds_info.ReadingFrame().GetTo() >= mrnamap.TargetLen()-3 || (cds_info.HasStop() && cds_info.Stop().GetFrom()-1 == cds_info.ReadingFrame().GetTo()));
713 }
714
715 _ASSERT(!cds_info.HasStart() || cds_info.Start().GetLength()==3);
716 _ASSERT(!cds_info.HasStop() || cds_info.Stop().GetLength()==3);
717 _ASSERT(cds_info.Cds().GetLength()%3==0);
718 #endif
719
720 return true;
721 }
722
SetCdsInfo(const CCDSInfo & cds_info)723 void CGeneModel::SetCdsInfo(const CCDSInfo& cds_info)
724 {
725 m_cds_info = cds_info;
726 _ASSERT( CdsInvariant() );
727 }
728
SetCdsInfo(const CGeneModel & a)729 void CGeneModel::SetCdsInfo(const CGeneModel& a)
730 {
731 m_cds_info = a.m_cds_info;
732 _ASSERT( CdsInvariant() );
733 }
734
CombineCdsInfo(const CGeneModel & a,bool ensure_cds_invariant)735 void CGeneModel::CombineCdsInfo(const CGeneModel& a, bool ensure_cds_invariant)
736 {
737 CombineCdsInfo(a.m_cds_info, ensure_cds_invariant);
738 }
739
CombineCdsInfo(const CCDSInfo & cds_info,bool ensure_cds_invariant)740 void CGeneModel::CombineCdsInfo(const CCDSInfo& cds_info, bool ensure_cds_invariant)
741 {
742 _ASSERT( cds_info.ReadingFrame().Empty() || Include(Limits(),cds_info.ReadingFrame()) );
743 //This assert is problem when when cds_info.ReadingFrame() touches a fs ????
744 _ASSERT( cds_info.ReadingFrame().Empty() || FShiftedLen(cds_info.Start()+cds_info.ReadingFrame()+cds_info.Stop(), false)%3==0 );
745
746 m_cds_info.CombineWith(cds_info);
747
748 _ASSERT( !ensure_cds_invariant || CdsInvariant(false) );
749 }
750
CutExons(TSignedSeqRange hole)751 void CGeneModel::CutExons(TSignedSeqRange hole)
752 {
753 if (ReadingFrame().NotEmpty()) {
754 TSignedSeqRange cds_hole = hole;
755 for(int i = 0; i < (int)MyExons().size(); ++i) {
756 if(i > 0 && hole.GetFrom() == MyExons()[i].GetFrom())
757 cds_hole.SetFrom(MyExons()[i].GetTo()+1);
758 if(i < (int)MyExons().size()-1 && hole.GetTo() == MyExons()[i].GetTo())
759 cds_hole.SetTo(MyExons()[i].GetFrom()-1);
760 }
761 m_cds_info.Cut(cds_hole);
762 }
763
764 for(int i = 0; i < (int)MyExons().size(); ++i) {
765 TSignedSeqRange intersection = MyExons()[i].Limits() & hole;
766 if (intersection.Empty())
767 continue;
768 if (MyExons()[i].GetFrom()<hole.GetFrom()) {
769 MyExons()[i].Limits().SetTo(hole.GetFrom()-1);
770 MyExons()[i].m_ssplice = false;
771 MyExons()[i].m_ssplice_sig.clear();
772 if (i+1 < (int)MyExons().size()) {
773 MyExons()[i+1].m_fsplice=false;
774 }
775 } else if (hole.GetTo()<MyExons()[i].GetTo()) {
776 MyExons()[i].Limits().SetFrom(hole.GetTo()+1);
777 MyExons()[i].m_fsplice = false;
778 MyExons()[i].m_fsplice_sig.clear();
779 if (0<i) {
780 MyExons()[i-1].m_ssplice=false;
781 }
782 } else {
783 if (0<i) {
784 MyExons()[i-1].m_ssplice=false;
785 // MyExons()[i-1].m_ssplice_sig.clear();
786 }
787 if (i+1 < (int)MyExons().size()) {
788 MyExons()[i+1].m_fsplice=false;
789 // MyExons()[i+1].m_fsplice_sig.clear();
790 }
791 MyExons().erase( MyExons().begin()+i);
792 --i;
793 }
794 }
795 RecalculateLimits();
796 RemoveExtraFShifts(hole.GetTo()+1, hole.GetFrom()-1);
797 }
798
Clip(TSignedSeqRange clip_limits,EClipMode mode,bool ensure_cds_invariant)799 void CGeneModel::Clip(TSignedSeqRange clip_limits, EClipMode mode, bool ensure_cds_invariant)
800 {
801 if (ReadingFrame().NotEmpty()) {
802 TSignedSeqRange cds_clip_limits = Limits();
803 if (mode == eRemoveExons)
804 cds_clip_limits &= clip_limits;
805 else {
806 TSignedSeqRange intersection;
807 intersection = Exons().front().Limits() & clip_limits;
808 if (intersection.NotEmpty())
809 cds_clip_limits.SetFrom(intersection.GetFrom());
810 intersection = Exons().back().Limits() & clip_limits;
811 if (intersection.NotEmpty())
812 cds_clip_limits.SetTo(intersection.GetTo());
813 }
814
815 TSignedSeqRange precise_cds_clip_limits;
816 ITERATE(TExons, e, Exons())
817 precise_cds_clip_limits += e->Limits() & cds_clip_limits;
818 cds_clip_limits = precise_cds_clip_limits;
819
820 CAlignMap mrnamap(Exons(),FrameShifts(),Strand(),GetCdsInfo().Cds());
821
822 if (RealCdsLimits().GetFrom() < cds_clip_limits.GetFrom() && cds_clip_limits.GetFrom() < ReadingFrame().GetFrom())
823 cds_clip_limits.SetFrom(ReadingFrame().GetFrom());
824 else if (ReadingFrame().GetFrom() < cds_clip_limits.GetFrom() && cds_clip_limits.GetFrom() <= RealCdsLimits().GetTo()) {
825 TSignedSeqRange tmp = mrnamap.ShrinkToRealPoints(TSignedSeqRange(cds_clip_limits.GetFrom(), RealCdsLimits().GetTo()),true);
826 if(tmp.Empty() || tmp.GetFrom() >= ReadingFrame().GetTo())
827 cds_clip_limits = TSignedSeqRange::GetEmpty();
828 else
829 cds_clip_limits.SetFrom(tmp.GetFrom());
830 }
831
832 if( cds_clip_limits.NotEmpty()) {
833 if (RealCdsLimits().GetFrom() <= cds_clip_limits.GetTo() && cds_clip_limits.GetTo() < ReadingFrame().GetTo()) {
834 TSignedSeqRange tmp = mrnamap.ShrinkToRealPoints(TSignedSeqRange(RealCdsLimits().GetFrom(),cds_clip_limits.GetTo()),true);
835 if(tmp.Empty() || ReadingFrame().GetFrom() >= tmp.GetTo())
836 cds_clip_limits = TSignedSeqRange::GetEmpty();
837 else
838 cds_clip_limits.SetTo(tmp.GetTo());
839 } else if (ReadingFrame().GetTo() < cds_clip_limits.GetTo() && cds_clip_limits.GetTo() < RealCdsLimits().GetTo())
840 cds_clip_limits.SetTo(ReadingFrame().GetTo());
841 }
842
843 m_cds_info.Clip(cds_clip_limits);
844 }
845
846 for (TExons::iterator e = MyExons().begin(); e != MyExons().end();) {
847 TSignedSeqRange clip = e->Limits() & clip_limits;
848 if (clip.NotEmpty()) {
849 if(e->GetFrom() <= clip_limits.GetFrom())
850 e->m_fsplice = false;
851 if(e->GetFrom() < clip_limits.GetFrom())
852 e->m_fsplice_sig.clear();
853 if(e->GetTo() >= clip_limits.GetTo())
854 e->m_ssplice = false;
855 if(e->GetTo() > clip_limits.GetTo())
856 e->m_ssplice_sig.clear();
857
858 e++->Limits() = clip;
859 } else if (mode == eRemoveExons)
860 e = MyExons().erase(e);
861 else
862 ++e;
863 }
864
865 if (Exons().size()>0) {
866 MyExons().front().m_fsplice = false;
867 // MyExons().front().m_fsplice_sig.clear();
868 MyExons().back().m_ssplice = false;
869 // MyExons().back().m_ssplice_sig.clear();
870 }
871
872 RecalculateLimits();
873 RemoveExtraFShifts(clip_limits.GetFrom(), clip_limits.GetTo());
874
875 _ASSERT( CdsInvariant(ensure_cds_invariant) );
876 }
877
RemoveExtraFShifts(int left,int right)878 void CGeneModel::RemoveExtraFShifts(int left, int right)
879 {
880 TInDels::const_iterator indl = m_fshifts.begin();
881 TInDels indels;
882 for(CGeneModel::TExons::const_iterator e = Exons().begin(); e != Exons().end() && indl != m_fshifts.end(); ++e) {
883 if(e->Limits().Empty())
884 continue;
885
886 for( ;indl != m_fshifts.end() && indl->InDelEnd() < (indl->IsDeletion() ? e->GetFrom() : e->GetFrom()+1); ++indl); // skip indels on the left
887 for( ;indl != m_fshifts.end() && e->GetFrom() == left && indl->IsDeletion() && indl->Loc() == left; ++indl); // skip flanking deletion if clipped
888
889 if(indl != m_fshifts.end() && indl->Loc() < e->GetFrom()) { // clip long mismatch
890 _ASSERT(indl->IsMismatch());
891 int extra = e->GetFrom()-indl->Loc();
892 indels.push_back(CInDelInfo(e->GetFrom(),indl->Len()-extra,CInDelInfo::eMism,indl->GetInDelV().substr(extra)));
893 ++indl;
894 }
895
896 for( ; indl != m_fshifts.end() && indl->Loc() <= e->GetTo(); ++indl) // include all internal indels
897 indels.push_back(*indl);
898
899 for( ; indl != m_fshifts.end() && e->GetTo() != right && indl->IsDeletion() && indl->Loc() == e->GetTo()+1; ++indl) // include flanking deletion if not cliped
900 indels.push_back(*indl);
901
902 if(!indels.empty() && indels.back().InDelEnd() > e->GetTo()+1) { // clip long mismatch
903 _ASSERT(indels.back().IsMismatch());
904 int extra = indels.back().InDelEnd()-e->GetTo()-1;
905 indels.back() = CInDelInfo(indels.back().Loc(),indels.back().Len()-extra,CInDelInfo::eMism,indels.back().GetInDelV().substr(0,indels.back().Len()-extra));
906 }
907 }
908
909 m_fshifts = indels;
910 }
911
912
FrameShifts(TSignedSeqPos a,TSignedSeqPos b) const913 TInDels CGeneModel::FrameShifts(TSignedSeqPos a, TSignedSeqPos b) const
914 {
915 TInDels fshifts;
916
917 ITERATE(TInDels, i, m_fshifts) {
918 if(i->IntersectingWith(a,b))
919 fshifts.push_back( *i );
920 }
921
922 return fshifts;
923 }
924
925
FShiftedLen(TSignedSeqRange ab,bool withextras) const926 int CGeneModel::FShiftedLen(TSignedSeqRange ab, bool withextras) const
927 {
928 if (ab.Empty())
929 return 0;
930
931 _ASSERT(Include(Limits(),ab));
932
933 return GetAlignMap().FShiftedLen(ab, withextras);
934 }
935
FShiftedMove(TSignedSeqPos pos,int len) const936 TSignedSeqPos CGeneModel::FShiftedMove(TSignedSeqPos pos, int len) const
937 {
938 return GetAlignMap().FShiftedMove(pos, len);
939 }
940
941
MutualExtension(const CGeneModel & a) const942 int CGeneModel::MutualExtension(const CGeneModel& a) const
943 {
944 if(Strand() != a.Strand())
945 return 0;
946 const TSignedSeqRange limits = Limits();
947 const TSignedSeqRange alimits = a.Limits();
948
949 if((Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)) == 0 && (a.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)) == 0) {
950 const int intersection = (limits & alimits).GetLength();
951 if(intersection==0 || intersection==limits.GetLength() || intersection==alimits.GetLength())
952 return 0;
953 return isCompatible(a);
954 } else if((Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)) == 0) {
955 if(a.Status()&CGeneModel::eRightFlexible)
956 return alimits.GetFrom() < limits.GetFrom() && alimits.GetTo() > limits.GetFrom();
957 else
958 return alimits.GetTo() > limits.GetTo() && alimits.GetFrom() < limits.GetTo();
959 } else if((a.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)) == 0) {
960 if(Status()&CGeneModel::eRightFlexible)
961 return limits.GetFrom() < alimits.GetFrom() && limits.GetTo() > alimits.GetFrom();
962 else
963 return limits.GetTo() > alimits.GetTo() && limits.GetFrom() < alimits.GetTo();
964 } else if((Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)) != (a.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))) {
965 return 0;
966 } else {
967 return limits.IntersectingWith(alimits) && limits != alimits;
968 }
969 }
970
isCompatible(const CGeneModel & a) const971 int CGeneModel::isCompatible(const CGeneModel& a) const
972 {
973 const CGeneModel& b = *this; // shortcut to this alignment
974
975 _ASSERT( b.Strand() == a.Strand() || ((a.Status()&CGeneModel::eUnknownOrientation) != 0 && (b.Status()&CGeneModel::eUnknownOrientation) != 0));
976
977 TSignedSeqRange intersect(a.Limits() & b.Limits());
978 if(intersect.GetLength() <= 1) return 0; // intersection with 1 base is not legit
979
980 int anum = a.Exons().size()-1; // exon containing left point or first exon on the left
981 for(; intersect.GetFrom() < a.Exons()[anum].GetFrom() ; --anum);
982 bool aexon = intersect.GetFrom() <= a.Exons()[anum].GetTo();
983 if(!aexon && intersect.GetTo() < a.Exons()[anum+1].GetFrom()) return 0; // b is in intron
984
985 int bnum = b.Exons().size()-1; // exon containing left point or first exon on the left
986 for(; intersect.GetFrom() < b.Exons()[bnum].GetFrom(); --bnum);
987 bool bexon = intersect.GetFrom() <= b.Exons()[bnum].GetTo();
988 if(!bexon && intersect.GetTo() < b.Exons()[bnum+1].GetFrom()) return 0; // a is in intron
989
990 TSignedSeqPos left = intersect.GetFrom();
991 TSignedSeqPos afirst = -1;
992 TSignedSeqPos bfirst = -1;
993 int commonspl = 0;
994 int firstcommonpoint = -1;
995
996 auto_ptr<CAlignMap> pmapa(0), pmapb(0);
997
998 while(left <= intersect.GetTo())
999 {
1000 TSignedSeqPos aright = aexon ? a.Exons()[anum].GetTo() : a.Exons()[anum+1].GetFrom()-1;
1001 TSignedSeqPos bright = bexon ? b.Exons()[bnum].GetTo() : b.Exons()[bnum+1].GetFrom()-1;
1002 TSignedSeqPos right = min(aright,bright);
1003
1004 if(!aexon && bexon)
1005 {
1006 if(a.Exons()[anum].m_ssplice && a.Exons()[anum+1].m_fsplice) return 0; // intron has both splices
1007 if(left == a.Exons()[anum].GetTo()+1 && a.Exons()[anum].m_ssplice) return 0; // intron has left splice and == left
1008 if(right == aright && a.Exons()[anum+1].m_fsplice) return 0; // intron has right splice and == right
1009 }
1010 if(aexon && !bexon)
1011 {
1012 if(b.Exons()[bnum].m_ssplice && b.Exons()[bnum+1].m_fsplice) return 0;
1013 if(left == b.Exons()[bnum].GetTo()+1 && b.Exons()[bnum].m_ssplice) return 0;
1014 if(right == bright && b.Exons()[bnum+1].m_fsplice) return 0;
1015 }
1016
1017 if(aexon && bexon) {
1018 if(firstcommonpoint < 0) {
1019 firstcommonpoint = left;
1020 } else if((anum > 0 && left == a.Exons()[anum].GetFrom() && !a.Exons()[anum].m_fsplice) || // end of a-hole
1021 (bnum > 0 && left == b.Exons()[bnum].GetFrom() && !b.Exons()[bnum].m_fsplice)) { // end of b-hole
1022
1023 if(afirst < 0) {
1024 pmapa.reset(new CAlignMap(a.Exons(),a.FrameShifts(),a.Strand()));
1025 pmapb.reset(new CAlignMap(b.Exons(),b.FrameShifts(),b.Strand()));
1026 afirst = pmapa->MapOrigToEdited(firstcommonpoint);
1027 bfirst = pmapb->MapOrigToEdited(firstcommonpoint);
1028 if(afirst < 0 || bfirst < 0) return 0;
1029 }
1030
1031 TSignedSeqPos asecond = pmapa->MapOrigToEdited(left);
1032 TSignedSeqPos bsecond = pmapb->MapOrigToEdited(left);
1033 if(asecond < 0 || bsecond < 0 || (asecond-afirst)%3 != (bsecond-bfirst)%3) return 0;
1034 }
1035 }
1036
1037 if(aexon && aright == right && a.Exons()[anum].m_ssplice &&
1038 bexon && bright == right && b.Exons()[bnum].m_ssplice) ++commonspl;
1039 if(!aexon && aright == right && a.Exons()[anum+1].m_fsplice &&
1040 !bexon && bright == right && b.Exons()[bnum+1].m_fsplice) ++commonspl;
1041
1042 left = right+1;
1043
1044 if(right == aright)
1045 {
1046 aexon = !aexon;
1047 if(aexon) ++anum;
1048 }
1049
1050 if(right == bright)
1051 {
1052 bexon = !bexon;
1053 if(bexon) ++bnum;
1054 }
1055 }
1056
1057 return firstcommonpoint >= 0 ? commonspl+1 : 0;
1058 }
1059
IsSubAlignOf(const CGeneModel & a) const1060 bool CGeneModel::IsSubAlignOf(const CGeneModel& a) const
1061 {
1062 if(!Include(a.Limits(),Limits()) || !isCompatible(a))
1063 return false;
1064
1065 for(unsigned int i = 1; i < a.Exons().size(); ++i) {
1066 if (!a.Exons()[i-1].m_ssplice || !a.Exons()[i].m_fsplice){
1067 TSignedSeqRange hole(a.Exons()[i-1].GetTo()+1, a.Exons()[i].GetFrom()-1);
1068 ITERATE(CGeneModel::TExons, k, Exons()) {
1069 if(k->Limits().IntersectingWith(hole)) {
1070 return false;
1071 }
1072 }
1073 }
1074 }
1075
1076 return true;
1077 }
1078
1079
AddExon(TSignedSeqRange exon_range,const string & fs,const string & ss,double ident,const string & seq,const CInDelInfo::SSource & src)1080 void CGeneModel::AddExon(TSignedSeqRange exon_range, const string& fs, const string& ss, double ident, const string& seq, const CInDelInfo::SSource& src)
1081 {
1082 _ASSERT( (m_range & exon_range).Empty() );
1083 m_range += exon_range;
1084
1085 CModelExon e(exon_range.GetFrom(),exon_range.GetTo(),false,false,fs,ss,ident,seq,src);
1086 if (MyExons().empty())
1087 MyExons().push_back(e);
1088 else if ((exon_range.Empty() || MyExons().back().Limits().Empty()) || MyExons().back().GetTo() < exon_range.GetFrom()) {
1089 if (!m_expecting_hole) {
1090 MyExons().back().m_ssplice = true;
1091 e.m_fsplice = true;
1092 }
1093 MyExons().push_back(e);
1094 } else {
1095 if (!m_expecting_hole) {
1096 MyExons().front().m_fsplice = true;
1097 e.m_ssplice = true;
1098 }
1099 MyExons().insert(MyExons().begin(),e);
1100 }
1101 m_expecting_hole = false;
1102 }
1103
AddHole()1104 void CGeneModel::AddHole()
1105 {
1106 m_expecting_hole = true;
1107 }
1108
AddGgapExon(double ident,const string & seq,const CInDelInfo::SSource & src,bool infront)1109 void CGeneModel::AddGgapExon(double ident, const string& seq, const CInDelInfo::SSource& src, bool infront)
1110 {
1111 _ASSERT(MyExons().empty() || !m_expecting_hole);
1112
1113 TSignedSeqRange exon_range(TSignedSeqRange::GetEmpty());
1114 CModelExon e(exon_range.GetFrom(),exon_range.GetTo(),false,false,"","",ident,seq,src);
1115 if (MyExons().empty())
1116 MyExons().push_back(e);
1117 else if (!infront) {
1118 _ASSERT(MyExons().back().Limits().NotEmpty());
1119 MyExons().back().m_ssplice = true;
1120 e.m_fsplice = true;
1121 e.m_fsplice_sig = "XX";
1122 MyExons().push_back(e);
1123 } else {
1124 _ASSERT(MyExons().front().Limits().NotEmpty());
1125 MyExons().front().m_fsplice = true;
1126 e.m_ssplice = true;
1127 e.m_ssplice_sig = "XX";
1128 MyExons().insert(MyExons().begin(),e);
1129 }
1130 m_expecting_hole = false;
1131 }
1132
AddNormalExon(TSignedSeqRange exon_range,const string & fs,const string & ss,double ident,bool infront)1133 void CGeneModel::AddNormalExon(TSignedSeqRange exon_range, const string& fs, const string& ss, double ident, bool infront)
1134 {
1135 _ASSERT( (m_range & exon_range).Empty() );
1136 m_range += exon_range;
1137
1138 CModelExon e(exon_range.GetFrom(),exon_range.GetTo(),false,false,fs,ss,ident);
1139 if (MyExons().empty())
1140 MyExons().push_back(e);
1141 else if (!infront) {
1142 if (!m_expecting_hole) {
1143 MyExons().back().m_ssplice = true;
1144 if(MyExons().back().Limits().Empty())
1145 MyExons().back().m_ssplice_sig = "XX";
1146 e.m_fsplice = true;
1147 }
1148 MyExons().push_back(e);
1149 } else {
1150 if (!m_expecting_hole) {
1151 MyExons().front().m_fsplice = true;
1152 if(MyExons().front().Limits().Empty())
1153 MyExons().front().m_fsplice_sig = "XX";
1154 e.m_ssplice = true;
1155 }
1156 MyExons().insert(MyExons().begin(),e);
1157 }
1158 m_expecting_hole = false;
1159 }
1160
Extend(const CModelExon & e)1161 void CModelExon::Extend(const CModelExon& e)
1162 {
1163 // spliced should include the other
1164 _ASSERT(
1165 (!m_fsplice || GetFrom() <= e.GetFrom()) &&
1166 (!e.m_fsplice || GetFrom() >= e.GetFrom()) &&
1167 (!m_ssplice || e.GetTo() <= GetTo()) &&
1168 (!e.m_ssplice || e.GetTo() >= GetTo())
1169 );
1170
1171 Limits().CombineWith(e.Limits());
1172 m_fsplice =m_fsplice || e.m_fsplice;
1173 m_ssplice =m_ssplice || e.m_ssplice;
1174 if(e.m_fsplice && !e.m_fsplice_sig.empty())
1175 m_fsplice_sig = e.m_fsplice_sig;
1176 if(e.m_ssplice && !e.m_ssplice_sig.empty())
1177 m_ssplice_sig = e.m_ssplice_sig;
1178 }
1179
TrimEdgesToFrameInOtherAlignGaps(const TExons & exons_with_gaps)1180 void CGeneModel::TrimEdgesToFrameInOtherAlignGaps(const TExons& exons_with_gaps)
1181 {
1182 if(Exons().empty())
1183 return;
1184
1185 TSignedSeqPos left_edge = Limits().GetFrom();
1186 TSignedSeqPos right_edge = Limits().GetTo();
1187 CAlignMap mrnamap(Exons(), FrameShifts(), ePlus); // 'positive' alignmap
1188
1189 for (int i = 0; i<int(exons_with_gaps.size())-1; ++i) {
1190 if (exons_with_gaps[i].GetTo() < left_edge && left_edge < exons_with_gaps[i+1].GetFrom()) {
1191 _ASSERT( !exons_with_gaps[i].m_ssplice && !exons_with_gaps[i+1].m_fsplice );
1192 _ASSERT( right_edge >= exons_with_gaps[i+1].GetFrom() );
1193 TSignedSeqRange tlim = mrnamap.MapRangeOrigToEdited(TSignedSeqRange(left_edge,exons_with_gaps[i+1].GetFrom()), CAlignMap::eLeftEnd, CAlignMap::eSinglePoint);
1194 _ASSERT(tlim.NotEmpty());
1195 int del = (tlim.GetLength()-1)%3;
1196 if(del > 0) {
1197 left_edge = -1;
1198 for(int tpos = tlim.GetFrom()+del; left_edge < 0 && tpos <= tlim.GetTo(); tpos += 3) {
1199 left_edge = mrnamap.MapEditedToOrig(tpos);
1200 }
1201 _ASSERT(left_edge > 0);
1202 CutExons(TSignedSeqRange(Limits().GetFrom(),left_edge-1));
1203 }
1204 }
1205
1206 if (exons_with_gaps[i].GetTo() < right_edge && right_edge < exons_with_gaps[i+1].GetFrom()) {
1207 _ASSERT( !exons_with_gaps[i].m_ssplice && !exons_with_gaps[i+1].m_fsplice );
1208 _ASSERT( left_edge <= exons_with_gaps[i].GetTo() );
1209 TSignedSeqRange tlim = mrnamap.MapRangeOrigToEdited(TSignedSeqRange(exons_with_gaps[i].GetTo(),right_edge), CAlignMap::eSinglePoint, CAlignMap::eRightEnd);
1210 _ASSERT(tlim.NotEmpty());
1211 int del = (tlim.GetLength()-1)%3;
1212 if(del > 0) {
1213 right_edge = -1;
1214 for(int tpos = tlim.GetTo()-del; right_edge < 0 && tpos >= tlim.GetFrom(); tpos -= 3) {
1215 right_edge = mrnamap.MapEditedToOrig(tpos);
1216 }
1217 _ASSERT(right_edge > 0);
1218 CutExons(TSignedSeqRange(right_edge+1, Limits().GetTo()));
1219 }
1220 }
1221 }
1222 }
1223
Extend(const CGeneModel & align,bool ensure_cds_invariant)1224 void CGeneModel::Extend(const CGeneModel& align, bool ensure_cds_invariant)
1225 {
1226 _ASSERT( align.Strand() == Strand() );
1227
1228 CGeneModel other_align = align;
1229
1230 if ( !other_align.Continuous() )
1231 this->TrimEdgesToFrameInOtherAlignGaps(other_align.Exons());
1232 if ( !this->Continuous() )
1233 other_align.TrimEdgesToFrameInOtherAlignGaps(this->Exons());
1234
1235 TExons a = MyExons();
1236 TExons b = other_align.Exons();
1237
1238 MyExons().clear();
1239
1240 size_t i,j;
1241 for ( i=0,j=0; i<a.size() || j<b.size(); ) {
1242 if (i==a.size())
1243 MyExons().push_back(b[j++]);
1244 else if (j==b.size())
1245 MyExons().push_back(a[i++]);
1246 else if (a[i].GetTo()+1<b[j].GetFrom())
1247 MyExons().push_back(a[i++]);
1248 else if (b[j].GetTo()+1<a[i].GetFrom())
1249 MyExons().push_back(b[j++]);
1250 else {
1251 b[j].Extend(a[i++]);
1252 while (j+1<b.size() && b[j].GetTo()+1>=b[j+1].GetFrom()) {
1253 b[j+1].Extend(b[j]);
1254 ++j;
1255 }
1256 }
1257 if(!MyExons().empty())
1258 MyExons().back().m_ident = 0;
1259 }
1260
1261 RecalculateLimits();
1262
1263 m_fshifts.insert(m_fshifts.end(),align.m_fshifts.begin(),align.m_fshifts.end());
1264 if(!m_fshifts.empty()) {
1265 sort(m_fshifts.begin(),m_fshifts.end());
1266 m_fshifts.erase( unique(m_fshifts.begin(),m_fshifts.end()), m_fshifts.end() );
1267 }
1268 #ifdef _DEBUG
1269 ITERATE(TInDels, indl, m_fshifts) {
1270 if(indl->IsDeletion())
1271 continue;
1272 TInDels::const_iterator next = indl;
1273 if(++next != m_fshifts.end())
1274 _ASSERT(indl->InDelEnd() <= next->Loc());
1275 }
1276 #endif
1277
1278 SetType(Type() | ((CGeneModel::eProt|CGeneModel::eSR|CGeneModel::eEST|CGeneModel::emRNA|CGeneModel::eNotForChaining)&align.Type()));
1279 if(align.ReadingFrame().NotEmpty())
1280 CombineCdsInfo(align, ensure_cds_invariant);
1281 }
1282
RecalculateLimits()1283 void CGeneModel::RecalculateLimits()
1284 {
1285 if (Exons().empty()) {
1286 m_range = TSignedSeqRange::GetEmpty();
1287 } else {
1288 int num = Exons().size();
1289 if(Exons()[0].Limits().NotEmpty()) {
1290 m_range.SetFrom(Exons()[0].GetFrom());
1291 } else {
1292 _ASSERT(num > 1 && Exons()[1].Limits().NotEmpty());
1293 m_range.SetFrom(Exons()[1].GetFrom());
1294 }
1295 if(Exons()[num-1].Limits().NotEmpty()) {
1296 m_range.SetTo(Exons()[num-1].GetTo());
1297 } else {
1298 _ASSERT(num > 1 && Exons()[num-2].Limits().NotEmpty());
1299 m_range.SetTo(Exons()[num-2].GetTo());
1300 }
1301 }
1302 }
1303
1304
ExtendLeft(int amount)1305 void CGeneModel::ExtendLeft(int amount)
1306 {
1307 _ASSERT(amount>=0);
1308 MyExons().front().AddFrom(-amount);
1309 RecalculateLimits();
1310 }
1311
ExtendRight(int amount)1312 void CGeneModel::ExtendRight(int amount)
1313 {
1314 _ASSERT(amount>=0);
1315 MyExons().back().AddTo(amount);
1316 RecalculateLimits();
1317 }
1318
AlignLen() const1319 int CGeneModel::AlignLen() const
1320 {
1321 return FShiftedLen(Limits(), false);
1322 }
1323
RealCdsLimits() const1324 TSignedSeqRange CGeneModel::RealCdsLimits() const
1325 {
1326 TSignedSeqRange cds = MaxCdsLimits();
1327 if (HasStart()) {
1328 if (Strand()==ePlus)
1329 cds.SetFrom(GetCdsInfo().Start().GetFrom());
1330 else
1331 cds.SetTo(GetCdsInfo().Start().GetTo());
1332 }
1333 return cds;
1334 }
1335
RealCdsLen() const1336 int CGeneModel::RealCdsLen() const
1337 {
1338 return FShiftedLen(RealCdsLimits(), false);
1339 }
1340
MaxCdsLimits() const1341 TSignedSeqRange CGeneModel::MaxCdsLimits() const
1342 {
1343 if (ReadingFrame().Empty())
1344 return TSignedSeqRange::GetEmpty();
1345
1346 return GetCdsInfo().MaxCdsLimits() & Limits();
1347 }
1348
1349 template< class T>
1350 class CStreamState {
1351 #ifdef HAVE_IOS_REGISTER_CALLBACK
1352 public:
CStreamState(const T & deflt)1353 CStreamState(const T& deflt) : m_deflt(deflt), m_index( ios_base::xalloc() ) {}
1354
slot(ios_base & iob)1355 T& slot(ios_base& iob)
1356 {
1357 void *&p = iob.pword(m_index);
1358 T *c = static_cast<T*>(p);
1359 if (c == NULL) {
1360 p = c = new T(m_deflt);
1361 iob.register_callback(ios_callback, m_index);
1362 }
1363 return *c;
1364 }
1365
1366 private:
ios_callback(ios_base::event e,ios_base & iob,int index)1367 static void ios_callback(ios_base::event e, ios_base& iob, int index)
1368 {
1369 if (e == ios_base::erase_event) {
1370 delete static_cast<T*>(iob.pword(index));
1371 } else if (e == ios_base::copyfmt_event) {
1372 void *&p = iob.pword(index);
1373 try {
1374 T *old = static_cast<T*>(p);
1375 p = new T(*old);
1376 } catch (...) {
1377 p = NULL;
1378 }
1379 }
1380 }
1381
1382 T m_deflt;
1383 int m_index;
1384 #else
1385 // Crude approximation for use by older compilers (notably GCC 2.95).
1386 // In particular, this implementation has no way to find out when
1387 // anything happens to the streams it tracks, so it leaks some
1388 // amount of memory and disregards the possibility that multiple
1389 // distinct tracked streams have the same address in sequence. :-/
1390 public:
1391 CStreamState(const T& deflt) : m_deflt(deflt) {}
1392
1393 T& slot(IOS_BASE& iob)
1394 {
1395 TMap::iterator it = m_map.find(&iob);
1396 if (it == m_map.end()) {
1397 it = m_map.insert(make_pair(&iob, m_deflt)).first;
1398 }
1399 return it->second;
1400 }
1401
1402 private:
1403 typedef map<IOS_BASE*, T> TMap;
1404
1405 T m_deflt;
1406 TMap m_map;
1407 #endif
1408 };
1409
PolyALen() const1410 int CAlignModel::PolyALen() const {
1411 if((Status()&ePolyA) == 0)
1412 return 0;
1413
1414 TSignedSeqRange lim = GetAlignMap().MapRangeOrigToEdited(GetAlignMap().ShrinkToRealPoints(Limits()),false);
1415 if((Status()&eReversed) == 0) {
1416 return TargetLen()-lim.GetTo()-1;
1417 } else {
1418 return lim.GetFrom();
1419 }
1420 }
1421
1422 CStreamState<pair<string,string> > line_buffer(make_pair(kEmptyStr,kEmptyStr));
1423
Getline(CNcbiIstream & is,string & line)1424 CNcbiIstream& Getline(CNcbiIstream& is, string& line)
1425 {
1426 if (!line_buffer.slot(is).first.empty()) {
1427 line = line_buffer.slot(is).first;
1428 line_buffer.slot(is).first.erase();
1429 } else {
1430 NcbiGetlineEOL(is, line);
1431 }
1432 line_buffer.slot(is).second = line;
1433 return is;
1434 }
1435
Ungetline(CNcbiIstream & is)1436 void Ungetline(CNcbiIstream& is)
1437 {
1438 line_buffer.slot(is).first = line_buffer.slot(is).second;
1439 is.clear();
1440 }
1441
InputError(CNcbiIstream & is)1442 CNcbiIstream& InputError(CNcbiIstream& is)
1443 {
1444 is.clear();
1445 ERR_POST( Error << "Input error. Last line: " << line_buffer.slot(is).second );
1446 Ungetline(is);
1447 is.setstate(ios::failbit);
1448 return is;
1449 }
1450
1451 CStreamState<string> contig_stream_state(kEmptyStr);
1452
operator <<(CNcbiOstream & s,const setcontig & c)1453 CNcbiOstream& operator<<(CNcbiOstream& s, const setcontig& c)
1454 {
1455 contig_stream_state.slot(s) = c.m_contig; return s;
1456 }
operator >>(CNcbiIstream & is,const getcontig & c)1457 CNcbiIstream& operator>>(CNcbiIstream& is, const getcontig& c)
1458 {
1459 c.m_contig = contig_stream_state.slot(is); return is;
1460 }
1461
1462 enum EModelFileFormat { eGnomonFileFormat, eGFF3FileFormat, eASNFileFormat };
1463 NCBI_XALGOGNOMON_EXPORT CNcbiOstream& operator<<(CNcbiOstream& s, EModelFileFormat f);
1464
1465 CStreamState<EModelFileFormat> model_file_format_state(eGFF3FileFormat);
1466
operator <<(CNcbiOstream & s,EModelFileFormat f)1467 CNcbiOstream& operator<<(CNcbiOstream& s, EModelFileFormat f)
1468 {
1469 model_file_format_state.slot(s) = f; return s;
1470 }
1471
1472 struct SGFFrec {
SGFFrecSGFFrec1473 SGFFrec() : start(-1), end(-1), score(BadScore()), strand('.'), phase(-1), model(0), tstart(-1), tend(-2), tstrand('+') {}
1474 string seqid;
1475 string source;
1476 string type;
1477 int start;
1478 int end;
1479 double score;
1480 char strand;
1481 int phase;
1482 Int8 model;
1483 int tstart;
1484 int tend;
1485 char tstrand;
1486 map<string,string> attributes;
1487
1488 void print(CNcbiOstream& os) const;
1489 };
1490
operator <<(CNcbiOstream & os,const SGFFrec & r)1491 CNcbiOstream& operator<<(CNcbiOstream& os, const SGFFrec& r)
1492 {
1493 r.print(os); return os;
1494 }
1495
1496 namespace {
1497 const char* dot = ".";
1498 }
1499
print(CNcbiOstream & os) const1500 void SGFFrec::print(CNcbiOstream& os) const
1501 {
1502
1503 os << (seqid.empty()?dot:seqid) << '\t';
1504 os << (source.empty()?dot:source) << '\t';
1505 os << (type.empty()?dot:type) << '\t';
1506 if(start >= 0)
1507 os << (start+1) << '\t';
1508 else
1509 os << "-\t";
1510 if(end >= 0)
1511 os << (end+1) << '\t';
1512 else
1513 os << "-\t";
1514 if (score == BadScore()) os << dot; else os << score; os << '\t';
1515 os << strand << '\t';
1516 if (phase < 0) os << dot; else os << phase; os << '\t';
1517
1518 os << "model=" << model;
1519 for (map<string,string>::const_iterator i = attributes.begin(); i != attributes.end(); ++i ) {
1520 if (!i->second.empty())
1521 os << ';' << i->first << '=' << i->second;
1522 }
1523
1524 os << '\n';
1525 }
1526
operator >>(CNcbiIstream & is,SGFFrec & res)1527 CNcbiIstream& operator>>(CNcbiIstream& is, SGFFrec& res)
1528 {
1529 string line;
1530 do {
1531 Getline(is, line);
1532 } while (is && (line.empty() || NStr::StartsWith(line, "#")));
1533 if (!is) {
1534 return is;
1535 }
1536
1537 vector<string> v;
1538 NStr::Split(line, "\t", v);
1539 if (v.size()!=9)
1540 return InputError(is);
1541 SGFFrec rec;
1542 try {
1543 rec.seqid = v[0];
1544 rec.source = v[1];
1545 rec.type = v[2];
1546 rec.start = -1;
1547 if(v[3] != "-")
1548 rec.start = NStr::StringToNumeric<int>(v[3])-1;
1549 rec.end = -1;
1550 if(v[4] != "-")
1551 rec.end = NStr::StringToNumeric<int>(v[4])-1;
1552 rec.score = v[5]==dot?BadScore():NStr::StringToNumeric<double>(v[5]);
1553 rec.strand = v[6][0];
1554 rec.phase = v[7]==dot?-1:NStr::StringToNumeric<int>(v[7]);
1555 } catch (...) {
1556 return InputError(is);
1557 }
1558 bool model_id_present = false;
1559 vector<string> attributes;
1560 NStr::Split(v[8], ";", attributes, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1561 for (size_t i = 0; i < attributes.size(); ++i) {
1562 string key, value;
1563 if (NStr::SplitInTwo(attributes[i], "=", key, value) && !value.empty()) {
1564 if (key == "model") {
1565 rec.model = NStr::StringToNumeric<Int8>(value);
1566 model_id_present = true;
1567 } else if(key == "Target") {
1568 vector<string> tt;
1569 NStr::Split(value, " \t", tt, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1570 rec.tstart = NStr::StringToNumeric<int>(tt[1])-1;
1571 rec.tend = NStr::StringToNumeric<int>(tt[2])-1;
1572 if(tt.size() > 3 && tt[3] == "-")
1573 rec.tstrand = '-';
1574 rec.attributes[key] = tt[0];
1575 } else
1576 rec.attributes[key] = value;
1577 }
1578 }
1579 if (!model_id_present)
1580 return InputError(is);
1581 res = rec;
1582 return is;
1583 }
1584
BuildGFF3Gap(int & prev_pos,const CInDelInfo & indel)1585 string BuildGFF3Gap(int& prev_pos, const CInDelInfo& indel) {
1586 string gap;
1587 string status;
1588 if(indel.GetStatus() == CInDelInfo::eGenomeCorrect)
1589 status = "c";
1590 else if(indel.GetStatus() == CInDelInfo::eGenomeNotCorrect)
1591 status = "n";
1592 if (prev_pos < indel.Loc())
1593 gap += " M"+NStr::NumericToString(indel.Loc()-prev_pos);
1594 if(indel.IsInsertion()) {
1595 gap += string(" ")+status+"D"+NStr::NumericToString(indel.Len());
1596 } else if(indel.IsDeletion()) {
1597 gap += string(" ")+status+"I"+indel.GetInDelV();
1598 } else {
1599 gap += string(" ")+status+"R"+indel.GetInDelV();
1600 }
1601
1602 prev_pos = indel.InDelEnd();
1603
1604 return gap;
1605 }
1606
BuildGFF3Gap(int start,int end,const TInDels & indels)1607 string BuildGFF3Gap(int start, int end, const TInDels& indels)
1608 {
1609 string gap;
1610
1611 int prev_pos = start;
1612 ITERATE (TInDels, indelp, indels) {
1613 const CInDelInfo& indel = *indelp;
1614 if(indel.Loc() < start)
1615 continue;
1616 if(indel.InDelEnd() > end+1)
1617 break;
1618
1619 _ASSERT( indel.IsDeletion() || (start <=indel.Loc() && indel.Loc()+indel.Len()-1 <= end));
1620 gap += BuildGFF3Gap(prev_pos, indel);
1621 }
1622 if (!gap.empty()) {
1623 gap.erase(0,1);
1624 if (prev_pos < end+1)
1625 gap += " M"+NStr::NumericToString(end+1-prev_pos);
1626 }
1627
1628 return gap;
1629 }
1630
readGFF3Gap(const string & gap,int start,int end,TInDels & indels)1631 void readGFF3Gap(const string& gap, int start, int end, TInDels& indels)
1632 {
1633 if (gap.empty())
1634 return;
1635 vector<string> operations;
1636 NStr::Split(gap, " ", operations, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1637 TSignedSeqPos loc = start;
1638 NON_CONST_ITERATE(vector<string>, o, operations) {
1639 CInDelInfo::EStatus status = CInDelInfo::eUnknown;
1640 if((*o)[0] == 'c') {
1641 status = CInDelInfo::eGenomeCorrect;
1642 o->erase(o->begin());
1643 } else if((*o)[0] == 'n') {
1644 status = CInDelInfo::eGenomeNotCorrect;
1645 o->erase(o->begin());
1646 }
1647 int len = NStr::StringToNumeric<int>(*o,NStr::fConvErr_NoThrow|NStr::fAllowLeadingSymbols);
1648 if ((*o)[0] == 'M') {
1649 loc += len;
1650 } else if ((*o)[0] == 'D') {
1651 indels.push_back(CInDelInfo(loc,len,CInDelInfo::eIns));
1652 indels.back().SetStatus(status);
1653 loc += len;
1654 } else if ((*o)[0] == 'I') {
1655 string seq;
1656 len = o->length()-1;
1657 seq = o->substr(1);
1658 indels.push_back(CInDelInfo(loc,len,CInDelInfo::eDel,seq));
1659 indels.back().SetStatus(status);
1660 } else {
1661 _ASSERT((*o)[0] == 'R');
1662 string seq;
1663 len = o->length()-1;
1664 seq = o->substr(1);
1665 indels.push_back(CInDelInfo(loc,len,CInDelInfo::eMism,seq));
1666 indels.back().SetStatus(status);
1667 loc += len;
1668 }
1669 }
1670 _ASSERT( loc == end+1 );
1671 }
1672
TypeToString(int type)1673 string CGeneModel::TypeToString(int type)
1674 {
1675 if ((type & eGnomon)!=0) return "Gnomon";
1676 if ((type & eChain)!=0) return "Chainer";
1677 if ((type & eProt)!=0) return "ProSplign";
1678 if ((type & (eSR|eEST|emRNA|eNotForChaining))!=0) return "Splign";
1679 return "Unknown";
1680 }
1681
CollectAttributes(const CAlignModel & a,map<string,string> & attributes)1682 void CollectAttributes(const CAlignModel& a, map<string,string>& attributes)
1683 {
1684 attributes["ID"] = NStr::NumericToString(a.ID());
1685 if (a.GeneID()!=0)
1686 attributes["Parent"] = "gene"+NStr::NumericToString(a.GeneID());
1687 if (a.RankInGene()!=0)
1688 attributes["rankInGene"] = NStr::NumericToString(a.RankInGene());
1689
1690 ITERATE(CSupportInfoSet, i, a.Support()) {
1691 attributes["support"] += ",";
1692 if(i->IsCore())
1693 attributes["support"] += "*";
1694
1695 attributes["support"] += NStr::NumericToString(i->GetId());
1696 }
1697 attributes["support"].erase(0,1);
1698
1699 list<string> tp;
1700 ITERATE(list< CRef<CSeq_id> >, i, a.TrustedProt()) {
1701 tp.push_back(CIdHandler::ToString(**i));
1702 }
1703 tp.sort();
1704 attributes["TrustedProt"] = NStr::Join(tp,",");
1705
1706 list<string> tm;
1707 ITERATE(list< CRef<CSeq_id> >, i, a.TrustedmRNA()) {
1708 tm.push_back(CIdHandler::ToString(**i));
1709 }
1710 tm.sort();
1711 attributes["TrustedmRNA"] = NStr::Join(tm,",");
1712
1713 if(a.TargetLen() > 0)
1714 attributes["TargetLen"] = NStr::NumericToString(a.TargetLen());
1715
1716 if(a.Ident() > 0)
1717 attributes["Ident"] = NStr::DoubleToString(a.Ident());
1718
1719 if(a.Weight() > 1)
1720 attributes["Weight"] = NStr::DoubleToString(a.Weight());
1721
1722 if (!a.ProteinHit().empty())
1723 attributes["protein_hit"] = a.ProteinHit();
1724
1725 if ((a.Type() &CGeneModel::eWall)!=0) attributes["flags"] += ",Wall";
1726 if ((a.Type() &CGeneModel::eNested)!=0) attributes["flags"] += ",Nested";
1727 if ((a.Type() &CGeneModel::eNotForChaining)!=0) attributes["flags"] += ",NotForChaining";
1728 if ((a.Type() &CGeneModel::eSR)!=0) attributes["flags"] += ",SR";
1729 if ((a.Type() &CGeneModel::eEST)!=0) attributes["flags"] += ",EST";
1730 if ((a.Type() &CGeneModel::emRNA)!=0) attributes["flags"] += ",mRNA";
1731 if ((a.Type() &CGeneModel::eProt)!=0) attributes["flags"] += ",Prot";
1732
1733 if ((a.Status()&CGeneModel::eCap)!=0) attributes["flags"] += ",Cap";
1734 if ((a.Status()&CGeneModel::ePolyA)!=0) attributes["flags"] += ",PolyA";
1735 if ((a.Status()&CGeneModel::eSkipped)!=0) attributes["flags"] += ",Skip";
1736 if ((a.Status()&CGeneModel::eBestPlacement)!=0) attributes["flags"] += ",BestPlacement";
1737 if ((a.Status()&CGeneModel::eConsistentCoverage)!=0) attributes["flags"] += ",ConsistentCoverage";
1738 if ((a.Status()&CGeneModel::eUnknownOrientation)!=0) attributes["flags"] += ",UnknownOrientation";
1739 if ((a.Status()&CGeneModel::eGapFiller)!=0) attributes["flags"] += ",GapFiller";
1740 if ((a.Status()&CGeneModel::ecDNAIntrons)!=0) attributes["flags"] += ",cDNAIntrons";
1741 if ((a.Status()&CGeneModel::eChangedByFilter)!=0) attributes["flags"] += ",ChangedByFilter";
1742 if ((a.Status()&CGeneModel::eTSA)!=0) attributes["flags"] += ",TSA";
1743 if ((a.Status()&CGeneModel::eLeftConfirmed)!=0) attributes["flags"] += ",LeftConfirmed";
1744 if ((a.Status()&CGeneModel::eRightConfirmed)!=0) attributes["flags"] += ",RightConfirmed";
1745
1746 if (a.GetCdsInfo().ReadingFrame().NotEmpty()) {
1747
1748 CCDSInfo cds_info = a.GetCdsInfo();
1749 if(cds_info.IsMappedToGenome()) { // convert local copy to tcoords
1750 cds_info = cds_info.MapFromOrigToEdited(a.GetAlignMap());
1751 }
1752
1753 TSignedSeqRange tlim = a.TranscriptLimits();
1754 TSignedSeqRange cds = cds_info.Cds(); // cdsinfo already in tcoords
1755 TSignedSeqRange start = cds_info.Start();
1756 TSignedSeqRange maxcdslim = tlim & cds_info.MaxCdsLimits();
1757 TSignedSeqRange protcds = cds_info.ProtReadingFrame();
1758
1759 TSignedSeqRange rf = cds;
1760 if(cds_info.HasStart())
1761 rf.SetFrom(rf.GetFrom()+3);
1762 if(cds_info.HasStop())
1763 rf.SetTo(rf.GetTo()-3);
1764
1765 bool tCoords = false;
1766
1767 if(start.NotEmpty() && (start.GetFrom() != maxcdslim.GetFrom() && start.GetTo() != maxcdslim.GetTo())) {
1768 attributes["maxCDS"] = NStr::NumericToString(maxcdslim.GetFrom()+1)+" "+NStr::NumericToString(maxcdslim.GetTo()+1);
1769 tCoords = true;
1770 }
1771 if(protcds.NotEmpty() && protcds != rf) {
1772 attributes["protCDS"] = NStr::NumericToString(protcds.GetFrom()+1)+" "+NStr::NumericToString(protcds.GetTo() +1);
1773 tCoords = true;
1774 }
1775 if(cds_info.HasStart()) {
1776 _ASSERT( !(cds_info.ConfirmedStart() && cds_info.OpenCds()) );
1777 string adj;
1778 if (cds_info.ConfirmedStart())
1779 adj = "Confirmed";
1780 else if (cds_info.OpenCds())
1781 adj = "Putative";
1782 attributes["flags"] += ","+adj+"Start";
1783 }
1784 if (cds_info.HasStop()) {
1785 string adj;
1786 if (cds_info.ConfirmedStop())
1787 adj = "Confirmed";
1788 attributes["flags"] += ","+adj+"Stop";
1789 }
1790
1791 if ((a.Status()&CGeneModel::eFullSupCDS)!=0) attributes["flags"] += ",FullSupCDS";
1792 if ((a.Status()&CGeneModel::ePseudo)!=0) attributes["flags"] += ",Pseudo";
1793 TInDels fs = a.GetInDels(true);
1794 ITERATE(TInDels, indl, fs) {
1795 if(indl->GetStatus() != CInDelInfo::eGenomeNotCorrect) {
1796 attributes["flags"] += ",Frameshifts";
1797 break;
1798 }
1799 }
1800 if(a.isNMD(50)) attributes["flags"] += ",NMD";
1801
1802 ITERATE(CCDSInfo::TPStops, s, cds_info.PStops()) {
1803 TSignedSeqRange pstop = *s;
1804 attributes["pstop"] += ","+NStr::NumericToString(pstop.GetFrom()+1)+" "+NStr::NumericToString(pstop.GetTo()+1);
1805 if(s->m_status == CCDSInfo::eGenomeCorrect)
1806 attributes["pstop"] += " C";
1807 else if(s->m_status == CCDSInfo::eGenomeNotCorrect)
1808 attributes["pstop"] += " N";
1809 else if(s->m_status == CCDSInfo::eSelenocysteine)
1810 attributes["pstop"] += " S";
1811
1812 tCoords = true;
1813 }
1814 attributes["pstop"].erase(0,1);
1815
1816 if(tCoords)
1817 attributes["flags"] += ",tCoords";
1818 }
1819
1820 attributes["flags"].erase(0,1);
1821
1822 attributes["note"] = a.GetComment();
1823 }
1824
StringToRange(const string & s)1825 TSignedSeqRange StringToRange(const string& s)
1826 {
1827 string start, stop;
1828 NStr::SplitInTwo(s, " ", start, stop);
1829 return TSignedSeqRange(NStr::StringToNumeric<TSignedSeqPos>(start)-1,NStr::StringToNumeric<TSignedSeqPos>(stop)-1);
1830 }
1831
ParseAttributes(map<string,string> & attributes,CAlignModel & a)1832 void ParseAttributes(map<string,string>& attributes, CAlignModel& a)
1833 {
1834 bool open_cds = false;
1835 bool confirmed_start = false;
1836 bool confirmed_stop = false;
1837 bool has_start = false;
1838 bool has_stop = false;
1839 bool tcoords = false;
1840
1841 vector<string> flags;
1842 NStr::Split(attributes["flags"], ",", flags, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1843 ITERATE(vector<string>, f, flags) {
1844 if (*f == "Wall") a.SetType(a.Type()| CGeneModel::eWall);
1845 else if (*f == "Nested") a.SetType(a.Type()| CGeneModel::eNested);
1846 else if (*f == "NotForChaining") a.SetType(a.Type()| CGeneModel::eNotForChaining);
1847 else if (*f == "SR") a.SetType(a.Type()| CGeneModel::eSR);
1848 else if (*f == "EST") a.SetType(a.Type()| CGeneModel::eEST);
1849 else if (*f == "mRNA") a.SetType(a.Type()| CGeneModel::emRNA);
1850 else if (*f == "Prot") a.SetType(a.Type()| CGeneModel::eProt);
1851 else if (*f == "Skip") a.Status() |= CGeneModel::eSkipped;
1852 else if (*f == "FullSupCDS") a.Status() |= CGeneModel::eFullSupCDS;
1853 else if (*f == "Pseudo") a.Status() |= CGeneModel::ePseudo;
1854 else if (*f == "PolyA") a.Status() |= CGeneModel::ePolyA;
1855 else if (*f == "Cap") a.Status() |= CGeneModel::eCap;
1856 else if (*f == "BestPlacement") a.Status() |= CGeneModel::eBestPlacement;
1857 else if (*f == "ConsistentCoverage") a.Status() |= CGeneModel::eConsistentCoverage;
1858 else if (*f == "UnknownOrientation") a.Status()|= CGeneModel::eUnknownOrientation;
1859 else if (*f == "GapFiller") a.Status()|= CGeneModel::eGapFiller;
1860 else if (*f == "cDNAIntrons") a.Status() |= CGeneModel::ecDNAIntrons;
1861 else if (*f == "TSA") a.Status() |= CGeneModel::eTSA;
1862 else if (*f == "LeftConfirmed") a.Status() |= CGeneModel::eLeftConfirmed;
1863 else if (*f == "RightConfirmed") a.Status() |= CGeneModel::eRightConfirmed;
1864 else if (*f == "ChangedByFilter") a.Status() |= CGeneModel::eChangedByFilter;
1865 else if (*f == "ConfirmedStart") { confirmed_start = true; has_start = true; }
1866 else if (*f == "PutativeStart") { open_cds = true; has_start = true; }
1867 else if (*f == "Start") has_start = true;
1868 else if (*f == "ConfirmedStop") { confirmed_stop = true; has_stop = true; }
1869 else if (*f == "Stop") has_stop = true;
1870 else if (*f == "tCoords") tcoords = true;
1871 }
1872
1873 if (NStr::StartsWith(attributes["Parent"],"gene"))
1874 a.SetGeneID(NStr::StringToNumeric<Int8>(attributes["Parent"],NStr::fConvErr_NoThrow|NStr::fAllowLeadingSymbols));
1875 if (!attributes["rankInGene"].empty()) {
1876 a.SetRankInGene(NStr::StringToNumeric<int>(attributes["rankInGene"]));
1877 }
1878
1879 if (!attributes["Target"].empty()) {
1880 string target = NStr::Replace(attributes["Target"], "%20", " ");
1881
1882 CRef<CSeq_id> target_id;
1883 try {
1884 target_id = CIdHandler::ToSeq_id(target);
1885 } catch(CException) {
1886 if(((a.Type() & CGeneModel::eGnomon) != 0 || (a.Type() & CGeneModel::eChain) != 0) && target.substr(0,4) == "hmm.") { // handles legacy files
1887 target = "gnl|GNOMON|"+target.substr(4);
1888 } else if((a.Type() & CGeneModel::eProt) != 0 && target.length() == 6 && isalpha(target[0])) { // probably PIR
1889 target = "pir||"+target;
1890 }
1891 target_id = CIdHandler::ToSeq_id(target);
1892 }
1893 a.SetTargetId(*target_id);
1894 }
1895
1896 if((a.Type() & CGeneModel::eGnomon) != 0 || (a.Type() & CGeneModel::eChain) != 0) { // handles legacy files
1897 vector<string> support;
1898 NStr::Split(attributes["support"], ",", support, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1899 ITERATE(vector<string>, s, support) {
1900 bool core = (*s)[0] == '*';
1901 Int8 id = NStr::StringToNumeric<Int8>(core ? s->substr(1) : *s);
1902 a.AddSupport(CSupportInfo(id, core));
1903 }
1904 }
1905
1906 vector<string> trustedmrna;
1907 NStr::Split(attributes["TrustedmRNA"], ",", trustedmrna, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1908 ITERATE(vector<string>, s, trustedmrna) {
1909 a.InsertTrustedmRNA(CIdHandler::ToSeq_id(*s));
1910 }
1911
1912 vector<string> trustedprot;
1913 NStr::Split(attributes["TrustedProt"], ",", trustedprot, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1914 ITERATE(vector<string>, s, trustedprot) {
1915 a.InsertTrustedProt(CIdHandler::ToSeq_id(*s));
1916 }
1917
1918 a.SetComment(attributes["note"]);
1919
1920 if (!attributes["Ident"].empty())
1921 a.SetIdent(NStr::StringToNumeric<double>(attributes["Ident"]));
1922
1923 if (!attributes["Weight"].empty())
1924 a.SetWeight(NStr::StringToNumeric<double>(attributes["Weight"]));
1925
1926 if (!attributes["protein_hit"].empty())
1927 a.ProteinHit() = attributes["protein_hit"];
1928
1929 CCDSInfo cds_info = a.GetCdsInfo();
1930 TSignedSeqRange reading_frame = cds_info.ReadingFrame();
1931
1932 if (reading_frame.NotEmpty()) {
1933
1934 if (!attributes["protCDS"].empty()) {
1935 TSignedSeqRange protcds = StringToRange(attributes["protCDS"]);
1936 if(!tcoords)
1937 protcds = a.GetAlignMap().MapRangeOrigToEdited(protcds, false);
1938 cds_info.SetReadingFrame(protcds, true );
1939 }
1940
1941 bool reversed = (a.Status()&CGeneModel::eReversed) != 0;
1942
1943 if(reversed)
1944 swap(has_start, has_stop);
1945
1946 TSignedSeqRange start, stop;
1947
1948 if(has_start) {
1949 start = TSignedSeqRange(reading_frame.GetFrom(),reading_frame.GetFrom()+2);
1950 reading_frame.SetFrom(reading_frame.GetFrom()+3);
1951 }
1952
1953 if(has_stop) {
1954 stop = TSignedSeqRange(reading_frame.GetTo()-2,reading_frame.GetTo());
1955 reading_frame.SetTo(reading_frame.GetTo()-3);
1956 }
1957
1958 if(reversed) {
1959 swap(has_start, has_stop);
1960 swap(start, stop);
1961 }
1962
1963 cds_info.SetReadingFrame(reading_frame, (a.Type()&CGeneModel::eProt)!=0 && cds_info.ProtReadingFrame().Empty());
1964 cds_info.SetStart(start,confirmed_start);
1965
1966 cds_info.SetStop(stop,confirmed_stop);
1967
1968 if(!attributes["pstop"].empty()) {
1969 vector<string> stops;
1970 NStr::Split(attributes["pstop"], ",", stops, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1971 ITERATE(vector<string>, s, stops) {
1972 vector<string> parts;
1973 NStr::Split(*s, " ", parts, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1974 TSignedSeqRange pstop = StringToRange(parts[0]+" "+parts[1]);
1975 if(!tcoords)
1976 pstop = a.GetAlignMap().MapRangeOrigToEdited(pstop, false);
1977 CCDSInfo::EStatus status = CCDSInfo::eUnknown;
1978 if(parts.size() > 2) {
1979 if(parts[2] == "C")
1980 status = CCDSInfo::eGenomeCorrect;
1981 else if(parts[2] == "N")
1982 status = CCDSInfo::eGenomeNotCorrect;
1983 else if(parts[2] == "S")
1984 status = CCDSInfo::eSelenocysteine;
1985 else
1986 _ASSERT(false);
1987 }
1988 cds_info.AddPStop(pstop, status);
1989 }
1990 }
1991
1992 TSignedSeqRange max_cds_limits;
1993 TSignedSeqRange tlim = a.TranscriptLimits();
1994 if (attributes["maxCDS"].empty()) {
1995 max_cds_limits = tlim;
1996 if(reversed)
1997 swap(start, stop);
1998 if(start.NotEmpty())
1999 max_cds_limits.SetFrom(start.GetFrom());
2000 if(stop.NotEmpty())
2001 max_cds_limits.SetTo(stop.GetTo());
2002 if(reversed)
2003 swap(start, stop);
2004 } else {
2005 max_cds_limits = StringToRange(attributes["maxCDS"]);
2006 if(!tcoords)
2007 max_cds_limits = a.GetAlignMap().MapRangeOrigToEdited(max_cds_limits, false);
2008 }
2009
2010 if(reversed) {
2011 if(max_cds_limits.GetTo() < tlim.GetTo())
2012 cds_info.Set5PrimeCdsLimit(max_cds_limits.GetTo());
2013 } else {
2014 if(max_cds_limits.GetFrom() > tlim.GetFrom())
2015 cds_info.Set5PrimeCdsLimit(max_cds_limits.GetFrom());
2016 }
2017 }
2018
2019 cds_info.SetScore(cds_info.Score(), open_cds);
2020
2021 a.SetCdsInfo(cds_info);
2022 }
2023
printGFF3(CNcbiOstream & os,CAlignModel a)2024 CNcbiOstream& printGFF3(CNcbiOstream& os, CAlignModel a)
2025 {
2026 CCDSInfo cds_info = a.GetCdsInfo();
2027 if(cds_info.IsMappedToGenome()) { // convert local copy to tcoords
2028 cds_info = cds_info.MapFromOrigToEdited(a.GetAlignMap());
2029 a.SetCdsInfo(cds_info);
2030 }
2031
2032 SGFFrec templ;
2033 templ.model = a.ID();
2034 templ.seqid = contig_stream_state.slot(os);
2035
2036 templ.source = CGeneModel::TypeToString(a.Type());
2037
2038 if(a.Strand() == ePlus)
2039 templ.strand = '+';
2040 else if(a.Strand() == eMinus)
2041 templ.strand = '-';
2042
2043 SGFFrec mrna = templ;
2044 mrna.type = "mRNA";
2045 mrna.start = a.Limits().GetFrom();
2046 mrna.end = a.Limits().GetTo();
2047 mrna.score = a.Score();
2048
2049 CollectAttributes(a, mrna.attributes);
2050
2051 os << mrna;
2052
2053 int part = 0;
2054 vector<SGFFrec> exons,cdss;
2055
2056 templ.attributes["Parent"] = NStr::NumericToString(a.ID());
2057 SGFFrec exon = templ;
2058 exon.type = "exon";
2059 SGFFrec cds = templ;
2060 cds.type = "CDS";
2061
2062 TSignedSeqRange tlim = a.TranscriptLimits();
2063 TSignedSeqRange rcds = tlim & cds_info.MaxCdsLimits(); // realcdslimits in tcoords
2064 if(cds_info.HasStart()) {
2065 if(a.Status()&CGeneModel::eReversed)
2066 rcds.SetTo(cds_info.Start().GetTo());
2067 else
2068 rcds.SetFrom(cds_info.Start().GetFrom());
2069 }
2070
2071 int phase = 0;
2072 if(cds_info.ReadingFrame().NotEmpty() && ((a.Strand() == ePlus && !cds_info.HasStart()) || (a.Strand() == eMinus && !cds_info.HasStop()))) { // may have extra bases on the left end
2073 phase = (a.Orientation() == ePlus) ? cds_info.ReadingFrame().GetFrom() - tlim.GetFrom() : tlim.GetTo()-cds_info.ReadingFrame().GetTo();
2074 _ASSERT(phase < 3);
2075 }
2076
2077 TInDels corrections = a.FrameShifts();
2078 for(unsigned int i = 0; i < a.Exons().size(); ++i) {
2079 const CModelExon *e = &a.Exons()[i];
2080
2081 if(e->Limits().NotEmpty()) {
2082 exon.start = e->GetFrom();
2083 exon.end = e->GetTo();
2084 exon.attributes["Gap"] = BuildGFF3Gap(exon.start, exon.end, corrections);
2085 } else {
2086 exon.start = -1;
2087 exon.end = -1;
2088 exon.attributes["Seq"] = e->m_seq;
2089 if(e->m_source.m_range.NotEmpty()) {
2090 exon.attributes["Source"] = e->m_source.m_acc + ":"+NStr::NumericToString(e->m_source.m_range.GetFrom()+1) + ":"+NStr::NumericToString(e->m_source.m_range.GetTo()+1) + ":";
2091 if(e->m_source.m_strand == ePlus)
2092 exon.attributes["Source"] += "+";
2093 else
2094 exon.attributes["Source"] += "-";
2095 }
2096 }
2097
2098 string targetid = NStr::Replace(CIdHandler::ToString(*a.GetTargetId()), " ", "%20");
2099
2100 TSignedSeqRange transcript_exon = a.TranscriptExon(i);
2101 string target = " "+NStr::NumericToString(transcript_exon.GetFrom()+1)+" "+NStr::NumericToString(transcript_exon.GetTo()+1);
2102 if((a.Status() & CGeneModel::eReversed)!=0) {
2103 target += " -";
2104 } else {
2105 target += " +";
2106 }
2107 exon.attributes["Target"] = targetid+target;
2108
2109 if(e->m_ident > 0) {
2110 exon.attributes["Ident"] = NStr::DoubleToString(e->m_ident);
2111 }
2112
2113 if(!e->m_fsplice_sig.empty() || !e->m_ssplice_sig.empty()) {
2114 string fs;
2115 if(!e->m_fsplice_sig.empty())
2116 fs = e->m_fsplice_sig;
2117 string ss;
2118 if(!e->m_ssplice_sig.empty())
2119 ss = e->m_ssplice_sig;
2120 if(a.Strand() == ePlus) {
2121 exon.attributes["Splices"] = fs+".."+ss;
2122 } else {
2123 exon.attributes["Splices"] = ss+".."+fs;
2124 }
2125 }
2126
2127 exons.push_back(exon);
2128 exon.attributes["Gap"].erase();
2129 exon.attributes["Ident"].erase();
2130 exon.attributes["Splices"].erase();
2131 exon.attributes["Seq"].erase();
2132 exon.attributes["Source"].erase();
2133
2134 TSignedSeqRange tcds = transcript_exon & rcds;
2135 if(tcds.NotEmpty()) {
2136 cds.tstart = tcds.GetFrom();
2137 cds.tend = tcds.GetTo();
2138 cds.start = -1;
2139 cds.end = -1;
2140 TSignedSeqRange cds_limits = a.GetAlignMap().MapRangeEditedToOrig(tcds, true);
2141 if(cds_limits.NotEmpty()) {
2142 cds.start = cds_limits.GetFrom();
2143 cds.end = cds_limits.GetTo();
2144 }
2145 string ctarget = " "+NStr::NumericToString(tcds.GetFrom()+1)+" "+NStr::NumericToString(tcds.GetTo()+1);
2146 if((a.Status() & CGeneModel::eReversed)!=0) {
2147 ctarget += " -";
2148 } else {
2149 ctarget += " +";
2150 }
2151 cds.attributes["Target"] = targetid+ctarget;
2152 cdss.push_back(cds);
2153 }
2154
2155 if (!e->m_ssplice) {
2156 string suffix;
2157 if (!a.Continuous()) {
2158 SGFFrec rec = templ;
2159 suffix= "."+NStr::NumericToString(++part);
2160 if(exons.front().start >= 0) {
2161 rec.start = exons.front().start;
2162 } else {
2163 _ASSERT(exons.size() > 1 && exons[1].start >= 0);
2164 rec.start = exons[1].start;
2165 }
2166 if(exons.back().end >= 0) {
2167 rec.end = exons.back().end;
2168 } else {
2169 _ASSERT(exons.size() > 1 && exons[exons.size()-2].end >= 0);
2170 rec.end = exons[exons.size()-2].end;
2171 }
2172 rec.type = "mRNA_part";
2173 rec.attributes["ID"] = rec.attributes["Parent"]+suffix;
2174 os << rec;
2175 }
2176 NON_CONST_ITERATE(vector<SGFFrec>, e, exons) {
2177 e->attributes["Parent"] += suffix;
2178 os << *e;
2179 }
2180 exons.clear();
2181
2182 NON_CONST_ITERATE(vector<SGFFrec>, e, cdss) {
2183 if (a.Strand()==ePlus)
2184 e->phase = phase;
2185
2186 phase = (e->tend - e->tstart+1 - phase)%3;
2187 if (a.Strand()==eMinus)
2188 e->phase = phase;
2189 phase = (3-phase)%3;
2190
2191 e->attributes["Parent"] += suffix;
2192 os << *e;
2193 }
2194 cdss.clear();
2195 }
2196 }
2197
2198 return os;
2199 }
2200
2201 struct Precedence : public binary_function<TSignedSeqRange, TSignedSeqRange, bool>
2202 {
operator ()Precedence2203 bool operator()(const TSignedSeqRange& __x, const TSignedSeqRange& __y) const
2204 { return Precede( __x, __y ); }
2205 };
2206
operator -(TSignedSeqRange a,TSignedSeqRange b)2207 TSignedSeqRange operator- (TSignedSeqRange a, TSignedSeqRange b)
2208 {
2209 b &= a;
2210 if (b.Empty())
2211 return a;
2212 if (a.GetFrom()==b.GetFrom())
2213 a.SetFrom(b.GetTo()+1);
2214 else if (a.GetTo()==b.GetTo())
2215 a.SetTo(b.GetFrom()-1);
2216 return a;
2217 }
2218
2219
readGFF3(CNcbiIstream & is,CAlignModel & align)2220 CNcbiIstream& readGFF3(CNcbiIstream& is, CAlignModel& align)
2221 {
2222 SGFFrec rec;
2223 is >> rec;
2224 if (!is) {
2225 return is;
2226 }
2227
2228 Int8 id = rec.model;
2229
2230 if (id==0)
2231 return InputError(is);
2232
2233 vector<SGFFrec> recs;
2234
2235 do {
2236 recs.push_back(rec);
2237 } while (is >> rec && rec.seqid == recs.front().seqid && rec.model == id && rec.type != "mRNA");
2238
2239 Ungetline(is);
2240
2241 CGeneModel a;
2242
2243 a.SetID(id);
2244 #ifdef _DEBUG
2245 a.oid = id;
2246 #endif
2247
2248 TSignedSeqRange cds;
2249 int phase = 0;
2250 bool tcoords = false;
2251
2252 vector<CModelExon> exons;
2253 vector<TSignedSeqRange> transcript_exons;
2254 TInDels indels;
2255 set<TSignedSeqRange,Precedence> mrna_parts;
2256
2257 rec = recs.front();
2258
2259 if (rec.source == "Gnomon") a.SetType(a.Type() | CGeneModel::eGnomon);
2260 else if (rec.source == "Chainer") a.SetType(a.Type() | CGeneModel::eChain);
2261
2262 if(rec.strand=='+')
2263 a.SetStrand(ePlus);
2264 else if(rec.strand=='-')
2265 a.SetStrand(eMinus);
2266
2267 double score = BadScore();
2268 map<string, string> attributes;
2269
2270 NON_CONST_ITERATE(vector<SGFFrec>, r, recs) {
2271 if (r->type == "mRNA") {
2272 attributes = r->attributes;
2273 score = r->score;
2274 } else if (r->type == "exon") {
2275 if(r->tstart >= 0 && r->tstrand == '-') {
2276 a.Status() |= CGeneModel::eReversed;
2277 }
2278
2279 string fs, ss;
2280 if(!r->attributes["Splices"].empty()) {
2281 string splices = r->attributes["Splices"];
2282 auto ispl = splices.find("..");
2283 if(ispl != string::npos) {
2284 if(ispl == 2)
2285 fs = splices.substr(0,2);
2286 if(ispl+4 == splices.length())
2287 ss = splices.substr(ispl+2,2);
2288 }
2289 if(a.Strand() == eMinus)
2290 swap(fs,ss);
2291 }
2292
2293 double eident = 0;
2294 if(!r->attributes["Ident"].empty()) {
2295 eident = NStr::StringToNumeric<double>(r->attributes["Ident"]);
2296 }
2297
2298 if(r->start >= 0 && r->end >= 0)
2299 exons.push_back(CModelExon(r->start,r->end,false,false,fs,ss,eident));
2300 else {
2301 CInDelInfo::SSource src;
2302 if(!r->attributes["Source"].empty()) {
2303 vector<string> v;
2304 NStr::Split(r->attributes["Source"], ":", v, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
2305 _ASSERT((int)v.size() == 4);
2306 src.m_acc = v[0];
2307 src.m_range.SetFrom(NStr::StringToNumeric<TSignedSeqPos>(v[1])-1);
2308 src.m_range.SetTo(NStr::StringToNumeric<TSignedSeqPos>(v[2])-1);
2309 if(v[3] == "+")
2310 src.m_strand = ePlus;
2311 else
2312 src.m_strand = eMinus;
2313 }
2314 exons.push_back(CModelExon(1,-1,false,false,fs,ss,eident,r->attributes["Seq"],src));
2315 }
2316 TSignedSeqRange texon(r->tstart,r->tend);
2317 if(texon.NotEmpty())
2318 transcript_exons.push_back(texon);
2319 readGFF3Gap(r->attributes["Gap"],r->start,r->end, indels);
2320 if(!r->attributes["Target"].empty())
2321 attributes["Target"] = r->attributes["Target"];
2322 } else if (r->type == "CDS") {
2323 _ASSERT((a.Status()&CGeneModel::eUnknownOrientation) == 0);
2324
2325 if (r->strand=='+') {
2326 if (cds.Empty())
2327 phase = r->phase;
2328 } else {
2329 phase = r->phase;
2330 }
2331 if(r->tstart < 0) { // old format before ggaps
2332 TSignedSeqRange cds_exon(r->start,r->end);
2333 cds.CombineWith(cds_exon);
2334 tcoords = false;
2335 } else {
2336 TSignedSeqRange cds_exon(r->tstart,r->tend);
2337 cds.CombineWith(cds_exon);
2338 tcoords = true;
2339 }
2340 } else if (r->type == "mRNA_part") {
2341 mrna_parts.insert(TSignedSeqRange(r->start,r->end));
2342 }
2343 }
2344
2345 for(int i = 0; i < (int)exons.size(); ++i) {
2346 const CModelExon& e = exons[i];
2347 if (a.Limits().IntersectingWith(e.Limits())) {
2348 return InputError(is);
2349 }
2350 if(i > 0 && exons[i-1].Limits().NotEmpty()) { // there can't be ggap adjacent to 'hole'
2351 set<TSignedSeqRange,Precedence>::iterator p = mrna_parts.lower_bound(e);
2352 if (p != mrna_parts.end() && p->GetFrom()==e.Limits().GetFrom())
2353 a.AddHole();
2354 }
2355 a.AddExon(e.Limits(),e.m_fsplice_sig,e.m_ssplice_sig,e.m_ident,e.m_seq,e.m_source);
2356 }
2357
2358 sort(indels.begin(),indels.end());
2359 CAlignMap amap;
2360 if(!transcript_exons.empty())
2361 amap = CAlignMap(a.Exons(), transcript_exons, indels, a.Orientation(), NStr::StringToNumeric<int>(attributes["TargetLen"]));
2362 else
2363 amap = CAlignMap(a.Exons(), indels, a.Strand());
2364
2365 a.FrameShifts() = indels;
2366 align = CAlignModel(a, amap);
2367
2368 TSignedSeqRange reading_frame;
2369 CCDSInfo cds_info;
2370
2371 if (cds.NotEmpty()) {
2372 cds_info = CCDSInfo(false);
2373
2374 // align.FrameShifts() = amap.GetInDels(true);
2375
2376 TSignedSeqRange rf = cds;
2377 if(!tcoords)
2378 rf = amap.MapRangeOrigToEdited(cds, false);
2379
2380 if(!(a.Status() & CGeneModel::eReversed)) {
2381 rf.SetFrom(rf.GetFrom()+phase);
2382 rf.SetTo(rf.GetTo()-rf.GetLength()%3);
2383 } else {
2384 rf.SetTo(rf.GetTo()-phase);
2385 rf.SetFrom(rf.GetFrom()+rf.GetLength()%3);
2386 }
2387
2388 cds_info.SetReadingFrame(rf, false);
2389 }
2390
2391 align.SetCdsInfo(cds_info);
2392
2393 ParseAttributes(attributes, align);
2394
2395 if(cds.NotEmpty()) {
2396 CCDSInfo cds_info_t = align.GetCdsInfo();
2397 cds_info_t.SetScore(score, cds_info_t.OpenCds());
2398 CCDSInfo cds_info_g = cds_info_t.MapFromEditedToOrig(amap);
2399 if(cds_info_g.ReadingFrame().NotEmpty()) // successful projection
2400 align.SetCdsInfo(cds_info_g);
2401 else
2402 align.SetCdsInfo(cds_info_t);
2403 }
2404
2405 contig_stream_state.slot(is) = rec.seqid;
2406 return is;
2407 }
2408
operator <<(CNcbiOstream & s,const CGeneModel & a)2409 CNcbiOstream& operator<<(CNcbiOstream& s, const CGeneModel& a)
2410 {
2411 return operator<<(s, CAlignModel(a, a.GetAlignMap()));
2412 }
2413
operator <<(CNcbiOstream & os,const CAlignModel & a)2414 CNcbiOstream& operator<<(CNcbiOstream& os, const CAlignModel& a)
2415 {
2416 switch (model_file_format_state.slot(os)) {
2417 case eGFF3FileFormat:
2418 return printGFF3(os,a);
2419 default:
2420 os.setstate(ios::failbit);
2421 return os;
2422 }
2423 }
2424
operator >>(CNcbiIstream & is,CAlignModel & align)2425 CNcbiIstream& operator>>(CNcbiIstream& is, CAlignModel& align)
2426 {
2427 switch (model_file_format_state.slot(is)) {
2428 case eGFF3FileFormat:
2429 return readGFF3(is, align);
2430 default:
2431 is.setstate(ios::failbit);
2432 return is;
2433 }
2434 }
2435
CSupportInfo(Int8 model_id,bool core)2436 CSupportInfo::CSupportInfo(Int8 model_id, bool core)
2437 : m_id( model_id), m_core_align(core)
2438 {
2439 }
2440
GetId() const2441 Int8 CSupportInfo::GetId() const
2442 {
2443 return m_id;
2444 }
2445
SetCore(bool core)2446 void CSupportInfo::SetCore(bool core)
2447 {
2448 m_core_align = core;
2449 }
2450
IsCore() const2451 bool CSupportInfo::IsCore() const
2452 {
2453 return m_core_align;
2454 }
2455
operator ==(const CSupportInfo & s) const2456 bool CSupportInfo::operator==(const CSupportInfo& s) const
2457 {
2458 return IsCore() == s.IsCore() && GetId() == s.GetId();
2459 }
2460
operator <(const CSupportInfo & s) const2461 bool CSupportInfo::operator<(const CSupportInfo& s) const
2462 {
2463 return GetId() == s.GetId() ? IsCore() < s.IsCore() : GetId() < s.GetId();
2464 }
2465
2466
2467 END_SCOPE(gnomon)
2468 END_NCBI_SCOPE
2469