1 /*  $Id: annot.cpp 620759 2020-11-30 16:00:57Z souvorov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Vyacheslav Chetvernin
27  *
28  * File Description:
29  *
30  * Builds annotation models out of chained alignments:
31  * selects good chains as alternatively spliced genes,
32  * selects good chains inside other chains introns,
33  * other chains filtered to leave one chain per placement,
34  * gnomon is run to improve chains and predict models in regions w/o chains
35  *
36  */
37 
38 #include <ncbi_pch.hpp>
39 #include <corelib/ncbiapp.hpp>
40 #include <corelib/ncbienv.hpp>
41 #include <corelib/ncbiargs.hpp>
42 
43 #include <algo/gnomon/annot.hpp>
44 #include "gnomon_engine.hpp"
45 #include <algo/gnomon/gnomon_model.hpp>
46 #include <algo/gnomon/gnomon.hpp>
47 #include <algo/gnomon/id_handler.hpp>
48 
49 #include <objects/seqloc/Seq_loc.hpp>
50 
51 BEGIN_NCBI_SCOPE
52 USING_SCOPE(objects);
BEGIN_SCOPE(gnomon)53 BEGIN_SCOPE(gnomon)
54 
55 CGeneSelector::CGeneSelector()
56 {
57 }
58 
CGnomonAnnotator()59 CGnomonAnnotator::CGnomonAnnotator()
60 {
61 }
62 
~CGnomonAnnotator()63 CGnomonAnnotator::~CGnomonAnnotator()
64 {
65 }
66 
s_AlignScoreOrder(const CGeneModel & ap,const CGeneModel & bp)67 bool s_AlignScoreOrder(const CGeneModel& ap, const CGeneModel& bp)
68 {
69     return (ap.Score() < bp.Score());
70 }
71 
RemoveShortHolesAndRescore(TGeneModelList chains)72 void CGnomonAnnotator::RemoveShortHolesAndRescore(TGeneModelList chains)
73 {
74     NON_CONST_ITERATE(TGeneModelList, it, chains) {
75         it->RemoveShortHolesAndRescore(*m_gnomon);
76     }
77 }
78 
ExtendJustThisChain(CGeneModel & chain,TSignedSeqPos left,TSignedSeqPos right)79 double CGnomonAnnotator::ExtendJustThisChain(CGeneModel& chain,
80                                              TSignedSeqPos left, TSignedSeqPos right)
81 {
82     TGeneModelList test_align;
83     test_align.push_back(chain);
84     int l = max((int)left,(int)chain.Limits().GetFrom()-10000);
85     int r = min(right,chain.Limits().GetTo()+10000);
86     cerr << "Testing alignment " << chain.ID() << " in fragment " << l << ' ' << r << endl;
87 
88     m_gnomon->ResetRange(l,r);
89     return m_gnomon->Run(test_align, false, false, false, false, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs);
90 }
91 
TryWithoutObviouslyBadAlignments(TGeneModelList & aligns,TGeneModelList & suspect_aligns,TGeneModelList & bad_aligns,bool leftwall,bool rightwall,bool leftanchor,bool rightanchor,TSignedSeqPos left,TSignedSeqPos right,TSignedSeqRange & tested_range)92 double CGnomonAnnotator::TryWithoutObviouslyBadAlignments(TGeneModelList& aligns, TGeneModelList& suspect_aligns, TGeneModelList& bad_aligns,
93                                                           bool leftwall, bool rightwall, bool leftanchor, bool rightanchor,
94                                                           TSignedSeqPos left, TSignedSeqPos right,
95                                                           TSignedSeqRange& tested_range)
96 {
97     bool already_tested = Include(tested_range, TSignedSeqRange(left,right));
98 
99     if (already_tested) {
100         for(TGeneModelList::iterator it = aligns.begin(); it != aligns.end(); it++) {
101             if(left <= it->Limits().GetTo() && it->Limits().GetFrom() <= right)
102                 suspect_aligns.push_back(*it);
103         }
104     } else {
105         tested_range = TSignedSeqRange(left,right);
106 
107         bool found_bad_cluster = false;
108         for(TGeneModelList::iterator it = aligns.begin(); it != aligns.end(); ) {
109             if(it->Limits().GetTo() < left || it->Limits().GetFrom() > right) {
110                 ++it;
111                 continue;
112             }
113 
114             if ((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))==0 &&
115                 ExtendJustThisChain(*it, left, right) == BadScore()) {
116                 found_bad_cluster = true;
117                 cerr << "Deleting alignment " << it->ID() << endl;
118                 it->Status() |= CGeneModel::eSkipped;
119                 it->AddComment("Bad score prediction alone");
120                 bad_aligns.push_back(*it);
121 
122                 it = aligns.erase(it);
123                 continue;
124             }
125             suspect_aligns.push_back(*it++);
126         }
127 
128         m_gnomon->ResetRange(left, right);
129         if(found_bad_cluster) {
130             cerr << "Testing w/o bad alignments in fragment " << left << ' ' << right << endl;
131             return m_gnomon->Run(suspect_aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs);
132         }
133     }
134     return BadScore();
135 }
136 
TryToEliminateOneAlignment(TGeneModelList & suspect_aligns,TGeneModelList & bad_aligns,bool leftwall,bool rightwall,bool leftanchor,bool rightanchor)137 double CGnomonAnnotator::TryToEliminateOneAlignment(TGeneModelList& suspect_aligns, TGeneModelList& bad_aligns,
138                                       bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
139 {
140     double score = BadScore();
141     for(TGeneModelList::iterator it = suspect_aligns.begin(); it != suspect_aligns.end();) {
142         if((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))!=0) {
143             ++it;
144             continue;
145         }
146         CGeneModel algn = *it;
147         it = suspect_aligns.erase(it);
148 
149         cerr << "Testing w/o " << algn.ID();
150         score = m_gnomon->Run(suspect_aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs);
151         if (score != BadScore()) {
152             cerr << "- Good. Deleting alignment " << algn.ID() << endl;
153             algn.Status() |= CGeneModel::eSkipped;
154             algn.AddComment("Good score prediction without");
155             bad_aligns.push_back(algn);
156             break;
157         } else {
158             cerr << " - Still bad." << endl;
159         }
160         suspect_aligns.insert(it,algn);
161     }
162     return score;
163 }
164 
TryToEliminateAlignmentsFromTail(TGeneModelList & suspect_aligns,TGeneModelList & bad_aligns,bool leftwall,bool rightwall,bool leftanchor,bool rightanchor)165 double CGnomonAnnotator::TryToEliminateAlignmentsFromTail(TGeneModelList& suspect_aligns, TGeneModelList& bad_aligns,
166                                       bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
167 {
168     double score = BadScore();
169     for (TGeneModelList::iterator it = suspect_aligns.begin(); score == BadScore() && it != suspect_aligns.end(); ) {
170         if ((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))!=0 || it->GoodEnoughToBeAnnotation()) {
171             ++it;
172             continue;
173         }
174         cerr << "Deleting alignment " << it->ID() << endl;
175         it->Status() |= CGeneModel::eSkipped;
176         it->AddComment("Bad score prediction in combination");
177         bad_aligns.push_back(*it);
178         it = suspect_aligns.erase(it);
179 
180         cerr << "Testing fragment " << left << ' ' << right << endl;
181         score = m_gnomon->Run(suspect_aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs);
182     }
183     return score;
184 }
185 
Predict(TSignedSeqPos llimit,TSignedSeqPos rlimit,TGeneModelList::const_iterator il,TGeneModelList::const_iterator ir,TGeneModelList & models,bool leftmostwall,bool rightmostwall,bool leftmostanchor,bool rightmostanchor,TGeneModelList & bad_aligns)186 void CGnomonAnnotator::Predict(TSignedSeqPos llimit, TSignedSeqPos rlimit, TGeneModelList::const_iterator il, TGeneModelList::const_iterator ir, TGeneModelList& models,
187              bool leftmostwall, bool rightmostwall, bool leftmostanchor, bool rightmostanchor, TGeneModelList& bad_aligns)
188 {
189     TGeneModelList aligns(il, ir);
190 
191     Int8 left = llimit;
192     bool leftwall = leftmostwall;
193     bool leftanchor = leftmostanchor;
194 
195     Int8 right = llimit+window;
196     bool rightwall = false;
197     bool rightanchor = false;
198 
199     Int8 prev_bad_right = rlimit+1;
200     bool do_it_again = false;
201 
202     m_gnomon->ResetRange(left, right);
203 
204     RemoveShortHolesAndRescore(aligns);
205 
206     TGeneModelList suspect_aligns;
207     TSignedSeqRange tested_range;
208 
209     TIVec busy_spots(rlimit+1,0);
210     ITERATE(TGeneModelList, it_c, aligns) {
211         int a = max(0,it_c->Limits().GetFrom()-margin);
212         int b = min(rlimit,it_c->Limits().GetTo()+margin);
213         for(int i = a; i<=b; ++i)
214             busy_spots[i] = 1;
215     }
216 
217     do {
218         for( ; right < rlimit && busy_spots[right] != 0; ++right);
219 
220         if (right + (right-left)/2 >= rlimit) {
221             right = rlimit;
222             rightwall = rightmostwall;
223             rightanchor = rightmostanchor;
224         } else {
225             rightwall = false;
226             rightanchor = false;
227         }
228 
229         if (do_it_again)
230             rightwall = true;
231 
232         double score = BadScore();
233 
234         if (right < prev_bad_right) {
235             suspect_aligns.clear();
236 
237             m_gnomon->ResetRange(left,right);
238 
239             cerr << left << ' ' << right << ' ' << m_gnomon->GetGCcontent() << endl;
240 
241             score = m_gnomon->Run(aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs);
242 
243             if(score == BadScore()) {
244                 cerr << "Inconsistent alignments in fragment " << left << ' ' << right << '\n';
245 
246                 score = TryWithoutObviouslyBadAlignments(aligns, suspect_aligns, bad_aligns,
247                                                          leftwall, rightwall, leftanchor, rightanchor,
248                                                          left, right, tested_range);
249             }
250 
251             if(score == BadScore()) {
252 
253                 prev_bad_right = right;
254                 right = (left+right)/2;
255 
256                 continue;
257             }
258         } else {
259             suspect_aligns.sort(s_AlignScoreOrder);
260 
261             score = TryToEliminateOneAlignment(suspect_aligns, bad_aligns,
262                                                leftwall, rightwall, leftanchor, rightanchor);
263             if (score == BadScore())
264                 score = TryToEliminateAlignmentsFromTail(suspect_aligns, bad_aligns,
265                                                          leftwall, rightwall, leftanchor, rightanchor);
266             if(score == BadScore()) {
267                 cerr << "!!! BAD SCORE EVEN WITH FINISHED ALIGNMENTS !!! " << endl;
268                 ITERATE(TGeneModelList, it, suspect_aligns) {
269                     if ((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))==0 && it->GoodEnoughToBeAnnotation())
270                        models.push_back(*it);
271                 }
272             }
273         }
274         prev_bad_right = rlimit+1;
275 
276         list<CGeneModel> genes = m_gnomon->GetGenes();
277 
278         TSignedSeqPos partial_start = right;
279 
280         if (right < rlimit && !genes.empty() && !genes.back().RightComplete() && !do_it_again) {
281             partial_start = genes.back().LeftComplete() ? genes.back().RealCdsLimits().GetFrom() : left;
282             _ASSERT ( partial_start < right );
283             genes.pop_back();
284         }
285 
286         do_it_again = false;
287 
288         if (!genes.empty()) {
289             left = genes.back().ReadingFrame().GetTo()+1;
290             leftanchor = true;
291         } else if (partial_start < left+1000) {
292             do_it_again=true;
293         } else if (partial_start < right) {
294             int new_left = partial_start-100;
295             for( ; new_left > left && busy_spots[new_left] != 0; --new_left);
296             if(new_left > left+1000) {
297                 left = new_left;
298                 leftanchor = false;
299             } else {
300                 do_it_again=true;
301             }
302         } else {
303             left = (left+right)/2+1;
304             leftanchor = false;
305         }
306 
307         models.splice(models.end(), genes);
308 
309         if (right >= rlimit)
310             break;
311 
312         if (!do_it_again)
313             leftwall = true;
314 
315         right = left + window;
316 
317     } while(left <= rlimit);
318 }
319 
WalledCdsLimits(const CGeneModel & a)320 TSignedSeqRange WalledCdsLimits(const CGeneModel& a)
321 {
322     return ((a.Type() & CGeneModel::eWall)!=0) ? a.Limits() : a.MaxCdsLimits();
323 }
324 
GetWallLimits(const CGeneModel & m)325 TSignedSeqRange GetWallLimits(const CGeneModel& m)
326 {
327     return m.RealCdsLimits().Empty() ? m.Limits() : m.RealCdsLimits();
328 }
329 
s_AlignSeqOrder(const CGeneModel & ap,const CGeneModel & bp)330 bool s_AlignSeqOrder(const CGeneModel& ap, const CGeneModel& bp)
331 {
332     TSignedSeqRange a = GetWallLimits(ap);
333     TSignedSeqRange b = GetWallLimits(bp);
334 
335     return (a.GetFrom() != b.GetFrom() ?
336             a.GetFrom() < b.GetFrom() :
337             a.GetTo() > b.GetTo()
338             );
339 }
340 
341 typedef map<int,CGeneModel> TNestedGenes;
342 
SaveWallModel(auto_ptr<CGeneModel> & wall_model,TNestedGenes & nested_genes,TGeneModelList & aligns)343 void SaveWallModel(auto_ptr<CGeneModel>& wall_model, TNestedGenes& nested_genes, TGeneModelList& aligns)
344 {
345     if (wall_model.get() != 0 && wall_model->Type() == CGeneModel::eWall+CGeneModel::eGnomon) {
346         aligns.push_back(*wall_model);
347     }
348     ITERATE(TNestedGenes, nested, nested_genes) {
349         aligns.push_back(nested->second);
350     }
351     nested_genes.clear();
352 }
353 
FindPartials(TGeneModelList & models,TGeneModelList & aligns,EStrand strand)354 void FindPartials(TGeneModelList& models, TGeneModelList& aligns, EStrand strand)
355 {
356     TSignedSeqPos right = -1;
357     auto_ptr<CGeneModel> wall_model;
358     TNestedGenes nested_genes;
359 
360     for (TGeneModelList::iterator loop_it = models.begin(); loop_it != models.end();) {
361         TGeneModelList::iterator ir = loop_it;
362         ++loop_it;
363 
364         if (ir->Strand() != strand) {
365             continue;
366         }
367 
368         TSignedSeqRange limits = GetWallLimits(*ir);
369 
370         if ( right < limits.GetFrom() ) { // new cluster
371             SaveWallModel(wall_model, nested_genes, aligns);
372         }
373 
374         if ((ir->Type() & CGeneModel::eNested) !=0) {
375             //            ir->SetType( ir->Type() - CGeneModel::eNested );
376             TNestedGenes::iterator nested_wall = nested_genes.find(ir->GeneID());
377             if (nested_wall == nested_genes.end()) {
378                 CGeneModel new_nested_wall(ir->Strand(),ir->ID(),CGeneModel::eNested+CGeneModel::eGnomon);
379                 new_nested_wall.SetGeneID(ir->GeneID());
380                 new_nested_wall.AddExon(limits);
381                 nested_genes[ir->GeneID()] = new_nested_wall;
382             } else if (limits.GetTo() - nested_wall->second.Limits().GetTo() > 0) {
383                     nested_wall->second.ExtendRight(limits.GetTo()- nested_wall->second.Limits().GetTo());
384             }
385             continue;
386         }
387 
388         if ( right < limits.GetFrom() ) { // new cluster
389             wall_model.reset( new CGeneModel(ir->Strand(),ir->ID(),CGeneModel::eWall+CGeneModel::eGnomon));
390             wall_model->SetGeneID(ir->GeneID());
391             wall_model->AddExon(limits);
392         }
393         /* doesn't work for genes which onclude coding and non coding together
394         else { // same cluster
395             _ASSERT( wall_model.get() != 0 && wall_model->GeneID()==ir->GeneID() );
396         }
397         */
398 
399         right = max(right, limits.GetTo());
400         if (ir->RankInGene() == 1 && !ir->GoodEnoughToBeAnnotation()) {
401             ir->Status() &= ~CGeneModel::eFullSupCDS;
402             aligns.splice(aligns.end(), models, ir);
403             wall_model->SetType(CGeneModel::eGnomon);
404         } else if (limits.GetTo()- wall_model->Limits().GetTo() > 0)  {
405             wall_model->ExtendRight(limits.GetTo() - wall_model->Limits().GetTo());
406         }
407     }
408     SaveWallModel(wall_model, nested_genes, aligns);
409 }
410 
Predict(TGeneModelList & models,TGeneModelList & bad_aligns)411 void CGnomonAnnotator::Predict(TGeneModelList& models, TGeneModelList& bad_aligns)
412 {
413     if (models.empty() && int(m_gnomon->GetSeq().size()) < mincontig)
414         return;
415 
416     if (GnomonNeeded()) {
417 
418         //extend partial nested
419 
420         typedef list<TGeneModelList::iterator> TIterList;
421         typedef map<Int8,TIterList> TGIDIterlist;
422         TGIDIterlist genes;
423         NON_CONST_ITERATE(TGeneModelList, im, models) {
424             if(im->Type()&CGeneModel::eNested)
425                 im->SetType(im->Type()-CGeneModel::eNested);  // ignore flag set in chainer
426             genes[im->GeneID()].push_back(im);
427         }
428 
429         set<TSignedSeqRange> hosting_intervals;
430         ITERATE(TGIDIterlist, ig, genes) {  // first - ID; second - list
431             bool coding_gene = false;
432             ITERATE(TIterList, im, ig->second) {
433                 if((*im)->ReadingFrame().NotEmpty()) {
434                     coding_gene = true;
435                     break;
436                 }
437             }
438 
439             TSignedSeqRange gene_lim_for_nested;
440             ITERATE(TIterList, im, ig->second) {
441                 const CGeneModel& ai = **im;
442                 if(coding_gene && ai.ReadingFrame().Empty())
443                     continue;
444                 TSignedSeqRange model_lim_for_nested = ai.Limits();
445                 if(ai.ReadingFrame().NotEmpty())
446                     model_lim_for_nested = ai.OpenCds() ? ai.MaxCdsLimits() : ai.RealCdsLimits();   // 'open' could be only a single variant gene
447                 gene_lim_for_nested += model_lim_for_nested;
448             }
449 
450             vector<int> grange(gene_lim_for_nested.GetLength(),1);
451             ITERATE(TIterList, im, ig->second) {   // exclude all positions included in CDS (any exons for not coding genes) and holes
452                 const CGeneModel& ai = **im;
453                 if(coding_gene && ai.ReadingFrame().Empty())
454                     continue;
455 
456                 TSignedSeqRange model_lim_for_nested = ai.Limits();
457                 if(ai.ReadingFrame().NotEmpty())
458                     model_lim_for_nested = ai.OpenCds() ? ai.MaxCdsLimits() : ai.RealCdsLimits();   // 'open' could be only a single variant gene
459 
460                 for(int i = 0; i < (int)ai.Exons().size(); ++i) {
461                     TSignedSeqRange overlap = (model_lim_for_nested & ai.Exons()[i].Limits());
462                     for(int j = overlap.GetFrom(); j <= overlap.GetTo(); ++j)
463                         grange[j-gene_lim_for_nested.GetFrom()] = 0;
464                 }
465 
466                 for(int i = 1; i < (int)ai.Exons().size(); ++i) {
467                     if(!ai.Exons()[i-1].m_ssplice || !ai.Exons()[i].m_fsplice) {
468                         TSignedSeqRange hole(ai.Exons()[i-1].Limits().GetTo()+1,ai.Exons()[i].Limits().GetFrom()-1);
469                         _ASSERT(Include(model_lim_for_nested, hole));
470                         for(int j = hole.GetFrom(); j <= hole.GetTo(); ++j)
471                             grange[j-gene_lim_for_nested.GetFrom()] = 0;
472                     }
473                 }
474             }
475             _ASSERT(grange.front() == 0 && grange.back() == 0);
476 
477             int left = -1;
478             int right;
479             for(int j = 0; j < (int)grange.size(); ++j) {
480                 if(left < 0) {
481                     if(grange[j] == 1) {
482                         left = j;
483                         right = j;
484                     }
485                 } else if(grange[j] == 1) {
486                     right = j;
487                 } else {
488                     TSignedSeqRange interval(left+gene_lim_for_nested.GetFrom(),right+gene_lim_for_nested.GetFrom());
489                     hosting_intervals.insert(interval);
490                     left = -1;
491                 }
492             }
493         }
494 
495         typedef map<TSignedSeqRange,TIterList> TRangeModels;
496         TRangeModels nested_models;
497         ITERATE(TGIDIterlist, ig, genes) {  // first - ID; second - list
498             TGeneModelList::iterator nested_modeli = ig->second.front();
499             if(!nested_modeli->GoodEnoughToBeAnnotation()) {
500                 _ASSERT(ig->second.size() == 1);
501                 TSignedSeqRange lim_for_nested = nested_modeli->RealCdsLimits().Empty() ? nested_modeli->Limits() : nested_modeli->RealCdsLimits();
502 
503                 TSignedSeqRange hosting_interval;
504                 ITERATE(set<TSignedSeqRange>, ii, hosting_intervals) {
505                     TSignedSeqRange interval = *ii;
506                     if(Include(interval,lim_for_nested)) {
507                         if(hosting_interval.Empty())
508                             hosting_interval = interval;
509                         else
510                             hosting_interval = (hosting_interval&interval);
511                     }
512                 }
513 
514                 if(hosting_interval.NotEmpty()) {
515                     TIterList nested(1,nested_modeli);
516                     ITERATE(TGIDIterlist, igg, genes) {  // first - ID; second - list
517                         const TIterList& other_gene = igg->second;
518                         if(igg == ig || !other_gene.front()->GoodEnoughToBeAnnotation())
519                             continue;
520 
521                         bool coding_gene = false;
522                         ITERATE(TIterList, im, other_gene) {
523                             if((*im)->ReadingFrame().NotEmpty()) {
524                                 coding_gene = true;
525                                 break;
526                             }
527                         }
528 
529                         TSignedSeqRange finished_interval;
530                         ITERATE(TIterList, im, other_gene) {
531                             const CGeneModel& ai = **im;
532                             if(coding_gene && ai.ReadingFrame().Empty())
533                                 continue;
534 
535                             finished_interval += coding_gene ? ai.RealCdsLimits() : ai.Limits();
536                         }
537                         if(!finished_interval.IntersectingWith(hosting_interval) || Include(finished_interval,hosting_interval))
538                             continue;
539 
540                         if(Precede(finished_interval,lim_for_nested)) {           // before partial model
541                             hosting_interval.SetFrom(finished_interval.GetTo());
542                         } else if(Precede(lim_for_nested,finished_interval)) {    // after partial model
543                             hosting_interval.SetTo(finished_interval.GetFrom());
544                         } else if(CModelCompare::RangeNestedInIntron(finished_interval, *nested_modeli, true)) {
545                             //} else {                                                  // overlaps partial model
546                             // _ASSERT(CModelCompare::RangeNestedInIntron(finished_interval, *nested_modeli, true));
547                             ITERATE(TIterList, im, other_gene) {
548                                 nested.push_back(*im);
549                             }
550                         }
551                     }
552                     _ASSERT(hosting_interval.NotEmpty());
553                     nested_models[hosting_interval].splice(nested_models[hosting_interval].begin(),nested);
554                 }
555             }
556         }
557 
558         bool scaffold_wall = wall;
559         wall = true;
560         ITERATE(TRangeModels, i, nested_models) {
561             TSignedSeqRange hosting_interval = i->first;
562 
563             TGeneModelList nested;
564             set<Int8> included_complete_models;
565             ITERATE(TIterList, im, i->second) {
566                 nested.push_back(**im);
567 
568                 if(!(*im)->GoodEnoughToBeAnnotation()) {
569                     if(((*im)->Type()&CGeneModel::eNested)) {
570                         nested.back().SetType(nested.back().Type()-CGeneModel::eNested);  // remove flag to allow ab initio extension
571                     }
572 
573                     if(nested.back().HasStart() && !Include(hosting_interval,nested.back().MaxCdsLimits())) {
574                         CCDSInfo cds = nested.back().GetCdsInfo();
575                         if(nested.back().Strand() == ePlus)
576                             cds.Set5PrimeCdsLimit(cds.Start().GetFrom());
577                         else
578                             cds.Set5PrimeCdsLimit(cds.Start().GetTo());
579                         nested.back().SetCdsInfo(cds);
580                     }
581 
582 
583 #ifdef _DEBUG
584                     nested.back().AddComment("partialnested");
585 #endif
586 
587                     models.erase(*im);
588                 } else {
589                     //                    (*im)->SetType((*im)->Type()|CGeneModel::eNested);
590                     included_complete_models.insert((*im)->ID());
591                 }
592             }
593 
594             cerr << "Interval " << hosting_interval << '\t' << nested.size() << endl;
595 
596             Predict(nested, bad_aligns, hosting_interval.GetFrom()+1,hosting_interval.GetTo()-1);
597 
598             NON_CONST_ITERATE(TGeneModelList, im, nested) {
599                 if(!im->Support().empty()) {
600                     im->SetType(im->Type()|CGeneModel::eNested);
601                     if(im->ID() == 0 || included_complete_models.find(im->ID()) == included_complete_models.end())  // include only models which we tried to extend
602                         models.push_back(*im);
603                 }
604             }
605         }
606         wall = scaffold_wall;
607     }
608 
609     Predict(models, bad_aligns, 0, TSignedSeqPos(m_gnomon->GetSeq().size())-1);
610 
611     ERASE_ITERATE(TGeneModelList, im, models) {
612         CGeneModel& model = *im;
613         TSignedSeqRange cds = model.RealCdsLimits();
614         if(cds.Empty())
615             continue;
616 
617         bool gapfilled = false;
618         int genome_cds = 0;
619         ITERATE(CGeneModel::TExons, ie, model.Exons()) {
620             if(ie->m_fsplice_sig == "XX" || ie->m_ssplice_sig == "XX")
621                 gapfilled = true;
622             else
623                 genome_cds += (cds&ie->Limits()).GetLength();
624         }
625 
626         if(gapfilled && genome_cds < 45) {
627             model.Status() |= CGeneModel::eSkipped;
628             model.AddComment("Most CDS in genomic gap");
629             bad_aligns.push_back(model);
630             models.erase(im);
631         }
632     }
633 }
634 
Predict(TGeneModelList & models,TGeneModelList & bad_aligns,TSignedSeqPos left,TSignedSeqPos right)635 void CGnomonAnnotator::Predict(TGeneModelList& models, TGeneModelList& bad_aligns, TSignedSeqPos left, TSignedSeqPos right)
636 {
637     if (GnomonNeeded()) {
638 
639         models.sort(s_AlignSeqOrder);
640 
641         if(!models.empty()) {
642             for(auto it_loop = next(models.begin()); it_loop != models.end(); ) {
643                 auto it = it_loop++;
644                 if(it->RankInGene() != 1 || it->GoodEnoughToBeAnnotation() || it->Type()&CGeneModel::eNested)
645                     continue;
646                 auto it_prev = prev(it);
647                 if(it_prev->RankInGene() != 1 || it_prev->GoodEnoughToBeAnnotation() || it_prev->Type()&CGeneModel::eNested)
648                     continue;
649 
650                 if(it->MaxCdsLimits().IntersectingWith(it_prev->MaxCdsLimits())) {
651                     cerr << "Intersecting alignments " << it->ID() << " " << it_prev->ID() << " " << it->Score() << " " << it_prev->Score() << endl;
652                     auto it_erase = (it->Score() < it_prev->Score()) ? it : it_prev;
653                     it_erase->Status() |= CGeneModel::eSkipped;
654                     it_erase->AddComment("Intersects with other partial");
655                     bad_aligns.push_back(*it_erase);
656                     models.erase(it_erase);
657                 }
658             }
659         }
660 
661         TGeneModelList aligns;
662 
663         FindPartials(models, aligns, ePlus);
664         FindPartials(models, aligns, eMinus);
665 
666         aligns.sort(s_AlignSeqOrder);
667 
668         TGeneModelList models_tmp;
669         Predict(left, right, aligns.begin(), aligns.end(), models_tmp,(left!=0 || wall), wall, left!=0, false, bad_aligns);
670         ITERATE(TGeneModelList, it, models_tmp) {
671             if(!it->Support().empty() || it->RealCdsLen() >= minCdsLen)
672                 models.push_back(*it);
673         }
674     }
675 
676     NON_CONST_ITERATE(TGeneModelList, it, models) {
677         CCDSInfo cds_info = it->GetCdsInfo();
678 
679         // removing fshifts in UTRs
680         TInDels fs;
681         TSignedSeqRange fullcds = cds_info.Cds();
682         ITERATE(TInDels, i, it->FrameShifts()) {
683             if(((i->IsInsertion() || i->IsMismatch()) && Include(fullcds,i->Loc())) ||
684                (i->IsDeletion() && i->Loc() > fullcds.GetFrom() && i->Loc() <= fullcds.GetTo())) {
685                 fs.push_back(*i);
686             }
687         }
688         it->FrameShifts() = fs;
689 
690         // removing pstops in UTRs
691         CCDSInfo::TPStops pstops = cds_info.PStops();
692         cds_info.ClearPStops();
693         ITERATE(CCDSInfo::TPStops, ps, pstops) {
694             if(Include(fullcds,*ps))
695                 cds_info.AddPStop(*ps);
696         }
697         it->SetCdsInfo(cds_info);
698 
699 #ifdef _DEBUG
700         {
701             string protein = it->GetProtein(m_gnomon->GetSeq());
702             int nstar = 0;
703             ITERATE(string, is, protein) {
704                 if(*is == '*')
705                     ++nstar;
706             }
707             int nstop = it->HasStop() ? 1 : 0;
708             ITERATE(CCDSInfo::TPStops, stp, it->GetCdsInfo().PStops()) {
709                 if(stp->m_status != CCDSInfo::eSelenocysteine)
710                     ++nstop;
711             }
712         }
713 #endif
714         if (it->PStop(false) || !it->FrameShifts().empty()) {
715             it->Status() |= CGeneModel::ePseudo;
716         }
717         if(it->OpenCds()) {
718             CCDSInfo cds_info = it->GetCdsInfo();
719             cds_info.SetScore(cds_info.Score(),false);     // kill the Open flag
720             it->SetCdsInfo(cds_info);
721         }
722     }
723 }
724 
RemoveTrailingNs(const CResidueVec & _seq)725 RemoveTrailingNs::RemoveTrailingNs(const CResidueVec& _seq)
726     : seq(_seq)
727 {
728 }
729 
transform_model(CGeneModel & m)730 void RemoveTrailingNs::transform_model(CGeneModel& m)
731 {
732         CAlignMap mrnamap(m.GetAlignMap());
733         CResidueVec vec;
734         mrnamap.EditedSequence(seq, vec);
735 
736         int five_p, three_p;
737         for(five_p=0; five_p < (int)vec.size() && vec[five_p] == 'N'; ++five_p);
738         for(three_p=0; three_p < (int)vec.size() && vec[(int)vec.size()-1-three_p] == 'N'; ++three_p);
739 
740         if(five_p > 0 || three_p > 0) {
741             int left = five_p;
742             int right = three_p;
743             if(m.Strand() == eMinus)
744                 swap(left,right);
745 
746             TSignedSeqRange new_lim(m.Limits());
747             if(left > 0) {
748                 _ASSERT(m.Exons().front().Limits().GetLength() > left);
749                 new_lim.SetFrom(new_lim.GetFrom()+left);
750             }
751             if(right > 0) {
752                 _ASSERT(m.Exons().back().Limits().GetLength() > right);
753                 new_lim.SetTo(new_lim.GetTo()-right);
754             }
755 
756             double score = m.Score();
757             m.Clip(new_lim,CAlignModel::eDontRemoveExons);
758             CCDSInfo cds_info = m.GetCdsInfo();
759             cds_info.SetScore(score, false);
760             m.SetCdsInfo(cds_info);
761 
762             if (m.Type() & (CGeneModel::eChain | CGeneModel::eGnomon)) {
763                 CAlignModel* a = dynamic_cast<CAlignModel*>(&m);
764                 if (a != NULL) {
765                     a->ResetAlignMap();
766                 }
767             }
768         }
769 }
770 
SetupArgDescriptions(CArgDescriptions * arg_desc)771 void CGnomonAnnotatorArgUtil::SetupArgDescriptions(CArgDescriptions* arg_desc)
772 {
773     arg_desc->AddFlag("nognomon","Skips ab initio prediction and ab initio extension of partial chains.");
774     arg_desc->AddKey("param", "param",
775                      "Organism specific parameters",
776                      CArgDescriptions::eInputFile);
777     arg_desc->AddDefaultKey("window","window","Prediction window",CArgDescriptions::eInteger,"200000");
778     arg_desc->AddDefaultKey("margin","margin","The minimal distance between chains to place the end of prediction window",CArgDescriptions::eInteger,"1000");
779     arg_desc->AddFlag("open","Allow partial predictions at the ends of contigs. Used for poorly assembled genomes with lots of unfinished contigs.");
780     arg_desc->AddDefaultKey("mpp","mpp","Penalty for connection two protein containing chains into one model.",CArgDescriptions::eDouble,"10.0");
781     arg_desc->AddFlag("nonconsens","Allows to accept nonconsensus splices starts/stops to complete partial alignmet. If not allowed some partial alignments "
782                       "may be rejected if there is no way to complete them.");
783     arg_desc->AddDefaultKey("ncsp","ncsp","Nonconsensus penalty",CArgDescriptions::eDouble,"25");
784 
785     arg_desc->AddFlag("norep","DO NOT mask lower case letters");
786     arg_desc->AddDefaultKey("mincont","mincont","Contigs shorter than that will be skipped unless they have alignments.",CArgDescriptions::eInteger,"1000");
787 
788     arg_desc->SetCurrentGroup("Prediction tuning");
789     arg_desc->AddFlag("singlest","Allow single exon EST chains as evidence");
790     arg_desc->AddDefaultKey("minlen","minlen","Minimal CDS length for pure ab initio models",CArgDescriptions::eInteger,"100");
791 }
792 
ReadArgs(CGnomonAnnotator * annot,const CArgs & args)793 void CGnomonAnnotatorArgUtil::ReadArgs(CGnomonAnnotator* annot, const CArgs& args)
794 {
795     CNcbiIfstream param_file(args["param"].AsString().c_str());
796     annot->SetHMMParameters(new CHMMParameters(param_file));
797 
798     annot->window = args["window"].AsInteger();
799     annot->margin = args["margin"].AsInteger();
800     annot->wall = !args["open"];
801     annot->mpp = args["mpp"].AsDouble();
802     bool nonconsens = args["nonconsens"];
803     annot->nonconsensp = (nonconsens ? -args["ncsp"].AsDouble() : BadScore());
804     annot->do_gnomon = !args["nognomon"];
805 
806     annot->mincontig = args["mincont"].AsInteger();
807 
808     annot->minCdsLen = args["minlen"].AsInteger();
809 
810     if (!args["norep"])
811         annot->EnableSeqMasking();
812 }
813 END_SCOPE(gnomon)
814 END_NCBI_SCOPE
815 
816