1 /* $Id: annot.cpp 620759 2020-11-30 16:00:57Z souvorov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Vyacheslav Chetvernin
27 *
28 * File Description:
29 *
30 * Builds annotation models out of chained alignments:
31 * selects good chains as alternatively spliced genes,
32 * selects good chains inside other chains introns,
33 * other chains filtered to leave one chain per placement,
34 * gnomon is run to improve chains and predict models in regions w/o chains
35 *
36 */
37
38 #include <ncbi_pch.hpp>
39 #include <corelib/ncbiapp.hpp>
40 #include <corelib/ncbienv.hpp>
41 #include <corelib/ncbiargs.hpp>
42
43 #include <algo/gnomon/annot.hpp>
44 #include "gnomon_engine.hpp"
45 #include <algo/gnomon/gnomon_model.hpp>
46 #include <algo/gnomon/gnomon.hpp>
47 #include <algo/gnomon/id_handler.hpp>
48
49 #include <objects/seqloc/Seq_loc.hpp>
50
51 BEGIN_NCBI_SCOPE
52 USING_SCOPE(objects);
BEGIN_SCOPE(gnomon)53 BEGIN_SCOPE(gnomon)
54
55 CGeneSelector::CGeneSelector()
56 {
57 }
58
CGnomonAnnotator()59 CGnomonAnnotator::CGnomonAnnotator()
60 {
61 }
62
~CGnomonAnnotator()63 CGnomonAnnotator::~CGnomonAnnotator()
64 {
65 }
66
s_AlignScoreOrder(const CGeneModel & ap,const CGeneModel & bp)67 bool s_AlignScoreOrder(const CGeneModel& ap, const CGeneModel& bp)
68 {
69 return (ap.Score() < bp.Score());
70 }
71
RemoveShortHolesAndRescore(TGeneModelList chains)72 void CGnomonAnnotator::RemoveShortHolesAndRescore(TGeneModelList chains)
73 {
74 NON_CONST_ITERATE(TGeneModelList, it, chains) {
75 it->RemoveShortHolesAndRescore(*m_gnomon);
76 }
77 }
78
ExtendJustThisChain(CGeneModel & chain,TSignedSeqPos left,TSignedSeqPos right)79 double CGnomonAnnotator::ExtendJustThisChain(CGeneModel& chain,
80 TSignedSeqPos left, TSignedSeqPos right)
81 {
82 TGeneModelList test_align;
83 test_align.push_back(chain);
84 int l = max((int)left,(int)chain.Limits().GetFrom()-10000);
85 int r = min(right,chain.Limits().GetTo()+10000);
86 cerr << "Testing alignment " << chain.ID() << " in fragment " << l << ' ' << r << endl;
87
88 m_gnomon->ResetRange(l,r);
89 return m_gnomon->Run(test_align, false, false, false, false, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs);
90 }
91
TryWithoutObviouslyBadAlignments(TGeneModelList & aligns,TGeneModelList & suspect_aligns,TGeneModelList & bad_aligns,bool leftwall,bool rightwall,bool leftanchor,bool rightanchor,TSignedSeqPos left,TSignedSeqPos right,TSignedSeqRange & tested_range)92 double CGnomonAnnotator::TryWithoutObviouslyBadAlignments(TGeneModelList& aligns, TGeneModelList& suspect_aligns, TGeneModelList& bad_aligns,
93 bool leftwall, bool rightwall, bool leftanchor, bool rightanchor,
94 TSignedSeqPos left, TSignedSeqPos right,
95 TSignedSeqRange& tested_range)
96 {
97 bool already_tested = Include(tested_range, TSignedSeqRange(left,right));
98
99 if (already_tested) {
100 for(TGeneModelList::iterator it = aligns.begin(); it != aligns.end(); it++) {
101 if(left <= it->Limits().GetTo() && it->Limits().GetFrom() <= right)
102 suspect_aligns.push_back(*it);
103 }
104 } else {
105 tested_range = TSignedSeqRange(left,right);
106
107 bool found_bad_cluster = false;
108 for(TGeneModelList::iterator it = aligns.begin(); it != aligns.end(); ) {
109 if(it->Limits().GetTo() < left || it->Limits().GetFrom() > right) {
110 ++it;
111 continue;
112 }
113
114 if ((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))==0 &&
115 ExtendJustThisChain(*it, left, right) == BadScore()) {
116 found_bad_cluster = true;
117 cerr << "Deleting alignment " << it->ID() << endl;
118 it->Status() |= CGeneModel::eSkipped;
119 it->AddComment("Bad score prediction alone");
120 bad_aligns.push_back(*it);
121
122 it = aligns.erase(it);
123 continue;
124 }
125 suspect_aligns.push_back(*it++);
126 }
127
128 m_gnomon->ResetRange(left, right);
129 if(found_bad_cluster) {
130 cerr << "Testing w/o bad alignments in fragment " << left << ' ' << right << endl;
131 return m_gnomon->Run(suspect_aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs);
132 }
133 }
134 return BadScore();
135 }
136
TryToEliminateOneAlignment(TGeneModelList & suspect_aligns,TGeneModelList & bad_aligns,bool leftwall,bool rightwall,bool leftanchor,bool rightanchor)137 double CGnomonAnnotator::TryToEliminateOneAlignment(TGeneModelList& suspect_aligns, TGeneModelList& bad_aligns,
138 bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
139 {
140 double score = BadScore();
141 for(TGeneModelList::iterator it = suspect_aligns.begin(); it != suspect_aligns.end();) {
142 if((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))!=0) {
143 ++it;
144 continue;
145 }
146 CGeneModel algn = *it;
147 it = suspect_aligns.erase(it);
148
149 cerr << "Testing w/o " << algn.ID();
150 score = m_gnomon->Run(suspect_aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs);
151 if (score != BadScore()) {
152 cerr << "- Good. Deleting alignment " << algn.ID() << endl;
153 algn.Status() |= CGeneModel::eSkipped;
154 algn.AddComment("Good score prediction without");
155 bad_aligns.push_back(algn);
156 break;
157 } else {
158 cerr << " - Still bad." << endl;
159 }
160 suspect_aligns.insert(it,algn);
161 }
162 return score;
163 }
164
TryToEliminateAlignmentsFromTail(TGeneModelList & suspect_aligns,TGeneModelList & bad_aligns,bool leftwall,bool rightwall,bool leftanchor,bool rightanchor)165 double CGnomonAnnotator::TryToEliminateAlignmentsFromTail(TGeneModelList& suspect_aligns, TGeneModelList& bad_aligns,
166 bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
167 {
168 double score = BadScore();
169 for (TGeneModelList::iterator it = suspect_aligns.begin(); score == BadScore() && it != suspect_aligns.end(); ) {
170 if ((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))!=0 || it->GoodEnoughToBeAnnotation()) {
171 ++it;
172 continue;
173 }
174 cerr << "Deleting alignment " << it->ID() << endl;
175 it->Status() |= CGeneModel::eSkipped;
176 it->AddComment("Bad score prediction in combination");
177 bad_aligns.push_back(*it);
178 it = suspect_aligns.erase(it);
179
180 cerr << "Testing fragment " << left << ' ' << right << endl;
181 score = m_gnomon->Run(suspect_aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs);
182 }
183 return score;
184 }
185
Predict(TSignedSeqPos llimit,TSignedSeqPos rlimit,TGeneModelList::const_iterator il,TGeneModelList::const_iterator ir,TGeneModelList & models,bool leftmostwall,bool rightmostwall,bool leftmostanchor,bool rightmostanchor,TGeneModelList & bad_aligns)186 void CGnomonAnnotator::Predict(TSignedSeqPos llimit, TSignedSeqPos rlimit, TGeneModelList::const_iterator il, TGeneModelList::const_iterator ir, TGeneModelList& models,
187 bool leftmostwall, bool rightmostwall, bool leftmostanchor, bool rightmostanchor, TGeneModelList& bad_aligns)
188 {
189 TGeneModelList aligns(il, ir);
190
191 Int8 left = llimit;
192 bool leftwall = leftmostwall;
193 bool leftanchor = leftmostanchor;
194
195 Int8 right = llimit+window;
196 bool rightwall = false;
197 bool rightanchor = false;
198
199 Int8 prev_bad_right = rlimit+1;
200 bool do_it_again = false;
201
202 m_gnomon->ResetRange(left, right);
203
204 RemoveShortHolesAndRescore(aligns);
205
206 TGeneModelList suspect_aligns;
207 TSignedSeqRange tested_range;
208
209 TIVec busy_spots(rlimit+1,0);
210 ITERATE(TGeneModelList, it_c, aligns) {
211 int a = max(0,it_c->Limits().GetFrom()-margin);
212 int b = min(rlimit,it_c->Limits().GetTo()+margin);
213 for(int i = a; i<=b; ++i)
214 busy_spots[i] = 1;
215 }
216
217 do {
218 for( ; right < rlimit && busy_spots[right] != 0; ++right);
219
220 if (right + (right-left)/2 >= rlimit) {
221 right = rlimit;
222 rightwall = rightmostwall;
223 rightanchor = rightmostanchor;
224 } else {
225 rightwall = false;
226 rightanchor = false;
227 }
228
229 if (do_it_again)
230 rightwall = true;
231
232 double score = BadScore();
233
234 if (right < prev_bad_right) {
235 suspect_aligns.clear();
236
237 m_gnomon->ResetRange(left,right);
238
239 cerr << left << ' ' << right << ' ' << m_gnomon->GetGCcontent() << endl;
240
241 score = m_gnomon->Run(aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs);
242
243 if(score == BadScore()) {
244 cerr << "Inconsistent alignments in fragment " << left << ' ' << right << '\n';
245
246 score = TryWithoutObviouslyBadAlignments(aligns, suspect_aligns, bad_aligns,
247 leftwall, rightwall, leftanchor, rightanchor,
248 left, right, tested_range);
249 }
250
251 if(score == BadScore()) {
252
253 prev_bad_right = right;
254 right = (left+right)/2;
255
256 continue;
257 }
258 } else {
259 suspect_aligns.sort(s_AlignScoreOrder);
260
261 score = TryToEliminateOneAlignment(suspect_aligns, bad_aligns,
262 leftwall, rightwall, leftanchor, rightanchor);
263 if (score == BadScore())
264 score = TryToEliminateAlignmentsFromTail(suspect_aligns, bad_aligns,
265 leftwall, rightwall, leftanchor, rightanchor);
266 if(score == BadScore()) {
267 cerr << "!!! BAD SCORE EVEN WITH FINISHED ALIGNMENTS !!! " << endl;
268 ITERATE(TGeneModelList, it, suspect_aligns) {
269 if ((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))==0 && it->GoodEnoughToBeAnnotation())
270 models.push_back(*it);
271 }
272 }
273 }
274 prev_bad_right = rlimit+1;
275
276 list<CGeneModel> genes = m_gnomon->GetGenes();
277
278 TSignedSeqPos partial_start = right;
279
280 if (right < rlimit && !genes.empty() && !genes.back().RightComplete() && !do_it_again) {
281 partial_start = genes.back().LeftComplete() ? genes.back().RealCdsLimits().GetFrom() : left;
282 _ASSERT ( partial_start < right );
283 genes.pop_back();
284 }
285
286 do_it_again = false;
287
288 if (!genes.empty()) {
289 left = genes.back().ReadingFrame().GetTo()+1;
290 leftanchor = true;
291 } else if (partial_start < left+1000) {
292 do_it_again=true;
293 } else if (partial_start < right) {
294 int new_left = partial_start-100;
295 for( ; new_left > left && busy_spots[new_left] != 0; --new_left);
296 if(new_left > left+1000) {
297 left = new_left;
298 leftanchor = false;
299 } else {
300 do_it_again=true;
301 }
302 } else {
303 left = (left+right)/2+1;
304 leftanchor = false;
305 }
306
307 models.splice(models.end(), genes);
308
309 if (right >= rlimit)
310 break;
311
312 if (!do_it_again)
313 leftwall = true;
314
315 right = left + window;
316
317 } while(left <= rlimit);
318 }
319
WalledCdsLimits(const CGeneModel & a)320 TSignedSeqRange WalledCdsLimits(const CGeneModel& a)
321 {
322 return ((a.Type() & CGeneModel::eWall)!=0) ? a.Limits() : a.MaxCdsLimits();
323 }
324
GetWallLimits(const CGeneModel & m)325 TSignedSeqRange GetWallLimits(const CGeneModel& m)
326 {
327 return m.RealCdsLimits().Empty() ? m.Limits() : m.RealCdsLimits();
328 }
329
s_AlignSeqOrder(const CGeneModel & ap,const CGeneModel & bp)330 bool s_AlignSeqOrder(const CGeneModel& ap, const CGeneModel& bp)
331 {
332 TSignedSeqRange a = GetWallLimits(ap);
333 TSignedSeqRange b = GetWallLimits(bp);
334
335 return (a.GetFrom() != b.GetFrom() ?
336 a.GetFrom() < b.GetFrom() :
337 a.GetTo() > b.GetTo()
338 );
339 }
340
341 typedef map<int,CGeneModel> TNestedGenes;
342
SaveWallModel(auto_ptr<CGeneModel> & wall_model,TNestedGenes & nested_genes,TGeneModelList & aligns)343 void SaveWallModel(auto_ptr<CGeneModel>& wall_model, TNestedGenes& nested_genes, TGeneModelList& aligns)
344 {
345 if (wall_model.get() != 0 && wall_model->Type() == CGeneModel::eWall+CGeneModel::eGnomon) {
346 aligns.push_back(*wall_model);
347 }
348 ITERATE(TNestedGenes, nested, nested_genes) {
349 aligns.push_back(nested->second);
350 }
351 nested_genes.clear();
352 }
353
FindPartials(TGeneModelList & models,TGeneModelList & aligns,EStrand strand)354 void FindPartials(TGeneModelList& models, TGeneModelList& aligns, EStrand strand)
355 {
356 TSignedSeqPos right = -1;
357 auto_ptr<CGeneModel> wall_model;
358 TNestedGenes nested_genes;
359
360 for (TGeneModelList::iterator loop_it = models.begin(); loop_it != models.end();) {
361 TGeneModelList::iterator ir = loop_it;
362 ++loop_it;
363
364 if (ir->Strand() != strand) {
365 continue;
366 }
367
368 TSignedSeqRange limits = GetWallLimits(*ir);
369
370 if ( right < limits.GetFrom() ) { // new cluster
371 SaveWallModel(wall_model, nested_genes, aligns);
372 }
373
374 if ((ir->Type() & CGeneModel::eNested) !=0) {
375 // ir->SetType( ir->Type() - CGeneModel::eNested );
376 TNestedGenes::iterator nested_wall = nested_genes.find(ir->GeneID());
377 if (nested_wall == nested_genes.end()) {
378 CGeneModel new_nested_wall(ir->Strand(),ir->ID(),CGeneModel::eNested+CGeneModel::eGnomon);
379 new_nested_wall.SetGeneID(ir->GeneID());
380 new_nested_wall.AddExon(limits);
381 nested_genes[ir->GeneID()] = new_nested_wall;
382 } else if (limits.GetTo() - nested_wall->second.Limits().GetTo() > 0) {
383 nested_wall->second.ExtendRight(limits.GetTo()- nested_wall->second.Limits().GetTo());
384 }
385 continue;
386 }
387
388 if ( right < limits.GetFrom() ) { // new cluster
389 wall_model.reset( new CGeneModel(ir->Strand(),ir->ID(),CGeneModel::eWall+CGeneModel::eGnomon));
390 wall_model->SetGeneID(ir->GeneID());
391 wall_model->AddExon(limits);
392 }
393 /* doesn't work for genes which onclude coding and non coding together
394 else { // same cluster
395 _ASSERT( wall_model.get() != 0 && wall_model->GeneID()==ir->GeneID() );
396 }
397 */
398
399 right = max(right, limits.GetTo());
400 if (ir->RankInGene() == 1 && !ir->GoodEnoughToBeAnnotation()) {
401 ir->Status() &= ~CGeneModel::eFullSupCDS;
402 aligns.splice(aligns.end(), models, ir);
403 wall_model->SetType(CGeneModel::eGnomon);
404 } else if (limits.GetTo()- wall_model->Limits().GetTo() > 0) {
405 wall_model->ExtendRight(limits.GetTo() - wall_model->Limits().GetTo());
406 }
407 }
408 SaveWallModel(wall_model, nested_genes, aligns);
409 }
410
Predict(TGeneModelList & models,TGeneModelList & bad_aligns)411 void CGnomonAnnotator::Predict(TGeneModelList& models, TGeneModelList& bad_aligns)
412 {
413 if (models.empty() && int(m_gnomon->GetSeq().size()) < mincontig)
414 return;
415
416 if (GnomonNeeded()) {
417
418 //extend partial nested
419
420 typedef list<TGeneModelList::iterator> TIterList;
421 typedef map<Int8,TIterList> TGIDIterlist;
422 TGIDIterlist genes;
423 NON_CONST_ITERATE(TGeneModelList, im, models) {
424 if(im->Type()&CGeneModel::eNested)
425 im->SetType(im->Type()-CGeneModel::eNested); // ignore flag set in chainer
426 genes[im->GeneID()].push_back(im);
427 }
428
429 set<TSignedSeqRange> hosting_intervals;
430 ITERATE(TGIDIterlist, ig, genes) { // first - ID; second - list
431 bool coding_gene = false;
432 ITERATE(TIterList, im, ig->second) {
433 if((*im)->ReadingFrame().NotEmpty()) {
434 coding_gene = true;
435 break;
436 }
437 }
438
439 TSignedSeqRange gene_lim_for_nested;
440 ITERATE(TIterList, im, ig->second) {
441 const CGeneModel& ai = **im;
442 if(coding_gene && ai.ReadingFrame().Empty())
443 continue;
444 TSignedSeqRange model_lim_for_nested = ai.Limits();
445 if(ai.ReadingFrame().NotEmpty())
446 model_lim_for_nested = ai.OpenCds() ? ai.MaxCdsLimits() : ai.RealCdsLimits(); // 'open' could be only a single variant gene
447 gene_lim_for_nested += model_lim_for_nested;
448 }
449
450 vector<int> grange(gene_lim_for_nested.GetLength(),1);
451 ITERATE(TIterList, im, ig->second) { // exclude all positions included in CDS (any exons for not coding genes) and holes
452 const CGeneModel& ai = **im;
453 if(coding_gene && ai.ReadingFrame().Empty())
454 continue;
455
456 TSignedSeqRange model_lim_for_nested = ai.Limits();
457 if(ai.ReadingFrame().NotEmpty())
458 model_lim_for_nested = ai.OpenCds() ? ai.MaxCdsLimits() : ai.RealCdsLimits(); // 'open' could be only a single variant gene
459
460 for(int i = 0; i < (int)ai.Exons().size(); ++i) {
461 TSignedSeqRange overlap = (model_lim_for_nested & ai.Exons()[i].Limits());
462 for(int j = overlap.GetFrom(); j <= overlap.GetTo(); ++j)
463 grange[j-gene_lim_for_nested.GetFrom()] = 0;
464 }
465
466 for(int i = 1; i < (int)ai.Exons().size(); ++i) {
467 if(!ai.Exons()[i-1].m_ssplice || !ai.Exons()[i].m_fsplice) {
468 TSignedSeqRange hole(ai.Exons()[i-1].Limits().GetTo()+1,ai.Exons()[i].Limits().GetFrom()-1);
469 _ASSERT(Include(model_lim_for_nested, hole));
470 for(int j = hole.GetFrom(); j <= hole.GetTo(); ++j)
471 grange[j-gene_lim_for_nested.GetFrom()] = 0;
472 }
473 }
474 }
475 _ASSERT(grange.front() == 0 && grange.back() == 0);
476
477 int left = -1;
478 int right;
479 for(int j = 0; j < (int)grange.size(); ++j) {
480 if(left < 0) {
481 if(grange[j] == 1) {
482 left = j;
483 right = j;
484 }
485 } else if(grange[j] == 1) {
486 right = j;
487 } else {
488 TSignedSeqRange interval(left+gene_lim_for_nested.GetFrom(),right+gene_lim_for_nested.GetFrom());
489 hosting_intervals.insert(interval);
490 left = -1;
491 }
492 }
493 }
494
495 typedef map<TSignedSeqRange,TIterList> TRangeModels;
496 TRangeModels nested_models;
497 ITERATE(TGIDIterlist, ig, genes) { // first - ID; second - list
498 TGeneModelList::iterator nested_modeli = ig->second.front();
499 if(!nested_modeli->GoodEnoughToBeAnnotation()) {
500 _ASSERT(ig->second.size() == 1);
501 TSignedSeqRange lim_for_nested = nested_modeli->RealCdsLimits().Empty() ? nested_modeli->Limits() : nested_modeli->RealCdsLimits();
502
503 TSignedSeqRange hosting_interval;
504 ITERATE(set<TSignedSeqRange>, ii, hosting_intervals) {
505 TSignedSeqRange interval = *ii;
506 if(Include(interval,lim_for_nested)) {
507 if(hosting_interval.Empty())
508 hosting_interval = interval;
509 else
510 hosting_interval = (hosting_interval&interval);
511 }
512 }
513
514 if(hosting_interval.NotEmpty()) {
515 TIterList nested(1,nested_modeli);
516 ITERATE(TGIDIterlist, igg, genes) { // first - ID; second - list
517 const TIterList& other_gene = igg->second;
518 if(igg == ig || !other_gene.front()->GoodEnoughToBeAnnotation())
519 continue;
520
521 bool coding_gene = false;
522 ITERATE(TIterList, im, other_gene) {
523 if((*im)->ReadingFrame().NotEmpty()) {
524 coding_gene = true;
525 break;
526 }
527 }
528
529 TSignedSeqRange finished_interval;
530 ITERATE(TIterList, im, other_gene) {
531 const CGeneModel& ai = **im;
532 if(coding_gene && ai.ReadingFrame().Empty())
533 continue;
534
535 finished_interval += coding_gene ? ai.RealCdsLimits() : ai.Limits();
536 }
537 if(!finished_interval.IntersectingWith(hosting_interval) || Include(finished_interval,hosting_interval))
538 continue;
539
540 if(Precede(finished_interval,lim_for_nested)) { // before partial model
541 hosting_interval.SetFrom(finished_interval.GetTo());
542 } else if(Precede(lim_for_nested,finished_interval)) { // after partial model
543 hosting_interval.SetTo(finished_interval.GetFrom());
544 } else if(CModelCompare::RangeNestedInIntron(finished_interval, *nested_modeli, true)) {
545 //} else { // overlaps partial model
546 // _ASSERT(CModelCompare::RangeNestedInIntron(finished_interval, *nested_modeli, true));
547 ITERATE(TIterList, im, other_gene) {
548 nested.push_back(*im);
549 }
550 }
551 }
552 _ASSERT(hosting_interval.NotEmpty());
553 nested_models[hosting_interval].splice(nested_models[hosting_interval].begin(),nested);
554 }
555 }
556 }
557
558 bool scaffold_wall = wall;
559 wall = true;
560 ITERATE(TRangeModels, i, nested_models) {
561 TSignedSeqRange hosting_interval = i->first;
562
563 TGeneModelList nested;
564 set<Int8> included_complete_models;
565 ITERATE(TIterList, im, i->second) {
566 nested.push_back(**im);
567
568 if(!(*im)->GoodEnoughToBeAnnotation()) {
569 if(((*im)->Type()&CGeneModel::eNested)) {
570 nested.back().SetType(nested.back().Type()-CGeneModel::eNested); // remove flag to allow ab initio extension
571 }
572
573 if(nested.back().HasStart() && !Include(hosting_interval,nested.back().MaxCdsLimits())) {
574 CCDSInfo cds = nested.back().GetCdsInfo();
575 if(nested.back().Strand() == ePlus)
576 cds.Set5PrimeCdsLimit(cds.Start().GetFrom());
577 else
578 cds.Set5PrimeCdsLimit(cds.Start().GetTo());
579 nested.back().SetCdsInfo(cds);
580 }
581
582
583 #ifdef _DEBUG
584 nested.back().AddComment("partialnested");
585 #endif
586
587 models.erase(*im);
588 } else {
589 // (*im)->SetType((*im)->Type()|CGeneModel::eNested);
590 included_complete_models.insert((*im)->ID());
591 }
592 }
593
594 cerr << "Interval " << hosting_interval << '\t' << nested.size() << endl;
595
596 Predict(nested, bad_aligns, hosting_interval.GetFrom()+1,hosting_interval.GetTo()-1);
597
598 NON_CONST_ITERATE(TGeneModelList, im, nested) {
599 if(!im->Support().empty()) {
600 im->SetType(im->Type()|CGeneModel::eNested);
601 if(im->ID() == 0 || included_complete_models.find(im->ID()) == included_complete_models.end()) // include only models which we tried to extend
602 models.push_back(*im);
603 }
604 }
605 }
606 wall = scaffold_wall;
607 }
608
609 Predict(models, bad_aligns, 0, TSignedSeqPos(m_gnomon->GetSeq().size())-1);
610
611 ERASE_ITERATE(TGeneModelList, im, models) {
612 CGeneModel& model = *im;
613 TSignedSeqRange cds = model.RealCdsLimits();
614 if(cds.Empty())
615 continue;
616
617 bool gapfilled = false;
618 int genome_cds = 0;
619 ITERATE(CGeneModel::TExons, ie, model.Exons()) {
620 if(ie->m_fsplice_sig == "XX" || ie->m_ssplice_sig == "XX")
621 gapfilled = true;
622 else
623 genome_cds += (cds&ie->Limits()).GetLength();
624 }
625
626 if(gapfilled && genome_cds < 45) {
627 model.Status() |= CGeneModel::eSkipped;
628 model.AddComment("Most CDS in genomic gap");
629 bad_aligns.push_back(model);
630 models.erase(im);
631 }
632 }
633 }
634
Predict(TGeneModelList & models,TGeneModelList & bad_aligns,TSignedSeqPos left,TSignedSeqPos right)635 void CGnomonAnnotator::Predict(TGeneModelList& models, TGeneModelList& bad_aligns, TSignedSeqPos left, TSignedSeqPos right)
636 {
637 if (GnomonNeeded()) {
638
639 models.sort(s_AlignSeqOrder);
640
641 if(!models.empty()) {
642 for(auto it_loop = next(models.begin()); it_loop != models.end(); ) {
643 auto it = it_loop++;
644 if(it->RankInGene() != 1 || it->GoodEnoughToBeAnnotation() || it->Type()&CGeneModel::eNested)
645 continue;
646 auto it_prev = prev(it);
647 if(it_prev->RankInGene() != 1 || it_prev->GoodEnoughToBeAnnotation() || it_prev->Type()&CGeneModel::eNested)
648 continue;
649
650 if(it->MaxCdsLimits().IntersectingWith(it_prev->MaxCdsLimits())) {
651 cerr << "Intersecting alignments " << it->ID() << " " << it_prev->ID() << " " << it->Score() << " " << it_prev->Score() << endl;
652 auto it_erase = (it->Score() < it_prev->Score()) ? it : it_prev;
653 it_erase->Status() |= CGeneModel::eSkipped;
654 it_erase->AddComment("Intersects with other partial");
655 bad_aligns.push_back(*it_erase);
656 models.erase(it_erase);
657 }
658 }
659 }
660
661 TGeneModelList aligns;
662
663 FindPartials(models, aligns, ePlus);
664 FindPartials(models, aligns, eMinus);
665
666 aligns.sort(s_AlignSeqOrder);
667
668 TGeneModelList models_tmp;
669 Predict(left, right, aligns.begin(), aligns.end(), models_tmp,(left!=0 || wall), wall, left!=0, false, bad_aligns);
670 ITERATE(TGeneModelList, it, models_tmp) {
671 if(!it->Support().empty() || it->RealCdsLen() >= minCdsLen)
672 models.push_back(*it);
673 }
674 }
675
676 NON_CONST_ITERATE(TGeneModelList, it, models) {
677 CCDSInfo cds_info = it->GetCdsInfo();
678
679 // removing fshifts in UTRs
680 TInDels fs;
681 TSignedSeqRange fullcds = cds_info.Cds();
682 ITERATE(TInDels, i, it->FrameShifts()) {
683 if(((i->IsInsertion() || i->IsMismatch()) && Include(fullcds,i->Loc())) ||
684 (i->IsDeletion() && i->Loc() > fullcds.GetFrom() && i->Loc() <= fullcds.GetTo())) {
685 fs.push_back(*i);
686 }
687 }
688 it->FrameShifts() = fs;
689
690 // removing pstops in UTRs
691 CCDSInfo::TPStops pstops = cds_info.PStops();
692 cds_info.ClearPStops();
693 ITERATE(CCDSInfo::TPStops, ps, pstops) {
694 if(Include(fullcds,*ps))
695 cds_info.AddPStop(*ps);
696 }
697 it->SetCdsInfo(cds_info);
698
699 #ifdef _DEBUG
700 {
701 string protein = it->GetProtein(m_gnomon->GetSeq());
702 int nstar = 0;
703 ITERATE(string, is, protein) {
704 if(*is == '*')
705 ++nstar;
706 }
707 int nstop = it->HasStop() ? 1 : 0;
708 ITERATE(CCDSInfo::TPStops, stp, it->GetCdsInfo().PStops()) {
709 if(stp->m_status != CCDSInfo::eSelenocysteine)
710 ++nstop;
711 }
712 }
713 #endif
714 if (it->PStop(false) || !it->FrameShifts().empty()) {
715 it->Status() |= CGeneModel::ePseudo;
716 }
717 if(it->OpenCds()) {
718 CCDSInfo cds_info = it->GetCdsInfo();
719 cds_info.SetScore(cds_info.Score(),false); // kill the Open flag
720 it->SetCdsInfo(cds_info);
721 }
722 }
723 }
724
RemoveTrailingNs(const CResidueVec & _seq)725 RemoveTrailingNs::RemoveTrailingNs(const CResidueVec& _seq)
726 : seq(_seq)
727 {
728 }
729
transform_model(CGeneModel & m)730 void RemoveTrailingNs::transform_model(CGeneModel& m)
731 {
732 CAlignMap mrnamap(m.GetAlignMap());
733 CResidueVec vec;
734 mrnamap.EditedSequence(seq, vec);
735
736 int five_p, three_p;
737 for(five_p=0; five_p < (int)vec.size() && vec[five_p] == 'N'; ++five_p);
738 for(three_p=0; three_p < (int)vec.size() && vec[(int)vec.size()-1-three_p] == 'N'; ++three_p);
739
740 if(five_p > 0 || three_p > 0) {
741 int left = five_p;
742 int right = three_p;
743 if(m.Strand() == eMinus)
744 swap(left,right);
745
746 TSignedSeqRange new_lim(m.Limits());
747 if(left > 0) {
748 _ASSERT(m.Exons().front().Limits().GetLength() > left);
749 new_lim.SetFrom(new_lim.GetFrom()+left);
750 }
751 if(right > 0) {
752 _ASSERT(m.Exons().back().Limits().GetLength() > right);
753 new_lim.SetTo(new_lim.GetTo()-right);
754 }
755
756 double score = m.Score();
757 m.Clip(new_lim,CAlignModel::eDontRemoveExons);
758 CCDSInfo cds_info = m.GetCdsInfo();
759 cds_info.SetScore(score, false);
760 m.SetCdsInfo(cds_info);
761
762 if (m.Type() & (CGeneModel::eChain | CGeneModel::eGnomon)) {
763 CAlignModel* a = dynamic_cast<CAlignModel*>(&m);
764 if (a != NULL) {
765 a->ResetAlignMap();
766 }
767 }
768 }
769 }
770
SetupArgDescriptions(CArgDescriptions * arg_desc)771 void CGnomonAnnotatorArgUtil::SetupArgDescriptions(CArgDescriptions* arg_desc)
772 {
773 arg_desc->AddFlag("nognomon","Skips ab initio prediction and ab initio extension of partial chains.");
774 arg_desc->AddKey("param", "param",
775 "Organism specific parameters",
776 CArgDescriptions::eInputFile);
777 arg_desc->AddDefaultKey("window","window","Prediction window",CArgDescriptions::eInteger,"200000");
778 arg_desc->AddDefaultKey("margin","margin","The minimal distance between chains to place the end of prediction window",CArgDescriptions::eInteger,"1000");
779 arg_desc->AddFlag("open","Allow partial predictions at the ends of contigs. Used for poorly assembled genomes with lots of unfinished contigs.");
780 arg_desc->AddDefaultKey("mpp","mpp","Penalty for connection two protein containing chains into one model.",CArgDescriptions::eDouble,"10.0");
781 arg_desc->AddFlag("nonconsens","Allows to accept nonconsensus splices starts/stops to complete partial alignmet. If not allowed some partial alignments "
782 "may be rejected if there is no way to complete them.");
783 arg_desc->AddDefaultKey("ncsp","ncsp","Nonconsensus penalty",CArgDescriptions::eDouble,"25");
784
785 arg_desc->AddFlag("norep","DO NOT mask lower case letters");
786 arg_desc->AddDefaultKey("mincont","mincont","Contigs shorter than that will be skipped unless they have alignments.",CArgDescriptions::eInteger,"1000");
787
788 arg_desc->SetCurrentGroup("Prediction tuning");
789 arg_desc->AddFlag("singlest","Allow single exon EST chains as evidence");
790 arg_desc->AddDefaultKey("minlen","minlen","Minimal CDS length for pure ab initio models",CArgDescriptions::eInteger,"100");
791 }
792
ReadArgs(CGnomonAnnotator * annot,const CArgs & args)793 void CGnomonAnnotatorArgUtil::ReadArgs(CGnomonAnnotator* annot, const CArgs& args)
794 {
795 CNcbiIfstream param_file(args["param"].AsString().c_str());
796 annot->SetHMMParameters(new CHMMParameters(param_file));
797
798 annot->window = args["window"].AsInteger();
799 annot->margin = args["margin"].AsInteger();
800 annot->wall = !args["open"];
801 annot->mpp = args["mpp"].AsDouble();
802 bool nonconsens = args["nonconsens"];
803 annot->nonconsensp = (nonconsens ? -args["ncsp"].AsDouble() : BadScore());
804 annot->do_gnomon = !args["nognomon"];
805
806 annot->mincontig = args["mincont"].AsInteger();
807
808 annot->minCdsLen = args["minlen"].AsInteger();
809
810 if (!args["norep"])
811 annot->EnableSeqMasking();
812 }
813 END_SCOPE(gnomon)
814 END_NCBI_SCOPE
815
816