1 /* $Id: chainer.cpp 635425 2021-08-03 16:41:35Z fukanchi $
2 ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Alexandre Souvorov
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiapp.hpp>
34 #include <corelib/ncbienv.hpp>
35 #include <corelib/ncbiargs.hpp>
36
37 #include <algo/gnomon/chainer.hpp>
38 #include <algo/gnomon/id_handler.hpp>
39 #include <algo/gnomon/gnomon_exception.hpp>
40 #include <algo/gnomon/glb_align.hpp>
41
42 #include <util/sequtil/sequtil_manip.hpp>
43
44 #include <algo/gnomon/gnomon_model.hpp>
45 #include <algo/gnomon/gnomon.hpp>
46 #include <algo/gnomon/annot.hpp>
47
48 #include <map>
49 #include <sstream>
50 #include <tuple>
51
52 #include <objects/general/Object_id.hpp>
53 #include <objmgr/object_manager.hpp>
54 #include <objmgr/feat_ci.hpp>
55 #include <objmgr/util/sequence.hpp>
56
57 #include "gnomon_seq.hpp"
58
59
60 BEGIN_SCOPE(ncbi)
BEGIN_SCOPE(gnomon)61 BEGIN_SCOPE(gnomon)
62
63 bool BelongToExon(const CGeneModel::TExons& exons, int pos) {
64 ITERATE(CGeneModel::TExons, i, exons) {
65 if(Include(i->Limits(),pos))
66 return true;
67 }
68 return false;
69 }
70
71 class CChain;
72 typedef list<CChain> TChainList;
73 typedef list<CChain*> TChainPointerList;
74
75
76 struct SChainMember;
77 typedef vector<SChainMember*> TContained;
78
79 typedef map<Int8,CAlignModel*> TOrigAligns;
80 typedef map<Int8,CGeneModel> TUnmodAligns;
81 struct SFShiftsCluster;
82 class CChainMembers;
83
84 class CGene;
85
86 class CChainer::CChainerImpl {
87
88 private:
89 CChainerImpl(CRef<CHMMParameters>& hmm_params, unique_ptr<CGnomonEngine>& gnomon, const CAlignMap& edited_contig_map, const TSignedSeqRange& limits, const string& m_contig_acc);
90 void SetGenomicRange(const TAlignModelList& alignments);
91 void SetConfirmedStartStopForProteinAlignments(TAlignModelList& alignments);
92
93 void FilterOutChimeras(TGeneModelList& clust);
94
95 TGeneModelList MakeChains(TGeneModelList& models, bool coding_estimates_only);
96 void FilterOutBadScoreChainsHavingBetterCompatibles(TGeneModelList& chains);
97 void CombineCompatibleChains(TChainList& chains);
98 void SetFlagsForChains(TChainList& chains);
99 SChainMember* FindOptimalChainForProtein(TContained& pointers_all, vector<CGeneModel*>& parts, CGeneModel& palign);
100 void CreateChainsForPartialProteins(TChainList& chains, TContained& pointers, TGeneModelList& unma_aligns, CChainMembers& unma_members);
101 void CutParts(TGeneModelList& clust);
102 bool CanIncludeJinI(const SChainMember& mi, const SChainMember& mj);
103 void IncludeInContained(SChainMember& big, SChainMember& small);
104 void FindContainedAlignments(TContained& pointers);
105 void DuplicateNotOriented(CChainMembers& pointers, TGeneModelList& clust);
106 void Duplicate5pendsAndShortCDSes(CChainMembers& pointers);
107 void ReplicatePStops(CChainMembers& pointers);
108 void ScoreCdnas(CChainMembers& pointers);
109 void DuplicateUTRs(CChainMembers& pointers);
110 void CalculateSpliceWeights(CChainMembers& pointers);
111 bool LRCanChainItoJ(int& delta_cds, double& delta_num, double& delta_splice_num, SChainMember& mi, SChainMember& mj, TContained& contained);
112 void LRIinit(SChainMember& mi);
113 void LeftRight(TContained& pointers);
114 void RightLeft(TContained& pointers);
115 double GoodCDNAScore(const CGeneModel& algn);
116 void RemovePoorCds(CGeneModel& algn, double minscor);
117 void SkipReason(CGeneModel* orig_align, const string& comment);
118 bool AddIfCompatible(set<SFShiftsCluster>& fshift_clusters, const CGeneModel& algn);
119 bool FsTouch(const TSignedSeqRange& lim, const CInDelInfo& fs);
120 void SplitAlignmentsByStrand(const TGeneModelList& clust, TGeneModelList& clust_plus, TGeneModelList& clust_minus);
121
122 void FindGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet);
123 void ReplacePseudoGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet);
124 void FindAltsForGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet);
125 void PlaceAllYouCan(list<CGene>& alts, TChainPointerList& not_placed_yet, TChainPointerList& rejected);
126 enum ECompat { eNotCompatible, eAlternative, eNested, eExternal, eOtherGene };
127 ECompat CheckCompatibility(const CGene& gene, const CChain& algn);
128 list<CGene> FindGenes(TChainList& cls);
129 void FilterOutSimilarsWithLowerScore(TChainPointerList& not_placed_yet, TChainPointerList& rejected);
130 void FilterOutTandemOverlap(TChainPointerList& not_placed_yet, TChainPointerList& rejected, double fraction);
131 void TrimAlignmentsIncludedInDifferentGenes(list<CGene>& genes);
132
133
134 CRef<CHMMParameters>& m_hmm_params;
135 unique_ptr<CGnomonEngine>& m_gnomon;
136 const CAlignMap& m_edited_contig_map;
137 const TSignedSeqRange& m_limits;
138 const string& m_contig_acc;
139
140
141 SMinScor minscor;
142 int intersect_limit;
143 int trim;
144 map<string,TSignedSeqRange> mrnaCDS;
145 map<string, pair<bool,bool> > prot_complet;
146 double mininframefrac;
147 bool no5pextension;
148
149 int min_cap_weight;
150 int min_cap_blob;
151 int min_polya_weight;
152 int min_polya_blob;
153 int max_dist;
154 double secondary_peak;
155 double tertiary_peak;
156 double tertiary_peak_coverage;
157 int min_flank_exon;
158
159 int minpolya;
160 bool use_confirmed_ends;
161 TIntMap confirmed_ends; // [splice], end
162
163 TOrigAligns orig_aligns;
164 TUnmodAligns unmodified_aligns;
165
166 map<TSignedSeqRange,int> mrna_count;
167 map<TSignedSeqRange,int> est_count;
168 map<TSignedSeqRange,int> rnaseq_count;
169 bool has_rnaseq;
170 set<TSignedSeqRange> oriented_introns_plus;
171 set<TSignedSeqRange> oriented_introns_minus;
172
173 double altfrac;
174 int composite;
175 bool allow_opposite_strand;
176 bool allow_partialalts;
177 int tolerance;
178
179 int m_idnext;
180 int m_idinc;
181
182 TInDels all_frameshifts;
183
184 int flex_len;
185
186 friend class CChainer;
187 friend class CChainerArgUtil;
188 };
189
CGnomonAnnotator_Base()190 CGnomonAnnotator_Base::CGnomonAnnotator_Base() : m_masking(false) { }
191
~CGnomonAnnotator_Base()192 CGnomonAnnotator_Base::~CGnomonAnnotator_Base(){ }
193
EnableSeqMasking()194 void CGnomonAnnotator_Base::EnableSeqMasking()
195 {
196 m_masking = true;
197 }
198
CChainer()199 CChainer::CChainer()
200 {
201 m_data.reset( new CChainerImpl(m_hmm_params, m_gnomon, m_edited_contig_map, m_limits, m_contig_acc) );
202 }
203
~CChainer()204 CChainer::~CChainer()
205 {
206 }
207
CChainerImpl(CRef<CHMMParameters> & hmm_params,unique_ptr<CGnomonEngine> & gnomon,const CAlignMap & edited_contig_map,const TSignedSeqRange & limits,const string & contig_acc)208 CChainer::CChainerImpl::CChainerImpl(CRef<CHMMParameters>& hmm_params, unique_ptr<CGnomonEngine>& gnomon, const CAlignMap& edited_contig_map, const TSignedSeqRange& limits, const string& contig_acc)
209 :m_hmm_params(hmm_params), m_gnomon(gnomon), m_edited_contig_map(edited_contig_map), m_limits(limits), m_contig_acc(contig_acc), m_idnext(1), m_idinc(1)
210 {
211 }
212
MakeChains(TGeneModelList & models,bool coding_estimates_only)213 TGeneModelList CChainer::MakeChains(TGeneModelList& models, bool coding_estimates_only)
214 {
215 return m_data->MakeChains(models, coding_estimates_only);
216 }
217
218 enum {
219 eCDS,
220 eLeftUTR,
221 eRightUTR
222 };
223
224 typedef set<SChainMember*> TMemberPtrSet;
225
226 struct SChainMember
227 {
SChainMemberSChainMember228 SChainMember() :
229 m_align(0), m_cds_info(0), m_align_map(0), m_left_member(0), m_right_member(0), m_sink_for_contained(0),
230 m_copy(0), m_contained(0), m_identical_count(0),
231 m_left_num(0), m_right_num(0), m_num(0),
232 m_splice_weight(0), m_left_splice_num(0), m_right_splice_num(0), m_splice_num(0),
233 m_type(eCDS), m_left_cds(0), m_right_cds(0), m_cds(0), m_included(false), m_postponed(false),
234 m_marked_for_deletion(false), m_marked_for_retention(false), m_restricted_to_start(false),
235 m_gapped_connection(false), m_fully_connected_to_part(-1), m_not_for_chaining(false),
236 m_rlimb(numeric_limits<int>::max()), m_llimb(numeric_limits<int>::max()), m_orig_align(0), m_unmd_align(0), m_mem_id(0) {}
237
238 TContained CollectContainedForChain();
239 void MarkIncludedForChain();
240 void MarkPostponedForChain();
241 void MarkUnwantedCopiesForChain(const TSignedSeqRange& cds);
242 TContained CollectContainedForMemeber();
243 void AddToContained(TContained& contained, TMemberPtrSet& included_in_list);
244
245 CGeneModel* m_align;
246 const CCDSInfo* m_cds_info;
247 CAlignMap* m_align_map;
248 SChainMember* m_left_member;
249 SChainMember* m_right_member;
250 SChainMember* m_sink_for_contained;
251 TContained* m_copy; // is used to make sure that the copy of already incuded duplicated alignment is not included in contained and doesn't trigger a new chain genereation
252 TContained* m_contained;
253 int m_identical_count;
254 double m_left_num, m_right_num, m_num;
255 double m_splice_weight;
256 double m_left_splice_num, m_right_splice_num, m_splice_num;
257 int m_type, m_left_cds, m_right_cds, m_cds;
258 bool m_included;
259 bool m_postponed;
260 bool m_marked_for_deletion;
261 bool m_marked_for_retention;
262 bool m_restricted_to_start;
263 bool m_gapped_connection; // used for gapped proteins
264 int m_fully_connected_to_part; // used for gapped proteins
265 bool m_not_for_chaining; // included in other alignmnet(s) or supressed and can't trigger a different chain
266 int m_rlimb; // leftmost compatible rexon
267 int m_llimb; // leftmost not compatible lexon
268 CAlignModel* m_orig_align;
269 CGeneModel* m_unmd_align;
270 int m_mem_id;
271 };
272
273 class CChain : public CGeneModel
274 {
275 private:
276 typedef map<int, double> TIDMap;
277 tuple<TIDMap, TSignedSeqRange> PeaksAndLimits(EStatus determinant, int min_blob_weight, int max_empty_dist, int min_splice_dist);
278 tuple<TIVec, TSignedSeqRange> MainPeaks(TIDMap& peak_weights, double secondary_peak, double tertiary_peak, double tertiary_peak_coverage, bool right_end);
279 public:
280 CChain(SChainMember& mbr, CGeneModel* gapped_helper = 0, bool keep_all_evidence = false);
281
282 void RestoreTrimmedEnds(int trim);
283 void RemoveFshiftsFromUTRs();
284 void RestoreReasonableConfirmedStart(const CGnomonEngine& gnomon, TOrigAligns& orig_aligns);
285 void SetOpenForPartialyAlignedProteins(map<string, pair<bool,bool> >& prot_complet);
286 pair<bool,bool> ValidPolyA(int pos, const CResidueVec& contig);
287 void ClipToCap(int min_cap_blob, int max_dist, int min_flank_exon, double secondary_peak);
288 void ClipToPolyA(const CResidueVec& contig, int min_polya_blob, int max_dist, int min_flank_exon, double secondary_peak, double tertiary_peak, double tertiary_peak_coverage);
289 void CheckSecondaryCapPolyAEnds();
290 void ClipLowCoverageUTR(double utr_clip_threshold);
291 void CalculateDropLimits();
292 void CalculateSupportAndWeightFromMembers(bool keep_all_evidence = false);
293 void ClipChain(TSignedSeqRange limits);
294 bool SetConfirmedEnds(const CGnomonEngine& gnomon, CGnomonAnnotator_Base::TIntMap& confirmed_ends);
295
296 void SetConfirmedStartStopForCompleteProteins(map<string, pair<bool,bool> >& prot_complet, const SMinScor& minscor);
297 void CollectTrustedmRNAsProts(TOrigAligns& orig_aligns, const SMinScor& minscor, CScope& scope, SMatrix& matrix, const CResidueVec& contig);
298 void SetBestPlacement(TOrigAligns& orig_aligns);
299 void SetConsistentCoverage();
300
301 bool HarborsNested(const CChain& other_chain, bool check_in_holes) const;
302 bool HarborsNested(const CGene& other_gene, bool check_in_holes) const;
303
304 bool HasTrustedEvidence(TOrigAligns& orig_aligns) const;
305
306 TContained m_members;
307 int m_polya_cap_right_soft_limit;
308 int m_polya_cap_left_soft_limit;
309 int m_coverage_drop_left;
310 int m_coverage_drop_right;
311 int m_coverage_bump_left;
312 int m_coverage_bump_right;
313 double m_core_coverage;
314 vector<double> m_coverage;
315 double m_splice_weight;
316 CGeneModel m_gapped_helper_align;
317 TSignedSeqRange m_supported_range;
318 TIVec m_cap_peaks;
319 TIVec m_polya_peaks;
320 };
321
322
323 class CGene : public TChainPointerList
324 {
325 public:
CGene()326 CGene() : m_maxscore(BadScore()) {}
327 typedef list<CGeneModel>::iterator TIt;
328 typedef list<CGeneModel>::const_iterator TConstIt;
Limits() const329 TSignedSeqRange Limits() const { return m_limits; }
RealCdsLimits() const330 TSignedSeqRange RealCdsLimits() const { return m_real_cds_limits; }
331 bool IsAlternative(const CChain& a, TOrigAligns& orig_aligns) const;
332 bool IsAllowedAlternative(const ncbi::gnomon::CGeneModel&, int maxcomposite) const;
333 void Insert(CChain& a);
MaxScore() const334 double MaxScore() const { return m_maxscore; }
Nested() const335 bool Nested() const { return !m_nested_in_genes.empty(); }
336 bool LargeCdsOverlap(const CGeneModel& a) const;
337 bool HarborsNested(const CChain& other_chain, bool check_in_holes) const;
338 bool HarborsNested(const CGene& other_gene, bool check_in_holes) const;
339
AddToHarbored(CGene * p)340 void AddToHarbored(CGene* p) { m_harbors_genes.insert(p); }
AddToNestedIn(CGene * p)341 void AddToNestedIn(CGene* p) {m_nested_in_genes.insert(p); };
342 set<CGene*> RemoveGeneFromOtherGenesSets();
343
344
345 private:
346 bool HarborsRange(TSignedSeqRange range, bool check_in_holes) const;
RemoveFromHarbored(CGene * p)347 void RemoveFromHarbored(CGene* p) { m_harbors_genes.erase(p); }
RemoveFromNestedIn(CGene * p)348 void RemoveFromNestedIn(CGene* p) {m_nested_in_genes.erase(p); };
349
350 TSignedSeqRange m_limits, m_real_cds_limits;
351 double m_maxscore;
352 set<CGene*> m_nested_in_genes;
353 set<CGene*> m_harbors_genes;
354 };
355
RemoveGeneFromOtherGenesSets()356 set<CGene*> CGene::RemoveGeneFromOtherGenesSets() {
357 NON_CONST_ITERATE(set<CGene*>, i, m_nested_in_genes)
358 (*i)->RemoveFromHarbored(this);
359 NON_CONST_ITERATE(set<CGene*>, i,m_harbors_genes)
360 (*i)->RemoveFromNestedIn(this);
361
362 return m_harbors_genes;
363 }
364
365 // if external model is 'open' all 5' introns can harbor
366 // gene with 'double' CDS can harbor in the interval between CDSes (intron or not)
367 // non coding models in external coding genes have no effect
HarborsRange(TSignedSeqRange range,bool check_in_holes) const368 bool CGene::HarborsRange(TSignedSeqRange range, bool check_in_holes) const {
369 TSignedSeqRange gene_lim_for_nested = Limits();
370 if(RealCdsLimits().NotEmpty())
371 gene_lim_for_nested = front()->OpenCds() ? front()->MaxCdsLimits() : RealCdsLimits(); // 'open' could be only a single variant gene
372 if(!Include(gene_lim_for_nested,range))
373 return false;
374
375 bool nested = true;
376 ITERATE(CGene, it, *this) {
377 if(RealCdsLimits().NotEmpty() && (*it)->ReadingFrame().Empty()) // non coding model in coding gene
378 continue;
379 TSignedSeqRange model_lim_for_nested = (*it)->Limits();
380 if((*it)->ReadingFrame().NotEmpty())
381 model_lim_for_nested = (*it)->OpenCds() ? (*it)->MaxCdsLimits() : (*it)->RealCdsLimits(); // 'open' could be only a single variant gene
382 if(range.IntersectingWith(model_lim_for_nested) && !CModelCompare::RangeNestedInIntron(range, **it, check_in_holes)) {
383 nested = false;
384 break;
385 }
386 }
387
388 return nested;
389 }
390
391 // if external model is 'open' all 5' introns can harbor
392 // gene with 'double' CDS can harbor in the interval between CDSes (intron or not)
393 // for nested model 'open' is ignored
394 // non coding models in external coding genes have no effect
HarborsNested(const CChain & other_chain,bool check_in_holes) const395 bool CGene::HarborsNested(const CChain& other_chain, bool check_in_holes) const {
396 TSignedSeqRange other_lim_for_nested = other_chain.Limits();
397 if(!other_chain.ReadingFrame().Empty())
398 other_lim_for_nested = other_chain.RealCdsLimits();
399
400 return HarborsRange(other_lim_for_nested, check_in_holes);
401 }
402
403 // if external model is 'open' all 5' introns can harbor
404 // gene with 'double' CDS can harbor in the interval between CDSes (intron or not)
405 // for nested model 'open' is ignored
406 // non coding models in external coding genes have no effect
HarborsNested(const CGene & other_gene,bool check_in_holes) const407 bool CGene::HarborsNested(const CGene& other_gene, bool check_in_holes) const {
408 TSignedSeqRange other_lim_for_nested = other_gene.Limits();
409 if(!other_gene.RealCdsLimits().Empty())
410 other_lim_for_nested = other_gene.RealCdsLimits();
411
412 return HarborsRange(other_lim_for_nested, check_in_holes);
413 }
414
415
LargeCdsOverlap(const CGeneModel & a) const416 bool CGene::LargeCdsOverlap(const CGeneModel& a) const {
417
418 ITERATE(CGene, it, *this) {
419 const CGeneModel& b = **it;
420 int common_cds = 0;
421 ITERATE(CGeneModel::TExons, ib, b.Exons()) {
422 ITERATE(CGeneModel::TExons, ia, a.Exons()) {
423 common_cds += (ib->Limits()&b.RealCdsLimits()&ia->Limits()&a.RealCdsLimits()).GetLength();
424 }
425 }
426 if(common_cds > 50)
427 return true;
428 }
429
430 return false;
431 }
432
Insert(CChain & a)433 void CGene::Insert(CChain& a)
434 {
435 push_back(&a);
436 m_limits += a.Limits();
437 m_real_cds_limits += a.RealCdsLimits();
438 m_maxscore = max(m_maxscore,a.Score());
439 }
440
IsAllowedAlternative(const CGeneModel & a,int maxcomposite) const441 bool CGene::IsAllowedAlternative(const CGeneModel& a, int maxcomposite) const
442 {
443 if(a.Exons().size() > 1 && (a.Status()&CGeneModel::ecDNAIntrons) == 0 && a.TrustedmRNA().empty() && a.TrustedProt().empty()) {
444 return false;
445 }
446
447 if (a.Support().empty()) {
448 return false;
449 }
450
451 int composite = 0;
452 ITERATE(CSupportInfoSet, s, a.Support()) {
453 if(s->IsCore() && ++composite > maxcomposite) return false;
454 }
455
456 if(a.PStop(false) || !a.FrameShifts().empty())
457 return false;
458 if(front()->PStop(false) || !front()->FrameShifts().empty())
459 return false;
460
461 // check for gapfillers
462
463 vector<TSignedSeqRange> gene_gapfill_exons;
464 ITERATE(CGeneModel::TExons, e, front()->Exons()) {
465 if(e->m_fsplice_sig == "XX" || e->m_ssplice_sig == "XX")
466 gene_gapfill_exons.push_back(e->Limits());
467 }
468 vector<TSignedSeqRange> a_gapfill_exons;
469 ITERATE(CGeneModel::TExons, e, a.Exons()) {
470 if(e->m_fsplice_sig == "XX" || e->m_ssplice_sig == "XX")
471 a_gapfill_exons.push_back(e->Limits());
472 }
473 if(gene_gapfill_exons != a_gapfill_exons)
474 return false;
475
476 bool a_share_intron = false;
477 ITERATE(CGene, it, *this) {
478 const CGeneModel& b = **it;
479 set<TSignedSeqRange> b_introns;
480 for(int i = 1; i < (int)b.Exons().size(); ++i) {
481 if(b.Exons()[i-1].m_ssplice && b.Exons()[i].m_fsplice) {
482 TSignedSeqRange intron(b.Exons()[i-1].GetTo()+1,b.Exons()[i].GetFrom()-1);
483 b_introns.insert(intron);
484 }
485 }
486
487 bool a_has_new_intron = false;
488 for(int i = 1; i < (int)a.Exons().size(); ++i) {
489 if(a.Exons()[i-1].m_ssplice && a.Exons()[i].m_fsplice && a.Exons()[i-1].m_ssplice_sig != "XX" && a.Exons()[i].m_fsplice_sig != "XX") {
490 TSignedSeqRange intron(a.Exons()[i-1].GetTo()+1,a.Exons()[i].GetFrom()-1);
491 if(b_introns.insert(intron).second)
492 a_has_new_intron = true;
493 else
494 a_share_intron = true;
495 }
496 }
497
498 if(a_has_new_intron) {
499 continue;
500 } else if(!gene_gapfill_exons.empty()) {
501 return false;
502 } else if(a.RealCdsLimits().NotEmpty() && b.RealCdsLimits().NotEmpty() && !a.RealCdsLimits().IntersectingWith(b.RealCdsLimits()) && (!a.TrustedmRNA().empty() || !a.TrustedProt().empty())) {
503 #ifdef _DEBUG
504 const_cast<CGeneModel&>(a).AddComment("Secondary CDS");
505 #endif
506 continue;
507 } else if(a.RealCdsLen() <= b.RealCdsLen()){
508 return false;
509 }
510 }
511
512 return (a_share_intron || gene_gapfill_exons.empty());
513 }
514
IsAlternative(const CChain & a,TOrigAligns & orig_aligns) const515 bool CGene::IsAlternative(const CChain& a, TOrigAligns& orig_aligns) const
516 {
517 _ASSERT( size()>0 );
518
519 if (a.Strand() != front()->Strand())
520 return false;
521
522 bool has_common_splice = false;
523
524 ITERATE(CGene, it, *this) {
525 if(CModelCompare::CountCommonSplices(**it, a) > 0) { // has common splice
526 has_common_splice = true;
527 break;
528 }
529 }
530
531 if(a.ReadingFrame().NotEmpty() && RealCdsLimits().NotEmpty()) {
532 CAlignMap amap(a.Exons(), a.FrameShifts(), a.Strand(), a.GetCdsInfo().Cds());
533 TIVec acds_map(amap.FShiftedLen(a.GetCdsInfo().Cds()),0);
534 for(unsigned int j = 0; j < a.Exons().size(); ++j) {
535 for(TSignedSeqPos k = max(a.Exons()[j].GetFrom(),a.GetCdsInfo().Cds().GetFrom()); k <= min(a.Exons()[j].GetTo(),a.GetCdsInfo().Cds().GetTo()); ++k) {
536 TSignedSeqPos p = amap.MapOrigToEdited(k);
537 _ASSERT(p < (int)acds_map.size());
538 if(p >= 0)
539 acds_map[p] = k;
540 }
541 }
542
543
544 bool has_common_cds = false;
545
546 ITERATE(CGene, it, *this) {
547 CAlignMap gmap((*it)->Exons(), (*it)->FrameShifts(), (*it)->Strand(), (*it)->GetCdsInfo().Cds());
548 TIVec cds_map(gmap.FShiftedLen((*it)->GetCdsInfo().Cds()),0);
549 for(unsigned int j = 0; j < (*it)->Exons().size(); ++j) {
550 for(TSignedSeqPos k = max((*it)->Exons()[j].GetFrom(),(*it)->GetCdsInfo().Cds().GetFrom()); k <= min((*it)->Exons()[j].GetTo(),(*it)->GetCdsInfo().Cds().GetTo()); ++k) {
551 TSignedSeqPos p = gmap.MapOrigToEdited(k);
552 _ASSERT(p < (int)cds_map.size());
553 if(p >= 0)
554 cds_map[p] = k;
555 }
556 }
557
558 for(unsigned int i = 0; i < acds_map.size(); ) {
559 unsigned int j = 0;
560 for( ; j < cds_map.size() && (acds_map[i] != cds_map[j] || i%3 != j%3); ++j);
561 if(j == cds_map.size()) {
562 ++i;
563 continue;
564 }
565
566 int count = 0;
567 for( ; j < cds_map.size() && i < acds_map.size() && acds_map[i] == cds_map[j]; ++j, ++i, ++count);
568
569 if(count > 30) { // has common cds
570 has_common_cds = true;
571 break;
572 }
573 }
574 }
575
576 bool gene_has_trusted = false;
577 ITERATE(CGene, it, *this) {
578 if((*it)->HasTrustedEvidence(orig_aligns)) {
579 gene_has_trusted = true;
580 break;
581 }
582 }
583
584 if(has_common_cds || (has_common_splice && (!gene_has_trusted || !a.HasTrustedEvidence(orig_aligns)))) // separate trusted genes with similar splices if they don't have common cds
585 return true;
586 else
587 return false;
588 }
589
590 return has_common_splice;
591 }
592
DescendingModelOrder(const CChain & a,const CChain & b)593 static bool DescendingModelOrder(const CChain& a, const CChain& b)
594 {
595 if (!a.Support().empty() && b.Support().empty())
596 return true;
597 else if (a.Support().empty() && !b.Support().empty())
598 return false;
599
600
601 bool atrusted = !a.TrustedmRNA().empty() || !a.TrustedProt().empty();
602 bool btrusted = !b.TrustedmRNA().empty() || !b.TrustedProt().empty();
603 if(atrusted && !btrusted) { // trusted gene is always better
604 return true;
605 } else if(btrusted && !atrusted) {
606 return false;
607 } else if(a.ReadingFrame().NotEmpty() && b.ReadingFrame().Empty()) { // coding is always better
608 return true;
609 } else if(b.ReadingFrame().NotEmpty() && a.ReadingFrame().Empty()) {
610 return false;
611 } else if(a.ReadingFrame().NotEmpty()) { // both coding
612
613 double ds = 0.05*fabs(a.Score());
614 double as = a.Score();
615 if((a.Status()&CGeneModel::ecDNAIntrons) != 0)
616 as += 2*ds;
617 if((a.Status()&CGeneModel::ePolyA) != 0)
618 as += ds;
619 if((a.Status()&CGeneModel::eCap) != 0)
620 as += ds;
621 if(a.isNMD())
622 as -= ds;
623
624 ds = 0.05*fabs(b.Score());
625 double bs = b.Score();
626 if((b.Status()&CGeneModel::ecDNAIntrons) != 0)
627 bs += 2*ds;
628 if((b.Status()&CGeneModel::ePolyA) != 0)
629 bs += ds;
630 if((b.Status()&CGeneModel::eCap) != 0)
631 bs += ds;
632 if(b.isNMD())
633 bs -= ds;
634
635 if(as > bs) // better score
636 return true;
637 else if(bs > as)
638 return false;
639 else if(a.m_splice_weight > b.m_splice_weight) // more splice support
640 return true;
641 else if(a.m_splice_weight < b.m_splice_weight)
642 return false;
643 else if(a.Weight() > b.Weight()) // more alignments is better
644 return true;
645 else if(a.Weight() < b.Weight())
646 return false;
647 else if(a.Limits().GetLength() != b.Limits().GetLength())
648 return (a.Limits().GetLength() < b.Limits().GetLength()); // everything else equal prefer compact model
649 else
650 return a.ID() < b.ID();
651 } else { // both noncoding
652 double asize = a.m_splice_weight;
653 double bsize = b.m_splice_weight;
654 double ds = 0.025*(asize+bsize);
655
656 if((a.Status()&CGeneModel::ePolyA) != 0)
657 asize += ds;
658 if((a.Status()&CGeneModel::eCap) != 0)
659 asize += ds;
660 if(a.isNMD())
661 asize -= ds;
662
663 if((b.Status()&CGeneModel::ePolyA) != 0)
664 bsize += ds;
665 if((b.Status()&CGeneModel::eCap) != 0)
666 bsize += ds;
667 if(b.isNMD())
668 bsize -= ds;
669
670 if(asize > bsize)
671 return true;
672 else if(bsize > asize)
673 return false;
674 else if(a.Limits().GetLength() != b.Limits().GetLength())
675 return (a.Limits().GetLength() < b.Limits().GetLength()); // everything else equal prefer compact model
676 else
677 return a.ID() < b.ID();
678 }
679 }
680
681 typedef CChain* TChainPtr;
DescendingModelOrderP(const TChainPtr & a,const TChainPtr & b)682 static bool DescendingModelOrderP(const TChainPtr& a, const TChainPtr& b)
683 {
684 return DescendingModelOrder(*a, *b);
685 }
DescendingModelOrderPConsistentCoverage(const TChainPtr & a,const TChainPtr & b)686 static bool DescendingModelOrderPConsistentCoverage(const TChainPtr& a, const TChainPtr& b)
687 {
688 if((a->Status()&CGeneModel::eConsistentCoverage) != (b->Status()&CGeneModel::eConsistentCoverage))
689 return (a->Status()&CGeneModel::eConsistentCoverage) > (b->Status()&CGeneModel::eConsistentCoverage);
690 else
691 return DescendingModelOrder(*a, *b);
692 }
693
CheckCompatibility(const CGene & gene,const CChain & algn)694 CChainer::CChainerImpl::ECompat CChainer::CChainerImpl::CheckCompatibility(const CGene& gene, const CChain& algn)
695 {
696 bool gene_good_enough_to_be_annotation = allow_partialalts || gene.front()->GoodEnoughToBeAnnotation();
697 bool algn_good_enough_to_be_annotation = allow_partialalts || algn.GoodEnoughToBeAnnotation();
698
699 TSignedSeqRange gene_cds = (gene.size() > 1 || gene.front()->CompleteCds() || algn_good_enough_to_be_annotation) ? gene.RealCdsLimits() : gene.front()->MaxCdsLimits();
700 TSignedSeqRange algn_cds = (algn.CompleteCds() || gene_good_enough_to_be_annotation) ? algn.RealCdsLimits() : algn.MaxCdsLimits();
701
702 if(!gene_good_enough_to_be_annotation && !algn_good_enough_to_be_annotation) { // both need ab initio
703 const CGeneModel& b = *gene.front();
704 for(int i = 1; i < (int)b.Exons().size(); ++i) {
705 if(b.Exons()[i].m_ssplice_sig == "XX" && b.Exons()[i].m_fsplice_sig == "XX" && b.Exons()[i].Limits().IntersectingWith(gene_cds)) { // if gap cds extend range to left exon
706 gene_cds.SetFrom(min(gene_cds.GetFrom(), b.Exons()[i-1].GetTo()));
707 }
708 }
709
710 for(int i = 1; i < (int)algn.Exons().size(); ++i) {
711 if(algn.Exons()[i].m_ssplice_sig == "XX" && algn.Exons()[i].m_fsplice_sig == "XX" && algn.Exons()[i].Limits().IntersectingWith(algn_cds)) { // if gap cds extend range to left exon
712 algn_cds.SetFrom(min(algn_cds.GetFrom(), algn.Exons()[i-1].GetTo()));
713 }
714 }
715 }
716
717 if(!gene.Limits().IntersectingWith(algn.Limits())) // don't overlap
718 return eOtherGene;
719
720 if(gene.IsAlternative(algn, orig_aligns)) { // has common splice or common CDS
721
722 if(gene.IsAllowedAlternative(algn, composite) && algn_good_enough_to_be_annotation) {
723 if(!algn.TrustedmRNA().empty() || !algn.TrustedProt().empty()) { // trusted gene
724 return eAlternative;
725 } else if(algn.ReadingFrame().Empty() || gene.front()->ReadingFrame().Empty()) { // one noncoding
726 if(algn.m_splice_weight > altfrac/100*gene.front()->m_splice_weight) // long enough
727 return eAlternative;
728 else
729 return eNotCompatible;
730 } else if(algn.RealCdsLen() > altfrac/100*gene.front()->RealCdsLen() || algn.Score() > altfrac/100*gene.front()->Score()) { // good score or long enough cds
731 return eAlternative;
732 }
733 }
734
735 return eNotCompatible;
736 }
737
738 // don't include overlapping gapfil 'introns' in different genes
739 set<TSignedSeqRange> gene_gapfill_introns;
740 set<TSignedSeqRange> align_gapfill_introns;
741 ITERATE(CGene, it, gene) {
742 const CGeneModel& b = **it;
743 for(int i = 1; i < (int)b.Exons().size(); ++i) {
744 if(b.Exons()[i-1].m_ssplice_sig == "XX" || b.Exons()[i].m_fsplice_sig == "XX") {
745 TSignedSeqRange intron(b.Exons()[i-1].GetTo(),b.Exons()[i].GetFrom());
746 gene_gapfill_introns.insert(intron);
747 }
748 }
749 }
750 for(int i = 1; i < (int)algn.Exons().size(); ++i) {
751 if(algn.Exons()[i-1].m_ssplice_sig == "XX" || algn.Exons()[i].m_fsplice_sig == "XX") {
752 TSignedSeqRange intron(algn.Exons()[i-1].GetTo(),algn.Exons()[i].GetFrom());
753 align_gapfill_introns.insert(intron);
754 }
755 }
756 ITERATE(set<TSignedSeqRange>, ig, gene_gapfill_introns) {
757 ITERATE(set<TSignedSeqRange>, ia, align_gapfill_introns) {
758 if(ig->IntersectingWith(*ia))
759 return eNotCompatible;
760 }
761 }
762
763 if(algn.HarborsNested(gene, gene_good_enough_to_be_annotation)) { // gene is nested in align's intron (could be partial)
764 if(gene_good_enough_to_be_annotation || algn.HasTrustedEvidence(orig_aligns))
765 return eExternal;
766 else
767 return eNotCompatible;
768 }
769
770 if(gene.HarborsNested(algn, algn_good_enough_to_be_annotation)) { // algn is nested in gene (could be partial)
771 if(algn_good_enough_to_be_annotation || algn.HasTrustedEvidence(orig_aligns))
772 return eNested;
773 else
774 return eNotCompatible;
775 }
776
777 if(!algn_cds.Empty() && !gene_cds.Empty()) { // both coding
778 if (!gene_cds.IntersectingWith(algn_cds)) { // don't overlap
779 #ifdef _DEBUG
780 if((gene_cds+algn_cds).GetLength() < gene_cds.GetLength()+algn_cds.GetLength()+20)
781 const_cast<CChain&>(algn).AddComment("Close proximity");
782 #endif
783 return eOtherGene;
784 } else if(gene.LargeCdsOverlap(algn)) {
785 return eNotCompatible;
786 }
787 }
788
789 if(gene_good_enough_to_be_annotation && algn_good_enough_to_be_annotation) {
790 if(gene.front()->Strand() != algn.Strand() && allow_opposite_strand &&
791 ((algn.Status()&CGeneModel::eBestPlacement) || (algn.Exons().size() > 1 && gene.front()->Exons().size() > 1)))
792 return eOtherGene;
793 else if(algn.Status() & CGeneModel::eBestPlacement && (algn.Exons().size() == 1 || (algn.Status()&CGeneModel::ecDNAIntrons))) {
794 #ifdef _DEBUG
795 const_cast<CChain&>(algn).AddComment("Best placement overlap");
796 #endif
797 return eOtherGene;
798 }
799 }
800
801 return eNotCompatible;
802 }
803
FindGeneSeeds(list<CGene> & alts,TChainPointerList & not_placed_yet)804 void CChainer::CChainerImpl::FindGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet) {
805
806 not_placed_yet.sort(DescendingModelOrderP);
807
808 for(TChainPointerList::iterator itloop = not_placed_yet.begin(); itloop != not_placed_yet.end(); ) {
809 TChainPointerList::iterator it = itloop++;
810 CChain& algn(**it);
811
812 if(algn.Score() == BadScore()) // postpone noncoding models
813 continue;
814 else if(algn.Score() < 2*minscor.m_min && algn.GetCdsInfo().ProtReadingFrame().Empty()) // postpone not so good models
815 continue;
816
817 list<CGene*> possibly_nested;
818
819 bool good_model = true;
820 for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
821 ECompat cmp = CheckCompatibility(*itl, algn);
822
823 switch(cmp) {
824 case eExternal:
825 possibly_nested.push_back(&(*itl)); // already created gene is nested in this model
826 case eOtherGene:
827 break;
828 default:
829 good_model = false;
830 break;
831 }
832 }
833
834 if(good_model) {
835 alts.push_back(CGene());
836 #ifdef _DEBUG
837 algn.AddComment("Pass1");
838 #endif
839 alts.back().Insert(algn);
840 not_placed_yet.erase(it);
841 }
842
843 ITERATE(list<CGene*>, itl, possibly_nested) {
844 (*itl)->AddToNestedIn(&alts.back());
845 alts.back().AddToHarbored(*itl);
846 }
847 }
848 }
849
ReplacePseudoGeneSeeds(list<CGene> & alts,TChainPointerList & not_placed_yet)850 void CChainer::CChainerImpl::ReplacePseudoGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet) {
851
852 not_placed_yet.sort(DescendingModelOrderP);
853
854 for(TChainPointerList::iterator itloop = not_placed_yet.begin(); itloop != not_placed_yet.end(); ) {
855 TChainPointerList::iterator it = itloop++;
856 CChain& algn(**it);
857
858 list<list<CGene>::iterator> included_in;
859 list<CGene*> possibly_nested; // genes which 'could' become nested
860 list<CGene*> nested_in;
861
862 bool good_model = true;
863 for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
864 ECompat cmp = CheckCompatibility(*itl, algn);
865
866 switch(cmp) {
867 case eNested:
868 nested_in.push_back(&(*itl));
869 break;
870 case eExternal:
871 possibly_nested.push_back(&(*itl)); // already created gene is nested in this model
872 break;
873 case eOtherGene:
874 break;
875 case eAlternative:
876 included_in.push_back(itl);
877 break;
878 case eNotCompatible:
879 if(itl->IsAlternative(algn, orig_aligns))
880 included_in.push_back(itl);
881 else
882 good_model = false;
883 break;
884 default:
885 good_model = false;
886 break;
887 }
888 }
889
890 if(!good_model || included_in.size() != 1 || (!(algn.Status()&CGeneModel::ecDNAIntrons) && algn.TrustedmRNA().empty() && algn.TrustedProt().empty()))
891 continue;
892
893 CGene& gene = *included_in.front();
894 CChain& model = *gene.front();
895 // if((!model.PStop(false) && model.FrameShifts().empty()) || algn.PStop(false) || !algn.FrameShifts().empty())
896 if(!model.PStop(false) || algn.PStop(false) || !algn.FrameShifts().empty()) // use only for pstops
897 continue;
898
899 int algn_cds_len = algn.FShiftedLen(algn.GetCdsInfo().Cds(),false);
900 int model_cds_len = model.FShiftedLen(model.GetCdsInfo().Cds(),false);
901 if(algn_cds_len < 0.8*model_cds_len)
902 continue;
903
904 #ifdef _DEBUG
905 algn.AddComment("Replacing pseudo "+NStr::NumericToString(model.ID()));
906 #endif
907 not_placed_yet.push_back(gene.front()); // position doesn't matter - will go to 'bad' models
908 gene.RemoveGeneFromOtherGenesSets();
909 gene = CGene();
910 gene.Insert(algn);
911 ITERATE(list<CGene*>, itl, nested_in) {
912 gene.AddToNestedIn(*itl);
913 (*itl)->AddToHarbored(&gene);
914 }
915 ITERATE(list<CGene*>, itl, possibly_nested) {
916 (*itl)->AddToNestedIn(&gene);
917 gene.AddToHarbored(*itl);
918 }
919
920 not_placed_yet.erase(it);
921 }
922 }
923
FindAltsForGeneSeeds(list<CGene> & alts,TChainPointerList & not_placed_yet)924 void CChainer::CChainerImpl::FindAltsForGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet) {
925
926 not_placed_yet.sort(DescendingModelOrderPConsistentCoverage);
927
928 for(TChainPointerList::iterator itloop = not_placed_yet.begin(); itloop != not_placed_yet.end(); ) {
929 TChainPointerList::iterator it = itloop++;
930 CChain& algn(**it);
931
932 list<list<CGene>::iterator> included_in;
933 list<CGene*> possibly_nested; // genes which 'could' become nested
934
935 bool good_model = true;
936 for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
937 ECompat cmp = CheckCompatibility(*itl, algn);
938
939 switch(cmp) {
940 case eExternal:
941 possibly_nested.push_back(&(*itl)); // already created gene is nested in this model
942 case eOtherGene:
943 break;
944 case eAlternative:
945 included_in.push_back(itl);
946 break;
947 default:
948 good_model = false;
949 break;
950 }
951 }
952
953 if(good_model && !included_in.empty() && (allow_partialalts || included_in.front()->front()->GoodEnoughToBeAnnotation())) {
954 if(included_in.size() == 1) { // alternative to only one seed
955 #ifdef _DEBUG
956 algn.AddComment("Pass2a");
957 #endif
958
959 CGene& gene = *included_in.front();
960 gene.Insert(algn);
961 not_placed_yet.erase(it);
962
963 ITERATE(list<CGene*>, itl, possibly_nested) {
964 if(gene.HarborsNested(**itl, true)) {
965 (*itl)->AddToNestedIn(&gene);
966 gene.AddToHarbored(*itl);
967 }
968 }
969 } else { // connects seeds
970
971 bool allow_connection = false;
972
973 if(!algn.TrustedmRNA().empty() || !algn.TrustedProt().empty() || (algn.Status()&CGeneModel::eConsistentCoverage)) { // connects seeds but trusted
974 bool cds_overlap = true;
975 if(algn.ReadingFrame().Empty()) {
976 cds_overlap = false;
977 } else {
978 CChain a = algn;
979 a.Clip(a.RealCdsLimits(), CAlignModel::eRemoveExons);
980 ITERATE(list<list<CGene>::iterator>, k, included_in) {
981 if(!(*k)->IsAlternative(a, orig_aligns)) {
982 cds_overlap = false;
983 break;
984 }
985 }
986 }
987
988 if(cds_overlap || (algn.Status()&CGeneModel::eConsistentCoverage)) {
989 #ifdef _DEBUG
990 algn.AddComment("Gene overlap override");
991 #endif
992 allow_connection = true;
993 }
994 }
995
996 if(allow_connection) {
997 CGene& gene = *included_in.front();
998 gene.Insert(algn);
999
1000 ITERATE(list<list<CGene>::iterator>, k, included_in) {
1001 if(k != included_in.begin()) {
1002 ITERATE(CGene, l, **k) {
1003 if(itloop == not_placed_yet.end() || !DescendingModelOrder(**itloop, **l)) { // next is not better
1004 if(CheckCompatibility(*included_in.front(), **l) == eAlternative) { // check that the thresholds are met
1005 #ifdef _DEBUG
1006 (*l)->AddComment("Pass2b");
1007 #endif
1008 included_in.front()->Insert(**l);
1009 } else {
1010 not_placed_yet.push_back(*l); // position doesn't matter - will go to 'bad' models
1011 }
1012 } else {
1013 TChainPointerList::iterator idest = itloop;
1014 for( ;idest != not_placed_yet.end() && DescendingModelOrder(**idest, **l); ++idest);
1015 not_placed_yet.insert(idest, *l);
1016 }
1017 }
1018 set<CGene*> nested_genes = (*k)->RemoveGeneFromOtherGenesSets();
1019 ITERATE(set<CGene*>, i, nested_genes)
1020 possibly_nested.push_back(*i);
1021 alts.erase(*k);
1022 }
1023 }
1024 not_placed_yet.erase(it);
1025
1026 ITERATE(list<CGene*>, itl, possibly_nested) {
1027 if(gene.HarborsNested(**itl, true)) {
1028 (*itl)->AddToNestedIn(&gene);
1029 gene.AddToHarbored(*itl);
1030 }
1031 }
1032 }
1033 }
1034 }
1035 }
1036 }
1037
PlaceAllYouCan(list<CGene> & alts,TChainPointerList & not_placed_yet,TChainPointerList & rejected)1038 void CChainer::CChainerImpl::PlaceAllYouCan(list<CGene>& alts, TChainPointerList& not_placed_yet, TChainPointerList& rejected) {
1039
1040 not_placed_yet.sort(DescendingModelOrderP);
1041
1042 ITERATE(TChainPointerList, it, not_placed_yet) {
1043 CChain& algn(**it);
1044 list<CGene>::iterator included_in(alts.end());
1045 list<CGene*> possibly_nested;
1046 list<CGene*> nested_in;
1047
1048 bool good_model = true;
1049 for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
1050 ECompat cmp = CheckCompatibility(*itl, algn);
1051 CNcbiOstrstream ost;
1052 switch(cmp) {
1053 case eNotCompatible:
1054 rejected.push_back(&algn);
1055 rejected.back()->Status() |= CGeneModel::eSkipped;
1056 ost << "Trumped by another model " << itl->front()->ID();
1057 rejected.back()->AddComment(CNcbiOstrstreamToString(ost));
1058 good_model = false;
1059 break;
1060 case eAlternative:
1061 if(!allow_partialalts && !itl->front()->GoodEnoughToBeAnnotation()) {
1062 rejected.push_back(&algn);
1063 rejected.back()->Status() |= CGeneModel::eSkipped;
1064 ost << " Trumped by another model " << itl->front()->ID();
1065 rejected.back()->AddComment(CNcbiOstrstreamToString(ost));
1066 good_model = false;
1067 } else if(included_in == alts.end()) {
1068 included_in = itl;
1069 } else { // tries to connect two different genes
1070 good_model = false;
1071 rejected.push_back(&algn);
1072 rejected.back()->Status() |= CGeneModel::eSkipped;
1073 ost << "Connects two genes " << itl->front()->ID() << " " << included_in->front()->ID();
1074 rejected.back()->AddComment(CNcbiOstrstreamToString(ost));
1075 }
1076 break;
1077 case eNested:
1078 nested_in.push_back(&(*itl));
1079 break;
1080 case eExternal:
1081 possibly_nested.push_back(&(*itl)); // already created gene is nested in this model
1082 break;
1083 case eOtherGene:
1084 break;
1085 }
1086 }
1087 if(good_model) {
1088 CGene* genep;
1089 if(included_in != alts.end()) {
1090 #ifdef _DEBUG
1091 algn.AddComment("Pass3a");
1092 #endif
1093 included_in->Insert(algn);
1094 genep = &(*included_in);
1095 } else {
1096 alts.push_back(CGene());
1097 genep = &alts.back();
1098 #ifdef _DEBUG
1099 algn.AddComment("Pass3b");
1100 #endif
1101 alts.back().Insert(algn);
1102 }
1103 ITERATE(list<CGene*>, itl, nested_in) {
1104 if((*itl)->HarborsNested(*genep, true)) {
1105 genep->AddToNestedIn(*itl);
1106 (*itl)->AddToHarbored(genep);
1107 }
1108 }
1109 ITERATE(list<CGene*>, itl, possibly_nested) {
1110 if(genep->HarborsNested(**itl, true)) {
1111 (*itl)->AddToNestedIn(genep);
1112 genep->AddToHarbored(*itl);
1113 }
1114 }
1115 }
1116 }
1117 }
1118
FilterOutSimilarsWithLowerScore(TChainPointerList & not_placed_yet,TChainPointerList & rejected)1119 void CChainer::CChainerImpl::FilterOutSimilarsWithLowerScore(TChainPointerList& not_placed_yet, TChainPointerList& rejected)
1120 {
1121 not_placed_yet.sort(DescendingModelOrderP);
1122
1123 NON_CONST_ITERATE(TChainPointerList, it, not_placed_yet) {
1124 CChain& ai(**it);
1125 TChainPointerList::iterator jt_loop = it;
1126 for(++jt_loop; jt_loop != not_placed_yet.end();) {
1127 TChainPointerList::iterator jt = jt_loop++;
1128 CChain& aj(**jt);
1129 if (CModelCompare::AreSimilar(ai,aj,tolerance)) {
1130 CNcbiOstrstream ost;
1131 ost << "Trumped by similar chain " << ai.ID();
1132 aj.AddComment(CNcbiOstrstreamToString(ost));
1133 rejected.push_back(&aj);
1134 not_placed_yet.erase(jt);
1135 }
1136 }
1137 }
1138 }
1139
FilterOutTandemOverlap(TChainPointerList & not_placed_yet,TChainPointerList & rejected,double fraction)1140 void CChainer::CChainerImpl::FilterOutTandemOverlap(TChainPointerList& not_placed_yet, TChainPointerList& rejected, double fraction)
1141 {
1142 for(TChainPointerList::iterator it_loop = not_placed_yet.begin(); it_loop != not_placed_yet.end();) {
1143 TChainPointerList::iterator it = it_loop++;
1144 CChain& ai(**it);
1145
1146 if(!ai.TrustedmRNA().empty() || !ai.TrustedProt().empty() || ai.ReadingFrame().Empty())
1147 continue;
1148 int cds_len = ai.RealCdsLen();
1149
1150 vector<const CChain*> candidates;
1151 ITERATE(TChainPointerList, jt, not_placed_yet) {
1152 const CChain& aj(**jt);
1153 if(!aj.HasStart() || !aj.HasStop() || aj.Score() < fraction/100*ai.Score() || aj.RealCdsLen() < fraction/100*cds_len || !CModelCompare::HaveCommonExonOrIntron(ai,aj))
1154 continue;
1155 candidates.push_back(&aj);
1156 }
1157
1158 bool alive = true;
1159 for (size_t i = 0; alive && i < candidates.size(); ++i) {
1160 for (size_t j = i+1; alive && j < candidates.size(); ++j) {
1161 if(!candidates[i]->Limits().IntersectingWith(candidates[j]->Limits())) {
1162 CNcbiOstrstream ost;
1163 ost << "Overlapping tandem " << candidates[i]->ID() - ai.ID() << " " << candidates[j]->ID() - ai.ID();
1164 ai.AddComment(CNcbiOstrstreamToString(ost));
1165 rejected.push_back(*it);
1166 not_placed_yet.erase(it);
1167 alive = false;
1168 }
1169 }
1170 }
1171 }
1172 }
1173
FindGenes(TChainList & cls)1174 list<CGene> CChainer::CChainerImpl::FindGenes(TChainList& cls)
1175 {
1176 TChainPointerList not_placed_yet;
1177 NON_CONST_ITERATE(TChainList, it, cls) {
1178 if((it->Status()&CGeneModel::eSkipped) == 0) {
1179 if(it->Type()&CGeneModel::eNested)
1180 it->SetType(it->Type()^CGeneModel::eNested);
1181 it->SetGeneID(it->ID());
1182 it->SetRankInGene(0);
1183 not_placed_yet.push_back(&(*it));
1184 }
1185 }
1186
1187 list<CGene> alts;
1188 TChainPointerList bad_aligns;
1189
1190 FilterOutSimilarsWithLowerScore(not_placed_yet, bad_aligns);
1191 FilterOutTandemOverlap(not_placed_yet, bad_aligns, 80);
1192
1193 FindGeneSeeds(alts, not_placed_yet);
1194 ReplacePseudoGeneSeeds(alts, not_placed_yet);
1195 FindAltsForGeneSeeds(alts, not_placed_yet);
1196 PlaceAllYouCan(alts, not_placed_yet, bad_aligns);
1197
1198 NON_CONST_ITERATE(list<CGene>, k, alts) {
1199 int rank = 0;
1200 NON_CONST_ITERATE(CGene, l, *k) {
1201 (*l)->SetGeneID(k->front()->ID());
1202 (*l)->SetRankInGene(++rank);
1203 if(k->Nested())
1204 (*l)->SetType((*l)->Type()|CGeneModel::eNested);
1205 }
1206 }
1207
1208 NON_CONST_ITERATE(TChainPointerList, l, bad_aligns)
1209 (*l)->Status() |= CGeneModel::eSkipped;
1210
1211 return alts;
1212 }
1213
1214
1215 struct GenomeOrderD
1216 {
operator ()GenomeOrderD1217 bool operator()(const SChainMember* ap, const SChainMember* bp) // left end increasing, long first if left end equal
1218 {
1219 TSignedSeqRange alimits = ap->m_align->Limits();
1220 //ignore flexible ends for sorting
1221 if(ap->m_align->Status()&CGeneModel::eLeftFlexible)
1222 alimits.SetFrom(alimits.GetTo());
1223 if(ap->m_align->Status()&CGeneModel::eRightFlexible)
1224 alimits.SetTo(alimits.GetFrom());
1225 TSignedSeqRange blimits = bp->m_align->Limits();
1226 //ignore flexible ends for sorting
1227 if(bp->m_align->Status()&CGeneModel::eLeftFlexible)
1228 blimits.SetFrom(blimits.GetTo());
1229 if(bp->m_align->Status()&CGeneModel::eRightFlexible)
1230 blimits.SetTo(blimits.GetFrom());
1231 if(alimits == blimits)
1232 return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1233 else if(alimits.GetFrom() == blimits.GetFrom())
1234 return (alimits.GetTo() > blimits.GetTo());
1235 else
1236 return (alimits.GetFrom() < blimits.GetFrom());
1237 }
1238 };
1239
1240
1241 typedef vector< pair<SChainMember*,CGene*> > TMemeberGeneVec;
1242
1243 typedef tuple<Int8, TSignedSeqRange> TIdLim;
AlignIdLimits(SChainMember * mp)1244 TIdLim AlignIdLimits(SChainMember* mp) {
1245 return make_tuple(mp->m_align->ID(), mp->m_align->Limits());
1246 }
1247 struct AlignIdOrder
1248 {
operator ()AlignIdOrder1249 bool operator()(const TMemeberGeneVec::value_type& a, const TMemeberGeneVec::value_type& b)
1250 {
1251 return AlignIdLimits(a.first) < AlignIdLimits(b.first);
1252 }
1253 };
1254
1255
TrimAlignmentsIncludedInDifferentGenes(list<CGene> & genes)1256 void CChainer::CChainerImpl::TrimAlignmentsIncludedInDifferentGenes(list<CGene>& genes) {
1257
1258 TMemeberGeneVec members_genes;
1259 NON_CONST_ITERATE(list<CGene>, ig, genes) {
1260 CGene& gene = *ig;
1261 TMemberPtrSet gmembers;
1262 ITERATE(CGene, ic, gene) {
1263 CChain& chain = **ic;
1264 ITERATE(TContained, im, chain.m_members) {
1265 SChainMember& m = **im;
1266 _ASSERT(m.m_orig_align);
1267 if(m.m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
1268 continue;
1269 if(m.m_orig_align->Continuous())
1270 gmembers.insert(&m);
1271 }
1272 }
1273 ITERATE(TMemberPtrSet, im, gmembers) {
1274 SChainMember& m = **im;
1275 members_genes.push_back(TMemeberGeneVec::value_type(&m,&gene));
1276 }
1277 }
1278
1279 if(members_genes.empty())
1280 return;
1281
1282 sort(members_genes.begin(),members_genes.end(),AlignIdOrder());
1283
1284 typedef map<CGene*,list<SChainMember*> > TGeneToMembers;
1285 typedef map<TIdLim, TGeneToMembers> TMembersInDiffGenes;
1286 TMembersInDiffGenes members_in_different_genes;
1287 {
1288 SChainMember* mp = members_genes.front().first;
1289 TIdLim idlim = AlignIdLimits(mp);
1290 CGene* genep = members_genes.front().second;
1291 members_in_different_genes[idlim][genep].push_back(mp);
1292 }
1293 for(int i = 1; i < (int)members_genes.size(); ++i) {
1294 TIdLim idlim_prev = AlignIdLimits(members_genes[i-1].first);
1295 SChainMember* mp = members_genes[i].first;
1296 TIdLim idlim = AlignIdLimits(mp);
1297 CGene* genep = members_genes[i].second;
1298 if(idlim_prev != idlim) {
1299 TMembersInDiffGenes::iterator it = members_in_different_genes.find(idlim_prev);
1300 if(it->second.size() < 2) // alignment in only one gene
1301 members_in_different_genes.erase(it);
1302 }
1303 members_in_different_genes[idlim][genep].push_back(mp);
1304 }
1305 {
1306 SChainMember* mp = members_genes.back().first;
1307 TIdLim idlim = AlignIdLimits(mp);
1308 TMembersInDiffGenes::iterator it = members_in_different_genes.find(idlim);
1309 if(it->second.size() < 2) // alignment in only one gene
1310 members_in_different_genes.erase(it);
1311 }
1312
1313 ITERATE(TMembersInDiffGenes, imdg, members_in_different_genes) {
1314 ITERATE(TGeneToMembers, ig1, imdg->second) {
1315 CGene& gene1 = *ig1->first;
1316 ITERATE(CGene, ic1, gene1) {
1317 CChain& chain1 = **ic1;
1318 sort(chain1.m_members.begin(),chain1.m_members.end());
1319 }
1320 }
1321 }
1322
1323 typedef map<CChain*,TMemberPtrSet> TConflictMemebersInChains;
1324 TConflictMemebersInChains conflict_members_in_chains;
1325
1326 ITERATE(TMembersInDiffGenes, imdg, members_in_different_genes) {
1327 ITERATE(TGeneToMembers, ig1, imdg->second) {
1328 CGene& gene1 = *ig1->first;
1329 ITERATE(CGene, ic1, gene1) {
1330 CChain* chain1p_orig = *ic1;
1331 SChainMember* mbr1p_orig = 0;
1332 for(list<SChainMember*>::const_iterator im = ig1->second.begin(); im != ig1->second.end() && mbr1p_orig == 0; ++im) {
1333 if(binary_search(chain1p_orig->m_members.begin(),chain1p_orig->m_members.end(),*im))
1334 mbr1p_orig = *im;
1335 }
1336 for(TGeneToMembers::const_iterator ig2 = imdg->second.begin(); mbr1p_orig != 0 && ig2 != ig1; ++ig2) {
1337 CGene& gene2 = *ig2->first;
1338 ITERATE(CGene, ic2, gene2) {
1339 CChain* chain1p = chain1p_orig;
1340 SChainMember* mbr1p = mbr1p_orig;
1341 CChain* chain2p = *ic2;
1342 SChainMember* mbr2p = 0;
1343 for(list<SChainMember*>::const_iterator im = ig2->second.begin(); im != ig2->second.end() && mbr2p == 0; ++im) {
1344 if(binary_search(chain2p->m_members.begin(),chain2p->m_members.end(),*im))
1345 mbr2p = *im;
1346 }
1347
1348 if(mbr2p != 0) { // both chains have alignment
1349
1350 TSignedSeqRange core1 = chain1p->RealCdsLimits();
1351 if(chain1p->Exons().size() > 1)
1352 core1 += TSignedSeqRange(chain1p->Exons().front().Limits().GetTo(),chain1p->Exons().back().Limits().GetFrom());
1353 TSignedSeqRange core2 = chain2p->RealCdsLimits();
1354 if(chain2p->Exons().size() > 1)
1355 core2 += TSignedSeqRange(chain2p->Exons().front().Limits().GetTo(),chain2p->Exons().back().Limits().GetFrom());
1356 _ASSERT(core1.NotEmpty() && core2.NotEmpty());
1357
1358 if(Precede(core2,core1)) { // chain2 is on the left change them over to simplify coding below
1359 swap(chain1p,chain2p);
1360 swap(mbr1p,mbr2p);
1361 swap(core1,core2);
1362 }
1363
1364 CChain& chain1 = *chain1p;
1365 CChain& chain2 = *chain2p;
1366 TSignedSeqRange align_lim = mbr1p->m_align->Limits();
1367
1368 if(CModelCompare::RangeNestedInIntron(core2, chain1)) { // chain2 is nested
1369 conflict_members_in_chains[&chain2].insert(mbr2p);
1370 } else if(CModelCompare::RangeNestedInIntron(core1, chain2)) { // chain1 is nested
1371 conflict_members_in_chains[&chain1].insert(mbr1p);
1372 }else if(Precede(core1,core2)) { // chain1 on the left
1373 if(Precede(align_lim,core1)) // alignment on the left of chain1
1374 conflict_members_in_chains[&chain2].insert(mbr2p);
1375 else if(Precede(core2,align_lim)) // alignment on the right of chain2
1376 conflict_members_in_chains[&chain1].insert(mbr1p);
1377 else { // alignmnet in between
1378 if(chain1.m_coverage_drop_right > 0 && chain2.m_coverage_drop_left > chain1.m_coverage_drop_right) { // non overlapping drop limits
1379 if(align_lim.GetTo() > chain1.m_coverage_drop_right)
1380 conflict_members_in_chains[&chain1].insert(mbr1p);
1381 if(align_lim.GetFrom() < chain2.m_coverage_drop_left)
1382 conflict_members_in_chains[&chain2].insert(mbr2p);
1383 } else if(chain1.m_coverage_drop_right > 0 && chain2.m_coverage_drop_left < 0 && chain1.m_core_coverage > 2*chain2.m_core_coverage) { // only chain1 has drop limit and is more expressed
1384 if(align_lim.GetTo() > chain1.m_coverage_drop_right)
1385 conflict_members_in_chains[&chain1].insert(mbr1p);
1386 if(align_lim.GetFrom() < max(chain2.m_coverage_bump_left,chain1.m_coverage_drop_right+50))
1387 conflict_members_in_chains[&chain2].insert(mbr2p);
1388 } else if(chain1.m_coverage_drop_right < 0 && chain2.m_coverage_drop_left > 0 && chain2.m_core_coverage > 2*chain1.m_core_coverage) { // only chain2 has drop limit and is more expressed
1389 if(align_lim.GetFrom() < chain2.m_coverage_drop_left)
1390 conflict_members_in_chains[&chain2].insert(mbr2p);
1391 if(align_lim.GetTo() > chain2.m_coverage_drop_left-50 || (chain1.m_coverage_bump_right > 0 && align_lim.GetTo() > chain1.m_coverage_bump_right))
1392 conflict_members_in_chains[&chain1].insert(mbr1p);
1393 } else {
1394 conflict_members_in_chains[&chain1].insert(mbr1p);
1395 conflict_members_in_chains[&chain2].insert(mbr2p);
1396 }
1397 }
1398 } else {
1399 conflict_members_in_chains[&chain1].insert(mbr1p);
1400 conflict_members_in_chains[&chain2].insert(mbr2p);
1401 }
1402 }
1403 }
1404 }
1405 }
1406 }
1407 }
1408
1409 ITERATE(TMembersInDiffGenes, imdg, members_in_different_genes) {
1410 ITERATE(TGeneToMembers, ig1, imdg->second) {
1411 CGene& gene1 = *ig1->first;
1412 ITERATE(CGene, ic1, gene1) {
1413 CChain& chain1 = **ic1;
1414 sort(chain1.m_members.begin(),chain1.m_members.end(),GenomeOrderD());
1415 }
1416 }
1417 }
1418
1419
1420 ITERATE(TConflictMemebersInChains, it, conflict_members_in_chains) {
1421 CChain& chain = *it->first;
1422 const TMemberPtrSet& conflict_members = it->second;
1423
1424 CAlignMap amap = chain.GetAlignMap();
1425
1426 TSignedSeqRange hard_limits(chain.Exons().front().Limits().GetTo()-15,chain.Exons().back().Limits().GetFrom()+15);
1427 hard_limits = (hard_limits & chain.Limits());
1428 if(chain.ReadingFrame().NotEmpty())
1429 hard_limits = (chain.OpenCds() ? chain.MaxCdsLimits() : chain.RealCdsLimits());
1430
1431 TSignedSeqRange noclip_limits = hard_limits;
1432
1433 /*
1434 int hard_limits_len = amap.FShiftedLen(hard_limits);
1435 ITERATE(TContained, i, chain.m_members) {
1436 const CGeneModel& a = *(*i)->m_align;
1437 if(a.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
1438 continue;
1439 TSignedSeqRange alim(amap.ShrinkToRealPoints(a.Limits()&chain.Limits(),false));
1440 if(Include(alim,hard_limits.GetFrom()) ) {
1441 TSignedSeqRange l(hard_limits.GetFrom(),alim.GetTo());
1442 l = amap.ShrinkToRealPoints(l,false);
1443 int len = 0;
1444 if(l.NotEmpty())
1445 len = amap.FShiftedLen(l);
1446 if(len > 0.75*a.AlignLen() || len > 0.75*hard_limits_len)
1447 noclip_limits.SetFrom(min(noclip_limits.GetFrom(),alim.GetFrom()));
1448 }
1449 if(Include(alim,hard_limits.GetTo())) {
1450 TSignedSeqRange l(alim.GetFrom(),hard_limits.GetTo());
1451 l = amap.ShrinkToRealPoints(l,false);
1452 int len = 0;
1453 if(l.NotEmpty())
1454 len = amap.FShiftedLen(l);
1455 if(len > 0.75*a.AlignLen() || len > 0.75*hard_limits_len)
1456 noclip_limits.SetTo(max(noclip_limits.GetTo(),alim.GetTo()));
1457 }
1458 }
1459
1460 noclip_limits = (noclip_limits & chain.Limits());
1461 */
1462
1463 if(chain.Status()&CGeneModel::ePolyA) {
1464 if(chain.Strand() == ePlus) {
1465 if(chain.m_coverage_drop_right < 0)
1466 noclip_limits.SetTo(max(noclip_limits.GetTo(),chain.m_polya_cap_right_soft_limit));
1467 else
1468 noclip_limits.SetTo(max(noclip_limits.GetTo(),chain.m_coverage_drop_right));
1469 } else {
1470 if(chain.m_coverage_drop_left < 0)
1471 noclip_limits.SetFrom(min(noclip_limits.GetFrom(),chain.m_polya_cap_left_soft_limit));
1472 else
1473 noclip_limits.SetFrom(min(noclip_limits.GetFrom(),chain.m_coverage_drop_left));
1474 }
1475 }
1476 if(chain.Status()&CGeneModel::eCap) {
1477 if(chain.Strand() == ePlus) {
1478 if(chain.m_coverage_drop_left < 0)
1479 noclip_limits.SetFrom(min(noclip_limits.GetFrom(),chain.m_polya_cap_left_soft_limit));
1480 else
1481 noclip_limits.SetFrom(min(noclip_limits.GetFrom(),chain.m_coverage_drop_left));
1482 } else {
1483 if(chain.m_coverage_drop_right < 0)
1484 noclip_limits.SetTo(max(noclip_limits.GetTo(),chain.m_polya_cap_right_soft_limit));
1485 else
1486 noclip_limits.SetTo(max(noclip_limits.GetTo(),chain.m_coverage_drop_right));
1487 }
1488 }
1489
1490 TSignedSeqRange new_limits = chain.Limits();
1491 ITERATE(TMemberPtrSet, im, conflict_members) {
1492 TSignedSeqRange alim = (*im)->m_align->Limits()&chain.Limits();
1493 if(alim.Empty())
1494 continue;
1495 alim = amap.ShrinkToRealPoints(alim);
1496 if(alim.Empty())
1497 continue;
1498 if(alim.GetFrom() < noclip_limits.GetFrom()) {
1499 int to = min(noclip_limits.GetFrom(),alim.GetTo());
1500 if(chain.m_coverage_drop_left > 0 && Include(alim,chain.m_coverage_drop_left)) {
1501 to = min(noclip_limits.GetFrom(),chain.m_coverage_drop_left);
1502 }
1503 new_limits.SetFrom(max(new_limits.GetFrom(),to));
1504 } else if(alim.GetTo() > noclip_limits.GetTo()) {
1505 int from = max(noclip_limits.GetTo(),alim.GetFrom());
1506 if(chain.m_coverage_drop_right > 0 && Include(alim,chain.m_coverage_drop_right)) {
1507 from = max(noclip_limits.GetTo(),chain.m_coverage_drop_right);
1508 }
1509 new_limits.SetTo(min(new_limits.GetTo(),from));
1510 }
1511 }
1512
1513 int left_splice = -1;
1514 int right_splice = -1;
1515 for(int e = 1; e < (int)chain.Exons().size(); ++e) {
1516 if(left_splice < 0 && chain.Exons()[e-1].m_ssplice && Include(new_limits,chain.Exons()[e-1].GetTo()))
1517 left_splice = chain.Exons()[e-1].GetTo();
1518 if(chain.Exons()[e].m_fsplice && Include(new_limits,chain.Exons()[e].GetFrom()))
1519 right_splice = chain.Exons()[e].GetFrom();
1520 }
1521 map<int,double> left_weights;
1522 double left_weights_total = 0.;
1523 map<int,double> right_weights;
1524 double right_weights_total = 0.;
1525 ITERATE(TContained, i, chain.m_members) {
1526 const CGeneModel& a = *(*i)->m_align;
1527 if(a.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
1528 continue;
1529 TSignedSeqRange alim(amap.ShrinkToRealPoints(a.Limits()&chain.Limits(),false));
1530 for(int e = 1; e < (int)a.Exons().size(); ++e) {
1531 if(a.Exons()[e-1].m_ssplice && a.Exons()[e-1].GetTo() == left_splice) {
1532 left_weights[alim.GetFrom()] += a.Weight();
1533 left_weights_total += a.Weight();
1534 }
1535 if(a.Exons()[e].m_fsplice && a.Exons()[e].GetFrom() == right_splice) {
1536 right_weights[alim.GetTo()] += a.Weight();
1537 right_weights_total += a.Weight();
1538 }
1539 }
1540 }
1541 if(left_weights_total > 0.) {
1542 int left = numeric_limits<int>::max();
1543 double t = 0;
1544 for(map<int,double>::reverse_iterator it = left_weights.rbegin(); it != left_weights.rend(); ++it) {
1545 if(t < 0.9*left_weights_total)
1546 left = it->first;
1547 t += it->second;
1548 }
1549 if(left < new_limits.GetFrom())
1550 new_limits.SetFrom(left);
1551 }
1552 if(right_weights_total > 0.) {
1553 int right = 0;
1554 double t = 0;
1555 for(map<int,double>::iterator it = right_weights.begin(); it != right_weights.end(); ++it) {
1556 if(t < 0.9*right_weights_total)
1557 right = it->first;
1558 t += it->second;
1559 }
1560 if(right > new_limits.GetTo())
1561 new_limits.SetTo(right);
1562 }
1563
1564 //if has to clip, clip to next cap/polya
1565 if(new_limits.GetFrom() != chain.Limits().GetFrom() && chain.m_polya_cap_left_soft_limit < chain.Limits().GetTo())
1566 new_limits.SetFrom(chain.m_polya_cap_left_soft_limit);
1567 if(new_limits.GetTo() != chain.Limits().GetTo() && chain.m_polya_cap_right_soft_limit > chain.Limits().GetFrom())
1568 new_limits.SetTo(chain.m_polya_cap_right_soft_limit);
1569
1570 //don't clip confirmed ends
1571 if(chain.Status()&CGeneModel::eLeftConfirmed)
1572 new_limits.SetFrom(chain.Limits().GetFrom());
1573 if(chain.Status()&CGeneModel::eRightConfirmed)
1574 new_limits.SetTo(chain.Limits().GetTo());
1575
1576 if(new_limits != chain.Limits()) {
1577 string note;
1578 if(new_limits.GetFrom() != chain.Limits().GetFrom())
1579 note += "Left";
1580 if(new_limits.GetTo() != chain.Limits().GetTo())
1581 note += "Right";
1582 note += " overlap UTR clip";
1583 chain.AddComment(note);
1584 _ASSERT(new_limits.NotEmpty());
1585
1586 bool wasopen = chain.OpenCds();
1587 chain.ClipChain(new_limits);
1588 if(chain.Type()&CGeneModel::eNested)
1589 chain.ClipLowCoverageUTR(0.1);
1590 _ASSERT(chain.Limits().NotEmpty());
1591 if(chain.ReadingFrame().NotEmpty()) {
1592 m_gnomon->GetScore(chain, !no5pextension);
1593 CCDSInfo cds = chain.GetCdsInfo();
1594 if(wasopen != chain.OpenCds() && (wasopen == false || cds.HasStart())) {
1595 cds.SetScore(cds.Score(),wasopen);
1596 chain.SetCdsInfo(cds);
1597 }
1598 }
1599 chain.CalculateDropLimits();
1600 }
1601 }
1602 }
1603
1604
1605
1606 //visits all levels of nested and adds uniquely to contained
AddToContained(TContained & contained,TMemberPtrSet & included_in_list)1607 void SChainMember::AddToContained(TContained& contained, TMemberPtrSet& included_in_list) {
1608
1609 list<const SChainMember*> not_visited(1,this);
1610 while(!not_visited.empty()) {
1611 const SChainMember* mbr = not_visited.front();
1612 for(int c = 0; c < (int)mbr->m_contained->size(); ++c) {
1613 SChainMember* mi = (*mbr->m_contained)[c];
1614 if(c < mbr->m_identical_count) {
1615 if(included_in_list.insert(mi).second) {
1616 contained.push_back(mi); //action
1617 if(mi->m_copy != 0)
1618 included_in_list.insert(mi->m_copy->begin(),mi->m_copy->end());
1619 }
1620 } else if(included_in_list.find(mi) == included_in_list.end()) {
1621 not_visited.push_back(mi); //store for future
1622 }
1623 }
1624 not_visited.pop_front();
1625 }
1626 }
1627
CollectContainedForMemeber()1628 TContained SChainMember::CollectContainedForMemeber() {
1629
1630 TContained contained;
1631 TMemberPtrSet included_in_list;
1632 AddToContained(contained, included_in_list);
1633
1634 return contained;
1635 }
1636
CollectContainedForChain()1637 TContained SChainMember::CollectContainedForChain()
1638 {
1639 TContained contained;
1640 TMemberPtrSet included_in_list;
1641
1642 AddToContained(contained, included_in_list);
1643
1644 for (SChainMember* left = m_left_member; left != 0; left = left->m_left_member) {
1645 left->AddToContained(contained, included_in_list);
1646 }
1647
1648 for (SChainMember* right = m_right_member; right != 0; right = right->m_right_member) {
1649 right->AddToContained(contained, included_in_list);
1650 }
1651
1652 return contained;
1653 }
1654
1655 #define START_BONUS 600
1656
MarkIncludedForChain()1657 void SChainMember::MarkIncludedForChain()
1658 {
1659 TContained contained = CollectContainedForChain();
1660 NON_CONST_ITERATE (TContained, i, contained) {
1661 SChainMember* mi = *i;
1662 mi->m_included = true;
1663 if (mi->m_copy != 0) {
1664 ITERATE(TContained, j, *mi->m_copy) {
1665 SChainMember* mj = *j;
1666 if(mj->m_type != eCDS || mj->m_cds < START_BONUS+25 ||
1667 (mi->m_align->Strand() == mj->m_align->Strand() &&
1668 (mi->m_cds_info->ReadingFrame().GetFrom() == mj->m_cds_info->ReadingFrame().GetFrom() || // same copy or supressed start
1669 mi->m_cds_info->ReadingFrame().GetTo() == mj->m_cds_info->ReadingFrame().GetTo()))) // same copy or supressed start
1670 mj->m_included = true;
1671 }
1672 }
1673 }
1674 }
1675
MarkPostponedForChain()1676 void SChainMember::MarkPostponedForChain()
1677 {
1678 TContained contained = CollectContainedForChain();
1679 NON_CONST_ITERATE (TContained, i, contained) {
1680 SChainMember* mi = *i;
1681 mi->m_postponed = true;
1682 if (mi->m_copy != 0) {
1683 ITERATE(TContained, j, *mi->m_copy) {
1684 SChainMember* mj = *j;
1685 if(mj->m_type != eCDS || mj->m_cds < START_BONUS+25 ||
1686 (mi->m_align->Strand() == mj->m_align->Strand() &&
1687 (mi->m_cds_info->ReadingFrame().GetFrom() == mj->m_cds_info->ReadingFrame().GetFrom() || // same copy or supressed start
1688 mi->m_cds_info->ReadingFrame().GetTo() == mj->m_cds_info->ReadingFrame().GetTo()))) // same copy or supressed start
1689 mj->m_postponed = true;
1690 }
1691 }
1692 }
1693 }
1694
MarkUnwantedCopiesForChain(const TSignedSeqRange & cds)1695 void SChainMember::MarkUnwantedCopiesForChain(const TSignedSeqRange& cds)
1696 {
1697 TContained contained = CollectContainedForChain();
1698 NON_CONST_ITERATE (TContained, i, contained) {
1699 SChainMember* mi = *i;
1700 CGeneModel& algni = *mi->m_align;
1701 const CCDSInfo& cinfoi = *mi->m_cds_info;
1702 if(Include(cds, cinfoi.ReadingFrame())) {
1703 mi->m_marked_for_retention = true;
1704 mi->m_marked_for_deletion = false;
1705 if (mi->m_copy != 0) {
1706 ITERATE(TContained, j, *mi->m_copy) {
1707 SChainMember* mj = *j;
1708 const CCDSInfo& cinfoj = *mj->m_cds_info;
1709 if(mj->m_marked_for_retention) // already included in cds
1710 continue;
1711 else if(cinfoi.HasStart() || cinfoj.HasStart()) { // don't delete copy which overrides the start or has the start
1712 if((algni.Strand() == ePlus && cinfoi.ReadingFrame().GetTo() == cinfoj.ReadingFrame().GetTo()) ||
1713 (algni.Strand() == eMinus && cinfoi.ReadingFrame().GetFrom() == cinfoj.ReadingFrame().GetFrom()))
1714 continue;
1715 }
1716 mj->m_marked_for_deletion = true;
1717 }
1718 }
1719 }
1720 }
1721 }
1722
1723
1724 struct LeftOrder
1725 {
operator ()LeftOrder1726 bool operator()(const SChainMember* ap, const SChainMember* bp) // right end increasing, short first if right end equal
1727 {
1728 TSignedSeqRange alimits = ap->m_align->Limits();
1729 //ignore flexible ends for sorting
1730 if(ap->m_align->Status()&CGeneModel::eLeftFlexible)
1731 alimits.SetFrom(alimits.GetTo());
1732 if(ap->m_align->Status()&CGeneModel::eRightFlexible)
1733 alimits.SetTo(alimits.GetFrom());
1734 TSignedSeqRange blimits = bp->m_align->Limits();
1735 //ignore flexible ends for sorting
1736 if(bp->m_align->Status()&CGeneModel::eLeftFlexible)
1737 blimits.SetFrom(blimits.GetTo());
1738 if(bp->m_align->Status()&CGeneModel::eRightFlexible)
1739 blimits.SetTo(blimits.GetFrom());
1740
1741 if(alimits.GetTo() == blimits.GetTo())
1742 return (alimits.GetFrom() > blimits.GetFrom());
1743 else
1744 return (alimits.GetTo() < blimits.GetTo());
1745 }
1746 };
1747
1748 struct LeftOrderD // use for sorting not for finding
1749 {
operator ()LeftOrderD1750 bool operator()(const SChainMember* ap, const SChainMember* bp) // right end increasing, short first if right end equal
1751 {
1752 TSignedSeqRange alimits = ap->m_align->Limits();
1753 //ignore flexible ends for sorting
1754 if(ap->m_align->Status()&CGeneModel::eLeftFlexible)
1755 alimits.SetFrom(alimits.GetTo());
1756 if(ap->m_align->Status()&CGeneModel::eRightFlexible)
1757 alimits.SetTo(alimits.GetFrom());
1758 TSignedSeqRange blimits = bp->m_align->Limits();
1759 //ignore flexible ends for sorting
1760 if(bp->m_align->Status()&CGeneModel::eLeftFlexible)
1761 blimits.SetFrom(blimits.GetTo());
1762 if(bp->m_align->Status()&CGeneModel::eRightFlexible)
1763 blimits.SetTo(blimits.GetFrom());
1764
1765 if(alimits == blimits)
1766 return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1767 else if(alimits.GetTo() == blimits.GetTo())
1768 return (alimits.GetFrom() > blimits.GetFrom());
1769 else
1770 return (alimits.GetTo() < blimits.GetTo());
1771 }
1772 };
1773
1774
1775 struct RightOrder
1776 {
operator ()RightOrder1777 bool operator()(const SChainMember* ap, const SChainMember* bp) // left end decreasing, short first if left end equal
1778 {
1779 TSignedSeqRange alimits = ap->m_align->Limits();
1780 //ignore flexible ends for sorting
1781 if(ap->m_align->Status()&CGeneModel::eLeftFlexible)
1782 alimits.SetFrom(alimits.GetTo());
1783 if(ap->m_align->Status()&CGeneModel::eRightFlexible)
1784 alimits.SetTo(alimits.GetFrom());
1785 TSignedSeqRange blimits = bp->m_align->Limits();
1786 //ignore flexible ends for sorting
1787 if(bp->m_align->Status()&CGeneModel::eLeftFlexible)
1788 blimits.SetFrom(blimits.GetTo());
1789 if(bp->m_align->Status()&CGeneModel::eRightFlexible)
1790 blimits.SetTo(blimits.GetFrom());
1791
1792 if(alimits.GetFrom() == blimits.GetFrom())
1793 return (alimits.GetTo() < blimits.GetTo());
1794 else
1795 return (alimits.GetFrom() > blimits.GetFrom());
1796 }
1797 };
1798
1799 struct RightOrderD
1800 {
operator ()RightOrderD1801 bool operator()(const SChainMember* ap, const SChainMember* bp) // left end decreasing, short first if left end equal
1802 {
1803 TSignedSeqRange alimits = ap->m_align->Limits();
1804 //ignore flexible ends for sorting
1805 if(ap->m_align->Status()&CGeneModel::eLeftFlexible)
1806 alimits.SetFrom(alimits.GetTo());
1807 if(ap->m_align->Status()&CGeneModel::eRightFlexible)
1808 alimits.SetTo(alimits.GetFrom());
1809 TSignedSeqRange blimits = bp->m_align->Limits();
1810 //ignore flexible ends for sorting
1811 if(bp->m_align->Status()&CGeneModel::eLeftFlexible)
1812 blimits.SetFrom(blimits.GetTo());
1813 if(bp->m_align->Status()&CGeneModel::eRightFlexible)
1814 blimits.SetTo(blimits.GetFrom());
1815
1816 if(alimits == blimits)
1817 return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1818 else if(alimits.GetFrom() == blimits.GetFrom())
1819 return (alimits.GetTo() < blimits.GetTo());
1820 else
1821 return (alimits.GetFrom() > blimits.GetFrom());
1822 }
1823 };
1824
1825
1826 struct CdsNumOrder
1827 {
operator ()CdsNumOrder1828 bool operator()(const SChainMember* ap, const SChainMember* bp)
1829 {
1830 if(max(ap->m_cds,bp->m_cds) >= 300 && ap->m_cds != bp->m_cds) // only long cdses count
1831 return (ap->m_cds > bp->m_cds);
1832 else if(fabs(ap->m_splice_num - bp->m_splice_num) > 0.001)
1833 return (ap->m_splice_num > bp->m_splice_num);
1834 else if(fabs(ap->m_num - bp->m_num) > 0.001)
1835 return (ap->m_num > bp->m_num);
1836 else
1837 return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1838 }
1839 };
1840
1841 struct ScoreOrder
1842 {
operator ()ScoreOrder1843 bool operator()(const SChainMember* ap, const SChainMember* bp)
1844 {
1845 if (ap->m_cds_info->Score() == bp->m_cds_info->Score())
1846 return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1847 else
1848 return (ap->m_cds_info->Score() > bp->m_cds_info->Score());
1849 }
1850 };
1851
1852 template <class C>
uniq(C & container)1853 void uniq(C& container)
1854 {
1855 sort(container.begin(),container.end());
1856 container.erase( unique(container.begin(),container.end()), container.end() );
1857 }
1858
1859 class CChainMembers : public vector<SChainMember*> {
1860 public:
CChainMembers()1861 CChainMembers() { m_extra_cds.push_back(CCDSInfo()); } // empty cds for utrs; first in the list
1862 CChainMembers(TGeneModelList& clust, TOrigAligns& orig_aligns, TUnmodAligns& unmodified_aligns);
1863 void InsertMember(CGeneModel& algn, SChainMember* copy_ofp = 0);
1864 void InsertMemberCopyWithCds(const CCDSInfo& cds, SChainMember* copy_ofp);
1865 void InsertMemberCopyAndStoreCds(const CCDSInfo& cds, SChainMember* copy_ofp);
1866 void InsertMemberCopyWithoutCds(SChainMember* copy_ofp);
1867 void InsertMember(SChainMember& m, SChainMember* copy_ofp = 0);
1868 void DuplicateUTR(SChainMember* copy_ofp);
1869 void SpliceFromOther(CChainMembers& other);
1870 private:
1871 CChainMembers(const CChainMembers& object) = delete;
1872 CChainMembers& operator=(const CChainMembers& object) = delete;
1873 list<SChainMember> m_members;
1874 list<TContained> m_copylist;
1875 list<CAlignMap> m_align_maps;
1876 list<TContained> m_containedlist;
1877 list<CCDSInfo> m_extra_cds;
1878 };
1879
SpliceFromOther(CChainMembers & other)1880 void CChainMembers::SpliceFromOther(CChainMembers& other) {
1881 m_members.splice(m_members.end(),other.m_members);
1882 m_copylist.splice(m_copylist.end(),other.m_copylist);
1883 m_align_maps.splice(m_align_maps.end(),other.m_align_maps);
1884 m_containedlist.splice(m_containedlist.end(),other.m_containedlist);
1885 m_extra_cds.splice(m_extra_cds.end(),other.m_extra_cds);
1886 insert(end(),other.begin(),other.end());
1887 }
1888
InsertMemberCopyWithCds(const CCDSInfo & cds,SChainMember * copy_ofp)1889 void CChainMembers::InsertMemberCopyWithCds(const CCDSInfo& cds, SChainMember* copy_ofp) {
1890
1891 SChainMember mbr = *copy_ofp;
1892 mbr.m_cds_info = &cds;
1893 mbr.m_type = eCDS;
1894 InsertMember(mbr, copy_ofp);
1895 }
1896
InsertMemberCopyAndStoreCds(const CCDSInfo & cds,SChainMember * copy_ofp)1897 void CChainMembers::InsertMemberCopyAndStoreCds(const CCDSInfo& cds, SChainMember* copy_ofp) {
1898
1899 m_extra_cds.push_back(cds);
1900 InsertMemberCopyWithCds(m_extra_cds.back(), copy_ofp);
1901 }
1902
InsertMemberCopyWithoutCds(SChainMember * copy_ofp)1903 void CChainMembers::InsertMemberCopyWithoutCds(SChainMember* copy_ofp) {
1904
1905 SChainMember mbr = *copy_ofp;
1906 mbr.m_cds_info = &m_extra_cds.front(); // empty cds
1907 mbr.m_type = eLeftUTR;
1908 InsertMember(mbr, copy_ofp);
1909 }
1910
1911
InsertMember(CGeneModel & algn,SChainMember * copy_ofp)1912 void CChainMembers::InsertMember(CGeneModel& algn, SChainMember* copy_ofp)
1913 {
1914 SChainMember mbr;
1915 mbr.m_align = &algn;
1916 mbr.m_cds_info = &algn.GetCdsInfo();
1917 mbr.m_type = eCDS;
1918 if(algn.Score() == BadScore())
1919 mbr.m_type = eLeftUTR;
1920 if(copy_ofp) {
1921 mbr.m_orig_align = copy_ofp->m_orig_align;
1922 mbr.m_unmd_align = copy_ofp->m_unmd_align;
1923 }
1924 InsertMember(mbr, copy_ofp);
1925 }
1926
InsertMember(SChainMember & m,SChainMember * copy_ofp)1927 void CChainMembers::InsertMember(SChainMember& m, SChainMember* copy_ofp)
1928 {
1929 m.m_mem_id = size()+1;
1930 m_members.push_back(m);
1931 push_back(&m_members.back());
1932
1933 m_containedlist.push_back(TContained());
1934 m_members.back().m_contained = &m_containedlist.back();
1935
1936 _ASSERT(copy_ofp == 0 || (m.m_align->Exons()==copy_ofp->m_align->Exons() && m.m_align->FrameShifts()==copy_ofp->m_align->FrameShifts()));
1937
1938 if(copy_ofp == 0 || m.m_align->Strand() != copy_ofp->m_align->Strand()) { // first time or reversed copy
1939 m_align_maps.push_back(CAlignMap(m.m_align->Exons(), m.m_align->FrameShifts(), m.m_align->Strand()));
1940 m_members.back().m_align_map = &m_align_maps.back();
1941 } else {
1942 m_members.back().m_align_map = copy_ofp->m_align_map;
1943 }
1944
1945 if(copy_ofp != 0) { // we are making a copy of member
1946 if(copy_ofp->m_copy == 0) {
1947 m_copylist.push_back(TContained(1,copy_ofp));
1948 copy_ofp->m_copy = &m_copylist.back();
1949 }
1950 m_members.back().m_copy = copy_ofp->m_copy;
1951 copy_ofp->m_copy->push_back(&m_members.back());
1952 }
1953 }
1954
DuplicateUTR(SChainMember * copy_ofp)1955 void CChainMembers::DuplicateUTR(SChainMember* copy_ofp)
1956 {
1957 _ASSERT(copy_ofp->m_type == eLeftUTR);
1958 SChainMember new_mbr = *copy_ofp;
1959 new_mbr.m_type = eRightUTR;
1960 InsertMember(new_mbr, copy_ofp);
1961 }
1962
1963
CChainMembers(TGeneModelList & clust,TOrigAligns & orig_aligns,TUnmodAligns & unmodified_aligns)1964 CChainMembers::CChainMembers(TGeneModelList& clust, TOrigAligns& orig_aligns, TUnmodAligns& unmodified_aligns)
1965 {
1966 m_extra_cds.push_back(CCDSInfo()); // empty cds for utrs; first in the list
1967 NON_CONST_ITERATE(TGeneModelList, itcl, clust) {
1968 InsertMember(*itcl);
1969 m_members.back().m_orig_align = orig_aligns[itcl->ID()];
1970 if(unmodified_aligns.count(itcl->ID()))
1971 m_members.back().m_unmd_align = &unmodified_aligns[itcl->ID()];
1972 }
1973 }
1974
1975
ExtendedMaxCdsLimits(const CGeneModel & a,const CCDSInfo & cds)1976 TSignedSeqRange ExtendedMaxCdsLimits(const CGeneModel& a, const CCDSInfo& cds)
1977 {
1978 TSignedSeqRange limits(a.Limits().GetFrom()-1,a.Limits().GetTo()+1);
1979
1980 return limits & cds.MaxCdsLimits();
1981 }
1982
1983
IncludeInContained(SChainMember & big,SChainMember & small)1984 void CChainer::CChainerImpl::IncludeInContained(SChainMember& big, SChainMember& small)
1985 {
1986 //all identical members are contained in each other; only one of them (with smaller m_mem_id) is contained in other members
1987 TSignedSeqRange big_limits = big.m_align->Limits();
1988 if(big.m_align->Status()&CGeneModel::eLeftFlexible)
1989 big_limits.SetFrom(big_limits.GetTo());
1990 if(big.m_align->Status()&CGeneModel::eRightFlexible)
1991 big_limits.SetTo(big_limits.GetFrom());
1992 TSignedSeqRange small_limits = small.m_align->Limits();
1993 bool small_flex = false;
1994 if(small.m_align->Status()&CGeneModel::eLeftFlexible) {
1995 small_limits.SetFrom(small_limits.GetTo());
1996 small_flex = true;
1997 }
1998 if(small.m_align->Status()&CGeneModel::eRightFlexible) {
1999 small_limits.SetTo(small_limits.GetFrom());
2000 small_flex = true;
2001 }
2002
2003 if(big_limits == small_limits) { // identical
2004 ++big.m_identical_count;
2005 big.m_contained->push_back(&small);
2006 return;
2007 } else if(big.m_sink_for_contained != nullptr &&
2008 small_limits.GetTo() <= big.m_sink_for_contained->m_align->Limits().GetTo() &&
2009 CanIncludeJinI(*big.m_sink_for_contained, small)) {
2010 return; // contained in next level
2011 } else {
2012 big.m_contained->push_back(&small);
2013 if(!small_flex && (big.m_sink_for_contained == nullptr || small_limits.GetTo() > big.m_sink_for_contained->m_align->Limits().GetTo()))
2014 big.m_sink_for_contained = &small;
2015 }
2016 }
2017
2018
CutParts(TGeneModelList & models)2019 void CChainer::CutParts(TGeneModelList& models)
2020 {
2021 m_data->CutParts(models);
2022 }
2023
CutParts(TGeneModelList & models)2024 void CChainer::CChainerImpl::CutParts(TGeneModelList& models) {
2025 ERASE_ITERATE(TGeneModelList, im, models) {
2026 TGeneModelList parts = GetAlignParts(*im, true);
2027 if(!parts.empty()) {
2028 models.splice(models.begin(),parts);
2029 models.erase(im);
2030 }
2031 }
2032 }
2033
DuplicateNotOriented(CChainMembers & pointers,TGeneModelList & clust)2034 void CChainer::CChainerImpl::DuplicateNotOriented(CChainMembers& pointers, TGeneModelList& clust)
2035 {
2036 unsigned int initial_size = pointers.size();
2037 for(unsigned int i = 0; i < initial_size; ++i) {
2038 SChainMember& mbr = *pointers[i];
2039 CGeneModel& algn = *mbr.m_align;
2040 if((algn.Status()&CGeneModel::eUnknownOrientation) != 0) {
2041 CGeneModel new_algn = algn;
2042 new_algn.ReverseComplementModel();
2043 new_algn.Status() &= ~CGeneModel::eReversed;
2044 clust.push_back(new_algn);
2045 pointers.InsertMember(clust.back(), &mbr); //reversed copy
2046 }
2047 }
2048 }
2049
DuplicateUTRs(CChainMembers & pointers)2050 void CChainer::CChainerImpl::DuplicateUTRs(CChainMembers& pointers)
2051 {
2052 unsigned int initial_size = pointers.size();
2053 for(unsigned int i = 0; i < initial_size; ++i) {
2054 SChainMember& mbr = *pointers[i];
2055 if(mbr.m_align->Status()&CGeneModel::eLeftFlexible)
2056 mbr.m_type = eRightUTR;
2057 else if(mbr.m_align->Status()&CGeneModel::eRightFlexible)
2058 mbr.m_type = eLeftUTR;
2059 else if(mbr.m_cds_info->Score() == BadScore())
2060 pointers.DuplicateUTR(&mbr);
2061 }
2062 }
2063
CalculateSpliceWeights(CChainMembers & pointers)2064 void CChainer::CChainerImpl::CalculateSpliceWeights(CChainMembers& pointers)
2065 {
2066 map<int, set<int> > oriented_splices;
2067 ITERATE(set<TSignedSeqRange>, i, oriented_introns_plus) {
2068 oriented_splices[ePlus].insert(i->GetFrom());
2069 oriented_splices[ePlus].insert(i->GetTo());
2070 }
2071 ITERATE(set<TSignedSeqRange>, i, oriented_introns_minus) {
2072 oriented_splices[eMinus].insert(i->GetFrom());
2073 oriented_splices[eMinus].insert(i->GetTo());
2074 }
2075
2076 NON_CONST_ITERATE(CChainMembers, i, pointers) {
2077 SChainMember& mbr = **i;
2078 CGeneModel& algn = *mbr.m_align;
2079 if(algn.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
2080 continue;
2081 set<int>& ospl = oriented_splices[algn.Strand()];
2082 ITERATE(CGeneModel::TExons, ie, algn.Exons()) {
2083 TSignedSeqRange exon = *ie;
2084 for(set<int>::iterator spli = ospl.lower_bound(exon.GetFrom()); spli != ospl.end() && *spli <= exon.GetTo(); ++spli)
2085 mbr.m_splice_weight += algn.Weight();
2086 }
2087 }
2088 }
2089
ReplicatePStops(CChainMembers & pointers)2090 void CChainer::CChainerImpl::ReplicatePStops(CChainMembers& pointers)
2091 {
2092 int left = numeric_limits<int>::max();
2093 int right = 0;
2094 typedef vector<pair<CCDSInfo::SPStop,TSignedSeqRange> > TPstopIntron;
2095 TPstopIntron pstops_with_intron_plus;
2096 TPstopIntron pstops_with_intron_minus;
2097 ITERATE(CChainMembers, i, pointers) {
2098 SChainMember& mbr = **i;
2099 CGeneModel& algn = *mbr.m_align;
2100 TPstopIntron& pstops_with_intron = (algn.Strand() == ePlus) ? pstops_with_intron_plus : pstops_with_intron_minus;
2101 ITERATE(CCDSInfo::TPStops, s, algn.GetCdsInfo().PStops()) {
2102 if(s->m_status == CCDSInfo::eSelenocysteine || s->m_status == CCDSInfo::eGenomeNotCorrect) {
2103 left = min(left,s->GetFrom());
2104 right = max(right,s->GetTo());
2105 if(s->GetLength() == 3) {
2106 pstops_with_intron.push_back(make_pair(*s,TSignedSeqRange(0,0)));
2107 } else {
2108 for(int i = 1; i < (int)algn.Exons().size(); ++i) {
2109 TSignedSeqRange intron(algn.Exons()[i-1].GetTo(),algn.Exons()[i].GetFrom());
2110 pstops_with_intron.push_back(make_pair(*s,intron));
2111 }
2112 }
2113 }
2114 }
2115 }
2116 uniq(pstops_with_intron_plus);
2117 uniq(pstops_with_intron_minus);
2118
2119 ITERATE(CChainMembers, i, pointers) {
2120 SChainMember& mbr = **i;
2121 CGeneModel& algn = *mbr.m_align;
2122 if(algn.Limits().GetFrom() > right || algn.Limits().GetTo() < left)
2123 continue;
2124 if((algn.Type()&CGeneModel::eProt) && !algn.PStop())
2125 continue;
2126 if(algn.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
2127 continue;
2128
2129 TPstopIntron& pstops_with_intron = (algn.Strand() == ePlus) ? pstops_with_intron_plus : pstops_with_intron_minus;
2130 if(pstops_with_intron.empty())
2131 continue;
2132
2133 if(algn.Type()&CGeneModel::eProt) {
2134 CCDSInfo cds = algn.GetCdsInfo();
2135 CCDSInfo::TPStops pstops = cds.PStops();
2136 NON_CONST_ITERATE(CCDSInfo::TPStops, s, pstops) {
2137 ITERATE(TPstopIntron, si, pstops_with_intron) {
2138 if(si->second.GetLength() == 1) { // no split
2139 if(si->first == *s)
2140 *s = si->first; // assigns status
2141 } else {
2142 for(int i = 1; i < (int)algn.Exons().size(); ++i) {
2143 TSignedSeqRange intron(algn.Exons()[i-1].GetTo(),algn.Exons()[i].GetFrom());
2144 if(si->second == intron && si->first == *s)
2145 *s = si->first; // assigns status
2146 }
2147 }
2148 }
2149 }
2150 cds.ClearPStops();
2151 ITERATE(CCDSInfo::TPStops, s, pstops)
2152 cds.AddPStop(*s);
2153 algn.SetCdsInfo(cds);
2154 } else if(algn.ReadingFrame().Empty()) {
2155 CCDSInfo cds;
2156 const CGeneModel::TExons& exons = algn.Exons();
2157 ITERATE(TPstopIntron, si, pstops_with_intron) {
2158 if(si->first.GetTo() < algn.Limits().GetFrom())
2159 continue;
2160 if(si->first.GetFrom() > algn.Limits().GetTo())
2161 break;
2162 for(int i = 0; i < (int)exons.size(); ++i) {
2163 if(Include(exons[i].Limits(),si->first.GetFrom())) {
2164 if(si->second.GetLength() == 1) { // no split
2165 if(si->first.GetTo() <= exons[i].GetTo())
2166 cds.AddPStop(si->first);
2167 } else {
2168 if(i < (int)exons.size()-1) {
2169 TSignedSeqRange intron(exons[i].GetTo(),exons[i+1].GetFrom());
2170 if(intron == si->second && si->first.GetTo() <= exons[i+1].GetTo())
2171 cds.AddPStop(si->first);
2172 }
2173 }
2174 }
2175 }
2176 }
2177 if(cds.PStop())
2178 algn.SetCdsInfo(cds);
2179 }
2180 }
2181 }
2182
ScoreCdnas(CChainMembers & pointers)2183 void CChainer::CChainerImpl::ScoreCdnas(CChainMembers& pointers)
2184 {
2185 NON_CONST_ITERATE(CChainMembers, i, pointers) {
2186 SChainMember& mbr = **i;
2187 CGeneModel& algn = *mbr.m_align;
2188
2189 if(algn.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
2190 continue;
2191 if((algn.Type() & CGeneModel::eProt)!=0 || algn.ConfirmedStart())
2192 continue;
2193
2194 m_gnomon->GetScore(algn);
2195 double ms = GoodCDNAScore(algn);
2196 RemovePoorCds(algn,ms);
2197
2198 if(algn.Score() != BadScore())
2199 mbr.m_type = eCDS;
2200 }
2201 }
2202
2203
Duplicate5pendsAndShortCDSes(CChainMembers & pointers)2204 void CChainer::CChainerImpl::Duplicate5pendsAndShortCDSes(CChainMembers& pointers)
2205 {
2206 unsigned int initial_size = pointers.size();
2207 for(unsigned int i = 0; i < initial_size; ++i) {
2208 SChainMember& mbr = *pointers[i];
2209 CGeneModel& algn = *mbr.m_align;
2210
2211 if(algn.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
2212 continue;
2213
2214 if(mbr.m_type == eRightUTR) // avoid copying UTR copies
2215 continue;
2216
2217 if(algn.GetCdsInfo().ProtReadingFrame().Empty() && algn.Score() < 5*minscor.m_min) {
2218 for(int i = 0; i < (int)algn.GetEdgeReadingFrames()->size(); ++i) {
2219 const CCDSInfo& cds_info = (*algn.GetEdgeReadingFrames())[i];
2220 if(cds_info.ReadingFrame() != algn.ReadingFrame()) {
2221 pointers.InsertMemberCopyWithCds(cds_info, &mbr); //copy with CDS
2222 }
2223 }
2224
2225 if(algn.Score() != BadScore()) {
2226 pointers.InsertMemberCopyWithoutCds(&mbr); //UTR copy
2227 }
2228 }
2229 }
2230
2231
2232 initial_size = pointers.size();
2233 for(unsigned int i = 0; i < initial_size; ++i) {
2234 SChainMember& mbr = *pointers[i];
2235 CGeneModel& algn = *mbr.m_align;
2236 CCDSInfo& acdsinfo = const_cast<CCDSInfo&>(*mbr.m_cds_info);
2237
2238 if(acdsinfo.HasStart()) {
2239 bool inf_5prime;
2240 if (algn.Strand()==ePlus) {
2241 inf_5prime = acdsinfo.MaxCdsLimits().GetFrom()==TSignedSeqRange::GetWholeFrom();
2242 } else {
2243 inf_5prime = acdsinfo.MaxCdsLimits().GetTo()==TSignedSeqRange::GetWholeTo();
2244 }
2245 if (inf_5prime) {
2246 CCDSInfo cdsinfo = acdsinfo;
2247
2248 TSignedSeqPos start = (algn.Strand() == ePlus) ? acdsinfo.Start().GetFrom() : acdsinfo.Start().GetTo();
2249 acdsinfo.Set5PrimeCdsLimit(start);
2250 mbr.m_restricted_to_start = true;
2251
2252 if(algn.Strand() == ePlus) {
2253 int full_rf_left = algn.FShiftedMove(algn.Limits().GetFrom(),(algn.FShiftedLen(algn.Limits().GetFrom(), cdsinfo.Start().GetFrom(), false)-1)%3);
2254 cdsinfo.SetStart(TSignedSeqRange::GetEmpty());
2255 cdsinfo.SetScore(cdsinfo.Score(),false);
2256 cdsinfo.SetReadingFrame(TSignedSeqRange(full_rf_left,cdsinfo.ReadingFrame().GetTo()));
2257 } else {
2258 int full_rf_right = algn.FShiftedMove(algn.Limits().GetTo(),-(algn.FShiftedLen(cdsinfo.Start().GetTo(),algn.Limits().GetTo(),false)-1)%3);
2259 cdsinfo.SetStart(TSignedSeqRange::GetEmpty());
2260 cdsinfo.SetScore(cdsinfo.Score(),false);
2261 cdsinfo.SetReadingFrame(TSignedSeqRange(cdsinfo.ReadingFrame().GetFrom(),full_rf_right));
2262 }
2263
2264 if(mbr.m_copy != 0) {
2265 if(mbr.m_copy->front()->m_align->Strand() == algn.Strand()) { // first copy is original alignment; for not oriented the second copy is reverse
2266 if(mbr.m_copy->front()->m_cds_info->ReadingFrame() == cdsinfo.ReadingFrame())
2267 continue;
2268 } else if((*mbr.m_copy)[1]->m_cds_info->ReadingFrame() == cdsinfo.ReadingFrame()) {
2269 continue;
2270 }
2271 }
2272
2273 pointers.InsertMemberCopyAndStoreCds(cdsinfo, &mbr);
2274 }
2275 }
2276
2277 }
2278 }
2279
StrictlyContainedInDels(const TInDels & indels,const TSignedSeqRange & lim)2280 TInDels StrictlyContainedInDels(const TInDels& indels, const TSignedSeqRange& lim) {
2281 TInDels fs;
2282 ITERATE(TInDels, indl, indels) {
2283 if(indl->InDelEnd() > lim.GetFrom() && indl->Loc() <= lim.GetTo())
2284 fs.push_back(*indl);
2285 }
2286 return fs;
2287 }
2288
CanIncludeJinI(const SChainMember & mi,const SChainMember & mj)2289 bool CChainer::CChainerImpl::CanIncludeJinI(const SChainMember& mi, const SChainMember& mj) {
2290 const CGeneModel& ai = *mi.m_align;
2291 const CGeneModel& aj = *mj.m_align;
2292
2293 if(ai.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
2294 return false;
2295
2296 bool jflex = false;
2297 TSignedSeqRange jlimits = aj.Limits();
2298 if(aj.Status()&CGeneModel::eLeftFlexible) {
2299 jlimits.SetFrom(jlimits.GetTo());
2300 jflex = true;
2301 }
2302 if(aj.Status()&CGeneModel::eRightFlexible) {
2303 jlimits.SetTo(jlimits.GetFrom());
2304 jflex = true;
2305 }
2306
2307 if(aj.Strand() != ai.Strand() || !Include(ai.Limits(),jlimits))
2308 return false;
2309
2310 if(mi.m_type != eCDS && mj.m_type != mi.m_type)
2311 return false; // avoid including UTR copy and avoid including CDS into UTR because that will change m_type
2312
2313 const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2314 TSignedSeqRange ai_rf = ai_cds_info.Start()+ai_cds_info.ReadingFrame()+ai_cds_info.Stop();
2315 TSignedSeqRange ai_max_cds = ai_cds_info.MaxCdsLimits()&ai.Limits();
2316
2317 const CCDSInfo& aj_cds_info = *mj.m_cds_info;
2318 TSignedSeqRange aj_rf = aj_cds_info.Start()+aj_cds_info.ReadingFrame()+aj_cds_info.Stop();
2319
2320 // UTR in CDS
2321 if(mi.m_type == eCDS && mj.m_type == eLeftUTR) {
2322 if(!jflex && jlimits.GetTo()-ai_max_cds.GetFrom() >= 5) // normal UTR don't go into CDS > 5bp
2323 return false;
2324 else if(jflex && (aj.Status()&CGeneModel::ePolyA) && (!ai_cds_info.HasStop() || jlimits.GetTo()-ai_max_cds.GetFrom() >= 5)) // flex polyA needs stop and don't go into CDS > 5bp
2325 return false;
2326 else if(jflex && (aj.Status()&CGeneModel::eCap) && ai_cds_info.HasStop() && ai_max_cds.GetTo()-jlimits.GetTo() <= 5) // flex cap is allowed almost up to 3' UTR to be awailable if start moves
2327 return false;
2328 }
2329 if(mi.m_type == eCDS && mj.m_type == eRightUTR) {
2330 if(!jflex && ai_max_cds.GetTo()-jlimits.GetFrom() >= 5)
2331 return false;
2332 else if(jflex && (aj.Status()&CGeneModel::ePolyA) && (!ai_cds_info.HasStop() || ai_max_cds.GetTo()-jlimits.GetFrom() >= 5))
2333 return false;
2334 else if(jflex && (aj.Status()&CGeneModel::eCap) && ai_cds_info.HasStop() && jlimits.GetFrom()-ai_max_cds.GetFrom() <= 5)
2335 return false;
2336 }
2337
2338 if(aj.FrameShifts() != StrictlyContainedInDels(ai.FrameShifts(), aj.Limits())) // not compatible frameshifts
2339 return false;
2340
2341 if(mi.m_type == eCDS && mj.m_type == eCDS) { // CDS in CDS
2342 TSignedSeqRange max_cds_limits = ai_cds_info.MaxCdsLimits() & aj_cds_info.MaxCdsLimits();
2343 if (!Include(max_cds_limits, ExtendedMaxCdsLimits(ai, ai_cds_info) + ExtendedMaxCdsLimits(aj, aj_cds_info)))
2344 return false;;
2345 if(!Include(ai_rf,aj_rf))
2346 return false;
2347
2348 if(ai_rf.GetFrom() != aj_rf.GetFrom()) {
2349 TSignedSeqPos j_from = mi.m_align_map->MapOrigToEdited(aj_rf.GetFrom());
2350 if(j_from < 0)
2351 return false;
2352 TSignedSeqPos i_from = mi.m_align_map->MapOrigToEdited(ai_rf.GetFrom());
2353 if(abs(j_from-i_from)%3 != 0)
2354 return false;
2355 }
2356 }
2357
2358 int iex = ai.Exons().size();
2359 int jex = aj.Exons().size();
2360 if(jex > iex)
2361 return false;
2362 if(iex > 1) { // big alignment is spliced
2363 int fex = 0;
2364 while(fex < iex && ai.Exons()[fex].GetTo() < jlimits.GetFrom()) {
2365 ++fex;
2366 }
2367 if(ai.Exons()[fex].GetFrom() > jlimits.GetFrom()) // first aj exon is in ai intron
2368 return false;
2369
2370 if(jlimits.GetLength() == 1) // flexible alignment
2371 return true;
2372
2373 if(iex-fex < jex) // not enough exons left in ai
2374 return false;
2375
2376 if(ai.Exons()[fex+jex-1].GetTo() < jlimits.GetTo()) // last aj exon is in ai intron
2377 return false;
2378
2379 for(int j = 0; j < jex-1; ++j) {
2380 if(aj.Exons()[j].GetTo() != ai.Exons()[fex+j].GetTo() || aj.Exons()[j+1].GetFrom() != ai.Exons()[fex+j+1].GetFrom()) // different intron
2381 return false;
2382 }
2383 }
2384
2385 return true;
2386 }
2387
FindContainedAlignments(TContained & pointers)2388 void CChainer::CChainerImpl::FindContainedAlignments(TContained& pointers) {
2389
2390 set<int> left_exon_ends, right_exon_ends;
2391 ITERATE(TContained, ip, pointers) {
2392 const CGeneModel& algn = *(*ip)->m_align;
2393 for(int i = 1; i < (int)algn.Exons().size(); ++i) {
2394 if(algn.Exons()[i-1].m_ssplice && algn.Exons()[i].m_fsplice) {
2395 left_exon_ends.insert(algn.Exons()[i].GetFrom());
2396 right_exon_ends.insert(algn.Exons()[i-1].GetTo());
2397 }
2398 }
2399 }
2400 NON_CONST_ITERATE(TContained, ip, pointers) {
2401 SChainMember& mi = **ip;
2402 CGeneModel& ai = *mi.m_align;
2403
2404 set<int>::iterator ri = right_exon_ends.lower_bound(ai.Limits().GetTo()); // leftmost compatible rexon
2405 mi.m_rlimb = numeric_limits<int>::max();
2406 if(ri != right_exon_ends.end())
2407 mi.m_rlimb = *ri;
2408 set<int>::iterator li = left_exon_ends.upper_bound(ai.Limits().GetFrom()); // leftmost not compatible lexon
2409 mi.m_llimb = numeric_limits<int>::max() ;
2410 if(li != left_exon_ends.end())
2411 mi.m_llimb = *li;
2412 }
2413
2414 // finding contained subalignments (alignment is contained in itself) and selecting longer alignments for chaining
2415
2416 sort(pointers.begin(),pointers.end(),GenomeOrderD());
2417 int jfirst = 0;
2418 for(int i = 0; i < (int)pointers.size(); ++i) {
2419 SChainMember& mi = *pointers[i];
2420 CGeneModel& ai = *mi.m_align;
2421 const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2422
2423 // knockdown spliced notconsensus UTRs in reads
2424 if(mi.m_type != eCDS && ai.Exons().size() > 1) {
2425 if(ai.Status()&CGeneModel::eUnknownOrientation) {
2426 mi.m_not_for_chaining = true;
2427 } else {
2428 for(int i = 1; i < (int)ai.Exons().size(); ++i) {
2429 if(ai.Exons()[i-1].m_ssplice_sig == "XX" || ai.Exons()[i].m_fsplice_sig == "XX")
2430 continue;
2431 else if(ai.Strand() == ePlus && (ai.Exons()[i-1].m_ssplice_sig != "GT" || ai.Exons()[i].m_fsplice_sig != "AG"))
2432 mi.m_not_for_chaining = true;
2433 else if(ai.Strand() == eMinus && (ai.Exons()[i-1].m_ssplice_sig != "AG" || ai.Exons()[i].m_fsplice_sig != "GT"))
2434 mi.m_not_for_chaining = true;
2435 }
2436 }
2437 }
2438
2439 //don't use alignments intersection with frameshifts for hiding smaller alignments
2440 TSignedSeqRange intersect_with_fs;
2441 ITERATE(TInDels, indl, all_frameshifts) {
2442 if(indl->InDelEnd() < ai.Limits().GetFrom())
2443 continue;
2444 else if(indl->Loc() > ai.Limits().GetTo()+1)
2445 break;
2446 else {
2447 ITERATE(CGeneModel::TExons, e, ai.Exons()) {
2448 if(indl->IntersectingWith(e->GetFrom(), e->GetTo()))
2449 intersect_with_fs += TSignedSeqRange(indl->Loc(), indl->InDelEnd());
2450 }
2451 }
2452 }
2453
2454 if(pointers[jfirst]->m_align->Limits() != ai.Limits())
2455 jfirst = i;
2456 for(int j = jfirst; j < (int)pointers.size() && pointers[j]->m_align->Limits().GetFrom() <= ai.Limits().GetTo(); ++j) {
2457
2458 if(i == j) {
2459 IncludeInContained(mi, mi); // include self
2460 continue;
2461 }
2462
2463 SChainMember& mj = *pointers[j];
2464 CGeneModel& aj = *mj.m_align;
2465 const CCDSInfo& aj_cds_info = *mj.m_cds_info;
2466
2467 if(CanIncludeJinI(mi, mj))
2468 IncludeInContained(mi, mj);
2469 else
2470 continue;
2471
2472 if(mi.m_not_for_chaining)
2473 continue;
2474
2475 if(intersect_with_fs.NotEmpty() && !Include(aj.Limits(), intersect_with_fs))
2476 continue;
2477
2478 if(mj.m_type != mi.m_type)
2479 continue;
2480 if((aj.Status()&CGeneModel::ePolyA) != 0 || (aj.Status()&CGeneModel::eCap) != 0)
2481 continue;
2482 if((aj.Type()&CGeneModel::eProt) != 0) // proteins (actually only gapped) should be directly available
2483 continue;
2484 if(ai.Limits() == aj.Limits())
2485 continue;
2486 if(mj.m_rlimb < ai.Limits().GetTo() || mj.m_llimb != mi.m_llimb) // bigger alignment may interfere with splices
2487 continue;
2488 if(mi.m_type == eCDS && mj.m_type == eCDS && !Include(ai_cds_info.MaxCdsLimits(),aj_cds_info.MaxCdsLimits())) // bigger alignment restricts the cds
2489 continue;
2490
2491 mj.m_not_for_chaining = true;
2492 }
2493 }
2494 }
2495
2496 #define NON_CDNA_INTRON_PENALTY 20
2497
LRCanChainItoJ(int & delta_cds,double & delta_num,double & delta_splice_num,SChainMember & mi,SChainMember & mj,TContained & contained)2498 bool CChainer::CChainerImpl::LRCanChainItoJ(int& delta_cds, double& delta_num, double& delta_splice_num, SChainMember& mi, SChainMember& mj, TContained& contained) {
2499
2500 const CGeneModel& ai = *mi.m_align;
2501 const CGeneModel& aj = *mj.m_align;
2502
2503
2504 if(aj.Strand() != ai.Strand())
2505 return false;
2506
2507 const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2508 TSignedSeqRange ai_rf = ai_cds_info.Start()+ai_cds_info.ReadingFrame()+ai_cds_info.Stop();
2509 bool ai_left_complete = ai.Strand() == ePlus ? ai_cds_info.HasStart() : ai_cds_info.HasStop();
2510
2511 const CCDSInfo& aj_cds_info = *mj.m_cds_info;
2512 TSignedSeqRange aj_rf = aj_cds_info.Start()+aj_cds_info.ReadingFrame()+aj_cds_info.Stop();
2513 bool aj_right_complete = aj.Strand() == ePlus ? aj_cds_info.HasStop() : aj_cds_info.HasStart();
2514
2515 bool j_rflexible = aj.Status()&CGeneModel::eRightFlexible;
2516 bool i_lflexible = ai.Status()&CGeneModel::eLeftFlexible;
2517 switch(mi.m_type) {
2518 case eCDS:
2519 if(mj.m_type == eRightUTR)
2520 return false;
2521 else if(mj.m_type == eLeftUTR && (!ai_left_complete || (!j_rflexible && (aj.Limits()&ai_rf).GetLength() > 5)))
2522 return false;
2523 else
2524 break;
2525 case eLeftUTR:
2526 if(mj.m_type != eLeftUTR)
2527 return false;
2528 else
2529 break;
2530 case eRightUTR:
2531 if(mj.m_type == eLeftUTR)
2532 return false;
2533 else if(mj.m_type == eCDS && (!aj_right_complete || (!i_lflexible && (ai.Limits()&aj_rf).GetLength() > 5)))
2534 return false;
2535 else
2536 break;
2537 default:
2538 return false;
2539 }
2540
2541 switch(ai.MutualExtension(aj)) {
2542 case 0: // not compatible
2543 return false;
2544 case 1: // no introns in intersection
2545 if(mi.m_type == eCDS && mj.m_type == eCDS) // no intersecting limit for coding
2546 break;
2547 if ((ai.Limits() & aj.Limits()).GetLength() < intersect_limit)
2548 return false;
2549 break;
2550 default: // one or more introns in intersection
2551 break;
2552 }
2553
2554 TSignedSeqRange overlap = (ai.Limits() & aj.Limits());
2555 if(StrictlyContainedInDels(ai.FrameShifts(), overlap) != StrictlyContainedInDels(aj.FrameShifts(), overlap)) // incompatible frameshifts
2556 return false;
2557
2558 int cds_overlap = 0;
2559
2560 if(mi.m_type == eCDS && mj.m_type == eCDS) {
2561 int genome_overlap = ai_rf.GetLength()+aj_rf.GetLength()-(ai_rf+aj_rf).GetLength();
2562 if(genome_overlap < 0)
2563 return false;
2564
2565 TSignedSeqRange max_cds_limits = ai_cds_info.MaxCdsLimits() & aj_cds_info.MaxCdsLimits();
2566
2567 if (!Include(max_cds_limits, ExtendedMaxCdsLimits(ai, ai_cds_info) + ExtendedMaxCdsLimits(aj, aj_cds_info)))
2568 return false;
2569
2570 if((Include(ai_rf,aj_rf) || Include(aj_rf,ai_rf)) && ai_rf.GetFrom() != aj_rf.GetFrom() && ai_rf.GetTo() != aj_rf.GetTo())
2571 return false;
2572
2573 cds_overlap = mi.m_align_map->FShiftedLen(ai_rf&aj_rf,false);
2574 if(cds_overlap%3 != 0)
2575 return false;
2576
2577 if(ai_cds_info.HasStart() && aj_cds_info.HasStart())
2578 cds_overlap += START_BONUS;
2579
2580 if(has_rnaseq) {
2581 for(int i = 1; i < (int)ai.Exons().size(); ++i) {
2582 if(ai.Exons()[i-1].m_ssplice && ai.Exons()[i].m_fsplice) {
2583 TSignedSeqRange intron(ai.Exons()[i-1].Limits().GetTo(),ai.Exons()[i].Limits().GetFrom());
2584 if(Include(ai_rf,intron) && Include(aj_rf,intron) && mrna_count[intron]+est_count[intron]+rnaseq_count[intron] == 0) {
2585 cds_overlap -= NON_CDNA_INTRON_PENALTY;
2586 }
2587 }
2588 }
2589 }
2590 }
2591
2592 delta_cds = mi.m_cds-cds_overlap;
2593
2594 TContained::const_iterator endsp = contained.begin();
2595 if(!j_rflexible && !i_lflexible)
2596 endsp = upper_bound(contained.begin(), contained.end(), &mj, LeftOrder()); // first alignmnet contained in ai and outside aj
2597 delta_num = 0;
2598 delta_splice_num = 0;
2599 for(TContained::const_iterator ic = endsp; ic != contained.end(); ++ic) {
2600 delta_num += (*ic)->m_align->Weight();
2601 delta_splice_num += (*ic)->m_splice_weight;
2602 }
2603
2604 return true;
2605 }
2606
2607
LRIinit(SChainMember & mi)2608 void CChainer::CChainerImpl::LRIinit(SChainMember& mi) {
2609 const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2610 TSignedSeqRange ai_rf = ai_cds_info.Start()+ai_cds_info.ReadingFrame()+ai_cds_info.Stop();
2611
2612 TContained micontained = mi.CollectContainedForMemeber();
2613 mi.m_num = 0;
2614 mi.m_splice_num = 0;
2615 for(auto p : micontained) {
2616 mi.m_num += p->m_align->Weight();
2617 mi.m_splice_num = p->m_splice_weight;
2618 }
2619
2620 const CGeneModel& ai = *mi.m_align;
2621 mi.m_cds = mi.m_align_map->FShiftedLen(ai_rf,false);
2622 if(ai_cds_info.HasStart()) {
2623 mi.m_cds += START_BONUS;
2624 _ASSERT((ai.Strand() == ePlus && ai_cds_info.Start().GetFrom() == ai_cds_info.MaxCdsLimits().GetFrom()) ||
2625 (ai.Strand() == eMinus && ai_cds_info.Start().GetTo() == ai_cds_info.MaxCdsLimits().GetTo()));
2626 }
2627
2628 if(has_rnaseq) {
2629 for(int i = 1; i < (int)ai.Exons().size(); ++i) {
2630 if(ai.Exons()[i-1].m_ssplice && ai.Exons()[i].m_fsplice) {
2631 TSignedSeqRange intron(ai.Exons()[i-1].Limits().GetTo(),ai.Exons()[i].Limits().GetFrom());
2632 if(Include(ai_rf,intron) && mrna_count[intron]+est_count[intron]+rnaseq_count[intron] == 0) {
2633 mi.m_cds -= NON_CDNA_INTRON_PENALTY;
2634 }
2635 }
2636 }
2637 }
2638
2639 mi.m_left_member = 0;
2640 mi.m_left_num = mi.m_num;
2641 mi.m_left_splice_num = mi.m_splice_num;
2642 mi.m_left_cds = mi.m_cds;
2643
2644 mi.m_gapped_connection = false;
2645 mi.m_fully_connected_to_part = -1;
2646 }
2647
LeftRight(TContained & pointers)2648 void CChainer::CChainerImpl::LeftRight(TContained& pointers)
2649 {
2650 sort(pointers.begin(),pointers.end(),LeftOrderD());
2651 TIVec right_ends(pointers.size());
2652 for(int k = 0; k < (int)pointers.size(); ++k) {
2653 auto& kalign = *pointers[k]->m_align;
2654 int rend = kalign.Limits().GetTo();
2655 if(kalign.Status()&CGeneModel::eRightFlexible)
2656 rend = kalign.Limits().GetFrom();
2657 right_ends[k] = rend;
2658 }
2659 NON_CONST_ITERATE(TContained, i, pointers) {
2660 SChainMember& mi = **i;
2661 CGeneModel& ai = *mi.m_align;
2662
2663 LRIinit(mi);
2664 TContained micontained = mi.CollectContainedForMemeber();
2665 sort(micontained.begin(),micontained.end(),LeftOrderD());
2666
2667 TIVec::iterator lb = lower_bound(right_ends.begin(),right_ends.end(),ai.Limits().GetFrom()-2*flex_len); // give some extra for flexible
2668 TContained::iterator jfirst = pointers.begin();
2669 if(lb != right_ends.end())
2670 jfirst = pointers.begin()+(lb-right_ends.begin()); // skip all on the left side
2671 for(TContained::iterator j = jfirst; j < i; ++j) {
2672 SChainMember& mj = **j;
2673 CGeneModel& aj = *mj.m_align;
2674 if(aj.Limits().GetTo() < ai.Limits().GetFrom()) // skip not overlapping (may exist because of flex_len)
2675 continue;
2676
2677 int delta_cds;
2678 double delta_num;
2679 double delta_splice_num;
2680 if(LRCanChainItoJ(delta_cds, delta_num, delta_splice_num, mi, mj, micontained)) {
2681 int newcds = mj.m_left_cds+delta_cds;
2682 double newnum = mj.m_left_num+delta_num;
2683 double newsplicenum = mj.m_left_splice_num+delta_splice_num;
2684
2685 bool better_connection = false;
2686 if(newcds != mi.m_left_cds) {
2687 better_connection = (newcds > mi.m_left_cds);
2688 } else if(fabs(newsplicenum - mi.m_left_splice_num) > 0.001) {
2689 better_connection = (newsplicenum > mi.m_left_splice_num);
2690 } else if(newnum > mi.m_left_num) {
2691 better_connection = true;
2692 }
2693
2694 if(better_connection) {
2695 mi.m_left_cds = newcds;
2696 mi.m_left_splice_num = newsplicenum;
2697 mi.m_left_num = newnum;
2698 mi.m_left_member = &mj;
2699 _ASSERT(((ai.Status()&CGeneModel::eLeftFlexible) || aj.Limits().GetFrom() < ai.Limits().GetFrom())
2700 && ((aj.Status()&CGeneModel::eRightFlexible) || aj.Limits().GetTo() < ai.Limits().GetTo()));
2701 }
2702 }
2703 }
2704 }
2705 }
2706
RightLeft(TContained & pointers)2707 void CChainer::CChainerImpl::RightLeft(TContained& pointers)
2708 {
2709 sort(pointers.begin(),pointers.end(),RightOrderD());
2710 TIVec left_ends(pointers.size());
2711 for(int k = 0; k < (int)pointers.size(); ++k) {
2712 auto& kalign = *pointers[k]->m_align;
2713 int lend = kalign.Limits().GetFrom();
2714 if(kalign.Status()&CGeneModel::eRightFlexible)
2715 lend = kalign.Limits().GetTo();
2716 left_ends[k] = lend;
2717 }
2718 NON_CONST_ITERATE(TContained, i, pointers) {
2719 SChainMember& mi = **i;
2720 CGeneModel& ai = *mi.m_align;
2721 const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2722 TSignedSeqRange ai_rf = ai_cds_info.Start()+ai_cds_info.ReadingFrame()+ai_cds_info.Stop();
2723 TSignedSeqRange ai_limits = ai.Limits();
2724 bool ai_right_complete = ai.Strand() == ePlus ? ai_cds_info.HasStop() : ai_cds_info.HasStart();
2725
2726 mi.m_right_member = 0;
2727 mi.m_right_num = mi.m_num;
2728 mi.m_right_splice_num = mi.m_splice_num;
2729 mi.m_right_cds = mi.m_cds;
2730 TContained micontained = mi.CollectContainedForMemeber();
2731 sort(micontained.begin(),micontained.end(),RightOrderD());
2732
2733 TIVec::iterator lb = lower_bound(left_ends.begin(),left_ends.end(),ai.Limits().GetTo()+2*flex_len,greater<int>()); // first potentially intersecting
2734 TContained::iterator jfirst = pointers.begin();
2735 if(lb != left_ends.end())
2736 jfirst = pointers.begin()+(lb-left_ends.begin()); // skip all on the right side
2737 for(TContained::iterator j = jfirst; j < i; ++j) {
2738 SChainMember& mj = **j;
2739 CGeneModel& aj = *mj.m_align;
2740
2741 if(aj.Strand() != ai.Strand())
2742 continue;
2743 if(aj.Limits().GetFrom() > ai.Limits().GetTo()) // skip not overlapping (may exist because of flex_len)
2744 continue;
2745
2746 const CCDSInfo& aj_cds_info = *mj.m_cds_info;
2747 TSignedSeqRange aj_rf = aj_cds_info.Start()+aj_cds_info.ReadingFrame()+aj_cds_info.Stop();
2748 bool aj_left_complete = aj.Strand() == ePlus ? aj_cds_info.HasStart() : aj_cds_info.HasStop();
2749
2750 bool j_lflexible = aj.Status()&CGeneModel::eLeftFlexible;
2751 bool i_rflexible = ai.Status()&CGeneModel::eRightFlexible;
2752 switch(mi.m_type)
2753 {
2754 case eCDS:
2755 if(mj.m_type == eLeftUTR)
2756 continue;
2757 if(mj.m_type == eRightUTR && (!ai_right_complete || (!j_lflexible && (aj.Limits()&ai_rf).GetLength() > 5)))
2758 continue;
2759 else
2760 break;
2761 case eRightUTR:
2762 if(mj.m_type != eRightUTR)
2763 continue;
2764 else
2765 break;
2766 case eLeftUTR:
2767 if(mj.m_type == eRightUTR)
2768 continue;
2769 if(mj.m_type == eCDS && (!aj_left_complete || (!i_rflexible && (ai.Limits()&aj_rf).GetLength() > 5)))
2770 continue;
2771 else
2772 break;
2773 default:
2774 continue;
2775 }
2776
2777 switch(ai.MutualExtension(aj))
2778 {
2779 case 0: // not compatible
2780 continue;
2781 case 1: // no introns in intersection
2782 {
2783 if(mi.m_type == eCDS && mj.m_type == eCDS) // no intersecting limit for coding
2784 break;
2785
2786 int intersect = (ai_limits & aj.Limits()).GetLength();
2787 if(intersect < intersect_limit) continue;
2788 break;
2789 }
2790 default: // one or more introns in intersection
2791 break;
2792 }
2793
2794 TSignedSeqRange overlap = (ai.Limits() & aj.Limits());
2795 if(StrictlyContainedInDels(ai.FrameShifts(), overlap) != StrictlyContainedInDels(aj.FrameShifts(), overlap)) // incompatible frameshifts
2796 continue;
2797
2798 int cds_overlap = 0;
2799
2800 if(mi.m_type == eCDS && mj.m_type == eCDS) {
2801 int genome_overlap = ai_rf.GetLength()+aj_rf.GetLength()-(ai_rf+aj_rf).GetLength();
2802 if(genome_overlap < 0)
2803 continue;
2804
2805 TSignedSeqRange max_cds_limits = ai_cds_info.MaxCdsLimits() & aj_cds_info.MaxCdsLimits();
2806
2807 if (!Include(max_cds_limits, ExtendedMaxCdsLimits(ai, ai_cds_info) + ExtendedMaxCdsLimits(aj, aj_cds_info)))
2808 continue;
2809
2810 if((Include(ai_rf,aj_rf) || Include(aj_rf,ai_rf)) && ai_rf.GetFrom() != aj_rf.GetFrom() && ai_rf.GetTo() != aj_rf.GetTo())
2811 continue;
2812
2813 cds_overlap = mi.m_align_map->FShiftedLen(ai_rf&aj_rf,false);
2814 if(cds_overlap%3 != 0)
2815 continue;
2816
2817 if(ai_cds_info.HasStart() && aj_cds_info.HasStart())
2818 cds_overlap += START_BONUS;
2819
2820 if(has_rnaseq) {
2821 for(int i = 1; i < (int)ai.Exons().size(); ++i) {
2822 if(ai.Exons()[i-1].m_ssplice && ai.Exons()[i].m_fsplice) {
2823 TSignedSeqRange intron(ai.Exons()[i-1].Limits().GetTo(),ai.Exons()[i].Limits().GetFrom());
2824 if(Include(ai_rf,intron) && Include(aj_rf,intron) && mrna_count[intron]+est_count[intron]+rnaseq_count[intron] == 0) {
2825 cds_overlap -= NON_CDNA_INTRON_PENALTY;
2826 }
2827 }
2828 }
2829 }
2830 }
2831
2832
2833 int delta_cds = mi.m_cds-cds_overlap;
2834 int newcds = mj.m_right_cds+delta_cds;
2835
2836 TContained::iterator endsp = micontained.begin();
2837 if(!j_lflexible && !i_rflexible)
2838 endsp = upper_bound(micontained.begin(),micontained.end(),&mj,RightOrder()); // first alignment contained in ai and outside aj
2839 double delta_num = 0;
2840 double delta_splice_num = 0;
2841 for(TContained::iterator ic = endsp; ic != micontained.end(); ++ic) {
2842 delta_num += (*ic)->m_align->Weight();
2843 delta_splice_num += (*ic)->m_splice_weight;
2844 }
2845 double newnum = mj.m_right_num+delta_num;
2846 double newsplicenum = mj.m_right_splice_num+delta_splice_num;
2847
2848 bool better_connection = false;
2849 if(newcds != mi.m_right_cds) {
2850 better_connection = (newcds > mi.m_right_cds);
2851 } else if(fabs(newsplicenum - mi.m_right_splice_num) > 0.001) {
2852 better_connection = (newsplicenum > mi.m_right_splice_num);
2853 } else if(newnum > mi.m_right_num) {
2854 better_connection = true;
2855 }
2856
2857 if(better_connection) {
2858 mi.m_right_cds = newcds;
2859 mi.m_right_splice_num = newsplicenum;
2860 mi.m_right_num = newnum;
2861 mi.m_right_member = &mj;
2862 _ASSERT(((aj.Status()&CGeneModel::eLeftFlexible) || aj.Limits().GetFrom() > ai.Limits().GetFrom())
2863 && ((ai.Status()&CGeneModel::eRightFlexible) || aj.Limits().GetTo() > ai.Limits().GetTo()));
2864 }
2865 }
2866 }
2867 }
2868
2869
2870
2871
2872 #include <stdio.h>
2873 #include <time.h>
2874 /*
2875 time_t seconds0 = time (NULL);
2876 time_t seconds1 = time (NULL);
2877 cerr << "Time1: " << (seconds1-seconds0)/60. << endl;
2878 */
2879
2880
MemberIsCoding(const SChainMember * mp)2881 bool MemberIsCoding(const SChainMember* mp) {
2882 return (mp->m_cds_info->Score() != BadScore());
2883 }
2884
MemberIsMarkedForDeletion(const SChainMember * mp)2885 bool MemberIsMarkedForDeletion(const SChainMember* mp) {
2886 return mp->m_marked_for_deletion;
2887 }
2888
2889 // returns essential members of the chain for debugging
GetLinkedIdsForMember(const SChainMember & mi)2890 string GetLinkedIdsForMember(const SChainMember& mi) {
2891 vector<const SChainMember*> mal;
2892 mal.push_back(&mi);
2893 for (SChainMember* left = mi.m_left_member; left != 0; left = left->m_left_member) {
2894 mal.push_back(left);
2895 }
2896 for (SChainMember* right = mi.m_right_member; right != 0; right = right->m_right_member) {
2897 mal.push_back(right);
2898 }
2899 sort(mal.begin(),mal.end(),GenomeOrderD());
2900 string note = to_string(mi.m_align->ID()); //+":"+to_string(mi.m_mem_id);;
2901 ITERATE(vector<const SChainMember*>, imal, mal) {
2902 note = note+" "+to_string((*imal)->m_align->ID()); //+":"+to_string((*imal)->m_mem_id);
2903 }
2904 return note;
2905 }
2906
GoodSupportForIntrons(const CGeneModel & chain,const SMinScor & minscor,map<TSignedSeqRange,int> & mrna_count,map<TSignedSeqRange,int> & est_count,map<TSignedSeqRange,int> & rnaseq_count)2907 bool GoodSupportForIntrons(const CGeneModel& chain, const SMinScor& minscor,
2908 map<TSignedSeqRange,int>& mrna_count, map<TSignedSeqRange,int>& est_count, map<TSignedSeqRange,int>& rnaseq_count) {
2909 bool good = true;
2910 for(int i = 1; i < (int)chain.Exons().size() && good; ++i) {
2911 if(chain.Exons()[i-1].m_ssplice && chain.Exons()[i].m_fsplice) {
2912 TSignedSeqRange intron(chain.Exons()[i-1].Limits().GetTo(),chain.Exons()[i].Limits().GetFrom());
2913 if(mrna_count[intron] < minscor.m_minsupport_mrna && mrna_count[intron]+est_count[intron] < minscor.m_minsupport && rnaseq_count[intron] < minscor.m_minsupport_rnaseq)
2914 good = false;
2915 }
2916 }
2917
2918 return good;
2919 }
2920
MarkUnwantedLowSupportIntrons(TContained & pointers,const SMinScor & minscor,map<TSignedSeqRange,int> & mrna_count,map<TSignedSeqRange,int> & est_count,map<TSignedSeqRange,int> & rnaseq_count)2921 void MarkUnwantedLowSupportIntrons(TContained& pointers, const SMinScor& minscor,
2922 map<TSignedSeqRange,int>& mrna_count, map<TSignedSeqRange,int>& est_count, map<TSignedSeqRange,int>& rnaseq_count) {
2923
2924 NON_CONST_ITERATE(TContained, i, pointers)
2925 (*i)->m_marked_for_deletion = !GoodSupportForIntrons(*(*i)->m_align, minscor, mrna_count, est_count, rnaseq_count);
2926 }
2927
2928 struct GModelOrder
2929 {
GModelOrderGModelOrder2930 GModelOrder(TOrigAligns& oa) : orig_aligns(oa) {}
2931
2932 TOrigAligns& orig_aligns;
2933
operator ()GModelOrder2934 bool operator()(const CGeneModel& a, const CGeneModel& b)
2935 {
2936 if(a.Limits() != b.Limits())
2937 return a.Limits() < b.Limits();
2938 else
2939 return *orig_aligns[a.ID()]->GetTargetId() < *orig_aligns[ b.ID()]->GetTargetId(); // to make sort deterministic
2940 }
2941 };
2942
2943
MakeChains(TGeneModelList & clust,bool coding_estimates_only)2944 TGeneModelList CChainer::CChainerImpl::MakeChains(TGeneModelList& clust, bool coding_estimates_only)
2945 {
2946 if(clust.empty()) return TGeneModelList();
2947
2948 clust.sort(GModelOrder(orig_aligns));
2949
2950 {
2951 map<tuple<int, int>, TGeneModelList::iterator> special_aligns; // [left/right flex|cap/polya, position]
2952 //all known flexible
2953 for(TGeneModelList::iterator it = clust.begin(); it != clust.end(); ++it) {
2954 if(it->Status()&CGeneModel::eLeftFlexible) {
2955 int status = it->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eCap|CGeneModel::ePolyA);
2956 special_aligns.emplace(make_tuple(status, it->Limits().GetTo()), it);
2957 }
2958 if(it->Status()&CGeneModel::eRightFlexible) {
2959 int status = it->Status()&(CGeneModel::eRightFlexible|CGeneModel::eCap|CGeneModel::ePolyA);
2960 special_aligns.emplace(make_tuple(status, it->Limits().GetFrom()), it);
2961 }
2962 }
2963 //make flexible from normal cap/polya
2964 int contig_len = m_gnomon->GetSeq().size();
2965 int spec_extend = SPECIAL_ALIGN_LEN-1;
2966 for(TGeneModelList::iterator it = clust.begin(); it != clust.end(); ++it) {
2967 if(it->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
2968 continue;
2969
2970 if(it->Status()&CGeneModel::eCap) {
2971 it->Status() &= ~CGeneModel::eCap;
2972 CGeneModel galign(it->Strand(), it->ID(), CGeneModel::eSR);
2973 galign.SetWeight(it->Weight());
2974
2975 int pos;
2976 int status = CGeneModel::eCap;
2977 if(it->Strand() == ePlus) {
2978 pos = it->Limits().GetFrom();
2979 galign.AddExon(TSignedSeqRange(pos, pos+spec_extend));
2980 status |= CGeneModel::eRightFlexible;
2981 } else {
2982 pos = it->Limits().GetTo();
2983 galign.AddExon(TSignedSeqRange(pos-spec_extend, pos));
2984 status |= CGeneModel::eLeftFlexible;
2985 }
2986 if(galign.Limits().GetFrom() >= 0 && galign.Limits().GetTo() < contig_len) {
2987 galign.Status() |= status;
2988 clust.push_front(galign);
2989 auto rslt = special_aligns.emplace(make_tuple(status, pos), clust.begin());
2990 if(!rslt.second) { //this position already exists
2991 auto ialign = rslt.first->second;
2992 ialign->SetWeight(ialign->Weight()+galign.Weight());
2993 clust.pop_front();
2994 }
2995 }
2996 }
2997 if(it->Status()&CGeneModel::ePolyA) {
2998 it->Status() &= ~CGeneModel::ePolyA;
2999 CGeneModel galign(it->Strand(), it->ID(), CGeneModel::eSR);
3000 galign.SetWeight(it->Weight());
3001
3002 int pos;
3003 int status = CGeneModel::ePolyA;
3004 if(it->Strand() == eMinus) {
3005 pos = it->Limits().GetFrom();
3006 galign.AddExon(TSignedSeqRange(pos, pos+spec_extend));
3007 status |= CGeneModel::eRightFlexible;
3008 } else {
3009 pos = it->Limits().GetTo();
3010 galign.AddExon(TSignedSeqRange(pos-spec_extend, pos));
3011 status |= CGeneModel::eLeftFlexible;
3012 }
3013 if(galign.Limits().GetFrom() >= 0 && galign.Limits().GetTo() < contig_len) {
3014 galign.Status() |= status;
3015 clust.push_front(galign);
3016 auto rslt = special_aligns.emplace(make_tuple(status, pos), clust.begin());
3017 if(!rslt.second) { //this position already exists
3018 auto ialign = rslt.first->second;
3019 ialign->SetWeight(ialign->Weight()+galign.Weight());
3020 clust.pop_front();
3021 }
3022 }
3023 }
3024 }
3025
3026 //remove below threshold and crossing contig boundaries
3027 for(auto& sa : special_aligns) {
3028 auto ialign = sa.second;
3029 double min_pos_weight = ((ialign->Status()&CGeneModel::eCap) ? min_cap_weight : min_polya_weight);
3030 if(ialign->Limits().GetFrom() < 0 || ialign->Limits().GetTo() >= contig_len || ialign->Weight() < min_pos_weight)
3031 clust.erase(ialign);
3032 }
3033
3034 clust.sort(GModelOrder(orig_aligns));
3035 }
3036
3037 confirmed_ends.clear();
3038 ITERATE (TGeneModelList, it, clust) {
3039 const CGeneModel& align = *it;
3040 if(use_confirmed_ends) {
3041 if(align.Status()&CGeneModel::eLeftConfirmed) {
3042 auto rslt = confirmed_ends.emplace(align.Exons().front().GetTo(), align.Exons().front().GetFrom());
3043 if(!rslt.second)
3044 rslt.first->second = min(rslt.first->second, align.Exons().front().GetFrom());
3045 }
3046 if(align.Status()&CGeneModel::eRightConfirmed) {
3047 auto rslt = confirmed_ends.emplace(align.Exons().back().GetFrom(), align.Exons().back().GetTo());
3048 if(!rslt.second)
3049 rslt.first->second = max(rslt.first->second, align.Exons().back().GetTo());
3050 }
3051 }
3052 all_frameshifts.insert(all_frameshifts.end(), align.FrameShifts().begin(), align.FrameShifts().end());
3053 for(int i = 1; i < (int)align.Exons().size(); ++i) {
3054 if(align.Exons()[i-1].m_ssplice && align.Exons()[i].m_fsplice) {
3055 TSignedSeqRange intron(align.Exons()[i-1].Limits().GetTo(),align.Exons()[i].Limits().GetFrom());
3056
3057 if((align.Status()&CGeneModel::eUnknownOrientation) == 0) {
3058 if(align.Strand() == ePlus)
3059 oriented_introns_plus.insert(intron);
3060 else
3061 oriented_introns_minus.insert(intron);
3062 }
3063
3064 if(align.Type() == CGeneModel::emRNA)
3065 mrna_count[intron] += align.Weight();
3066 else if(align.Type() == CGeneModel::eEST)
3067 est_count[intron] += align.Weight();
3068 else if(align.Type() == CGeneModel::eSR)
3069 rnaseq_count[intron] += align.Weight();
3070 }
3071 }
3072 }
3073
3074 has_rnaseq = !rnaseq_count.empty();
3075 sort(all_frameshifts.begin(),all_frameshifts.end());
3076 if(!all_frameshifts.empty())
3077 uniq(all_frameshifts);
3078
3079 flex_len = 0;
3080 NON_CONST_ITERATE (TGeneModelList, it, clust) {
3081 CGeneModel& align = *it;
3082 if(align.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
3083 flex_len = max(flex_len, align.Limits().GetLength());
3084
3085 if(align.Status()&CGeneModel::eUnknownOrientation) {
3086 int pluses = 0;
3087 int minuses = 0;
3088 for(int i = 1; i < (int)align.Exons().size(); ++i) {
3089 if(align.Exons()[i-1].m_ssplice && align.Exons()[i].m_fsplice) {
3090 TSignedSeqRange intron(align.Exons()[i-1].Limits().GetTo(),align.Exons()[i].Limits().GetFrom());
3091 if(oriented_introns_plus.find(intron) != oriented_introns_plus.end())
3092 ++pluses;
3093 if(oriented_introns_minus.find(intron) != oriented_introns_minus.end())
3094 ++minuses;
3095 }
3096 }
3097 if(pluses > 0 && minuses == 0) {
3098 align.Status() ^= CGeneModel::eUnknownOrientation;
3099 if(align.Strand() == eMinus)
3100 align.ReverseComplementModel();
3101 } else if(minuses > 0 && pluses == 0) {
3102 align.Status() ^= CGeneModel::eUnknownOrientation;
3103 if(align.Strand() == ePlus)
3104 align.ReverseComplementModel();
3105 }
3106 align.Status() &= ~CGeneModel::eReversed;
3107 }
3108 }
3109
3110
3111 CChainMembers allpointers(clust, orig_aligns, unmodified_aligns);
3112
3113 DuplicateNotOriented(allpointers, clust);
3114 ReplicatePStops(allpointers);
3115 ScoreCdnas(allpointers);
3116 Duplicate5pendsAndShortCDSes(allpointers);
3117 DuplicateUTRs(allpointers);
3118 CalculateSpliceWeights(allpointers);
3119 FindContainedAlignments(allpointers);
3120
3121 TContained pointers;
3122 ITERATE(TContained, ip, allpointers) {
3123 _ASSERT((*ip)->m_orig_align);
3124 if(!(*ip)->m_not_for_chaining)
3125 pointers.push_back(*ip);
3126 }
3127
3128 TContained coding_pointers;
3129 ITERATE(CChainMembers, i, pointers) {
3130 if(MemberIsCoding(*i))
3131 coding_pointers.push_back(*i);
3132 }
3133
3134 LeftRight(coding_pointers);
3135 RightLeft(coding_pointers);
3136
3137 TChainList tmp_chains;
3138
3139 set<tuple<int,int,int>> coding_splices; // position, strand, donor/acceptor
3140
3141 NON_CONST_ITERATE(TContained, i, coding_pointers) {
3142 SChainMember& mi = **i;
3143 mi.m_cds = mi.m_left_cds+mi.m_right_cds-mi.m_cds;
3144 mi.m_splice_num = mi.m_left_splice_num+mi.m_right_splice_num-mi.m_splice_num;
3145 mi.m_num = mi.m_left_num+mi.m_right_num-mi.m_num;
3146 }
3147 sort(coding_pointers.begin(),coding_pointers.end(),CdsNumOrder());
3148 NON_CONST_ITERATE(TContained, i, coding_pointers) {
3149 SChainMember& mi = **i;
3150
3151 if(mi.m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
3152 continue;
3153
3154 if(mi.m_included)
3155 continue;
3156
3157 CChain chain(mi, 0, coding_estimates_only);
3158 TSignedSeqRange i_rf = chain.ReadingFrame();
3159
3160 m_gnomon->GetScore(chain, coding_estimates_only);
3161 mi.MarkIncludedForChain();
3162 if(chain.Score() == BadScore())
3163 continue;
3164
3165 if(coding_estimates_only) {
3166 if(chain.GetCdsInfo().ProtReadingFrame().NotEmpty() || chain.Score() > 30 || chain.FShiftedLen(chain.GetCdsInfo().Cds()) >= 300) {
3167 chain.SetID(m_idnext);
3168 chain.SetGeneID(m_idnext);
3169 m_idnext += m_idinc;
3170 tmp_chains.push_back(chain);
3171 }
3172
3173 continue;
3174 } else {
3175 if(chain.Score() == BadScore() || chain.PStop(false))
3176 continue;
3177
3178 int cdslen = chain.FShiftedLen(chain.GetCdsInfo().Cds(),true);
3179 if(chain.GetCdsInfo().ProtReadingFrame().Empty() &&
3180 (cdslen < minscor.m_minlen || (chain.Score() < 2*minscor.m_min && cdslen < 2*minscor.m_cds_len)))
3181 continue;
3182
3183 TSignedSeqRange real_cds = chain.RealCdsLimits();
3184 for(int i = 1; i < (int)chain.Exons().size(); ++ i) {
3185 int donor = chain.Exons()[i-1].GetTo();
3186 if(Include(real_cds, donor))
3187 coding_splices.emplace(donor, chain.Strand(), 0);
3188 int acceptor = chain.Exons()[i].GetFrom();
3189 if(Include(real_cds, acceptor))
3190 coding_splices.emplace(acceptor, chain.Strand(), 1);
3191 }
3192
3193 TSignedSeqRange n_rf = chain.ReadingFrame();
3194 if(!i_rf.IntersectingWith(n_rf))
3195 continue;
3196 int a,b;
3197 if(n_rf.GetFrom() <= i_rf.GetFrom()) {
3198 a = n_rf.GetFrom();
3199 b = i_rf.GetTo();
3200 } else {
3201 a = i_rf.GetFrom();
3202 b = n_rf.GetTo();
3203 }
3204 if(chain.FShiftedLen(a,b,true)%3 != 0)
3205 continue;
3206
3207 mi.MarkUnwantedCopiesForChain(chain.RealCdsLimits());
3208 }
3209 }
3210
3211 for(auto ip : pointers) {
3212 if(ip->m_align->Type()&CGeneModel::eSR)
3213 continue;
3214
3215 TSignedSeqRange cds = ip->m_cds_info->Cds();
3216 int strand = ip->m_align->Strand();
3217 for(int i = 1; i < (int)ip->m_align->Exons().size(); ++ i) {
3218 int donor = ip->m_align->Exons()[i-1].GetTo();
3219 if(coding_splices.count(make_tuple(donor, ip->m_align->Strand(), 0)) && !Include(cds, donor)) {
3220 if(ip->m_restricted_to_start && ((strand == ePlus && donor < cds.GetFrom()) || (strand == eMinus && donor > cds.GetTo())))
3221 continue;
3222 ip->m_marked_for_deletion = true;
3223 break;
3224 }
3225 int acceptor = ip->m_align->Exons()[i].GetFrom();
3226 if(coding_splices.count(make_tuple(acceptor, ip->m_align->Strand(), 1)) && !Include(cds, acceptor)) {
3227 if(ip->m_restricted_to_start && ((strand == ePlus && acceptor < cds.GetFrom()) || (strand == eMinus && acceptor > cds.GetTo())))
3228 continue;
3229 ip->m_marked_for_deletion = true;
3230 break;
3231 }
3232 }
3233 }
3234
3235 if(coding_estimates_only) {
3236 TGeneModelList chains;
3237 ITERATE(TChainList, it, tmp_chains) {
3238 chains.push_back(*it);
3239 CGeneModel& chain = chains.back();
3240 int introns = 0;
3241 int weight = 0;
3242 for(int i = 1; i < (int)chain.Exons().size(); ++i) {
3243 if(chain.Exons()[i-1].m_ssplice && chain.Exons()[i].m_fsplice) {
3244 TSignedSeqRange intron(chain.Exons()[i-1].Limits().GetTo(),chain.Exons()[i].Limits().GetFrom());
3245 weight += rnaseq_count[intron];
3246 ++introns;
3247 }
3248 }
3249 for(int i = 1; i < (int)chain.Exons().size(); ++i) {
3250 if(chain.Exons()[i-1].m_ssplice && chain.Exons()[i].m_fsplice) {
3251 TSignedSeqRange intron(chain.Exons()[i-1].Limits().GetTo(),chain.Exons()[i].Limits().GetFrom());
3252 if(rnaseq_count[intron] < weight/introns/5) {
3253 chain.SetSplices(i-1, chain.Exons()[i-1].m_fsplice_sig, "WL"); // set weak link
3254 chain.SetSplices(i, "WL", chain.Exons()[i].m_ssplice_sig); // set weak link
3255 }
3256 }
3257 }
3258 }
3259
3260 return chains;
3261 }
3262
3263 pointers.erase(std::remove_if(pointers.begin(),pointers.end(),MemberIsMarkedForDeletion),pointers.end()); // wrong orientaition/UTR/frames are removed
3264
3265 LeftRight(pointers);
3266 RightLeft(pointers);
3267 NON_CONST_ITERATE(TContained, i, pointers) {
3268 SChainMember& mi = **i;
3269 mi.m_included = false;
3270 mi.m_cds = mi.m_left_cds+mi.m_right_cds-mi.m_cds;
3271 mi.m_splice_num = mi.m_left_splice_num+mi.m_right_splice_num-mi.m_splice_num;
3272 mi.m_num = mi.m_left_num+mi.m_right_num-mi.m_num;
3273 }
3274
3275 sort(pointers.begin(),pointers.end(),CdsNumOrder());
3276
3277 NON_CONST_ITERATE(TContained, i, pointers) {
3278 SChainMember& mi = **i;
3279
3280 if(mi.m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
3281 continue;
3282
3283 if(mi.m_included || mi.m_postponed) continue;
3284
3285 CChain chain(mi);
3286 mi.MarkPostponedForChain();
3287
3288 if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3289 continue;
3290 m_gnomon->GetScore(chain);
3291 if(chain.Score() == BadScore() || (chain.GetCdsInfo().Cds()&chain.m_supported_range).Empty())
3292 continue;
3293
3294 chain.RemoveFshiftsFromUTRs();
3295 chain.RestoreReasonableConfirmedStart(*m_gnomon, orig_aligns);
3296 const CResidueVec& contig = m_gnomon->GetSeq();
3297 // alignments clipped below might not be in any chain; clipping may produce redundant chains
3298 chain.ClipToCap(min_cap_blob, max_dist, min_flank_exon, secondary_peak);
3299 chain.ClipToPolyA(contig, min_polya_blob, max_dist, min_flank_exon, secondary_peak, tertiary_peak, tertiary_peak_coverage);
3300 chain.ClipLowCoverageUTR(minscor.m_utr_clip_threshold);
3301 if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3302 continue;
3303 m_gnomon->GetScore(chain, !no5pextension); // this will return CDS to best/longest depending on no5pextension
3304 chain.CheckSecondaryCapPolyAEnds();
3305
3306 double ms = GoodCDNAScore(chain);
3307
3308 bool has_trusted = chain.HasTrustedEvidence(orig_aligns);
3309
3310 if(!has_trusted)
3311 RemovePoorCds(chain,ms);
3312 if(chain.Score() != BadScore() && (has_trusted || chain.RealCdsLen() >= minscor.m_minlen)) {
3313 mi.MarkIncludedForChain();
3314
3315 #ifdef _DEBUG
3316 chain.AddComment("Link1 "+GetLinkedIdsForMember(mi));
3317 #endif
3318 chain.CalculateDropLimits();
3319 tmp_chains.push_back(chain);
3320 _ASSERT( chain.FShiftedLen(chain.GetCdsInfo().Start()+chain.ReadingFrame()+chain.GetCdsInfo().Stop(), false)%3==0 );
3321 }
3322 }
3323
3324 TGeneModelList unma_aligns;
3325 CChainMembers unma_members;
3326 CreateChainsForPartialProteins(tmp_chains, pointers, unma_aligns, unma_members);
3327
3328
3329 pointers.erase(std::remove_if(pointers.begin(),pointers.end(),MemberIsCoding),pointers.end()); // only noncoding left
3330
3331 MarkUnwantedLowSupportIntrons(pointers, minscor, mrna_count, est_count, rnaseq_count);
3332 pointers.erase(std::remove_if(pointers.begin(),pointers.end(),MemberIsMarkedForDeletion),pointers.end()); // low support introns removed
3333
3334 // convert all flexible to left UTRs; copy contained flexible from right UTRs to left UTRs; remove right UTRs
3335 for(auto i : allpointers) {
3336 SChainMember& mi = *i;
3337 if(mi.m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)) {
3338 mi.m_type = eLeftUTR;
3339 } else if(mi.m_type == eLeftUTR) {
3340 if(mi.m_copy != nullptr) {
3341 for(auto j : *mi.m_copy) {
3342 if(j->m_type == eRightUTR && j->m_align->Strand() == mi.m_align->Strand()) {
3343 for(auto jc : *j->m_contained) {
3344 if(jc->m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
3345 mi.m_contained->push_back(jc);
3346 }
3347 }
3348 }
3349 }
3350 }
3351 }
3352 pointers.erase(std::remove_if(pointers.begin(),pointers.end(),[](SChainMember* p){ return p->m_type == eRightUTR; }), pointers.end());
3353
3354 LeftRight(pointers);
3355 RightLeft(pointers);
3356
3357 ITERATE(TContained, i, pointers) {
3358 SChainMember& mi = **i;
3359 mi.m_splice_num = mi.m_left_splice_num+mi.m_right_splice_num-mi.m_splice_num;
3360 mi.m_num = mi.m_left_num+mi.m_right_num-mi.m_num;
3361 _ASSERT(mi.m_cds == 0);
3362 }
3363
3364 sort(pointers.begin(),pointers.end(),CdsNumOrder());
3365
3366 NON_CONST_ITERATE(TContained, i, pointers) {
3367 SChainMember& mi = **i;
3368 if(mi.m_included)
3369 continue;
3370
3371 if(mi.m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
3372 continue;
3373
3374 CChain chain(mi);
3375 if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3376 continue;
3377
3378 chain.RemoveFshiftsFromUTRs();
3379 mi.MarkIncludedForChain();
3380 const CResidueVec& contig = m_gnomon->GetSeq();
3381 chain.ClipToCap(min_cap_blob, max_dist, min_flank_exon, secondary_peak);
3382 chain.ClipToPolyA(contig, min_polya_blob, max_dist, min_flank_exon, secondary_peak, tertiary_peak, tertiary_peak_coverage);
3383 chain.ClipLowCoverageUTR(minscor.m_utr_clip_threshold);
3384 if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3385 continue;
3386 if(chain.Continuous() && chain.Exons().size() > 1) {
3387 #ifdef _DEBUG
3388 chain.AddComment("Link2 "+GetLinkedIdsForMember(mi));
3389 #endif
3390 chain.CalculateDropLimits();
3391 tmp_chains.push_back(chain);
3392 }
3393 }
3394
3395 NON_CONST_ITERATE(TChainList, it, tmp_chains) {
3396 CChain& chain = *it;
3397 chain.SetID(m_idnext);
3398 chain.SetGeneID(m_idnext);
3399 m_idnext += m_idinc;
3400 }
3401
3402 CombineCompatibleChains(tmp_chains);
3403 SetFlagsForChains(tmp_chains);
3404
3405 list<CGene> genes = FindGenes(tmp_chains); // assigns geneid, rank, skip, nested
3406
3407 if(genes.size() > 1) {
3408 TrimAlignmentsIncludedInDifferentGenes(genes);
3409 CombineCompatibleChains(tmp_chains);
3410 SetFlagsForChains(tmp_chains);
3411 }
3412
3413 if(genes.size() > 1)
3414 FindGenes(tmp_chains); // redo genes after trim
3415
3416
3417 TGeneModelList chains;
3418 NON_CONST_ITERATE(TChainList, it, tmp_chains) {
3419 it->RestoreTrimmedEnds(trim);
3420 chains.push_back(*it);
3421 }
3422
3423 enum { eFirstPeak = 1, eSecondPeak = 2, eThirdPeak = 4, eAs = 8};
3424 map<tuple<int, int, int>, int> cap_polya_info; // [cap/polya strand position]
3425 const CResidueVec& contig = m_gnomon->GetSeq();
3426 for(auto& chain : tmp_chains) {
3427 if(chain.Status()&CGeneModel::eSkipped)
3428 continue;
3429 if(chain.Status()&CGeneModel::eCap) {
3430 for(int i = 0; i < (int)chain.m_cap_peaks.size(); ++i) {
3431 int pos = chain.m_cap_peaks[i];
3432 if(pos >= 0)
3433 cap_polya_info[make_tuple(CGeneModel::eCap, chain.Strand(), pos)] |= (1 << i);
3434 }
3435 }
3436 if(chain.Status()&CGeneModel::ePolyA) {
3437 for(int i = 0; i < (int)chain.m_polya_peaks.size(); ++i) {
3438 int pos = chain.m_polya_peaks[i];
3439 if(pos >= 0) {
3440 cap_polya_info[make_tuple(CGeneModel::ePolyA, chain.Strand(), pos)] |= (1 << i);
3441 if(chain.ValidPolyA(pos, contig).second)
3442 cap_polya_info[make_tuple(CGeneModel::ePolyA, chain.Strand(), pos)] |= eAs;
3443 }
3444 }
3445 }
3446 }
3447 for(auto& info : cap_polya_info) {
3448 string determinant = get<0>(info.first) == CGeneModel::eCap ? "Cap" : "PolyA";
3449 char strand = get<1>(info.first) == ePlus ? '+' : '-';
3450 int pos = m_edited_contig_map.MapEditedToOrig(get<2>(info.first))+m_limits.GetFrom()+1;
3451 cerr << m_contig_acc << ' ' << determinant << ' ' << strand << ' ' << pos << ' ';
3452 if(info.second&eFirstPeak)
3453 cerr << ":FirstPeak";
3454 if(info.second&eSecondPeak)
3455 cerr << ":SecondPeak";
3456 if(info.second&eThirdPeak)
3457 cerr << ":ThirdPeak";
3458 if(info.second&eAs)
3459 cerr << ":As";
3460 cerr << ":\n";
3461 }
3462
3463
3464 return chains;
3465 }
3466
3467 struct AlignSeqOrder
3468 {
operator ()AlignSeqOrder3469 bool operator()(const CGeneModel* ap, const CGeneModel* bp)
3470 {
3471 if (ap->Limits().GetFrom() != bp->Limits().GetFrom()) return ap->Limits().GetFrom() < bp->Limits().GetFrom();
3472 if (ap->Limits().GetTo() != bp->Limits().GetTo()) return ap->Limits().GetTo() > bp->Limits().GetTo();
3473 return ap->ID() < bp->ID(); // to make sort deterministic
3474 }
3475 };
3476
FindOptimalChainForProtein(TContained & pointers,vector<CGeneModel * > & parts,CGeneModel & palign)3477 SChainMember* CChainer::CChainerImpl::FindOptimalChainForProtein(TContained& pointers, vector<CGeneModel*>& parts, CGeneModel& palign) {
3478 // Int8 id = parts.front()->ID();
3479
3480 TIVec right_ends(pointers.size());
3481 vector<SChainMember> no_gap_members(pointers.size()); // temporary helper chain members; will be used for gap filling optimisation
3482 for(int k = 0; k < (int)pointers.size(); ++k) {
3483 SChainMember& mi = *pointers[k];
3484 right_ends[k] = mi.m_align->Limits().GetTo();
3485 no_gap_members[k] = mi;
3486 }
3487
3488 SChainMember* best_right = 0;
3489
3490 int first_member = pointers.size()-1;
3491 int leftpos = palign.Limits().GetFrom();
3492 for(int i = pointers.size()-1; i >= 0; --i) {
3493 TSignedSeqRange limi = pointers[i]->m_align->Limits();
3494 if(limi.GetTo() >= leftpos) {
3495 first_member = i;
3496 leftpos = min(leftpos,limi.GetFrom());
3497 } else {
3498 break;
3499 }
3500 }
3501
3502 int last_member = 0;
3503 int rightpos = palign.Limits().GetTo();
3504 for(int i = 0; i < (int)pointers.size(); ++i) {
3505 TSignedSeqRange limi = pointers[i]->m_align->Limits();
3506 if(Include(limi,rightpos)) {
3507 last_member = i;
3508 rightpos = max(rightpos,limi.GetTo());
3509 }
3510 }
3511
3512 int fully_connected_right = 0; // rightmost point already connected to all parts
3513
3514 for(int i = first_member; i <= last_member; ++i) {
3515 SChainMember& mi = *pointers[i]; // best connection maybe gapped
3516 SChainMember& mi_no_gap = no_gap_members[i]; // best not gapped connection (if any)
3517 CGeneModel& ai = *mi.m_align;
3518 LRIinit(mi);
3519 LRIinit(mi_no_gap);
3520
3521 if(ai.Strand() != palign.Strand())
3522 continue;
3523
3524 int part_to_connect = parts.size()-1;
3525 while(part_to_connect >= 0 && ai.Limits().GetFrom() <= parts[part_to_connect]->Limits().GetFrom())
3526 --part_to_connect;
3527
3528 if(part_to_connect >=0 && ai.Limits().GetFrom() < parts[part_to_connect]->Limits().GetTo() && !parts[part_to_connect]->isCompatible(ai)) // overlaps with part but not compatible
3529 continue;
3530
3531 if(fully_connected_right > 0 && ai.Limits().GetFrom() > fully_connected_right) // can't possibly be connected
3532 continue;
3533
3534 TContained micontained = mi.CollectContainedForMemeber();
3535 sort(micontained.begin(),micontained.end(),LeftOrderD());
3536
3537 bool compatible_with_included_parts = true;
3538 int last_included_part = -1;
3539 bool includes_first_part = false;
3540 for(int p = part_to_connect+1; p < (int)parts.size(); ++p) {
3541 if(Include(ai.Limits(),parts[p]->Limits())) {
3542 TSignedSeqRange ai_rf = mi.m_cds_info->ReadingFrame();
3543 TSignedSeqRange aj_rf = parts[p]->GetCdsInfo().ReadingFrame();
3544 TSignedSeqRange ai_cds = mi.m_cds_info->Cds();
3545 TSignedSeqRange aj_cds = parts[p]->GetCdsInfo().Cds();
3546 bool compatible = (parts[p]->isCompatible(ai) && Include(ai_rf,aj_rf) && mi.m_align_map->FShiftedLen(ai_cds.GetFrom(),aj_cds.GetFrom(),false)%3==1);
3547 bool samestop = (parts[p]->GetCdsInfo().HasStop() == mi.m_cds_info->HasStop() && (!parts[p]->GetCdsInfo().HasStop() || parts[p]->GetCdsInfo().Stop() == mi.m_cds_info->Stop()));
3548 bool samefshifts = (parts[p]->FrameShifts() == StrictlyContainedInDels(ai.FrameShifts(), parts[p]->Limits()));
3549 if(compatible && samestop && samefshifts) {
3550 last_included_part = p;
3551 if(p == 0)
3552 includes_first_part = true;
3553 } else {
3554 compatible_with_included_parts = false;
3555 break;
3556 }
3557 } else if(ai.Limits().IntersectingWith(parts[p]->Limits())) {
3558 TSignedSeqRange overlap = (ai.Limits() & parts[p]->Limits());
3559 if(!parts[p]->isCompatible(ai) || StrictlyContainedInDels(ai.FrameShifts(), overlap) != StrictlyContainedInDels(parts[p]->FrameShifts(), overlap)) {
3560 compatible_with_included_parts = false;
3561 break;
3562 }
3563 } else {
3564 break;
3565 }
3566 }
3567
3568 if(!compatible_with_included_parts)
3569 continue;
3570
3571 _ASSERT(part_to_connect < 0 || part_to_connect == (int)parts.size()-1 || mi.m_type == eCDS); // coding if between parts
3572
3573 if(includes_first_part) {
3574 mi.m_fully_connected_to_part = last_included_part;
3575 mi_no_gap.m_fully_connected_to_part = last_included_part;
3576 }
3577
3578 TIVec::iterator lb = lower_bound(right_ends.begin(),right_ends.end(),(part_to_connect >= 0 ? parts[part_to_connect]->Limits().GetTo() : ai.Limits().GetFrom()));
3579 int jfirst = 0;
3580 if(lb != right_ends.end())
3581 jfirst = lb-right_ends.begin(); // skip all on the left side
3582
3583 for(int j = jfirst; j < i; ++j) {
3584 SChainMember& mj = *pointers[j]; // best connection maybe gapped
3585 if(part_to_connect >= 0 && mj.m_fully_connected_to_part < part_to_connect) // alignmnet is not connected to all previous parts
3586 continue;
3587 CGeneModel& aj = *mj.m_align;
3588 if( ai.Strand() != aj.Strand())
3589 continue;
3590
3591 SChainMember& mj_no_gap = no_gap_members[j]; // best not gapped connection (if any)
3592
3593 if(ai.Limits().GetFrom() > aj.Limits().GetTo() && part_to_connect >= 0 && part_to_connect < (int)parts.size()-1 && // gap is not closed
3594 mj_no_gap.m_fully_connected_to_part == part_to_connect && // no additional gap
3595 mi.m_type == eCDS && mj.m_type == eCDS &&
3596 mj.m_cds_info->MaxCdsLimits().GetTo() == TSignedSeqRange::GetWholeTo() &&
3597 mi.m_cds_info->MaxCdsLimits().GetFrom() == TSignedSeqRange::GetWholeFrom()) { // reading frame not interrupted
3598
3599 #define PGAP_PENALTY 120
3600
3601 int newcds = mj_no_gap.m_left_cds+mi.m_cds - PGAP_PENALTY;
3602 double newnum = mj_no_gap.m_left_num+mi.m_num;
3603
3604 if(mi.m_left_member == 0 || newcds > mi.m_left_cds || (newcds == mi.m_left_cds && newnum > mi.m_left_num)) {
3605 mi.m_left_cds = newcds;
3606 mi.m_left_num = newnum;
3607 mi.m_left_member = &mj_no_gap;
3608 mi.m_gapped_connection = true;
3609 mi.m_fully_connected_to_part = part_to_connect;
3610 }
3611 } else if(ai.Limits().IntersectingWith(aj.Limits())) {
3612 int delta_cds;
3613 double delta_num;
3614 double delta_splice_num;
3615 if(LRCanChainItoJ(delta_cds, delta_num, delta_splice_num, mi, mj, micontained)) { // i and j connected continuosly
3616 int newcds = mj.m_left_cds+delta_cds;
3617 double newnum = mj.m_left_num+delta_num;
3618 double newsplicenum = mj.m_left_splice_num+delta_splice_num;
3619
3620 bool better_connection = false;
3621 if(newcds != mi.m_left_cds) {
3622 better_connection = (newcds > mi.m_left_cds);
3623 } else if(fabs(newsplicenum - mi.m_left_splice_num) > 0.001) {
3624 better_connection = (newsplicenum > mi.m_left_splice_num);
3625 } else if(newnum > mi.m_left_num) {
3626 better_connection = true;
3627 }
3628
3629 if (mi.m_left_member == 0 || better_connection) {
3630 mi.m_left_cds = newcds;
3631 mi.m_left_splice_num = newsplicenum;
3632 mi.m_left_num = newnum;
3633 mi.m_gapped_connection = mj.m_gapped_connection;
3634 mi.m_left_member = &mj;
3635 mi.m_fully_connected_to_part = part_to_connect;
3636 if(!mi.m_gapped_connection)
3637 mi_no_gap = mi;
3638 } else if(mj_no_gap.m_fully_connected_to_part == part_to_connect) {
3639 newcds = mj_no_gap.m_left_cds+delta_cds;
3640 newnum = mj_no_gap.m_left_num+delta_num;
3641 newsplicenum = mj_no_gap.m_left_splice_num+delta_splice_num;
3642
3643 better_connection = false;
3644 if(newcds != mi_no_gap.m_left_cds) {
3645 better_connection = (newcds > mi_no_gap.m_left_cds);
3646 } else if(fabs(newsplicenum - mi_no_gap.m_left_splice_num) > 0.001) {
3647 better_connection = (newsplicenum > mi_no_gap.m_left_splice_num);
3648 } else if(newnum > mi_no_gap.m_left_num) {
3649 better_connection = true;
3650 }
3651
3652 if (mi_no_gap.m_left_member == 0 || better_connection) {
3653 mi_no_gap.m_left_cds = newcds;
3654 mi_no_gap.m_left_splice_num = newsplicenum;
3655 mi_no_gap.m_left_num = newnum;
3656 mi_no_gap.m_left_member = &mj_no_gap;
3657 mi_no_gap.m_fully_connected_to_part = part_to_connect;
3658 }
3659 }
3660 }
3661 }
3662 }
3663
3664 if(mi.m_left_member != 0 && last_included_part >= 0) {
3665 mi.m_fully_connected_to_part = last_included_part;
3666 mi.m_gapped_connection = false;
3667 mi_no_gap = mi;
3668 }
3669
3670 if(mi.m_fully_connected_to_part == (int)parts.size()-1) { // includes all parts
3671 fully_connected_right = max(fully_connected_right,mi.m_align->Limits().GetTo());
3672
3673 if(best_right == 0 || (mi.m_left_cds > best_right->m_left_cds || (mi.m_left_cds == best_right->m_left_cds && mi.m_left_num > best_right->m_left_num)) )
3674 best_right = &mi;
3675 }
3676 }
3677
3678 _ASSERT(best_right != 0);
3679
3680 _ASSERT(best_right < &no_gap_members.front() || best_right > &no_gap_members.back()); // don't point to temporary vector
3681 for (SChainMember* mp = best_right; mp != 0; mp = mp->m_left_member) {
3682 if(mp->m_left_member >= &no_gap_members.front() && mp->m_left_member <= &no_gap_members.back()) { // points to temporary vector
3683 SChainMember* p = pointers[mp->m_left_member-&no_gap_members.front()];
3684 *p = *mp->m_left_member;
3685 mp->m_left_member = p;
3686 }
3687 }
3688
3689 return best_right;
3690 }
3691
3692 struct AlignLenOrder
3693 {
AlignLenOrderAlignLenOrder3694 AlignLenOrder(TOrigAligns& oa) : orig_aligns(oa) {}
3695 TOrigAligns& orig_aligns;
3696
operator ()AlignLenOrder3697 bool operator()(const vector<CGeneModel*>* ap, const vector<CGeneModel*>* bp)
3698 {
3699 const vector<CGeneModel*>& partsa = *ap;
3700 const vector<CGeneModel*>& partsb = *bp;
3701
3702 int align_lena = 0;
3703 ITERATE(vector<CGeneModel*>, k, partsa)
3704 align_lena += (*k)->AlignLen();
3705
3706 int align_lenb = 0;
3707 ITERATE(vector<CGeneModel*>, k, partsb)
3708 align_lenb += (*k)->AlignLen();
3709
3710 if(align_lena != align_lenb) {
3711 return align_lena > align_lenb;
3712 } else {
3713 return *orig_aligns[partsa.front()->ID()]->GetTargetId() < *orig_aligns[partsb.front()->ID()]->GetTargetId(); // to make sort deterministic
3714 }
3715 }
3716 };
3717
CreateChainsForPartialProteins(TChainList & chains,TContained & pointers_all,TGeneModelList & unma_aligns,CChainMembers & unma_members)3718 void CChainer::CChainerImpl::CreateChainsForPartialProteins(TChainList& chains, TContained& pointers_all, TGeneModelList& unma_aligns, CChainMembers& unma_members) {
3719
3720 sort(pointers_all.begin(),pointers_all.end(),LeftOrderD());
3721
3722 typedef map<Int8, vector<CGeneModel*> > TIdChainMembermap;
3723 TIdChainMembermap protein_parts;
3724 for(int k = 0; k < (int)pointers_all.size(); ++k) {
3725 SChainMember& mi = *pointers_all[k];
3726
3727 if((mi.m_align->Type() & CGeneModel::eProt) && (mi.m_copy == 0 || mi.m_cds_info->HasStart())) { // only prots with start can have copies
3728 protein_parts[mi.m_align->ID()].push_back(mi.m_align);
3729 }
3730 }
3731
3732 vector<vector<CGeneModel*>*> gapped_sorted_protein_parts;
3733 NON_CONST_ITERATE(TIdChainMembermap, ip, protein_parts) {
3734 vector<CGeneModel*>& parts = ip->second;
3735 if(parts.size() > 1) {
3736 sort(parts.begin(),parts.end(),AlignSeqOrder());
3737 gapped_sorted_protein_parts.push_back(&parts);
3738 }
3739 }
3740 sort(gapped_sorted_protein_parts.begin(),gapped_sorted_protein_parts.end(),AlignLenOrder(orig_aligns));
3741
3742 NON_CONST_ITERATE(vector<vector<CGeneModel*>*>, ip, gapped_sorted_protein_parts) { // make chains starting from long proteins
3743 vector<CGeneModel*>& parts = **ip;
3744 Int8 id = parts.front()->ID();
3745
3746 CGeneModel palign(parts.front()->Strand(), id, CGeneModel::eProt);
3747 ITERATE(vector<CGeneModel*>, k, parts) {
3748 CGeneModel part = **k;
3749 CCDSInfo cds = part.GetCdsInfo();
3750 cds.Clear5PrimeCdsLimit();
3751 part.SetCdsInfo(cds);
3752 palign.Extend(part);
3753 }
3754 m_gnomon->GetScore(palign);
3755
3756 bool connected = false;
3757 NON_CONST_ITERATE(TChainList, k, chains) {
3758 if(k->Continuous() && palign.Strand() == k->Strand() && palign.IsSubAlignOf(*k)) {
3759 connected = true;
3760 #ifdef _DEBUG
3761 k->AddComment("Was connected "+orig_aligns[palign.ID()]->TargetAccession());
3762 #endif
3763 break;
3764 }
3765 }
3766
3767 if(connected)
3768 continue;
3769
3770
3771 TContained pointers;
3772 for(int k = 0; k < (int)pointers_all.size(); ++k) {
3773 SChainMember* mip = pointers_all[k];
3774
3775 if(mip->m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)) // skip flexible
3776 continue;
3777
3778 if((mip->m_type != eCDS || !Include(mip->m_cds_info->MaxCdsLimits(),mip->m_align->Limits())) && Include(palign.Limits(),mip->m_align->Limits())) // skip all not entirely coding inside protein alignment
3779 continue;
3780
3781 if(mip->m_align->Exons().front().m_ssplice_sig == "XX" && Include(palign.Limits(),mip->m_align->Exons().front().Limits())) // skip 3'/5' cdna gapfillers inside protein alignment
3782 continue;
3783
3784 if(mip->m_align->Exons().back().m_fsplice_sig == "XX" && Include(palign.Limits(),mip->m_align->Exons().back().Limits())) // skip 3'/5' cdna gapfillers inside protein alignment
3785 continue;
3786
3787 pointers.push_back(mip);
3788 }
3789
3790 SChainMember* best_right = FindOptimalChainForProtein(pointers, parts, palign);
3791
3792 best_right->m_right_member = 0;
3793 CChain chain(*best_right,&palign);
3794
3795 if(unmodified_aligns.count(id)) { // some unmodifies are dleted if interfere with a gap
3796 CGeneModel unma = unmodified_aligns[id];
3797 vector<TSignedSeqRange> new_holes;
3798 vector<TSignedSeqRange> remaining_holes;
3799 for(int k = 1; k < (int)chain.Exons().size(); ++k) {
3800 CModelExon exonl = chain.Exons()[k-1];
3801 CModelExon exonr = chain.Exons()[k];
3802 if(!(exonl.m_ssplice && exonr.m_fsplice)) {
3803 TSignedSeqRange h(exonl.GetTo()+1,exonr.GetFrom()-1);
3804 remaining_holes.push_back(h);
3805 for(int piece_begin = 0; piece_begin < (int)unma.Exons().size(); ++piece_begin) {
3806 int piece_end = piece_begin;
3807 for( ; piece_end < (int)unma.Exons().size() && unma.Exons()[piece_end].m_ssplice; ++piece_end);
3808 if(unma.Exons()[piece_begin].GetFrom() < h.GetFrom() && unma.Exons()[piece_end].GetTo() > h.GetTo()) {
3809 new_holes.push_back(h);
3810 break;
3811 }
3812 piece_begin = piece_end;
3813 }
3814 }
3815 }
3816
3817 if(!new_holes.empty()) { // failed to connect all parts - try unsupported introns
3818 CAlignMap umap = unma.GetAlignMap();
3819 if(unma.Limits() != palign.Limits()) {
3820 TSignedSeqRange lim = umap.ShrinkToRealPoints(palign.Limits(), true);
3821 unma.Clip(lim,CGeneModel::eRemoveExons);
3822 }
3823
3824 vector<TSignedSeqRange> existed_holes;
3825 for(int k = 1; k < (int)unma.Exons().size(); ++k) {
3826 CModelExon exonl = unma.Exons()[k-1];
3827 CModelExon exonr = unma.Exons()[k];
3828 if(!(exonl.m_ssplice && exonr.m_fsplice))
3829 existed_holes.push_back(TSignedSeqRange(exonl.GetTo()+1,exonr.GetFrom()-1));
3830 }
3831
3832 for(int k = 1; k < (int)palign.Exons().size(); ++k) { // cut holes which were connected or existed
3833 CModelExon exonl = palign.Exons()[k-1];
3834 CModelExon exonr = palign.Exons()[k];
3835 if(!(exonl.m_ssplice && exonr.m_fsplice)) {
3836 TSignedSeqRange hole(exonl.GetTo()+1,exonr.GetFrom()-1);
3837 bool connected = true;
3838 ITERATE(vector<TSignedSeqRange>, h, remaining_holes) {
3839 _ASSERT(Include(unma.Limits(), *h));
3840 if(Include(hole, *h)) {
3841 connected = false;
3842 break;
3843 }
3844 }
3845
3846 bool existed = false;
3847 ITERATE(vector<TSignedSeqRange>, h, existed_holes) {
3848 if(Include(hole, *h)) {
3849 existed = true;
3850 break;
3851 }
3852 }
3853
3854 if(connected || existed) {
3855 TSignedSeqRange left = umap.ShrinkToRealPoints(TSignedSeqRange(unma.Limits().GetFrom(),hole.GetFrom()-1), true);
3856 TSignedSeqRange right = umap.ShrinkToRealPoints(TSignedSeqRange(hole.GetTo()+1,unma.Limits().GetTo()), true);
3857 if(left.GetTo()+1 == hole.GetFrom() && right.GetFrom()-1 == hole.GetTo())
3858 unma.CutExons(hole);
3859 }
3860 }
3861 }
3862 m_gnomon->GetScore(unma);
3863
3864 TGeneModelList unmacl;
3865 unmacl.push_back(unma);
3866 CutParts(unmacl);
3867
3868 vector<CGeneModel*> unmaparts;
3869 NON_CONST_ITERATE(TGeneModelList, im, unmacl) {
3870 m_gnomon->GetScore(*im);
3871 unmaparts.push_back(&(*im));
3872 }
3873
3874 CChainMembers unmapointers(unmacl, orig_aligns, unmodified_aligns);
3875 Duplicate5pendsAndShortCDSes(unmapointers);
3876 sort(pointers.begin(),pointers.end(),GenomeOrderD());
3877 ITERATE(TContained, ip, unmapointers) {
3878 SChainMember& mi = **ip;
3879 IncludeInContained(mi, mi); // include self
3880 ITERATE(TContained, jp, pointers) {
3881 SChainMember& mj = **jp;
3882 if(CanIncludeJinI(mi, mj))
3883 IncludeInContained(mi, mj);
3884 }
3885 }
3886
3887 ITERATE(TContained, ip, unmapointers) {
3888 _ASSERT((*ip)->m_orig_align);
3889 (*ip)->m_mem_id = -(*ip)->m_mem_id; // unique m_mem_id
3890 pointers.push_back(*ip);
3891 }
3892
3893 sort(pointers.begin(),pointers.end(),LeftOrderD());
3894 best_right = FindOptimalChainForProtein(pointers, unmaparts, unma);
3895 ITERATE(TContained, jp, unmapointers) { // add parts in case they were 'shadowed' by longer or identical alignment
3896 SChainMember& mj = **jp;
3897 bool present = false;
3898 for(SChainMember* ip = best_right; ip != 0 && !present; ip = ip->m_left_member)
3899 present = ip == &mj;
3900 for(SChainMember* ip = best_right; ip != 0 && !present; ip = ip->m_left_member) {
3901 SChainMember& mi = *ip;
3902 if(CanIncludeJinI(mi, mj)) {
3903 mj.m_left_member = best_right;
3904 best_right = &mj;
3905 break;
3906 }
3907 }
3908 }
3909 chain = CChain(*best_right, &unma);
3910 unma_aligns.splice(unma_aligns.end(), unmacl);
3911 unma_members.SpliceFromOther(unmapointers);
3912 }
3913 }
3914
3915 if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3916 continue;
3917 m_gnomon->GetScore(chain);
3918 if(chain.Score() == BadScore())
3919 continue;
3920
3921 chain.RemoveFshiftsFromUTRs();
3922 chain.RestoreReasonableConfirmedStart(*m_gnomon, orig_aligns);
3923 const CResidueVec& contig = m_gnomon->GetSeq();
3924 chain.ClipToCap(min_cap_blob, max_dist, min_flank_exon, secondary_peak);
3925 chain.ClipToPolyA(contig, min_polya_blob, max_dist, min_flank_exon, secondary_peak, tertiary_peak, tertiary_peak_coverage);
3926 chain.ClipLowCoverageUTR(minscor.m_utr_clip_threshold);
3927 if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3928 continue;
3929 m_gnomon->GetScore(chain, !no5pextension); // this will return CDS to best/longest depending on no5pextension
3930 chain.CheckSecondaryCapPolyAEnds();
3931 chain.CalculateDropLimits();
3932 _ASSERT( chain.FShiftedLen(chain.GetCdsInfo().Start()+chain.ReadingFrame()+chain.GetCdsInfo().Stop(), false)%3==0 );
3933
3934 #ifdef _DEBUG
3935 chain.AddComment("Connected "+orig_aligns[palign.ID()]->TargetAccession());
3936 chain.AddComment("LinkForGapped "+GetLinkedIdsForMember(*best_right));
3937 #endif
3938 chains.push_back(chain);
3939 }
3940 }
3941
SetFlagsForChains(TChainList & chains)3942 void CChainer::CChainerImpl::SetFlagsForChains(TChainList& chains) {
3943
3944 int left = numeric_limits<int>::max();
3945 int right = 0;
3946 ITERATE(TOrigAligns, it, orig_aligns) {
3947 const CAlignModel& align = *it->second;
3948 left = min(left,align.Limits().GetFrom());
3949 right = max(right,align.Limits().GetTo());
3950 }
3951
3952 int len = right-left+1;
3953
3954 vector<int> prot_cov[2][3];
3955 prot_cov[0][0].resize(len,0);
3956 prot_cov[0][1].resize(len,0);
3957 prot_cov[0][2].resize(len,0);
3958 prot_cov[1][0].resize(len,0);
3959 prot_cov[1][1].resize(len,0);
3960 prot_cov[1][2].resize(len,0);
3961 ITERATE(TOrigAligns, it, orig_aligns) {
3962 const CAlignModel& align = *it->second;
3963 if(align.GetCdsInfo().ProtReadingFrame().NotEmpty()) {
3964 CAlignMap amap = align.GetAlignMap();
3965 int cdstr = amap.MapOrigToEdited(align.GetCdsInfo().Cds().GetFrom());
3966 for(int i = 0; i < (int)align.Exons().size(); ++i) {
3967 TSignedSeqRange rf = (align.Exons()[i].Limits() & align.ReadingFrame());
3968 if(rf.NotEmpty()) {
3969 for(int j = rf.GetFrom(); j <= rf.GetTo(); ++j) {
3970 int jtr = amap.MapOrigToEdited(j);
3971 if(jtr >= 0)
3972 ++prot_cov[align.Strand()][abs(cdstr-jtr)%3][j-left];
3973 }
3974 }
3975 }
3976 }
3977 }
3978
3979 CScope scope(*CObjectManager::GetInstance());
3980 scope.AddDefaults();
3981
3982 SMatrix matrix;
3983
3984 const CResidueVec& contig = m_gnomon->GetSeq();
3985
3986 NON_CONST_ITERATE(TChainList, it, chains) {
3987 CChain& chain = *it;
3988 // chain.RestoreReasonableConfirmedStart(*m_gnomon, orig_aligns);
3989 chain.SetOpenForPartialyAlignedProteins(prot_complet);
3990 chain.SetConfirmedStartStopForCompleteProteins(prot_complet, minscor);
3991 chain.CollectTrustedmRNAsProts(orig_aligns, minscor, scope, matrix, contig);
3992 chain.SetBestPlacement(orig_aligns);
3993 chain.SetConsistentCoverage();
3994 if(chain.Continuous() && chain.Exons().size() > 1) {
3995 bool allcdnaintrons = true;
3996 int num = 0;
3997 for(int i = 1; i < (int)chain.Exons().size() && allcdnaintrons; ++i) {
3998 if(chain.Exons()[i-1].m_ssplice_sig != "XX" && chain.Exons()[i].m_fsplice_sig != "XX") {
3999 TSignedSeqRange intron(TSignedSeqRange(chain.Exons()[i-1].GetTo(),chain.Exons()[i].GetFrom()));
4000 allcdnaintrons = (mrna_count[intron]+est_count[intron]+rnaseq_count[intron] > 0);
4001 ++num;
4002 }
4003 }
4004 if(allcdnaintrons && num >0)
4005 chain.Status() |= CGeneModel::ecDNAIntrons;
4006 }
4007 if (chain.FullCds()) {
4008 chain.Status() |= CGeneModel::eFullSupCDS;
4009 }
4010
4011 if(chain.GetCdsInfo().ProtReadingFrame().Empty() && chain.ReadingFrame().NotEmpty()) { // coding chain without protein support
4012 int protcds = 0;
4013 int lrf_from_proteins = numeric_limits<int>::max();
4014 int rrf_from_proteins = 0;
4015 CAlignMap amap = chain.GetAlignMap();
4016 int cdstr = amap.MapOrigToEdited(chain.GetCdsInfo().Cds().GetFrom());
4017 for(int i = 0; i < (int)chain.Exons().size(); ++i) {
4018 TSignedSeqRange rf = (chain.Exons()[i].Limits() & chain.ReadingFrame());
4019 if(rf.NotEmpty()) {
4020 for(int j = rf.GetFrom(); j <= rf.GetTo(); ++j) {
4021 if(j < left || j > right)
4022 continue;
4023
4024 int jtr = amap.MapOrigToEdited(j);
4025 int frame = abs(cdstr-jtr)%3;
4026 if(jtr >= 0 && prot_cov[chain.Strand()][frame][j-left] > 0) {
4027 if(frame == 0)
4028 lrf_from_proteins = min(lrf_from_proteins,j);
4029 if(frame == 2)
4030 rrf_from_proteins = max(rrf_from_proteins,j);
4031 ++protcds;
4032 }
4033 }
4034 }
4035 }
4036 if(protcds > 0.2*amap.FShiftedLen(chain.GetCdsInfo().Cds()) && rrf_from_proteins > lrf_from_proteins) {
4037 CCDSInfo cds = chain.GetCdsInfo();
4038 TSignedSeqRange reading_frame = cds.ReadingFrame();
4039 cds.SetReadingFrame(reading_frame&TSignedSeqRange(lrf_from_proteins,rrf_from_proteins), true);
4040 cds.SetReadingFrame(reading_frame);
4041 chain.SetCdsInfo(cds);
4042 chain.SetType(chain.Type()|CGeneModel::eProt);
4043
4044 #ifdef _DEBUG
4045 chain.AddComment("Added protsupport");
4046 #endif
4047 }
4048 }
4049 }
4050 }
4051
4052
CombineCompatibleChains(TChainList & chains)4053 void CChainer::CChainerImpl::CombineCompatibleChains(TChainList& chains) {
4054 for(TChainList::iterator itt = chains.begin(); itt != chains.end(); ++itt) {
4055 if(itt->Status()&CGeneModel::eSkipped)
4056 continue;
4057 CCDSInfo::TPStops istops = itt->GetCdsInfo().PStops();
4058 for(TChainList::iterator jt = chains.begin(); jt != chains.end();) {
4059 TChainList::iterator jtt = jt++;
4060 if(jtt->Status()&CGeneModel::eSkipped)
4061 continue;
4062
4063 if(itt != jtt && itt->Strand() == jtt->Strand() && jtt->IsSubAlignOf(*itt) && itt->ReadingFrame().Empty() == jtt->ReadingFrame().Empty()) {
4064 if(itt->ReadingFrame().NotEmpty()) {
4065 if(!Include(jtt->GetCdsInfo().MaxCdsLimits(), itt->GetCdsInfo().MaxCdsLimits()))
4066 continue;
4067
4068 if(jtt->FrameShifts() != StrictlyContainedInDels(itt->FrameShifts(), jtt->Limits()))
4069 continue;
4070
4071 if((itt->FShiftedLen(itt->GetCdsInfo().Cds().GetFrom(),jtt->GetCdsInfo().Cds().GetFrom(),false)-1)%3 != 0)
4072 continue;
4073
4074 CCDSInfo::TPStops jstops = jtt->GetCdsInfo().PStops();
4075 bool same_stops = true;
4076 ITERATE(CCDSInfo::TPStops, istp, istops) {
4077 if(Include(jtt->Limits(),*istp) && find(jstops.begin(), jstops.end(), *istp) == jstops.end()) {
4078 same_stops = false;
4079 break;
4080 }
4081 }
4082 if(!same_stops)
4083 continue;
4084 }
4085
4086 TMemberPtrSet support;
4087 ITERATE(TContained, i, itt->m_members) {
4088 support.insert(*i);
4089 if((*i)->m_copy != 0)
4090 support.insert((*i)->m_copy->begin(),(*i)->m_copy->end());
4091 }
4092 ITERATE(TContained, i, jtt->m_members) {
4093 if(support.insert(*i).second) {
4094 itt->m_members.push_back(*i);
4095 if((*i)->m_copy != 0)
4096 support.insert((*i)->m_copy->begin(),(*i)->m_copy->end());
4097 }
4098 }
4099 sort(itt->m_members.begin(),itt->m_members.end(),GenomeOrderD());
4100 itt->CalculateSupportAndWeightFromMembers();
4101 chains.erase(jtt);
4102 }
4103 }
4104 }
4105 }
4106
GoodCDNAScore(const CGeneModel & algn)4107 double CChainer::CChainerImpl::GoodCDNAScore(const CGeneModel& algn)
4108 {
4109 if(algn.FShiftedLen(algn.GetCdsInfo().Cds(),true) > minscor.m_cds_len)
4110 return 0.99*BadScore();
4111 if(((algn.Type()&CGeneModel::eProt)!=0 || algn.ConfirmedStart()) && algn.FShiftedLen(algn.GetCdsInfo().ProtReadingFrame(),true) > minscor.m_prot_cds_len) return 0.99*BadScore();
4112
4113 int intron_left = 0, intron_internal = 0, intron_total =0;
4114 for(int i = 1; i < (int)algn.Exons().size(); ++i) {
4115 if(!algn.Exons()[i-1].m_ssplice || !algn.Exons()[i].m_fsplice) continue;
4116
4117 ++intron_total;
4118 if(algn.Exons()[i].GetFrom()-1 < algn.RealCdsLimits().GetFrom()) ++intron_left;
4119 if(algn.Exons()[i-1].GetTo()+1 > algn.RealCdsLimits().GetFrom() && algn.Exons()[i].GetFrom()-1 < algn.RealCdsLimits().GetTo()) ++intron_internal;
4120 }
4121
4122 int intron_3p, intron_5p;
4123 if(algn.Strand() == ePlus) {
4124 intron_5p = intron_left;
4125 intron_3p = intron_total -intron_5p - intron_internal;
4126 } else {
4127 intron_3p = intron_left;
4128 intron_5p = intron_total -intron_3p - intron_internal;
4129 }
4130
4131 int cdslen = algn.RealCdsLen();
4132 int len = algn.AlignLen();
4133
4134 // return max(0.,25+7*intron_5p+14*intron_3p-0.05*cdslen+0.005*len);
4135 return max(0.,minscor.m_min+minscor.m_i5p_penalty*intron_5p+minscor.m_i3p_penalty*intron_3p-minscor.m_cds_bonus*cdslen+minscor.m_length_penalty*len);
4136 }
4137
4138
RemovePoorCds(CGeneModel & algn,double minscor)4139 void CChainer::CChainerImpl::RemovePoorCds(CGeneModel& algn, double minscor)
4140 {
4141 if (algn.Score() < minscor)
4142 algn.SetCdsInfo(CCDSInfo());
4143 }
4144
4145 #define SCAN_WINDOW 49 // odd number!!!
4146
CChain(SChainMember & mbr,CGeneModel * gapped_helper,bool keep_all_evidence)4147 CChain::CChain(SChainMember& mbr, CGeneModel* gapped_helper, bool keep_all_evidence) : m_coverage_drop_left(-1), m_coverage_drop_right(-1), m_coverage_bump_left(-1), m_coverage_bump_right(-1), m_core_coverage(0), m_splice_weight(0), m_cap_peaks(3, -1), m_polya_peaks(3, -1)
4148 {
4149 m_members = mbr.CollectContainedForChain();
4150 _ASSERT(m_members.size()>0);
4151 sort(m_members.begin(),m_members.end(),GenomeOrderD());
4152
4153 list<CGeneModel> extened_parts;
4154 vector<CGeneModel*> extened_parts_and_gapped;
4155 if(gapped_helper != 0) {
4156 extened_parts_and_gapped.push_back(gapped_helper);
4157 m_gapped_helper_align = *gapped_helper;
4158 }
4159 //limits extended by cap/polya info alignments without other support
4160 int left = numeric_limits<int>::max();
4161 int right = 0;
4162 ITERATE(TContained, i, m_members) {
4163 SChainMember* mi = *i;
4164 CGeneModel align = *mi->m_align;
4165 if(align.Status()&CGeneModel::eLeftFlexible) {
4166 right = max(right, align.Limits().GetTo());
4167 continue;
4168 } else if(align.Status()&CGeneModel::eRightFlexible) {
4169 left = min(left, align.Limits().GetFrom());
4170 continue;
4171 }
4172 align.SetCdsInfo(*mi->m_cds_info);
4173 if(extened_parts.empty() || !align.Limits().IntersectingWith(extened_parts.back().Limits())) {
4174 extened_parts.push_back(align);
4175 _ASSERT(extened_parts.back().Continuous());
4176 extened_parts_and_gapped.push_back(&extened_parts.back());
4177 } else {
4178 extened_parts.back().Extend(align, false);
4179 _ASSERT(extened_parts.back().Continuous());
4180 }
4181 }
4182 if(left < extened_parts.front().Limits().GetFrom())
4183 extened_parts.front().ExtendLeft(extened_parts.front().Limits().GetFrom()-left);
4184 if(right > extened_parts.back().Limits().GetTo())
4185 extened_parts.back().ExtendRight(right-extened_parts.back().Limits().GetTo());
4186
4187 SetType(eChain);
4188 EStrand strand = extened_parts_and_gapped.front()->Strand();
4189 SetStrand(strand);
4190
4191 sort(extened_parts_and_gapped.begin(),extened_parts_and_gapped.end(),AlignSeqOrder());
4192 ITERATE (vector<CGeneModel*>, it, extened_parts_and_gapped) {
4193 const CGeneModel& align = **it;
4194 Extend(align, false);
4195 }
4196
4197 NON_CONST_ITERATE(TExons, e, MyExons()) {
4198 if(!e->m_fsplice)
4199 e->m_fsplice_sig.clear();
4200 if(!e->m_ssplice)
4201 e->m_ssplice_sig.clear();
4202 }
4203
4204 m_supported_range = Limits();
4205
4206 CalculateSupportAndWeightFromMembers(keep_all_evidence);
4207
4208 m_polya_cap_left_soft_limit = Limits().GetTo()+1;
4209 m_polya_cap_right_soft_limit = Limits().GetFrom()-1;
4210
4211 CAlignMap amap = GetAlignMap();
4212 int mrna_len = amap.FShiftedLen(Limits());
4213 vector<double> coverage_raw(mrna_len+SCAN_WINDOW);
4214 ITERATE (TContained, it, m_members) {
4215 const CGeneModel& align = *(*it)->m_align;
4216 if(align.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
4217 continue;
4218
4219 TSignedSeqRange overlap = Limits()&align.Limits(); // theoretically some ends could be outside (partially trimmed from other chain and combined)
4220 if(align.Type() == CGeneModel::eSR && overlap.NotEmpty()) {
4221 TSignedSeqRange on_mrna = amap.MapRangeOrigToEdited(overlap); // for align partially in a hole will give the hole boundary
4222 for(int i = on_mrna.GetFrom(); i <= on_mrna.GetTo(); ++i)
4223 coverage_raw[i+SCAN_WINDOW/2] += align.Weight();
4224 }
4225 }
4226
4227 m_coverage.resize(mrna_len);
4228 double cov = 0;
4229 for(int i = 0; i < SCAN_WINDOW; ++i)
4230 cov += coverage_raw[i]/SCAN_WINDOW;
4231 for(int i = 0; i < mrna_len; ++i) { // will decrease coverage in SCAN_WINDOW/2 end intervals
4232 m_coverage[i] = cov;
4233 cov -= coverage_raw[i]/SCAN_WINDOW;
4234 cov += coverage_raw[i+SCAN_WINDOW]/SCAN_WINDOW;
4235 }
4236 }
4237
HasTrustedEvidence(TOrigAligns & orig_aligns) const4238 bool CChain::HasTrustedEvidence(TOrigAligns& orig_aligns) const {
4239 ITERATE (TContained, i, m_members) {
4240 const CGeneModel* align = (*i)->m_align;
4241 if(!align->TrustedProt().empty() || (!align->TrustedmRNA().empty() && (*i)->m_cds_info->ProtReadingFrame().NotEmpty())) {
4242 CAlignModel* orig_align = orig_aligns[align->ID()];
4243 if(align->AlignLen() > 0.5*orig_align->TargetLen())
4244 return true;
4245 }
4246 }
4247
4248 return false;
4249 }
4250
SetBestPlacement(TOrigAligns & orig_aligns)4251 void CChain::SetBestPlacement(TOrigAligns& orig_aligns) {
4252
4253 map<Int8,int> exonnum;
4254 ITERATE (TContained, it, m_members) {
4255 const CGeneModel& align = *(*it)->m_align;
4256
4257 if(align.GetCdsInfo().ProtReadingFrame().NotEmpty() && (align.Status()&eBestPlacement) && ((*it)->m_copy == 0 || (*it)->m_cds_info->HasStart())) // best placed protein or projected mRNA
4258 exonnum[align.ID()] += align.Exons().size();
4259 }
4260
4261 for(map<Int8,int>::iterator it = exonnum.begin(); it != exonnum.end(); ++it) {
4262 if(it->second >= (int)orig_aligns[it->first]->Exons().size()) { // all exons are included in the chain
4263 Status() |= eBestPlacement;
4264 break;
4265 }
4266 }
4267 }
4268
4269 struct SLinker
4270 {
SLinkerSLinker4271 SLinker() : m_member(0), m_value(0), m_matches(0), m_left(0), m_not_wanted(false), m_count(0), m_not_wanted_count(0), m_matches_count(0), m_connected(false) {}
operator <SLinker4272 bool operator<(const SLinker& sl) const {
4273 if(m_range != sl.m_range)
4274 return m_range < sl.m_range;
4275 else if(!m_member)
4276 return true;
4277 else if (!sl.m_member)
4278 return false;
4279 else
4280 return m_member->m_mem_id < sl.m_member->m_mem_id; // to make sort deterministic
4281 }
4282
4283 SChainMember* m_member;
4284 TSignedSeqRange m_range;
4285 TSignedSeqRange m_reading_frame;
4286 int m_value;
4287 int m_matches;
4288 SLinker* m_left;
4289 bool m_not_wanted;
4290 int m_count;
4291 int m_not_wanted_count;
4292 int m_matches_count;
4293 bool m_connected;
4294 };
4295
4296 typedef vector<SLinker> TLinkers;
4297
4298 struct RangeOrder {
operator ()RangeOrder4299 bool operator()(const TSignedSeqRange& a, const TSignedSeqRange& b) const {
4300 return Precede(a, b);
4301 }
4302 };
4303 typedef set<TSignedSeqRange,RangeOrder> TRangePrecedeSet;
4304
CalculateSupportAndWeightFromMembers(bool keep_all_evidence)4305 void CChain::CalculateSupportAndWeightFromMembers(bool keep_all_evidence) {
4306
4307 TLinkers linkers;
4308 ITERATE(TContained, i, m_members) {
4309 SChainMember* mi = *i;
4310 CGeneModel* ai = mi->m_align;
4311 _ASSERT(mi->m_orig_align);
4312 int matches = ai->AlignLen();
4313 if(ai->Ident() > 0.)
4314 matches = ai->Ident()*matches+0.5;
4315 bool not_wanted = false;
4316 TSignedSeqRange alimits = ai->Limits();
4317
4318 if(ai->Status()&CGeneModel::eRightFlexible) {
4319 matches = 0;
4320 not_wanted = true;
4321 for(auto& exon : Exons()) {
4322 if(Include(exon.Limits(), alimits.GetFrom())) {
4323 alimits.SetTo(min(alimits.GetTo(), exon.Limits().GetTo()));
4324 matches = alimits.GetLength();
4325 break;
4326 }
4327 }
4328 if(matches == 0) {
4329 if(alimits.GetFrom() < Limits().GetFrom()) {
4330 alimits.SetTo(min(alimits.GetTo(), Exons().front().Limits().GetTo()));
4331 matches = alimits.GetLength();
4332 } else {
4333 continue;
4334 }
4335 }
4336 }
4337 if(ai->Status()&CGeneModel::eLeftFlexible) {
4338 matches = 0;
4339 not_wanted = true;
4340 for(auto& exon : Exons()) {
4341 if(Include(exon.Limits(), alimits.GetTo())) {
4342 alimits.SetFrom(max(alimits.GetFrom(), exon.Limits().GetFrom()));
4343 matches = alimits.GetLength();
4344 break;
4345 }
4346 }
4347 if(matches == 0) {
4348 if(alimits.GetTo() > Limits().GetTo()) {
4349 alimits.SetFrom(max(alimits.GetFrom(), Exons().back().Limits().GetFrom()));
4350 matches = alimits.GetLength();
4351 } else {
4352 continue;
4353 }
4354 }
4355 }
4356
4357 TRangePrecedeSet incompatible_ranges;
4358 for(int j = 1; j < (int)Exons().size(); ++j) {
4359 TSignedSeqRange intron(Exons()[j-1].GetTo()+1,Exons()[j].GetFrom()-1);
4360 if(intron.IntersectingWith(alimits))
4361 incompatible_ranges.insert(incompatible_ranges.end(),intron);
4362 }
4363 for(int j = 1; j < (int)ai->Exons().size(); ++j) {
4364 TSignedSeqRange intron(ai->Exons()[j-1].GetTo()+1,ai->Exons()[j].GetFrom()-1);
4365 if(intron.IntersectingWith(m_supported_range)) {
4366 TRangePrecedeSet::iterator first = incompatible_ranges.lower_bound(TSignedSeqRange(intron.GetFrom(),intron.GetFrom()));
4367 if(first != incompatible_ranges.end() && *first == intron) { // compatible intron
4368 incompatible_ranges.erase(first);
4369 continue;
4370 }
4371
4372 TRangePrecedeSet::iterator second = incompatible_ranges.upper_bound(TSignedSeqRange(intron.GetTo(),intron.GetTo()));
4373 for(TRangePrecedeSet::iterator ir = first; ir != second; ) {
4374 intron += *ir;
4375 incompatible_ranges.erase(ir++);
4376 }
4377 incompatible_ranges.insert(second,intron);
4378 }
4379 }
4380
4381 if(!incompatible_ranges.empty())
4382 not_wanted = true;
4383
4384 int left = (alimits&m_supported_range).GetFrom();
4385 if(!incompatible_ranges.empty() && incompatible_ranges.begin()->GetFrom() <= left) {
4386 left = incompatible_ranges.begin()->GetTo()+1;
4387 incompatible_ranges.erase(incompatible_ranges.begin());
4388 }
4389 int right = (alimits&m_supported_range).GetTo();
4390 if(!incompatible_ranges.empty()) {
4391 TRangePrecedeSet::iterator last = incompatible_ranges.end();
4392 if((--last)->GetTo() >= right) {
4393 right = last->GetFrom()-1;
4394 incompatible_ranges.erase(last);
4395 }
4396 }
4397 while(left <= right) {
4398 SLinker sl;
4399 sl.m_not_wanted = not_wanted;
4400 sl.m_member = mi;
4401 sl.m_value = 1;
4402 if(ai->Status()&CGeneModel::eLeftFlexible)
4403 sl.m_value = (ai->Limits().GetTo() == Limits().GetTo()) ? 1000 : 10000; // remove from support if possible; keep exact end if needed
4404 if(ai->Status()&CGeneModel::eRightFlexible)
4405 sl.m_value = (ai->Limits().GetFrom() == Limits().GetFrom()) ? 1000 : 10000; // remove from support if possible; keep exact end if needed
4406 sl.m_matches = matches;
4407 sl.m_range.SetFrom(left);
4408 if(!incompatible_ranges.empty()) {
4409 sl.m_range.SetTo(incompatible_ranges.begin()->GetFrom()-1);
4410 left = incompatible_ranges.begin()->GetTo()+1;
4411 incompatible_ranges.erase(incompatible_ranges.begin());
4412 } else {
4413 sl.m_range.SetTo(right);
4414 left = right+1;
4415 }
4416 sl.m_reading_frame = ReadingFrame()&sl.m_range;
4417 linkers.push_back(sl);
4418 }
4419 }
4420
4421 set<TSignedSeqRange> chain_introns;
4422 for(int i = 1; i < (int)Exons().size(); ++i) {
4423 if(Exons()[i-1].m_ssplice && Exons()[i].m_fsplice)
4424 chain_introns.insert(TSignedSeqRange(Exons()[i-1].GetTo(),Exons()[i].GetFrom()));
4425 }
4426
4427 Status() &= ~CGeneModel::eChangedByFilter;
4428
4429 NON_CONST_ITERATE(TLinkers, l, linkers) {
4430 SLinker& sl = *l;
4431 SChainMember* mi = sl.m_member;
4432 CGeneModel& align = *mi->m_align;
4433 if(mi->m_unmd_align) {
4434 CGeneModel& unma = *mi->m_unmd_align;
4435 bool all_introns_included = true;
4436 for(int i = 1; all_introns_included && i < (int)unma.Exons().size(); ++i) {
4437 if(unma.Exons()[i-1].m_ssplice && unma.Exons()[i].m_fsplice)
4438 all_introns_included = chain_introns.count(TSignedSeqRange(unma.Exons()[i-1].GetTo(),unma.Exons()[i].GetFrom()));
4439 }
4440 if(!all_introns_included) { // protein intron was clipped and not restored or part is not in chain
4441 sl.m_not_wanted = true;
4442 if(align.ID() == m_gapped_helper_align.ID())
4443 Status() |= CGeneModel::eChangedByFilter;
4444 }
4445 } else if(align.Status()&CGeneModel::eChangedByFilter) { // for proteins could be restored
4446 sl.m_not_wanted = true;
4447 } else {
4448 CAlignModel& orig_align = *mi->m_orig_align;
4449 bool all_introns_included = true;
4450 for(int i = 1; all_introns_included && i < (int)orig_align.Exons().size(); ++i) {
4451 if(orig_align.Exons()[i-1].m_ssplice && orig_align.Exons()[i].m_fsplice)
4452 all_introns_included = chain_introns.count(TSignedSeqRange(orig_align.Exons()[i-1].GetTo(),orig_align.Exons()[i].GetFrom()));
4453 }
4454 if(!all_introns_included) { // intron was clipped by UTR clip or part is not in chain
4455 sl.m_not_wanted = true;
4456 if(align.Type()&eNotForChaining) // if TSA was clipped remove from support if possible
4457 sl.m_value = 10000;
4458 }
4459 }
4460 }
4461
4462 if(m_gapped_helper_align.ID()) {
4463 int left = m_gapped_helper_align.Limits().GetFrom();
4464 for(int i = 0; i < (int)m_gapped_helper_align.Exons().size(); ++i) {
4465 if(!m_gapped_helper_align.Exons()[i].m_ssplice) {
4466 SLinker sl;
4467 sl.m_range = TSignedSeqRange(left,m_gapped_helper_align.Exons()[i].GetTo())&m_supported_range;
4468 sl.m_reading_frame = sl.m_range&ReadingFrame();
4469 if(sl.m_range.NotEmpty())
4470 linkers.push_back(sl);
4471
4472 if(i+1 < (int)m_gapped_helper_align.Exons().size())
4473 left = m_gapped_helper_align.Exons()[i+1].GetFrom();
4474 }
4475 }
4476
4477 for(int i = 1; i < (int)Exons().size(); ++i) {
4478 if(!Exons()[i-1].m_ssplice || !Exons()[i].m_fsplice) {
4479 SLinker sl;
4480 sl.m_range = TSignedSeqRange(Exons()[i-1].GetTo(),Exons()[i].GetFrom());
4481 sl.m_reading_frame = sl.m_range&ReadingFrame();
4482 linkers.push_back(sl);
4483 }
4484 }
4485 }
4486
4487 sort(linkers.begin(), linkers.end());
4488 for(int i = 0; i < (int)linkers.size(); ++i) {
4489 SLinker& sli = linkers[i];
4490 if(sli.m_range.GetFrom() == m_supported_range.GetFrom()) {
4491 sli.m_count = sli.m_value;
4492 sli.m_matches_count = sli.m_matches;
4493 if(sli.m_not_wanted)
4494 sli.m_not_wanted_count = sli.m_value;
4495 sli.m_connected = true;
4496 } else {
4497 for(int j = i-1; j >= 0; --j) {
4498 SLinker& slj = linkers[j];
4499 if(slj.m_connected &&
4500 slj.m_range.GetFrom() < sli.m_range.GetFrom() &&
4501 slj.m_range.GetTo() < sli.m_range.GetTo() &&
4502 slj.m_range.GetTo() >= sli.m_range.GetFrom()-1) { //overlaps and extends and connected to the left end
4503
4504 bool divided_pstop = false;
4505 for(int is = 0; is < (int)GetCdsInfo().PStops().size() && !divided_pstop; ++is) {
4506 const TSignedSeqRange& s = GetCdsInfo().PStops()[is];
4507 divided_pstop = (Include(s,slj.m_range.GetTo()) || Include(s,sli.m_range.GetFrom())) && !Include(slj.m_reading_frame,s) && !Include(sli.m_reading_frame,s);
4508 }
4509 if(divided_pstop) // both alignmnets just touch the pstop without actually crossing it
4510 continue;
4511
4512 int new_count = slj.m_count + sli.m_value;
4513 int new_matches_count = slj.m_matches_count + sli.m_matches;
4514 int new_not_wanted_count = slj.m_not_wanted_count;
4515 if(sli.m_not_wanted)
4516 new_not_wanted_count += sli.m_value;
4517 if(!sli.m_connected || new_count < sli.m_count || (new_count == sli.m_count && new_not_wanted_count < sli.m_not_wanted_count) ||
4518 (new_count == sli.m_count && new_not_wanted_count == sli.m_not_wanted_count && new_matches_count > sli.m_matches_count)) {
4519 sli.m_count = new_count;
4520 sli.m_matches_count = new_matches_count;
4521 sli.m_not_wanted_count = new_not_wanted_count;
4522 sli.m_connected = true;
4523 sli.m_left = &slj;
4524 }
4525 }
4526 }
4527 }
4528 }
4529 SLinker* best_right = 0;
4530 for(int i = 0; i < (int)linkers.size(); ++i) {
4531 SLinker& sli = linkers[i];
4532 if(sli.m_connected && sli.m_range.GetTo() == m_supported_range.GetTo()) {
4533 if(best_right == 0 || sli.m_count < best_right->m_count || (sli.m_count == best_right->m_count && sli.m_not_wanted_count < best_right->m_not_wanted_count) ||
4534 (sli.m_count == best_right->m_count && sli.m_not_wanted_count == best_right->m_not_wanted_count && sli.m_matches_count > best_right->m_matches_count))
4535 best_right = &sli;
4536 }
4537 }
4538
4539 _ASSERT(best_right != 0);
4540
4541 set<Int8> sp_core;
4542 for(SLinker* l = best_right; l != 0; l = l->m_left) {
4543 if(l->m_member)
4544 sp_core.insert(l->m_member->m_align->ID());
4545 }
4546 if(m_gapped_helper_align.ID())
4547 sp_core.insert(m_gapped_helper_align.ID());
4548
4549 set<Int8> sp_not_wanted;
4550 if(!keep_all_evidence) {
4551 for(int i = 0; i < (int)linkers.size(); ++i) {
4552 SLinker& sli = linkers[i];
4553 if(sli.m_member && sli.m_not_wanted) {
4554 if(!sp_core.count(sli.m_member->m_align->ID()))
4555 sp_not_wanted.insert(sli.m_member->m_align->ID());
4556 else
4557 Status() |= CGeneModel::eChangedByFilter;
4558 }
4559 }
4560 }
4561
4562 double weight = 0;
4563 m_splice_weight = 0;
4564 set<Int8> sp;
4565 TSignedSeqRange protreadingframe;
4566 ReplaceSupport(CSupportInfoSet());
4567
4568 SetType(Type() & (~(eSR | eEST | emRNA | eProt | eNotForChaining)));
4569 ITERATE (TContained, it, m_members) {
4570 const CGeneModel& align = *(*it)->m_align;
4571 Int8 id = align.ID();
4572 if(!sp_not_wanted.count(id)) {
4573 SetType(Type() | (align.Type() & (eSR | eEST | emRNA | eProt | eNotForChaining)));
4574 protreadingframe += align.GetCdsInfo().ProtReadingFrame();
4575 m_splice_weight += (*it)->m_splice_weight;
4576 if(sp.insert(id).second) { // avoid counting parts of splitted aligns
4577 weight += align.Weight();
4578 AddSupport(CSupportInfo(id,sp_core.count(id)));
4579 }
4580 }
4581 }
4582
4583
4584
4585 CCDSInfo cds = GetCdsInfo();
4586 TSignedSeqRange readingframe = cds.ReadingFrame();
4587 protreadingframe &= readingframe;
4588 cds.SetReadingFrame(protreadingframe, true);
4589 cds.SetReadingFrame(readingframe, false);
4590
4591 {
4592 CAlignMap mrnamap(Exons(),FrameShifts(),Strand());
4593 CCDSInfo cds_info = cds;
4594 if(cds_info.IsMappedToGenome())
4595 cds_info = cds_info.MapFromOrigToEdited(mrnamap);
4596 }
4597
4598 SetCdsInfo(cds);
4599
4600 SetWeight(weight);
4601 }
4602
RestoreTrimmedEnds(int trim)4603 void CChain::RestoreTrimmedEnds(int trim)
4604 {
4605 // add back trimmed off UTRs
4606
4607 if(((Status()&eLeftConfirmed) == 0) && (!OpenLeftEnd() || ReadingFrame().Empty()) && (Strand() == ePlus || (Status()&ePolyA) == 0) && (Strand() == eMinus || (Status()&eCap) == 0)) {
4608 for(int ia = 0; ia < (int)m_members.size(); ++ia) {
4609 const CGeneModel a = *m_members[ia]->m_align;
4610 if((a.Type() & eProt)==0 && (a.Status() & CGeneModel::eLeftTrimmed)!=0 &&
4611 a.Exons().size() > 1 && Exons().front().Limits().GetFrom() == a.Limits().GetFrom()) {
4612 ExtendLeft( trim );
4613 break;
4614 }
4615 }
4616 }
4617
4618 if(((Status()&eRightConfirmed) == 0) && (!OpenRightEnd() || ReadingFrame().Empty()) && (Strand() == eMinus || (Status()&ePolyA) == 0) && (Strand() == ePlus || (Status()&eCap) == 0)) {
4619 for(int ia = 0; ia < (int)m_members.size(); ++ia) {
4620 const CGeneModel a = *m_members[ia]->m_align;
4621 if((a.Type() & eProt)==0 && (a.Status() & CGeneModel::eRightTrimmed)!=0 &&
4622 a.Exons().size() > 1 && Exons().back().Limits().GetTo() == a.Limits().GetTo()) {
4623 ExtendRight( trim );
4624 break;
4625 }
4626 }
4627 }
4628 }
4629
SetOpenForPartialyAlignedProteins(map<string,pair<bool,bool>> & prot_complet)4630 void CChain::SetOpenForPartialyAlignedProteins(map<string, pair<bool,bool> >& prot_complet) {
4631 if(ConfirmedStart() || !HasStart() || !HasStop() || OpenCds() || !Open5primeEnd() || (Type()&CGeneModel::eProt) == 0)
4632 return;
4633
4634 bool found_length_match = false;
4635 ITERATE (TContained, it, m_members) {
4636 CAlignModel* orig_align = (*it)->m_orig_align;
4637 _ASSERT(orig_align);
4638 if((orig_align->Type() & CGeneModel::eProt) == 0 || orig_align->TargetLen() == 0) // not a protein or not known length
4639 continue;
4640
4641 string accession = orig_align->TargetAccession();
4642 map<string, pair<bool,bool> >::iterator iter = prot_complet.find(accession);
4643 _ASSERT(iter != prot_complet.end());
4644 if(iter == prot_complet.end() || !iter->second.first || !iter->second.second) // unknown or partial protein
4645 continue;
4646
4647 if(orig_align->TargetLen()*0.8 < RealCdsLen()) {
4648 found_length_match = true;
4649 break;
4650 }
4651 }
4652
4653 if(!found_length_match) {
4654 CCDSInfo cds_info = GetCdsInfo();
4655 cds_info.SetScore(Score(), true);
4656 SetCdsInfo(cds_info);
4657 }
4658
4659 return;
4660 }
4661
RestoreReasonableConfirmedStart(const CGnomonEngine & gnomon,TOrigAligns & orig_aligns)4662 void CChain::RestoreReasonableConfirmedStart(const CGnomonEngine& gnomon, TOrigAligns& orig_aligns)
4663 {
4664 // if(ReadingFrame().Empty() || ConfirmedStart())
4665 if(ReadingFrame().Empty())
4666 return;
4667
4668 TSignedSeqRange conf_start;
4669 TSignedSeqPos rf=0;
4670 bool trusted = false;
4671
4672 CAlignMap amap = GetAlignMap();
4673 ITERATE(TOrigAligns, it, orig_aligns) {
4674 const CAlignModel& align = *it->second;
4675 if(align.Strand() != Strand() || !align.ConfirmedStart() || (align.TrustedProt().empty() && align.TrustedmRNA().empty()) || !(align.Status()&CGeneModel::eBestPlacement))
4676 continue;
4677
4678 TSignedSeqRange start = align.GetCdsInfo().Start();
4679
4680 int a = amap.MapOrigToEdited(start.GetFrom());
4681 int b = amap.MapOrigToEdited(start.GetTo());
4682 if(a < 0 || b < 0 || abs(a-b) != 2)
4683 continue;
4684
4685 int l = GetCdsInfo().Cds().GetFrom();
4686 int r = start.GetFrom();
4687 if(l > r)
4688 swap(l,r);
4689 if(!Include(GetCdsInfo().MaxCdsLimits(),start) || amap.FShiftedLen(l,r)%3 != 1)
4690 continue;
4691
4692 list<TSignedSeqRange> align_introns;
4693 for(int i = 1; i < (int)align.Exons().size(); ++i) {
4694 TSignedSeqRange intron(align.Exons()[i-1].Limits().GetTo(),align.Exons()[i].Limits().GetFrom());
4695 if(Include(start,intron))
4696 align_introns.push_back(intron);
4697 }
4698
4699 list<TSignedSeqRange> introns;
4700 bool hole = false;
4701 int len = start.GetLength();
4702 for(int i = 1; i < (int)Exons().size(); ++i) {
4703 TSignedSeqRange intron(Exons()[i-1].Limits().GetTo(),Exons()[i].Limits().GetFrom());
4704 if(Include(start,intron)) {
4705 introns.push_back(intron);
4706 len -= intron.GetLength()+2;
4707 if(!Exons()[i-1].m_ssplice || !Exons()[i].m_fsplice)
4708 hole = true;
4709 }
4710 }
4711
4712 if(len !=3 || hole || align_introns != introns)
4713 continue;
4714
4715 if(Strand() == ePlus) {
4716 if(conf_start.Empty() || start.GetFrom() < conf_start.GetFrom()) {
4717 bool found = false;
4718 for(int i = 0; i < (int)Exons().size() && !found; ++i) {
4719 if(Include(Exons()[i].Limits(),start.GetTo())) {
4720 if(Exons()[i].Limits().GetTo() > start.GetTo()) {
4721 rf = start.GetTo()+1;
4722 found = true;
4723 } else if(i != (int)Exons().size()-1) {
4724 rf = Exons()[i+1].Limits().GetFrom();
4725 found = true;
4726 }
4727 }
4728 }
4729
4730 if(found && amap.FShiftedLen(rf,GetCdsInfo().Cds().GetTo()) > 75) {
4731 conf_start = start;
4732 trusted = true;
4733 }
4734 }
4735 } else {
4736 if(conf_start.Empty() || start.GetTo() > conf_start.GetTo()) {
4737 bool found = false;
4738 for(int i = 0; i < (int)Exons().size() && !found; ++i) {
4739 if(Include(Exons()[i].Limits(),start.GetFrom())) {
4740 if(Exons()[i].Limits().GetFrom() < start.GetFrom()) {
4741 rf = start.GetFrom()-1;
4742 found = true;
4743 } else if(i != 0) {
4744 rf = Exons()[i-1].Limits().GetTo();
4745 found = true;
4746 }
4747 }
4748 }
4749
4750 if(found && amap.FShiftedLen(GetCdsInfo().Cds().GetFrom(),rf) > 75) {
4751 conf_start = start;
4752 trusted = true;
4753 }
4754 }
4755 }
4756 }
4757
4758
4759 if(conf_start.Empty()) {
4760 ITERATE (TContained, it, m_members) {
4761 CAlignModel* orig_align = (*it)->m_orig_align;
4762 _ASSERT(orig_align);
4763
4764 if(orig_align->ConfirmedStart() && Include((*it)->m_align->Limits(),orig_align->GetCdsInfo().Start())) { // right part of orig is included
4765 TSignedSeqRange start = orig_align->GetCdsInfo().Start();
4766 int l = GetCdsInfo().Cds().GetFrom();
4767 int r = start.GetFrom();
4768 if(l > r)
4769 swap(l,r);
4770 if(!Include(GetCdsInfo().MaxCdsLimits(),start) || amap.FShiftedLen(l,r)%3 != 1) // orig_align could be dropped beacause it was modified and have frameshifts between its start and 'best' start
4771 continue;
4772
4773 if(Strand() == ePlus) {
4774 if(conf_start.Empty() || start.GetFrom() < conf_start.GetFrom()) {
4775 conf_start = start;
4776 rf = orig_align->ReadingFrame().GetFrom();
4777 }
4778 } else {
4779 if(conf_start.Empty() || start.GetTo() > conf_start.GetTo()) {
4780 conf_start = start;
4781 rf = orig_align->ReadingFrame().GetTo();
4782 }
4783 }
4784 }
4785 }
4786 }
4787
4788
4789 if(conf_start.NotEmpty()) {
4790 TSignedSeqRange extra_cds;
4791 CCDSInfo cds = GetCdsInfo();
4792 if(cds.ProtReadingFrame().NotEmpty()) {
4793 if(Strand() == ePlus && cds.ProtReadingFrame().GetFrom() < conf_start.GetFrom())
4794 extra_cds = TSignedSeqRange(cds.ProtReadingFrame().GetFrom(), conf_start.GetFrom());
4795 else if(Strand() == eMinus && cds.ProtReadingFrame().GetTo() > conf_start.GetTo())
4796 extra_cds = TSignedSeqRange(conf_start.GetTo(), cds.ProtReadingFrame().GetTo());
4797 }
4798 if(extra_cds.Empty() || FShiftedLen(extra_cds) < 0.2*RealCdsLen()) {
4799 TSignedSeqRange reading_frame = cds.ReadingFrame();
4800 if(Strand() == ePlus)
4801 reading_frame.SetFrom(rf);
4802 else
4803 reading_frame.SetTo(rf);
4804 TSignedSeqRange protreadingframe = cds.ProtReadingFrame();
4805 TSignedSeqRange stop = cds.Stop();
4806 bool confirmed_stop = cds.ConfirmedStop();
4807 CCDSInfo::TPStops pstops = cds.PStops();
4808 cds.Clear();
4809
4810 if(protreadingframe.NotEmpty())
4811 cds.SetReadingFrame(reading_frame&protreadingframe, true);
4812 cds.SetReadingFrame(reading_frame);
4813 cds.SetStart(conf_start,true);
4814 if(stop.NotEmpty())
4815 cds.SetStop(stop,confirmed_stop);
4816 ITERATE(CCDSInfo::TPStops, s, pstops) {
4817 if(Include(reading_frame, *s))
4818 cds.AddPStop(*s);
4819 }
4820 SetCdsInfo(cds);
4821
4822 TSignedSeqRange new_lim = Limits();
4823 for(int i = 1; i < (int)Exons().size(); ++i) {
4824 if(!Exons()[i-1].m_ssplice || !Exons()[i].m_fsplice) {
4825 TSignedSeqRange hole(Exons()[i-1].GetTo(),Exons()[i].GetFrom());
4826 if(Precede(hole,reading_frame)) {
4827 new_lim.SetFrom(hole.GetTo());
4828 } else if(Precede(reading_frame,hole)) {
4829 new_lim.SetTo(hole.GetFrom());
4830 break;
4831 }
4832 }
4833 }
4834 if(new_lim != Limits())
4835 ClipChain(new_lim); // remove holes from new UTRs
4836
4837 gnomon.GetScore(*this, false, trusted);
4838 RemoveFshiftsFromUTRs();
4839 AddComment("Restored confirmed start");
4840 }
4841 }
4842 }
4843
RemoveFshiftsFromUTRs()4844 void CChain::RemoveFshiftsFromUTRs()
4845 {
4846 TInDels fs;
4847 ITERATE(TInDels, i, FrameShifts()) { // removing fshifts in UTRs
4848 TSignedSeqRange cds = GetCdsInfo().Cds();
4849 if(OpenCds())
4850 cds = MaxCdsLimits();
4851 if(Include(cds,i->Loc()))
4852 fs.push_back(*i);
4853 }
4854 if(FrameShifts().size() != fs.size()) {
4855 FrameShifts() = fs;
4856 int mrna_len = AlignLen();
4857 m_coverage.resize(mrna_len, m_coverage.back()); // this will slightly shift values compared to recalculation from scratch but will keep better ends
4858 }
4859 }
4860
4861
ClipChain(TSignedSeqRange limits)4862 void CChain::ClipChain(TSignedSeqRange limits) {
4863
4864 _ASSERT(Include(Limits(),limits) && (RealCdsLimits().Empty() || Include(limits,RealCdsLimits())));
4865
4866 TSignedSeqRange limits_on_mrna = GetAlignMap().MapRangeOrigToEdited(limits,false);
4867 _ASSERT(limits_on_mrna.NotEmpty());
4868
4869 TContained new_members;
4870 ITERATE (TContained, it, m_members) {
4871 auto ai = (*it)->m_align;
4872 TSignedSeqRange alimits = ai->Limits();
4873 if(limits.IntersectingWith(alimits)) // not clipped
4874 new_members.push_back(*it);
4875 }
4876 m_members = new_members;
4877
4878 if(limits.GetFrom() > Limits().GetFrom()) {
4879 TSignedSeqRange clip_range(Limits().GetFrom(),limits.GetFrom()-1);
4880 CutExons(clip_range);
4881 RecalculateLimits();
4882 }
4883 if(limits.GetTo() < Limits().GetTo()) {
4884 TSignedSeqRange clip_range(limits.GetTo()+1,Limits().GetTo());
4885 CutExons(clip_range);
4886 RecalculateLimits();
4887 }
4888
4889 if(limits_on_mrna.GetFrom() > 0)
4890 m_coverage.erase(m_coverage.begin(),m_coverage.begin()+limits_on_mrna.GetFrom());
4891 m_coverage.resize(limits_on_mrna.GetLength());
4892
4893 if(RealCdsLimits().NotEmpty()) {
4894 CCDSInfo cds = GetCdsInfo();
4895 bool changed = false;
4896 if((Strand() == ePlus && cds.MaxCdsLimits().GetFrom() < Limits().GetFrom()) ||
4897 (Strand() == eMinus && cds.MaxCdsLimits().GetTo() > Limits().GetTo())) {
4898 cds.Clear5PrimeCdsLimit();
4899 changed = true;
4900 }
4901 if(cds.PStop()) {
4902 CCDSInfo::TPStops pstops;
4903 for(auto& pstop : cds.PStops()) {
4904 if(Include(limits, pstop))
4905 pstops.push_back(pstop);
4906 }
4907 if(pstops.size() != cds.PStops().size()) {
4908 cds.ClearPStops();
4909 for(auto& pstop : pstops)
4910 cds.AddPStop(pstop);
4911 changed = true;
4912 }
4913 }
4914
4915 if(changed)
4916 SetCdsInfo(cds);
4917 }
4918
4919 if(limits.GetFrom() > m_supported_range.GetFrom())
4920 m_supported_range.SetFrom(limits.GetFrom());
4921 if(limits.GetTo() < m_supported_range.GetTo())
4922 m_supported_range.SetTo(limits.GetTo());
4923
4924 CalculateSupportAndWeightFromMembers();
4925 }
4926
SetConfirmedEnds(const CGnomonEngine & gnomon,CGnomonAnnotator_Base::TIntMap & confirmed_ends)4927 bool CChain::SetConfirmedEnds(const CGnomonEngine& gnomon, CGnomonAnnotator_Base::TIntMap& confirmed_ends) {
4928 if(Exons().size() < 2)
4929 return true;
4930
4931 auto old_limits = Limits();
4932 auto new_limits = old_limits;
4933 bool left_confirmed = false;
4934 bool right_confirmed = false;
4935
4936 auto rslt = confirmed_ends.find(Exons().front().GetTo());
4937 if(rslt != confirmed_ends.end() && rslt->second < Exons().front().GetTo()) {
4938 left_confirmed = true;
4939 new_limits.SetFrom(rslt->second);
4940 }
4941 rslt = confirmed_ends.find(Exons().back().GetFrom());
4942 if(rslt != confirmed_ends.end() && rslt->second > Exons().back().GetFrom()) {
4943 right_confirmed = true;
4944 new_limits.SetTo(rslt->second);
4945 }
4946
4947 if(!left_confirmed && !right_confirmed)
4948 return true;
4949 else if(!Continuous())
4950 return false;
4951
4952 CCDSInfo cds_info = GetCdsInfo();
4953 bool left_complete = LeftComplete(); // has start/stop on left
4954 bool right_complete = RightComplete(); // has start/stop on right
4955
4956 SetCdsInfo(CCDSInfo()); //we will deal with CDS separately
4957
4958 //extend chain
4959 if(new_limits.GetFrom() < old_limits.GetFrom()) {
4960 int delta = old_limits.GetFrom()-new_limits.GetFrom();
4961 ExtendLeft(delta);
4962 m_coverage.insert(m_coverage.begin(), delta, 0);
4963 }
4964 if(new_limits.GetTo() > old_limits.GetTo()) {
4965 int delta = new_limits.GetTo()-old_limits.GetTo();
4966 ExtendRight(delta);
4967 m_coverage.insert(m_coverage.end(), delta, 0);
4968 }
4969
4970 CAlignMap amap = GetAlignMap(); //includes extended ends and keeps clipped ends
4971
4972 { // removing fshifts outside of clip
4973 TInDels fs;
4974 ITERATE(TInDels, i, FrameShifts()) {
4975 if(i->Loc() > new_limits.GetFrom() && i->InDelEnd() < new_limits.GetTo())
4976 fs.push_back(*i);
4977 }
4978
4979 if(FrameShifts().size() != fs.size()) {
4980 FrameShifts() = fs;
4981 int mrna_len = AlignLen();
4982 m_coverage.resize(mrna_len, m_coverage.back()); // this will slightly shift values compared to recalculation from scratch but will keep better ends
4983 }
4984 }
4985
4986 //clip chain
4987 if(Limits() != new_limits)
4988 ClipChain(new_limits);
4989
4990 //set limits
4991 m_polya_cap_left_soft_limit = max(m_polya_cap_left_soft_limit, new_limits.GetFrom());
4992 m_polya_cap_right_soft_limit = min(m_polya_cap_right_soft_limit, new_limits.GetTo());
4993
4994 //set status
4995 if(left_confirmed) {
4996 Status() |= eLeftConfirmed;
4997 if(new_limits.GetFrom() < old_limits.GetFrom())
4998 AddComment("Extended to confirmed left");
4999 else if(new_limits.GetFrom() > old_limits.GetFrom())
5000 AddComment("Clipped to confirmed left");
5001 }
5002 if(right_confirmed) {
5003 Status() |= eRightConfirmed;
5004 if(new_limits.GetTo() > old_limits.GetTo())
5005 AddComment("Extended to confirmed right");
5006 else if(new_limits.GetTo() < old_limits.GetTo())
5007 AddComment("Clipped to confirmed right");
5008 }
5009
5010 if(cds_info.ReadingFrame().Empty()) //non coding chain
5011 return true;
5012
5013 if(!Include(new_limits, cds_info.Cds()) || (left_confirmed && !left_complete) || (right_confirmed && !right_complete)) { //CDS may need clipping to expose startstop
5014 auto cds_info_t = cds_info.MapFromOrigToEdited(amap);
5015 int frame = cds_info_t.ReadingFrame().GetFrom()%3;
5016
5017 //project new_limits to transcript and align to frame
5018 auto cds_limits_t = amap.ShrinkToRealPoints(new_limits);
5019 cds_limits_t = amap.MapRangeOrigToEdited(cds_limits_t, CAlignMap::eSinglePoint, CAlignMap::eSinglePoint);
5020 for(int i = cds_limits_t.GetFrom(); i <= cds_limits_t.GetTo(); ++i) {
5021 cds_limits_t.SetFrom(i);
5022 if(i%3 == frame && amap.MapEditedToOrig(i) >= 0)
5023 break;
5024 }
5025 for(int i = cds_limits_t.GetTo(); i >= cds_limits_t.GetFrom(); --i) {
5026 cds_limits_t.SetTo(i);
5027 if((i+1)%3 == frame && amap.MapEditedToOrig(i) >= 0)
5028 break;
5029 }
5030 if(cds_limits_t.Empty())
5031 return false;
5032 cds_info_t.Clip(cds_limits_t); // remove extra CDS
5033
5034
5035 bool fivep_confirmed = (Strand() == ePlus) ? left_confirmed : right_confirmed;
5036 bool threep_confirmed = (Strand() == ePlus) ? right_confirmed : left_confirmed;
5037 bool has_start = cds_info_t.HasStart();
5038 bool has_stop = cds_info_t.HasStop();
5039 auto prot_rf = cds_info_t.ProtReadingFrame();
5040 if(prot_rf.NotEmpty() && ((fivep_confirmed && !has_start) || (threep_confirmed && !has_stop))) { //CDS may need some additional clipping to expose starts/stops
5041 const CResidueVec& contig = gnomon.GetSeq();
5042 CResidueVec mrna;
5043 amap.EditedSequence(contig, mrna);
5044
5045 auto IndelInCodon = [this](int i, CAlignMap& map) {
5046 int a = map.MapEditedToOrig(i);
5047 int b = map.MapEditedToOrig(i+2);
5048 if(Strand() == eMinus)
5049 swap(a, b);
5050 return (a < 0 || b < 0 || map.MapEditedToOrig(i+1) < 0 || !GetInDels(a, b, false).empty()); // genomic indels inside, if true
5051 };
5052
5053 if(fivep_confirmed && !has_start) {
5054 for(int i = prot_rf.GetFrom(); !has_start && i >= cds_limits_t.GetFrom() && !IsStopCodon(&mrna[i]); i =- 3) { //find start outside protein (no clip will be needed)
5055 has_start = IsStartCodon(&mrna[i]) && !IndelInCodon(i, amap);
5056 }
5057 for(int i = prot_rf.GetFrom(); !has_start && i < cds_limits_t.GetTo(); i += 3) { //find start inside protein (clip will be needed)
5058 if(i > prot_rf.GetTo() && IsStopCodon(&mrna[i]))
5059 break;
5060 has_start = IsStartCodon(&mrna[i]) && !IndelInCodon(i, amap);
5061 cds_limits_t.SetFrom(i);
5062 }
5063 if(!has_start)
5064 return false;
5065 }
5066 if(threep_confirmed && !has_stop) {
5067 for(int i = prot_rf.GetTo()+1; !has_stop && i < cds_limits_t.GetTo(); i += 3) //find stop outside protein (no clip will be needed)
5068 has_stop = IsStopCodon(&mrna[i]) && !IndelInCodon(i, amap);
5069 if(!has_stop && cds_info_t.PStop(false)) { //find stop inside protein (clip will be needed)
5070 CCDSInfo::TPStops pstops = cds_info_t.PStops();
5071 sort(pstops.begin(), pstops.end());
5072 for(auto& stp : pstops) {
5073 if(stp.m_status != CCDSInfo::eGenomeNotCorrect && stp.m_status != CCDSInfo::eSelenocysteine && !IndelInCodon(stp.GetFrom(), amap)) {
5074 has_stop = true;
5075 cds_limits_t.SetTo(stp.GetFrom()-1);
5076 }
5077 }
5078 if(!has_stop)
5079 return false;
5080 }
5081 }
5082
5083 if(cds_limits_t.Empty())
5084 return false;
5085
5086 cds_info_t.Clip(cds_limits_t);
5087 }
5088
5089 cds_info = cds_info_t.MapFromEditedToOrig(amap);
5090 }
5091
5092 { // removing fshifts in UTRs
5093 auto cds = cds_info.Cds();
5094 TInDels fs;
5095 ITERATE(TInDels, i, FrameShifts()) {
5096 if(Include(cds, i->Loc()))
5097 fs.push_back(*i);
5098 }
5099
5100 if(FrameShifts().size() != fs.size()) {
5101 FrameShifts() = fs;
5102 int mrna_len = AlignLen();
5103 m_coverage.resize(mrna_len, m_coverage.back()); // this will slightly shift values compared to recalculation from scratch but will keep better ends
5104 }
5105 }
5106
5107 SetCdsInfo(cds_info);
5108
5109 return true;
5110 }
5111
5112 // valid, found As
ValidPolyA(int pos,const CResidueVec & contig)5113 pair<bool, bool> CChain::ValidPolyA(int pos, const CResidueVec& contig) {
5114 string motif1 = "AATAAA";
5115 string motif2 = "ATTAAA";
5116 string motif3 = "AGTAAA";
5117 int block_of_As_len = 6;
5118 CResidueVec block_of_As;
5119 if(Strand() == ePlus)
5120 block_of_As.assign(block_of_As_len, 'A');
5121 else
5122 block_of_As.assign(block_of_As_len, 'T');
5123
5124 int a = max(0, pos-block_of_As_len);
5125 int b = min((int)contig.size()-1, pos+block_of_As_len);
5126 if(b-a+1 < block_of_As_len)
5127 return make_pair(false, false);
5128 if(search(contig.begin()+a, contig.begin()+b+1, block_of_As.begin(), block_of_As.end()) != contig.begin()+b+1) { // found As
5129 int left;
5130 int right;
5131 if(Strand() == ePlus) {
5132 left = pos-35;
5133 right = pos-18;
5134 } else {
5135 left = pos+18;
5136 right = pos+35;
5137 }
5138 if(left < 0 || right >= (int)contig.size())
5139 return make_pair(false, false);
5140
5141 string segment(contig.begin()+left, contig.begin()+right+1);
5142 if(Strand() == eMinus)
5143 ReverseComplement(segment.begin(), segment.end());
5144
5145 if(segment.find(motif1) != string::npos || segment.find(motif2) != string::npos || segment.find(motif3) != string::npos)
5146 return make_pair(true, true);
5147 else
5148 return make_pair(false, true);
5149 } else {
5150 return make_pair(true, false);
5151 }
5152 }
5153
5154 #define MIN_UTR_EXON 15
5155
PeaksAndLimits(EStatus determinant,int min_blob_weight,int max_empty_dist,int min_splice_dist)5156 tuple<CChain::TIDMap, TSignedSeqRange> CChain::PeaksAndLimits(EStatus determinant, int min_blob_weight, int max_empty_dist, int min_splice_dist) {
5157 bool right_end = (determinant == ePolyA && Strand() == ePlus) || (determinant == eCap && Strand() == eMinus); // determinant is on the right gene side
5158 bool coding = ReadingFrame().NotEmpty();
5159
5160 TIDMap peak_weights;
5161 TSignedSeqRange real_limits;
5162
5163 int flex_len = 0;
5164 TIDMap raw_weights;
5165 for(auto& mi : m_members) {
5166 const CGeneModel& align = *mi->m_align;
5167 if(align.Status()&determinant) {
5168 if(right_end) {
5169 int rlimit = (coding ? RealCdsLimits().GetTo() : Exons().back().Limits().GetFrom()); // look in the last exon of notcoding or right UTR of coding
5170 bool belong_to_exon = false;
5171 int pos = align.Limits().GetTo();
5172 for(auto& exon : Exons()) {
5173 if(pos >= exon.Limits().GetFrom()+min_splice_dist && pos <= exon.Limits().GetTo()) {
5174 belong_to_exon = true;
5175 break;
5176 }
5177 }
5178 if(rlimit < pos && belong_to_exon)
5179 raw_weights[align.Limits().GetTo()] += align.Weight();
5180 } else {
5181 int llimit = (coding ? RealCdsLimits().GetFrom() : Exons().front().Limits().GetTo()); // look in the first exon of notcoding or left UTR of coding
5182 bool belong_to_exon = false;
5183 int pos = align.Limits().GetFrom();
5184 for(auto& exon : Exons()) {
5185 if(pos >= exon.Limits().GetFrom() && pos <= exon.Limits().GetTo()-min_splice_dist) {
5186 belong_to_exon = true;
5187 break;
5188 }
5189 }
5190 if(llimit > pos && belong_to_exon)
5191 raw_weights[-align.Limits().GetFrom()] += align.Weight(); // negative position, so the map is in convinient order
5192 }
5193 }
5194 if(align.Status()&(eLeftFlexible|eRightFlexible))
5195 flex_len = max(flex_len, align.Limits().GetLength());
5196 else
5197 real_limits += (align.Limits()&Limits());
5198 }
5199 if(raw_weights.empty())
5200 return make_tuple(peak_weights,real_limits);
5201
5202 int last_allowed = right_end ? real_limits.GetTo()+flex_len : -(real_limits.GetFrom()-flex_len);
5203 auto ipeak = raw_weights.begin();
5204 double w = ipeak->second;
5205 for(auto it = next(raw_weights.begin()); it != raw_weights.end(); ++it) {
5206 if(it->first > prev(it)->first+1+max_empty_dist) { // next blob
5207 if(ipeak->first > last_allowed)
5208 break;
5209 if(w >= min_blob_weight) {
5210 auto still_good = ipeak;
5211 for(auto i = ipeak; i != it && i->first <= last_allowed; ++i) { // shift position to furthest 50% within blob
5212 if(i->second >= 0.5*ipeak->second)
5213 still_good = i;
5214 }
5215 peak_weights.emplace(still_good->first, w); // peak position, blob weight
5216 }
5217 ipeak = it;
5218 w = it->second;
5219 } else {
5220 w += it->second;
5221 if(it->second > ipeak->second) // new peak position; first for equals
5222 ipeak = it;
5223 }
5224 }
5225 if(ipeak->first <= last_allowed && w >= min_blob_weight) { // last peak
5226 auto still_good = ipeak;
5227 for(auto i = ipeak; i != raw_weights.end() && i->first <= last_allowed; ++i) { // shift position to furthest 50% within blob
5228 if(i->second >= 0.5*ipeak->second)
5229 still_good = i;
5230 }
5231 peak_weights.emplace(still_good->first, w); // peak position, blob weight
5232 }
5233
5234 return make_tuple(peak_weights,real_limits);
5235 }
5236
MainPeaks(TIDMap & peak_weights,double secondary_peak,double tertiary_peak,double tertiary_peak_coverage,bool right_end)5237 tuple<TIVec, TSignedSeqRange> CChain::MainPeaks(TIDMap& peak_weights, double secondary_peak, double tertiary_peak, double tertiary_peak_coverage, bool right_end) {
5238 TIVec peaks(3, -1);
5239 auto limits = Limits();
5240 auto ifirst_peak = max_element(peak_weights.begin(), peak_weights.end(), [](const TIDMap::value_type& a, const TIDMap::value_type& b) { return a.second < b.second; });
5241 peaks[0] = abs(ifirst_peak->first);
5242 if(right_end) {
5243 int first_peak = ifirst_peak->first;
5244 limits.SetTo(first_peak);
5245 m_polya_cap_right_soft_limit = first_peak;
5246 } else {
5247 int first_peak = -ifirst_peak->first;
5248 limits.SetFrom(first_peak);
5249 m_polya_cap_left_soft_limit = first_peak;
5250 }
5251 auto isecond_peak = prev(peak_weights.end());
5252 for( ; isecond_peak != ifirst_peak && isecond_peak->second < secondary_peak*ifirst_peak->second; --isecond_peak);
5253 if(isecond_peak != ifirst_peak)
5254 peaks[1] = abs(isecond_peak->first);
5255
5256 if(tertiary_peak > 0) {
5257 CAlignMap amap = GetAlignMap();
5258 TSignedSeqRange genome_core_lim = RealCdsLimits();
5259 if(genome_core_lim.Empty()) {
5260 genome_core_lim = Limits();
5261 if(Exons().size() > 1) {
5262 if(Exons().front().Limits().GetLength() >= MIN_UTR_EXON)
5263 genome_core_lim.SetFrom(Exons().front().Limits().GetTo()-MIN_UTR_EXON+1);
5264 if(Exons().back().Limits().GetLength() >= MIN_UTR_EXON)
5265 genome_core_lim.SetTo(Exons().back().Limits().GetFrom()+MIN_UTR_EXON-1);
5266 }
5267 }
5268 genome_core_lim = amap.ShrinkToRealPoints(genome_core_lim);
5269 TSignedSeqRange core_lim = amap.MapRangeOrigToEdited(genome_core_lim);
5270 double core_coverage = 0;
5271 for (int i = core_lim.GetFrom(); i <= core_lim.GetTo(); ++i) {
5272 core_coverage += m_coverage[i];
5273 }
5274 core_coverage /= core_lim.GetLength();
5275
5276 TSignedSeqRange fpeak_exon;
5277 for(auto& exon : Exons()) {
5278 if(Include(exon.Limits(), abs(ifirst_peak->first))) {
5279 fpeak_exon = exon.Limits();
5280 break;
5281 }
5282 }
5283
5284 auto ithird_peak = prev(peak_weights.end());
5285 for( ; ithird_peak != isecond_peak; --ithird_peak) {
5286 if(Include(fpeak_exon, abs(ithird_peak->first))) {
5287 int p = amap.MapOrigToEdited(abs(ithird_peak->first));
5288 if(p < 0)
5289 continue;
5290 if(ithird_peak->second >= tertiary_peak*ifirst_peak->second && m_coverage[p] > tertiary_peak_coverage*core_coverage)
5291 break;
5292 }
5293 }
5294 if(ithird_peak != isecond_peak)
5295 peaks[2] = abs(ithird_peak->first);
5296 isecond_peak = ithird_peak;
5297 }
5298
5299 if(isecond_peak != ifirst_peak) {
5300 if(right_end) {
5301 int second_peak = isecond_peak->first;
5302 limits.SetTo(second_peak);
5303 } else {
5304 int second_peak = -isecond_peak->first;
5305 limits.SetFrom(second_peak);
5306 }
5307 }
5308
5309 return make_tuple(peaks, limits);
5310 }
5311
ClipToCap(int min_cap_blob,int max_dist,int min_flank_exon,double secondary_peak)5312 void CChain::ClipToCap(int min_cap_blob, int max_dist, int min_flank_exon, double secondary_peak) {
5313 bool right_end = Strand() == eMinus; // cap is on the right gene side
5314 if((Status()&eLeftConfirmed) && !right_end)
5315 return;
5316 if((Status()&eRightConfirmed) && right_end)
5317 return;
5318
5319 bool coding = ReadingFrame().NotEmpty();
5320 if(!HasStart() && coding)
5321 return;
5322
5323 auto rslt = PeaksAndLimits(eCap, min_cap_blob, max_dist, min_flank_exon);
5324 TIDMap& peak_weights(get<0>(rslt));
5325 TSignedSeqRange real_limits(get<1>(rslt));
5326
5327 if(peak_weights.empty()) {
5328 TSignedSeqRange limits = Limits();
5329 Status() &= ~eCap;
5330 if(right_end && real_limits.GetTo() < Limits().GetTo())
5331 limits.SetTo(real_limits.GetTo());
5332 else if(!right_end && real_limits.GetFrom() > Limits().GetFrom())
5333 limits.SetFrom(real_limits.GetFrom());
5334
5335 if (limits != Limits()) {
5336 if(!coding || Include(limits,RealCdsLimits())) {
5337 AddComment("capsupressed");
5338 ClipChain(limits);
5339 } else {
5340 AddComment("capoverlapcds");
5341 }
5342 }
5343
5344 if(right_end)
5345 m_polya_cap_right_soft_limit = Limits().GetFrom()-1;
5346 else
5347 m_polya_cap_left_soft_limit = Limits().GetTo()+1;
5348
5349 return;
5350 }
5351
5352 Status() |= eCap;
5353 auto rslt1 = MainPeaks(peak_weights, secondary_peak, 0., 0., right_end);
5354 m_cap_peaks = get<0>(rslt1);
5355 TSignedSeqRange limits = get<1>(rslt1);
5356
5357 if (limits != Limits()) {
5358 AddComment("capclip");
5359 ClipChain(limits);
5360 }
5361 }
ClipToPolyA(const CResidueVec & contig,int min_polya_blob,int max_dist,int min_flank_exon,double secondary_peak,double tertiary_peak,double tertiary_peak_coverage)5362 void CChain::ClipToPolyA(const CResidueVec& contig, int min_polya_blob, int max_dist, int min_flank_exon, double secondary_peak, double tertiary_peak, double tertiary_peak_coverage) {
5363 bool right_end = Strand() == ePlus; // polya is on the right gene side
5364 if((Status()&eLeftConfirmed) && !right_end)
5365 return;
5366 if((Status()&eRightConfirmed) && right_end)
5367 return;
5368
5369 bool coding = ReadingFrame().NotEmpty();
5370 if(!HasStop() && coding)
5371 return;
5372
5373 auto rslt = PeaksAndLimits(ePolyA, min_polya_blob, max_dist, min_flank_exon);
5374 TIDMap& peak_weights(get<0>(rslt));
5375 TSignedSeqRange real_limits(get<1>(rslt));
5376 //check for As
5377 for(auto ip_loop = peak_weights.begin(); ip_loop != peak_weights.end(); ) {
5378 auto ip = ip_loop++;
5379 if(!ValidPolyA(abs(ip->first), contig).first)
5380 peak_weights.erase(ip);
5381 }
5382
5383 if(peak_weights.empty()) {
5384 TSignedSeqRange limits = Limits();
5385 Status() &= ~ePolyA;
5386 if(right_end && real_limits.GetTo() < Limits().GetTo())
5387 limits.SetTo(real_limits.GetTo());
5388 else if(!right_end && real_limits.GetFrom() > Limits().GetFrom())
5389 limits.SetFrom(real_limits.GetFrom());
5390
5391 if (limits != Limits()) {
5392 if(!coding || Include(limits,RealCdsLimits())) {
5393 AddComment("polyasupressed");
5394 ClipChain(limits);
5395 } else {
5396 AddComment("polyaoverlapcds");
5397 }
5398 }
5399
5400 if(right_end)
5401 m_polya_cap_right_soft_limit = Limits().GetFrom()-1;
5402 else
5403 m_polya_cap_left_soft_limit = Limits().GetTo()+1;
5404
5405 return;
5406 }
5407
5408 Status() |= ePolyA;
5409 auto rslt1 = MainPeaks(peak_weights, secondary_peak, tertiary_peak, tertiary_peak_coverage, right_end);
5410 m_polya_peaks = get<0>(rslt1);
5411 TSignedSeqRange limits = get<1>(rslt1);
5412
5413 if (limits != Limits()) {
5414 AddComment("polyaclip");
5415 ClipChain(limits);
5416 }
5417 }
5418
CheckSecondaryCapPolyAEnds()5419 void CChain::CheckSecondaryCapPolyAEnds() {
5420 if(m_polya_cap_left_soft_limit < Limits().GetTo() && Include(RealCdsLimits(), m_polya_cap_left_soft_limit))
5421 m_polya_cap_left_soft_limit = Limits().GetFrom();
5422
5423 if(m_polya_cap_right_soft_limit > Limits().GetFrom() && Include(RealCdsLimits(), m_polya_cap_right_soft_limit))
5424 m_polya_cap_right_soft_limit = Limits().GetTo();
5425 }
5426
5427 #define COVERAGE_DROP 0.1
5428 #define COVERAGE_BUMP 3
5429 #define SMALL_GAP_UTR 100
5430
ClipLowCoverageUTR(double utr_clip_threshold)5431 void CChain::ClipLowCoverageUTR(double utr_clip_threshold)
5432 {
5433 if((Type()&CGeneModel::eSR) == 0) // don't have SR coverage
5434 return;
5435
5436 CAlignMap amap = GetAlignMap();
5437
5438 int mrna_len = amap.FShiftedLen(Limits());
5439
5440 TSignedSeqRange genome_core_lim;
5441 if(ReadingFrame().NotEmpty()) {
5442 if(OpenCds())
5443 genome_core_lim = MaxCdsLimits();
5444 else
5445 genome_core_lim = RealCdsLimits();
5446 ITERATE (CGeneModel::TExons, e, Exons()) {
5447 if(Include(e->Limits(),genome_core_lim.GetFrom()))
5448 genome_core_lim.SetFrom(max(genome_core_lim.GetFrom()-MIN_UTR_EXON,e->GetFrom()));
5449 if(Include(e->Limits(),genome_core_lim.GetTo()))
5450 genome_core_lim.SetTo(min(genome_core_lim.GetTo()+MIN_UTR_EXON,e->GetTo()));
5451 }
5452 } else {
5453 genome_core_lim = Limits();
5454 if(Exons().size() > 1) {
5455 if(Exons().front().Limits().GetLength() >= MIN_UTR_EXON)
5456 genome_core_lim.SetFrom(Exons().front().Limits().GetTo()-MIN_UTR_EXON+1);
5457 if(Exons().back().Limits().GetLength() >= MIN_UTR_EXON)
5458 genome_core_lim.SetTo(Exons().back().Limits().GetFrom()+MIN_UTR_EXON-1);
5459 }
5460 }
5461
5462 TSignedSeqRange core_lim = amap.MapRangeOrigToEdited(genome_core_lim);
5463
5464 vector<double> coverage = m_coverage;
5465 _ASSERT((int)coverage.size() == mrna_len && core_lim.GetFrom() >= 0 && core_lim.GetTo() < mrna_len);
5466
5467 double core_coverage = 0;
5468 for (int i = core_lim.GetFrom(); i <= core_lim.GetTo(); ++i) {
5469 core_coverage += coverage[i];
5470 }
5471 core_coverage /= core_lim.GetLength();
5472 m_core_coverage = core_coverage;
5473
5474 if(core_lim.GetFrom() <= 0 && core_lim.GetTo() >= mrna_len-1) //nothing to clip
5475 return;
5476
5477 if(core_lim.GetTo()-core_lim.GetFrom() < SCAN_WINDOW) // too short
5478 return;
5479
5480 map<int,double> intron_coverage; // in transcript space
5481 vector<double> longseq_coverage(mrna_len);
5482 ITERATE (TContained, it, m_members) {
5483 const CGeneModel& align = *(*it)->m_align;
5484 if(align.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
5485 continue;
5486 TSignedSeqRange overlap = Limits()&align.Limits();
5487 if(overlap.Empty()) // some could be cut by polya clip
5488 continue;
5489
5490 for(int i = 1; i < (int)align.Exons().size(); ++i) {
5491 if(align.Exons()[i-1].m_ssplice && align.Exons()[i].m_fsplice && align.Exons()[i-1].m_ssplice_sig != "XX" && align.Exons()[i].m_fsplice_sig != "XX") {
5492 TSignedSeqRange intr(align.Exons()[i-1].Limits().GetTo(),align.Exons()[i].Limits().GetFrom());
5493 bool valid_intron = false; // some introns might be clipped by previous UTR clips but still be in members
5494 for(int j = 1; j < (int)Exons().size() && !valid_intron; ++j) {
5495 if(Exons()[j-1].m_ssplice && Exons()[j].m_fsplice) {
5496 TSignedSeqRange jntr(Exons()[j-1].Limits().GetTo(),Exons()[j].Limits().GetFrom());
5497 valid_intron = (intr == jntr);
5498 }
5499 }
5500 if(valid_intron) {
5501 int intron = 0; // donor in transcript space
5502 if(Strand() == ePlus) {
5503 intron = amap.MapRangeOrigToEdited(Limits()&align.Exons()[i-1].Limits()).GetTo();
5504 } else {
5505 intron = amap.MapRangeOrigToEdited(Limits()&align.Exons()[i].Limits()).GetTo();
5506 }
5507 intron_coverage[intron] += (align.Type() == CGeneModel::eSR) ? align.Weight() : 0;
5508 }
5509 }
5510 }
5511
5512
5513 TSignedSeqRange overlap_on_mrna = amap.MapRangeOrigToEdited(overlap);
5514
5515 if(align.Type() == CGeneModel::emRNA || align.Type() == CGeneModel::eEST || align.Type() == CGeneModel::eNotForChaining) { //OK to clip protein in UTR
5516 for(int i = overlap_on_mrna.GetFrom(); i <= overlap_on_mrna.GetTo(); ++i)
5517 longseq_coverage[i] += align.Weight();
5518 }
5519 }
5520
5521 //don't save short gap utrs
5522 TSignedSeqRange cds = GetCdsInfo().Cds();
5523 if(Exons().front().m_ssplice_sig == "XX" && (cds&Exons().front().Limits()).Empty() && Exons().front().Limits().GetLength() < SMALL_GAP_UTR) {
5524 TSignedSeqRange texon = TranscriptExon(0);
5525 for(int i = texon.GetFrom(); i <= texon.GetTo(); ++i) {
5526 coverage[i] = 0;
5527 longseq_coverage[i] = 0;
5528 }
5529 }
5530 if(Exons().back().m_fsplice_sig == "XX" && (cds&Exons().back().Limits()).Empty() && Exons().back().Limits().GetLength() < SMALL_GAP_UTR) {
5531 TSignedSeqRange texon = TranscriptExon(Exons().size()-1);
5532 for(int i = texon.GetFrom(); i <= texon.GetTo(); ++i) {
5533 coverage[i] = 0;
5534 longseq_coverage[i] = 0;
5535 }
5536 }
5537
5538 double core_inron_coverage = 0;
5539 int core_introns = 0;
5540 for(int i = 1; i < (int)Exons().size(); ++i) {
5541 if(Exons()[i-1].m_ssplice && Exons()[i].m_fsplice) {
5542 int intron; // donor in transcript space
5543 if(Strand() == ePlus)
5544 intron = amap.MapRangeOrigToEdited(Exons()[i-1].Limits(), true).GetTo();
5545 else
5546 intron = amap.MapRangeOrigToEdited(Exons()[i].Limits(), true).GetTo();
5547 if(Include(core_lim, intron)) {
5548 ++core_introns;
5549 core_inron_coverage += intron_coverage[intron];
5550 }
5551 }
5552 }
5553 if(core_introns > 0)
5554 core_inron_coverage /= core_introns;
5555 else
5556 core_inron_coverage = 0.5*core_coverage;
5557
5558 // 5' UTR
5559 bool fivep_confirmed = (Strand() == ePlus) ? (Status()&eLeftConfirmed) : (Status()&eRightConfirmed);
5560 if(!fivep_confirmed && !(Status()&eCap) && core_lim.GetFrom() > SCAN_WINDOW/2) {
5561 int left_limit = core_lim.GetFrom(); // cds/splice
5562 int right_limit = core_lim.GetTo(); // cds/splice
5563 int len = right_limit-left_limit+1;
5564 double wlen = 0;
5565 for(int i = left_limit; i <= right_limit; ++i)
5566 wlen += coverage[i];
5567
5568 while(left_limit > 0 && (longseq_coverage[left_limit] > 0 ||
5569 (coverage[left_limit] > max(core_coverage,wlen/len)*utr_clip_threshold &&
5570 (intron_coverage.find(left_limit-1) == intron_coverage.end() || intron_coverage[left_limit-1] > core_inron_coverage*utr_clip_threshold)))) {
5571
5572 ++len;
5573 --left_limit;
5574 wlen += coverage[left_limit];
5575 }
5576
5577 if(left_limit > 0) {
5578 AddComment("5putrclip");
5579 ClipChain(amap.MapRangeEditedToOrig(TSignedSeqRange(left_limit,mrna_len-1)));
5580 if(Strand() == ePlus && Exons().front().Limits().GetLength() < MIN_UTR_EXON && Exons().front().Limits().GetTo() < genome_core_lim.GetFrom())
5581 ClipChain(TSignedSeqRange(Exons()[1].Limits().GetFrom(),Limits().GetTo()));
5582 else if(Strand() == eMinus && Exons().back().Limits().GetLength() < MIN_UTR_EXON && Exons().back().Limits().GetFrom() > genome_core_lim.GetTo())
5583 ClipChain(TSignedSeqRange(Limits().GetFrom(),Exons()[Exons().size()-2].GetTo()));
5584 }
5585 }
5586
5587
5588 // 3' UTR
5589 bool threep_confirmed = (Strand() == ePlus) ? (Status()&eRightConfirmed) : (Status()&eLeftConfirmed);
5590 if(!threep_confirmed && !(Status()&ePolyA) && core_lim.GetTo() < mrna_len-1-SCAN_WINDOW/2) {
5591 int right_limit = core_lim.GetTo(); // cds/splice
5592 int left_limit = core_lim.GetFrom(); // cds/splice
5593 int len = right_limit-left_limit+1;
5594 double wlen = 0;
5595 for(int i = left_limit; i <= right_limit; ++i)
5596 wlen += coverage[i];
5597
5598 double window_wlen = 0;
5599 for(int i = right_limit-SCAN_WINDOW/2; i <= right_limit+SCAN_WINDOW/2; ++i)
5600 window_wlen += coverage[i];
5601
5602 while(right_limit < mrna_len-1 && (longseq_coverage[right_limit] > 0 ||
5603 (coverage[right_limit] > wlen/len*utr_clip_threshold &&
5604 (intron_coverage.find(right_limit) == intron_coverage.end() || intron_coverage[right_limit] > core_inron_coverage*utr_clip_threshold)))) {
5605
5606 ++len;
5607 ++right_limit;
5608 wlen += coverage[right_limit];
5609 }
5610
5611 if(right_limit < mrna_len-1) {
5612 AddComment("3putrclip");
5613 int new_5p = amap.MapRangeOrigToEdited(Limits()).GetFrom();
5614 ClipChain(amap.MapRangeEditedToOrig(TSignedSeqRange(new_5p,right_limit)));
5615 if(Strand() == ePlus && Exons().back().Limits().GetLength() < MIN_UTR_EXON && Exons().back().Limits().GetFrom() > genome_core_lim.GetTo())
5616 ClipChain(TSignedSeqRange(Limits().GetFrom(),Exons()[Exons().size()-2].GetTo()));
5617 else if(Strand() == eMinus && Exons().front().Limits().GetLength() < MIN_UTR_EXON && Exons().front().Limits().GetTo() < genome_core_lim.GetFrom())
5618 ClipChain(TSignedSeqRange(Exons()[1].Limits().GetFrom(),Limits().GetTo()));
5619 }
5620 }
5621 }
5622
CalculateDropLimits()5623 void CChain::CalculateDropLimits() {
5624
5625 m_coverage_drop_left = -1;
5626 m_coverage_drop_right = -1;
5627 m_coverage_bump_left = -1;
5628 m_coverage_bump_right = -1;
5629
5630 bool fivep_confirmed = (Strand() == ePlus) ? (Status()&eLeftConfirmed) : (Status()&eRightConfirmed);
5631 bool threep_confirmed = (Strand() == ePlus) ? (Status()&eRightConfirmed) : (Status()&eLeftConfirmed);
5632
5633 if(fivep_confirmed && threep_confirmed)
5634 return;
5635
5636 CAlignMap amap = GetAlignMap();
5637
5638 int mrna_len = amap.FShiftedLen(Limits());
5639
5640 vector<double> longseq_coverage(mrna_len);
5641 ITERATE (TContained, it, m_members) {
5642 const CGeneModel& align = *(*it)->m_align;
5643 if(align.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
5644 continue;
5645 TSignedSeqRange overlap = Limits()&align.Limits();
5646 if(overlap.Empty())
5647 continue;
5648
5649 TSignedSeqRange overlap_on_mrna = amap.MapRangeOrigToEdited(overlap);
5650
5651 if(align.Type() == CGeneModel::emRNA || align.Type() == CGeneModel::eEST) { //OK to clip protein in UTR
5652 for(int i = overlap_on_mrna.GetFrom(); i <= overlap_on_mrna.GetTo(); ++i)
5653 longseq_coverage[i] += align.Weight();
5654 }
5655 }
5656
5657 TSignedSeqRange sfl(Exons().front().Limits().GetTo(),Exons().back().Limits().GetFrom());
5658 if(ReadingFrame().NotEmpty()) {
5659 TSignedSeqRange cds = (OpenCds() ? MaxCdsLimits() : RealCdsLimits());
5660 sfl.SetFrom(min(sfl.GetFrom(),cds.GetFrom()));
5661 sfl.SetTo(max(sfl.GetTo(),cds.GetTo()));
5662 }
5663 TSignedSeqRange soft_limit = sfl;
5664 ITERATE(TContained, i, m_members) {
5665 if((*i)->m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
5666 continue;
5667 TSignedSeqRange overlap = ((*i)->m_align->Limits() & Limits());
5668 if(Include(overlap,sfl.GetFrom()+1))
5669 soft_limit.SetFrom(min(soft_limit.GetFrom(),overlap.GetFrom()));
5670 if(Include(overlap,sfl.GetTo()-1))
5671 soft_limit.SetTo(max(soft_limit.GetTo(),overlap.GetTo()));
5672 }
5673 soft_limit.SetFrom(min(soft_limit.GetFrom(),m_polya_cap_left_soft_limit));
5674 soft_limit.SetTo(max(soft_limit.GetTo(),m_polya_cap_right_soft_limit));
5675
5676 soft_limit = amap.MapRangeOrigToEdited(soft_limit);
5677
5678 // 5' UTR
5679 if(!fivep_confirmed) {
5680 int left_limit = soft_limit.GetFrom();
5681 int first_bump = -1;
5682 double max_cov = 0;
5683 while(left_limit > 0 && first_bump < 0 && (longseq_coverage[left_limit] > 0 || m_coverage[left_limit] > m_core_coverage*COVERAGE_DROP)) {
5684 max_cov = max(max_cov,m_coverage[left_limit]);
5685 if(max_cov > m_core_coverage*COVERAGE_BUMP)
5686 first_bump = left_limit;
5687
5688 --left_limit;
5689 }
5690
5691 if(first_bump > 0) {
5692 for( ; first_bump < soft_limit.GetFrom()-SCAN_WINDOW && m_coverage[first_bump+SCAN_WINDOW] < m_coverage[first_bump]; ++first_bump);
5693 if(Strand() == ePlus)
5694 m_coverage_bump_left = amap.MapEditedToOrig(first_bump);
5695 else
5696 m_coverage_bump_right = amap.MapEditedToOrig(first_bump);
5697 } else if(left_limit > 0 || m_coverage[left_limit] <= m_core_coverage*COVERAGE_DROP) {
5698 int first_drop = left_limit;
5699 if(first_drop+SCAN_WINDOW/2 < mrna_len) {
5700 for( ; first_drop-SCAN_WINDOW/2 > 0; --first_drop) {
5701 if(m_coverage[first_drop-SCAN_WINDOW/2] >= m_coverage[first_drop+SCAN_WINDOW/2]) // check for negative gradient
5702 break;
5703 if(m_coverage[first_drop-SCAN_WINDOW/2]+m_coverage[first_drop+SCAN_WINDOW/2]-2*m_coverage[first_drop] >= 0) // check for decrease of gradient
5704 break;
5705 }
5706 }
5707 if(Strand() == ePlus)
5708 m_coverage_drop_left = amap.MapEditedToOrig(first_drop);
5709 else
5710 m_coverage_drop_right = amap.MapEditedToOrig(first_drop);
5711 }
5712 }
5713
5714 // 3' UTR
5715 if(!threep_confirmed) {
5716 int right_limit = soft_limit.GetTo();
5717 int first_bump = -1;
5718 double max_cov = 0;
5719 while(right_limit < mrna_len-1 && first_bump < 0 && (longseq_coverage[right_limit] > 0 || m_coverage[right_limit] > m_core_coverage*COVERAGE_DROP)) {
5720 max_cov = max(max_cov,m_coverage[right_limit]);
5721 if(first_bump < 0 && max_cov > m_core_coverage*COVERAGE_BUMP)
5722 first_bump = right_limit;
5723
5724 ++right_limit;
5725 }
5726 if(first_bump > 0) {
5727 for( ; first_bump > soft_limit.GetTo()+SCAN_WINDOW && m_coverage[first_bump-SCAN_WINDOW] < m_coverage[first_bump]; --first_bump);
5728 if(Strand() == ePlus)
5729 m_coverage_bump_right = amap.MapEditedToOrig(first_bump);
5730 else
5731 m_coverage_bump_left = amap.MapEditedToOrig(first_bump);
5732 } else if(right_limit < mrna_len-1 || m_coverage[right_limit] <= m_core_coverage*COVERAGE_DROP) { // garanteed that right_limit <= mrna_len-1
5733 int first_drop = right_limit;
5734 if(first_drop-SCAN_WINDOW/2 > 0) {
5735 for( ; first_drop < mrna_len-SCAN_WINDOW/2; ++first_drop) {
5736 if(m_coverage[first_drop+SCAN_WINDOW/2] >= m_coverage[first_drop-SCAN_WINDOW/2]) // check for negative gradient
5737 break;
5738 if(m_coverage[first_drop-SCAN_WINDOW/2]+m_coverage[first_drop+SCAN_WINDOW/2]-2*m_coverage[first_drop] >= 0) // check for decrease of gradient
5739 break;
5740 }
5741 }
5742 if(Strand() == ePlus)
5743 m_coverage_drop_right = amap.MapEditedToOrig(first_drop);
5744 else
5745 m_coverage_drop_left = amap.MapEditedToOrig(first_drop);
5746 }
5747 }
5748 }
5749
SetConsistentCoverage()5750 void CChain::SetConsistentCoverage()
5751 {
5752 if(!(Type()&CGeneModel::eSR))
5753 return;
5754
5755 CAlignMap amap = GetAlignMap();
5756 int mrna_len = amap.FShiftedLen(Limits());
5757 map<TSignedSeqRange,double> intron_coverage;
5758 vector<double> coverage(mrna_len);
5759 ITERATE (TContained, it, m_members) {
5760 const CGeneModel& align = *(*it)->m_align;
5761 if(align.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
5762 continue;
5763 TSignedSeqRange overlap = Limits()&align.Limits();
5764 if(overlap.Empty()) // some could be cut by polya clip
5765 continue;
5766
5767 if(align.Type() == CGeneModel::eSR) {
5768 TSignedSeqRange overlap_on_mrna = amap.MapRangeOrigToEdited(overlap);
5769 for(int i = overlap_on_mrna.GetFrom(); i <= overlap_on_mrna.GetTo(); ++i)
5770 coverage[i] += align.Weight();
5771 }
5772
5773 for(int i = 1; i < (int)align.Exons().size(); ++i) {
5774 if(align.Exons()[i-1].m_ssplice_sig != "XX" && align.Exons()[i].m_fsplice_sig != "XX") {
5775 TSignedSeqRange intron(align.Exons()[i-1].Limits().GetTo(),align.Exons()[i].Limits().GetFrom());
5776 if(Include(Limits(),intron))
5777 intron_coverage[intron] += (align.Type() == CGeneModel::eSR) ? align.Weight() : 0;
5778 }
5779 }
5780 }
5781
5782 double minintroncount = numeric_limits<double>::max();
5783 double maxintroncount = 0;
5784 for(map<TSignedSeqRange,double>::iterator it = intron_coverage.begin(); it != intron_coverage.end(); ++it) {
5785 minintroncount = min(minintroncount,it->second);
5786 maxintroncount = max(maxintroncount,it->second);
5787 }
5788 if(minintroncount < 0.1*maxintroncount)
5789 return;
5790
5791 vector<int> dips(mrna_len,0);
5792 double maxsofar = 0;
5793 for(int i = 0; i < mrna_len; ++i) {
5794 if(coverage[i] < 0.1*maxsofar)
5795 dips[i] = 1;
5796 maxsofar = max(maxsofar,coverage[i]);
5797 }
5798 for(int i = 0; i < (int)Exons().size(); ++i) {
5799 if(Exons()[i].m_fsplice_sig == "XX" || Exons()[i].m_ssplice_sig == "XX") {
5800 TSignedSeqRange te = amap.MapRangeOrigToEdited(Exons()[i].Limits(),false);
5801 _ASSERT(te.NotEmpty());
5802 for(int p = max(0,te.GetFrom()-50); p <= min(mrna_len-1,te.GetTo()+50); ++p)
5803 dips[p] = 0;
5804 }
5805 }
5806 maxsofar = 0;
5807 for(int i = mrna_len-1; i >= 0; --i) {
5808 if(coverage[i] < 0.1*maxsofar && dips[i] > 0)
5809 return;
5810 maxsofar = max(maxsofar,coverage[i]);
5811 }
5812
5813 if(intron_coverage.size() > 1)
5814 Status() |= eConsistentCoverage;
5815 }
5816
SetConfirmedStartStopForCompleteProteins(map<string,pair<bool,bool>> & prot_complet,const SMinScor & minscor)5817 void CChain::SetConfirmedStartStopForCompleteProteins(map<string, pair<bool,bool> >& prot_complet, const SMinScor& minscor)
5818 {
5819 if(ConfirmedStart() && ConfirmedStop())
5820 return;
5821
5822 bool setconfstart = false;
5823 bool setconfstop = false;
5824
5825 CAlignMap mrnamap = GetAlignMap();
5826 ITERATE(TContained, i, m_members) {
5827
5828 if((*i)->m_align->GetCdsInfo().ProtReadingFrame().Empty()) // not known CDS
5829 continue;
5830
5831 if((*i)->m_align->Type() & emRNA) {
5832 if(!ConfirmedStart() && HasStart())
5833 setconfstart = true;
5834 if(!ConfirmedStop() && HasStop())
5835 setconfstop = true;
5836 } else {
5837 CAlignModel* orig_align = (*i)->m_orig_align;
5838 if(orig_align->TargetLen() == 0) // protein of not known length
5839 continue;
5840
5841 string accession = orig_align->TargetAccession();
5842 map<string, pair<bool,bool> >::iterator iter = prot_complet.find(accession);
5843 _ASSERT(iter != prot_complet.end());
5844 if(iter == prot_complet.end())
5845 continue;
5846
5847 TSignedSeqRange fivep_exon = orig_align->Exons().front().Limits();
5848 TSignedSeqRange threep_exon = orig_align->Exons().back().Limits();
5849 if((*i)->m_align->Strand() == eMinus)
5850 swap(fivep_exon,threep_exon);
5851
5852 if(!ConfirmedStart() && HasStart() && fivep_exon.IntersectingWith((*i)->m_align->Limits()) &&
5853 iter->second.first && Include(Limits(),(*i)->m_align->Limits())) { // protein has start
5854
5855 TSignedSeqPos not_aligned = orig_align->GetAlignMap().MapRangeOrigToEdited((*i)->m_align->Limits(),false).GetFrom()-1;
5856 if(not_aligned <= (1.-minscor.m_minprotfrac)*orig_align->TargetLen()) { // well aligned
5857 TSignedSeqPos fivep = mrnamap.MapOrigToEdited(Strand() == ePlus ? (*i)->m_align->Limits().GetFrom() : (*i)->m_align->Limits().GetTo());
5858 if(fivep > 0) { // the end is still in chain
5859 TSignedSeqPos extra_length = fivep-mrnamap.MapRangeOrigToEdited(GetCdsInfo().Start(),false).GetFrom()-1;
5860 if(extra_length > not_aligned-minscor.m_endprotfrac*orig_align->TargetLen()) {
5861 setconfstart = true;
5862 }
5863 }
5864 }
5865 }
5866
5867 if(!ConfirmedStop() && HasStop() && threep_exon.IntersectingWith((*i)->m_align->Limits()) &&
5868 iter->second.second && Include(Limits(),(*i)->m_align->Limits())) { // protein has stop
5869
5870 TSignedSeqPos not_aligned = orig_align->TargetLen()-orig_align->GetAlignMap().MapRangeOrigToEdited((*i)->m_align->Limits(),false).GetTo();
5871 if(not_aligned <= (1.-minscor.m_minprotfrac)*orig_align->TargetLen()) { // well aligned
5872 TSignedSeqPos threep = mrnamap.MapOrigToEdited(Strand() == ePlus ? (*i)->m_align->Limits().GetTo() : (*i)->m_align->Limits().GetFrom());
5873 if(threep >= 0) { // the end is still in chain
5874 TSignedSeqPos extra_length = mrnamap.MapRangeOrigToEdited(GetCdsInfo().Stop(),false).GetTo()-threep;
5875 if(extra_length > not_aligned-minscor.m_endprotfrac*orig_align->TargetLen()) {
5876 setconfstop = true;
5877 }
5878 }
5879 }
5880 }
5881 }
5882 }
5883
5884 CCDSInfo cds_info = GetCdsInfo();
5885 double score = cds_info.Score();
5886 if((setconfstart || ConfirmedStart()) && (setconfstop || ConfirmedStop()) && Continuous()) {
5887 score += max(1.,0.3*score);
5888 cds_info.SetScore(score, false); // not open
5889 }
5890
5891 if(setconfstart) {
5892 cds_info.SetScore(score, false); // not open
5893 cds_info.SetStart(cds_info.Start(), true); // confirmed start
5894 }
5895
5896 if(setconfstop) {
5897 cds_info.SetStop(cds_info.Stop(), true); // confirmed stop
5898 }
5899
5900 SetCdsInfo(cds_info);
5901 }
5902
CollectTrustedmRNAsProts(TOrigAligns & orig_aligns,const SMinScor & minscor,CScope & scope,SMatrix & delta,const CResidueVec & contig)5903 void CChain::CollectTrustedmRNAsProts(TOrigAligns& orig_aligns, const SMinScor& minscor, CScope& scope, SMatrix& delta, const CResidueVec& contig)
5904 {
5905 ClearTrustedmRNA();
5906 ClearTrustedProt();
5907
5908 if(HasStart() && HasStop()) {
5909 typedef map<Int8, set<TSignedSeqRange> > Tint8range;
5910 Tint8range aexons;
5911 Tint8range uexons;
5912 ITERATE(TContained, i, m_members) {
5913 if(IntersectingWith(*(*i)->m_align)) { // just in case we clipped this alignment
5914 if(!(*i)->m_align->TrustedProt().empty()) {
5915 ITERATE(TExons, e, (*i)->m_align->Exons()) {
5916 if((*i)->m_mem_id > 0)
5917 aexons[(*i)->m_align->ID()].insert(e->Limits());
5918 else
5919 uexons[(*i)->m_align->ID()].insert(e->Limits());
5920 }
5921 }
5922 else if(!(*i)->m_align->TrustedmRNA().empty() && (*i)->m_align->ConfirmedStart() && (*i)->m_align->ConfirmedStop()) // trusted mRNA with aligned CDS (correctly checks not duplicated cds)
5923 InsertTrustedmRNA(*(*i)->m_align->TrustedmRNA().begin()); // could be only one 'part'
5924 }
5925 }
5926 typedef map<Int8, int> Tint8int;
5927 Tint8int palignedlen;
5928 ITERATE(Tint8range, i, aexons) {
5929 int len = 0;
5930 ITERATE(set<TSignedSeqRange>, e, i->second)
5931 len += e->GetLength();
5932 palignedlen[i->first] = len;
5933 }
5934 ITERATE(Tint8range, i, uexons) {
5935 int len = 0;
5936 ITERATE(set<TSignedSeqRange>, e, i->second)
5937 len += e->GetLength();
5938 palignedlen[i->first] = max(len,palignedlen[i->first]);
5939 }
5940
5941 if(ConfirmedStart() && ConfirmedStop()) {
5942 ITERATE(Tint8int, i, palignedlen) {
5943 CAlignModel* orig_align = orig_aligns[i->first];
5944 if((Continuous() && i->second > 0.8*orig_align->TargetLen()) || i->second > minscor.m_minprotfrac*orig_align->TargetLen()) // well aligned trusted protein
5945 InsertTrustedProt(*orig_align->TrustedProt().begin());
5946 }
5947 }
5948
5949 if(Continuous() && TrustedmRNA().empty() && TrustedProt().empty() && !palignedlen.empty()) {
5950 TSignedSeqRange cds = RealCdsLimits();
5951 int gap_cds = 0;
5952 ITERATE(CGeneModel::TExons, ie, Exons()) {
5953 if(ie->m_fsplice_sig == "XX" || ie->m_ssplice_sig == "XX")
5954 gap_cds += (cds&ie->Limits()).GetLength();
5955 }
5956
5957 if(gap_cds > 0) {
5958 string mprotein = GetProtein(contig);
5959 ITERATE(Tint8int, i, palignedlen) {
5960 CAlignModel* orig_align = orig_aligns[i->first];
5961 if(i->second+gap_cds > 0.8*orig_align->TargetLen()) { //realign proteins if close enough
5962 CSeqVector protein_seqvec(scope.GetBioseqHandle(*orig_align->GetTargetId()), CBioseq_Handle::eCoding_Iupac);
5963 string tprotein(protein_seqvec.begin(),protein_seqvec.end());
5964 CCigar cigar = LclAlign(mprotein.c_str(), mprotein.size(), tprotein.c_str(), tprotein.size(), 10, 1, delta.matrix);
5965 if(cigar.SubjectRange().GetLength() > 0.8*tprotein.size()) {
5966 InsertTrustedProt(*orig_align->TrustedProt().begin());
5967 break;
5968 }
5969 }
5970 }
5971 }
5972 }
5973 }
5974 }
5975
5976 // if external model is 'open' all 5' introns can harbor
5977 // for nested model 'open' is ignored
HarborsNested(const CChain & other_chain,bool check_in_holes) const5978 bool CChain::HarborsNested(const CChain& other_chain, bool check_in_holes) const {
5979 TSignedSeqRange lim_for_nested = Limits();
5980 if(!ReadingFrame().Empty())
5981 lim_for_nested = OpenCds() ? MaxCdsLimits() : RealCdsLimits();
5982
5983 TSignedSeqRange other_lim_for_nested = other_chain.Limits();
5984 if(!other_chain.ReadingFrame().Empty())
5985 other_lim_for_nested = other_chain.RealCdsLimits();
5986
5987 if(lim_for_nested.IntersectingWith(other_lim_for_nested))
5988 return CModelCompare::RangeNestedInIntron(other_lim_for_nested, *this, check_in_holes);
5989 else
5990 return false;
5991 }
5992
5993 // if external model is 'open' all 5' introns can harbor
5994 // for nested model 'open' is ignored
HarborsNested(const CGene & other_gene,bool check_in_holes) const5995 bool CChain::HarborsNested(const CGene& other_gene, bool check_in_holes) const {
5996 TSignedSeqRange lim_for_nested = Limits();
5997 if(!ReadingFrame().Empty())
5998 lim_for_nested = OpenCds() ? MaxCdsLimits() : RealCdsLimits();
5999
6000 TSignedSeqRange other_lim_for_nested = other_gene.Limits();
6001 if(!other_gene.RealCdsLimits().Empty())
6002 other_lim_for_nested = other_gene.RealCdsLimits();
6003
6004 if(lim_for_nested.IntersectingWith(other_lim_for_nested))
6005 return CModelCompare::RangeNestedInIntron(other_lim_for_nested, *this, check_in_holes);
6006 else
6007 return false;
6008 }
6009
GetAccVer(const CAlignModel & a,CScope & scope)6010 pair<string,int> GetAccVer(const CAlignModel& a, CScope& scope)
6011 {
6012 if((a.Type()&CGeneModel::eProt) == 0)
6013 return make_pair(a.TargetAccession(), 0);
6014
6015 try {
6016 CSeq_id_Handle idh = sequence::GetId(*a.GetTargetId(), scope,
6017 sequence::eGetId_ForceAcc);
6018 if (idh) {
6019 CConstRef<CSeq_id> acc = idh.GetSeqId();
6020 const CTextseq_id* txtid = acc->GetTextseq_Id();
6021 return (txtid && txtid->IsSetAccession() && txtid->IsSetVersion()) ?
6022 make_pair(txtid->GetAccession(), txtid->GetVersion()) : make_pair(idh.AsString(), 0);
6023 }
6024 }
6025 catch (sequence::CSeqIdFromHandleException&) {
6026 }
6027 return make_pair(a.TargetAccession(), 0);
6028 }
6029
6030 static int s_ExonLen(const CGeneModel& a);
6031
6032 struct s_ByAccVerLen {
s_ByAccVerLens_ByAccVerLen6033 s_ByAccVerLen(CScope& scope_) : scope(scope_) {}
6034 CScope& scope;
operator ()s_ByAccVerLen6035 bool operator()(const CAlignModel* a, const CAlignModel* b)
6036 {
6037 pair<string,int> a_acc = GetAccVer(*a, scope);
6038 pair<string,int> b_acc = GetAccVer(*b, scope);
6039 int acc_cmp = NStr::CompareCase(a_acc.first,b_acc.first);
6040 if (acc_cmp != 0)
6041 return acc_cmp<0;
6042 if (a_acc.second != b_acc.second)
6043 return a_acc.second > b_acc.second;
6044
6045 int a_stt = a->HasStart()+a->HasStop();
6046 int b_stt = b->HasStart()+b->HasStop();
6047 if (a_stt != b_stt)
6048 return a_stt > b_stt;
6049
6050 int a_len = s_ExonLen(*a);
6051 int b_len = s_ExonLen(*b);
6052 if (a_len!=b_len)
6053 return a_len > b_len;
6054
6055 if((a->Status()&CGeneModel::eBestPlacement) != (b->Status()&CGeneModel::eBestPlacement))
6056 return (a->Status()&CGeneModel::eBestPlacement) > (b->Status()&CGeneModel::eBestPlacement);
6057
6058 return a->ID() < b->ID(); // to make sort deterministic
6059 }
6060 };
s_ExonLen(const CGeneModel & a)6061 static int s_ExonLen(const CGeneModel& a)
6062 {
6063 int len = 0;
6064 ITERATE(CGeneModel::TExons, e, a.Exons())
6065 len += e->Limits().GetLength();
6066 return len;
6067 }
6068
SkipReason(CGeneModel * orig_align,const string & comment)6069 void CChainer::CChainerImpl::SkipReason(CGeneModel* orig_align, const string& comment)
6070 {
6071 orig_align->Status() |= CGeneModel::eSkipped;
6072 orig_align->AddComment(comment);
6073 }
6074
FilterOutChimeras(TGeneModelList & clust)6075 void CChainer::FilterOutChimeras(TGeneModelList& clust)
6076 {
6077 m_data->FilterOutChimeras(clust);
6078 }
6079
FilterOutChimeras(TGeneModelList & clust)6080 void CChainer::CChainerImpl::FilterOutChimeras(TGeneModelList& clust)
6081 {
6082 typedef map<int,TGeneModelClusterSet> TClustersByStrand;
6083 TClustersByStrand trusted_aligns;
6084 ITERATE(TGeneModelList, it, clust) {
6085 if(it->Status()&CGeneModel::eUnmodifiedAlign)
6086 continue;
6087
6088 CAlignModel* orig_align = orig_aligns[it->ID()];
6089 if(orig_align->Continuous() && (!it->TrustedmRNA().empty() || !it->TrustedProt().empty())
6090 && it->AlignLen() > minscor.m_minprotfrac*orig_aligns[it->ID()]->TargetLen()) {
6091 trusted_aligns[it->Strand()].Insert(*it);
6092 }
6093 }
6094
6095 if(trusted_aligns[ePlus].size() < 2 && trusted_aligns[eMinus].size() < 2)
6096 return;
6097
6098 typedef set<int> TSplices;
6099 typedef list<TSplices> TSplicesList;
6100 typedef map<int,TSplicesList> TSplicesByStrand;
6101 TSplicesByStrand trusted_splices;
6102
6103 ITERATE(TClustersByStrand, it, trusted_aligns) {
6104 int strand = it->first;
6105 const TGeneModelClusterSet& clset = it->second;
6106 ITERATE(TGeneModelClusterSet, jt, clset) {
6107 const TGeneModelCluster& cls = *jt;
6108 trusted_splices[strand].push_back(set<int>());
6109 TSplices& splices = trusted_splices[strand].back();
6110 ITERATE(TGeneModelCluster, lt, cls) {
6111 const CGeneModel& align = *lt;
6112 ITERATE(CGeneModel::TExons, e, align.Exons()) {
6113 if(e->m_fsplice)
6114 splices.insert(e->GetFrom());
6115 if(e->m_ssplice)
6116 splices.insert(e->GetTo());
6117 }
6118 }
6119 }
6120 }
6121
6122 for(TGeneModelList::iterator it_loop = clust.begin(); it_loop != clust.end(); ) {
6123 TGeneModelList::iterator it = it_loop++;
6124 if(it->Status()&CGeneModel::eUnmodifiedAlign)
6125 continue;
6126
6127 const CGeneModel& align = *it;
6128 int strand = align.Strand();
6129 const TSplicesList& spl = trusted_splices[strand];
6130
6131 int count = 0;
6132 ITERATE(TSplicesList, jt, spl) {
6133 const TSplices& splices = *jt;
6134 for(unsigned int i = 0; i < align.Exons().size(); ++i) {
6135 const CModelExon& e = align.Exons()[i];
6136 if(splices.find(e.GetFrom()) != splices.end() || splices.find(e.GetTo()) != splices.end()) {
6137 ++count;
6138 break;
6139 }
6140 }
6141 }
6142
6143 if(count > 1) {
6144 cerr << "Chimeric alignment " << align.ID() << endl;
6145 SkipReason(orig_aligns[align.ID()],"Chimera");
6146 clust.erase(it);
6147 }
6148 }
6149 }
6150
6151 struct OverlapsSameAccessionAlignment : public Predicate {
6152 OverlapsSameAccessionAlignment(TAlignModelList& alignments);
6153 virtual bool align_predicate(CAlignModel& align);
GetCommentOverlapsSameAccessionAlignment6154 virtual string GetComment() { return "Overlaps the same alignment";}
6155 };
6156
OverlapsSameAccessionAlignment(TAlignModelList & alignments)6157 OverlapsSameAccessionAlignment::OverlapsSameAccessionAlignment(TAlignModelList& alignments)
6158 {
6159 CScope scope(*CObjectManager::GetInstance());
6160 scope.AddDefaults();
6161
6162 vector<CAlignModel*> alignment_ptrs;
6163 NON_CONST_ITERATE(TAlignModelList, a, alignments) {
6164 if(!(a->Status()&CGeneModel::eUnmodifiedAlign) && a->Type() != CGeneModel::eNotForChaining)
6165 alignment_ptrs.push_back(&*a);
6166 }
6167
6168 if (alignment_ptrs.empty())
6169 return;
6170
6171 sort(alignment_ptrs.begin(), alignment_ptrs.end(), s_ByAccVerLen(scope));
6172
6173 vector<CAlignModel*>::iterator first = alignment_ptrs.begin();
6174 pair<string,int> first_accver = GetAccVer(**first, scope);
6175 vector<CAlignModel*> ::iterator current = first; ++current;
6176 for (; current != alignment_ptrs.end(); ++current) {
6177 pair<string,int> current_accver = GetAccVer(**current, scope);
6178 if (first_accver.first == current_accver.first) {
6179 if ((*current)->Strand() == (*first)->Strand() && (*current)->Limits().IntersectingWith((*first)->Limits())) {
6180 (*current)->Status() |= CGeneModel::eSkipped;
6181 }
6182 } else {
6183 first=current;
6184 first_accver = current_accver;
6185 }
6186 }
6187 }
6188
align_predicate(CAlignModel & align)6189 bool OverlapsSameAccessionAlignment::align_predicate(CAlignModel& align)
6190 {
6191 return align.Status() & CGeneModel::eSkipped;
6192 }
6193
OverlapsSameAccessionAlignment(TAlignModelList & alignments)6194 Predicate* CChainer::OverlapsSameAccessionAlignment(TAlignModelList& alignments)
6195 {
6196 return new gnomon::OverlapsSameAccessionAlignment(alignments);
6197 }
6198
FindMultiplyIncluded(CAlignModel & algn,TAlignModelList & clust)6199 string FindMultiplyIncluded(CAlignModel& algn, TAlignModelList& clust)
6200 {
6201 if ((algn.Type() & CGeneModel::eProt)!=0 && !algn.Continuous()) {
6202 set<string> compatible_evidence;
6203 int len = algn.AlignLen();
6204
6205 static CGeneModel dummy_align;
6206 const CGeneModel* prev_alignp = &dummy_align;
6207
6208 bool prev_is_compatible = false;
6209 NON_CONST_ITERATE(TAlignModelList, jtcl, clust) {
6210 CAlignModel& algnj = *jtcl;
6211 if (algn == algnj)
6212 continue;
6213 if (algnj.AlignLen() < len/4)
6214 continue;
6215
6216 bool same_as_prev = algnj.IdenticalAlign(*prev_alignp);
6217 if (!same_as_prev)
6218 prev_alignp = &algnj;
6219
6220 if ((same_as_prev && prev_is_compatible) || (!same_as_prev && algn.Strand()==algnj.Strand() && algn.isCompatible(algnj))) {
6221 prev_is_compatible = true;
6222 if (!compatible_evidence.insert(algnj.TargetAccession()).second) {
6223 return algnj.TargetAccession();
6224 }
6225 } else {
6226 prev_is_compatible = false;
6227 }
6228 }
6229 }
6230 return kEmptyStr;
6231 }
6232
6233 struct ConnectsParalogs : public Predicate {
ConnectsParalogsConnectsParalogs6234 ConnectsParalogs(TAlignModelList& _alignments)
6235 : alignments(_alignments)
6236 {}
6237 TAlignModelList& alignments;
6238 string paralog;
6239
align_predicateConnectsParalogs6240 virtual bool align_predicate(CAlignModel& align)
6241 {
6242 paralog = FindMultiplyIncluded(align, alignments);
6243 return !paralog.empty();
6244 }
GetCommentConnectsParalogs6245 virtual string GetComment() { return "Connects two "+paralog+" alignments"; }
6246 };
6247
ConnectsParalogs(TAlignModelList & alignments)6248 Predicate* CChainer::ConnectsParalogs(TAlignModelList& alignments)
6249 {
6250 return new gnomon::ConnectsParalogs(alignments);
6251 }
6252
ScoreCDSes_FilterOutPoorAlignments(TGeneModelList & clust)6253 void CChainer::ScoreCDSes_FilterOutPoorAlignments(TGeneModelList& clust)
6254 {
6255 ERASE_ITERATE(TGeneModelList, itcl, clust) {
6256 if(m_data->orig_aligns.find(itcl->ID()) == m_data->orig_aligns.end()) {
6257 clust.erase(itcl);
6258 continue;
6259 }
6260
6261 CGeneModel& algn = *itcl;
6262 if ((algn.Type() & CGeneModel::eProt)!=0 || algn.ConfirmedStart()) { // this includes protein alignments and mRNA with confirmed CDSes
6263
6264 m_gnomon->GetScore(algn);
6265 double ms = m_data->GoodCDNAScore(algn);
6266 CAlignModel* orig = m_data->orig_aligns[algn.ID()];
6267
6268 if (algn.Score() == BadScore() || (algn.Score() < ms && (algn.Type()&CGeneModel::eProt) && !(algn.Status()&CGeneModel::eBestPlacement) && orig->AlignLen() < m_data->minscor.m_minprotfrac*orig->TargetLen())) { // all mRNA with confirmed CDS and best placed or reasonably aligned proteins with known length will get through with any finite score
6269 CNcbiOstrstream ost;
6270 if(algn.AlignLen() <= 75)
6271 ost << "Short alignment " << algn.AlignLen();
6272 else
6273 ost << "Low score " << algn.Score();
6274 m_data->SkipReason(orig, CNcbiOstrstreamToString(ost));
6275 clust.erase(itcl);
6276 }
6277 }
6278 }
6279 }
6280
6281 #define PROT_CLIP 120
6282 #define PROT_CLIP_FRAC 0.20
6283 #define MIN_PART 30
6284
FindSelenoproteinsClipProteinsToStartStop(TGeneModelList & clust)6285 void CChainer::FindSelenoproteinsClipProteinsToStartStop(TGeneModelList& clust) {
6286 CScope scope(*CObjectManager::GetInstance());
6287 scope.AddDefaults();
6288 const CResidueVec& contig = m_gnomon->GetSeq();
6289
6290 ERASE_ITERATE(TGeneModelList, itcl, clust) {
6291 if(!(itcl->Type()&CGeneModel::eProt) || m_data->orig_aligns.find(itcl->ID()) == m_data->orig_aligns.end()) // skip cDNA and 'unmodified' without origaligns
6292 continue;
6293
6294 CGeneModel& align = *itcl;
6295 m_gnomon->GetScore(align);
6296 if(align.Score() == BadScore()) {
6297 clust.erase(itcl);
6298 continue;
6299 }
6300
6301 CAlignModel* orig = m_data->orig_aligns[align.ID()];
6302 CSeqVector protein_seqvec(scope.GetBioseqHandle(*orig->GetTargetId()), CBioseq_Handle::eCoding_Iupac);
6303
6304 CAlignMap amap = align.GetAlignMap();
6305 CAlignMap origmap = orig->GetAlignMap();
6306
6307 //find selenoproteins and stops 'confirmed' on genome
6308 if(align.PStop()) {
6309 CCDSInfo::TPStops pstops = align.GetCdsInfo().PStops();
6310 NON_CONST_ITERATE(CCDSInfo::TPStops, stp, pstops) {
6311 TInDels fs = StrictlyContainedInDels(align.FrameShifts(), *stp);
6312 if(!fs.empty())
6313 continue;
6314 TSignedSeqRange tstop = amap.MapRangeOrigToEdited(*stp,false);
6315 CResidueVec mrna;
6316 amap.EditedSequence(contig, mrna);
6317 if(tstop.GetLength() == 3 && mrna[tstop.GetFrom()] == 'T' && mrna[tstop.GetFrom()+1] == 'G' && mrna[tstop.GetFrom()+2] == 'A') {
6318 TSignedSeqRange ostop = origmap.MapRangeOrigToEdited(*stp,false);
6319 if(ostop.GetLength() == 3 && protein_seqvec[ostop.GetFrom()/3] == 'U') {
6320 stp->m_status = CCDSInfo::eSelenocysteine;
6321 }
6322 }
6323 if(stp->m_status != CCDSInfo::eSelenocysteine) {
6324 TIntMap::iterator conf = m_confirmed_bases_len.upper_bound(stp->GetTo()); // confirmed on the right
6325 if(conf != m_confirmed_bases_len.begin() && (--conf)->first <= stp->GetFrom() && conf->first+conf->second > stp->GetTo())
6326 stp->m_status = CCDSInfo::eGenomeCorrect;
6327 }
6328 }
6329
6330 CCDSInfo cds = align.GetCdsInfo();
6331 cds.ClearPStops();
6332 ITERATE(CCDSInfo::TPStops, stp, pstops)
6333 cds.AddPStop(*stp);
6334 align.SetCdsInfo(cds);
6335 }
6336
6337 if(itcl->Status()&CGeneModel::eUnmodifiedAlign) {
6338 m_data->unmodified_aligns[itcl->ID()] = *itcl;
6339 clust.erase(itcl);
6340 continue;
6341 }
6342
6343 if(align.Limits() == orig->Limits() && (!align.HasStart() || !align.FrameShifts().empty() || align.PStop(false))) {
6344 int maxclip = min(PROT_CLIP, (int)(align.AlignLen()*PROT_CLIP_FRAC+0.5));
6345 TSignedSeqRange tlim = orig->TranscriptLimits();
6346 int fivepclip = 0;
6347 if(protein_seqvec[0] == 'M')
6348 fivepclip = maxclip-tlim.GetFrom();
6349 int threepclip = maxclip-(orig->TargetLen()-tlim.GetTo()-1);
6350
6351 bool skip = false;
6352
6353 int fivepshift = 0;
6354 int threepshift = 0;
6355 int tlen = align.TranscriptLimits().GetTo()+1;
6356 for(TInDels::iterator indl = align.FrameShifts().begin(); !skip && indl != align.FrameShifts().end(); ++indl) {
6357 //project safely in case of tandem frameshifts or exon boundaries
6358 TSignedSeqRange left(align.Limits().GetFrom(),indl->Loc()-1);
6359 left = amap.ShrinkToRealPoints(left,false);
6360 _ASSERT(left.NotEmpty());
6361 TSignedSeqRange right(indl->InDelEnd(),align.Limits().GetTo());
6362 right = amap.ShrinkToRealPoints(right,false);
6363 _ASSERT(right.NotEmpty());
6364
6365 TSignedSeqRange lim = amap.MapRangeOrigToEdited(TSignedSeqRange(left.GetTo(),right.GetFrom()), false);
6366 _ASSERT(lim.GetLength() >= 2);
6367 int tpa = lim.GetFrom()+1;
6368 int tpb = lim.GetTo()-1;
6369 // for deletion tpa,tpb are first and last tposition of the extra sequence on transcript
6370 // for insertion tpa is AFTER the missing sequence and tpb is BEFORE
6371 if(tpb < fivepclip) { // clipable 5' frameshift
6372 if(indl->IsInsertion())
6373 fivepshift += indl->Len();
6374 else if(indl->IsDeletion())
6375 fivepshift -= indl->Len();
6376 } else if(tpa >= tlen-threepclip) { // clipable 3' frameshift
6377 if(indl->IsInsertion())
6378 threepshift += indl->Len();
6379 else if(indl->IsDeletion())
6380 threepshift -= indl->Len();
6381 } else { // frameshift in main body
6382 skip = true;
6383 }
6384 }
6385 if(skip)
6386 continue;
6387
6388 if(fivepshift >= 0)
6389 fivepshift %= 3;
6390 else
6391 fivepshift = 3-(-fivepshift)%3;
6392 if(threepshift >= 0)
6393 threepshift %= 3;
6394 else
6395 threepshift = 3-(-threepshift)%3;
6396
6397 CGeneModel editedm = align;
6398 editedm.FrameShifts().clear();
6399 editedm.SetCdsInfo(CCDSInfo());
6400 //CAlignMap edited_map = editedm.GetAlignMap();
6401 TSignedSeqRange edited_tlim = editedm.TranscriptLimits();
6402 edited_tlim.SetFrom(edited_tlim.GetFrom()+fivepshift);
6403 edited_tlim.SetTo(edited_tlim.GetTo()-threepshift);
6404 TSignedSeqRange edited_lim = editedm.GetAlignMap().MapRangeEditedToOrig(edited_tlim, false);
6405 _ASSERT(edited_lim.NotEmpty());
6406 editedm.Clip(edited_lim, CGeneModel::eRemoveExons);
6407 CCDSInfo edited_cds;
6408 edited_cds.SetReadingFrame(edited_lim, true);
6409 editedm.SetCdsInfo(edited_cds);
6410
6411 string protseq = editedm.GetProtein(contig);
6412 tlen = 3*protseq.size();
6413 int fivep_problem = -1;
6414 int first_stop = tlen;
6415
6416 for(int p = 0; !skip && p < (int)protseq.size(); ++p) {
6417 if(protseq[p] == '*') {
6418 int tpa = p*3;
6419 int tpb = tpa+2;
6420 if(tpb < fivepclip) // clipable 5' stop
6421 fivep_problem = max(fivep_problem, tpb);
6422 else if(tpa >= tlen-threepclip || p == (int)protseq.size()-1) // leftmost 3' stop
6423 first_stop = min(first_stop, tpa);
6424 else // stop in main body
6425 skip = true;
6426 }
6427 }
6428 if(skip)
6429 continue;
6430
6431 int fivep_limit = 0;
6432 size_t m = protseq.find("M", (fivep_problem+1)/3); // first start after possible stop/frameshift
6433 skip = true;
6434 if(m != string::npos && (int)m*3 <= fivepclip) {
6435 fivep_limit = 3*m;
6436 skip = false;
6437 }
6438 if(skip)
6439 continue;
6440
6441 int threep_limit = tlen-1;
6442 skip = true;
6443 if(first_stop+2 < threep_limit) {
6444 threep_limit = first_stop+2;
6445 skip = false;
6446 }
6447 if(skip)
6448 continue;
6449
6450 TSignedSeqRange clip(fivep_limit, threep_limit);
6451 tlen = clip.GetLength();
6452 clip = editedm.GetAlignMap().MapRangeEditedToOrig(clip, false);
6453 _ASSERT(clip.NotEmpty());
6454
6455 editedm.Clip(clip, CGeneModel::eRemoveExons);
6456 if(align.Limits().GetFrom() != editedm.Limits().GetFrom() && !editedm.Exons().front().m_ssplice && editedm.Exons().front().Limits().GetLength() < MIN_PART) // short 5' part
6457 continue;
6458 if(align.Limits().GetTo() != editedm.Limits().GetTo() && !editedm.Exons().back().m_fsplice && editedm.Exons().back().Limits().GetLength() < MIN_PART) // short 3' part
6459 continue;
6460
6461 TSignedSeqRange start(0, 2);
6462 TSignedSeqRange stop(tlen-3, tlen-1);
6463 TSignedSeqRange rf(start.GetTo()+1,stop.GetFrom()-1);
6464 edited_cds.SetReadingFrame(rf,true);
6465 edited_cds.SetStart(start,true);
6466 edited_cds.SetStop(stop,true);
6467 edited_cds.SetScore(align.Score());
6468 edited_cds = edited_cds.MapFromEditedToOrig(editedm.GetAlignMap());
6469 editedm.SetCdsInfo(edited_cds);
6470
6471 #ifdef _DEBUG
6472 protseq = editedm.GetProtein(contig);
6473 _ASSERT(tlen == 3*(int)protseq.size());
6474 _ASSERT(protseq[0] == 'M');
6475 m = protseq.find("*");
6476 _ASSERT(m == protseq.size()-1);
6477 #endif
6478
6479 align = editedm;
6480 }
6481 }
6482 }
6483
6484
6485 struct SFShiftsCluster {
SFShiftsClusterSFShiftsCluster6486 SFShiftsCluster(TSignedSeqRange limits = TSignedSeqRange::GetEmpty()) : m_limits(limits) {}
6487 TSignedSeqRange m_limits;
6488 TInDels m_fshifts;
operator <SFShiftsCluster6489 bool operator<(const SFShiftsCluster& c) const { return m_limits.GetTo() < c.m_limits.GetFrom(); }
6490 };
6491
AddIfCompatible(set<SFShiftsCluster> & fshift_clusters,const CGeneModel & algn)6492 bool CChainer::CChainerImpl::AddIfCompatible(set<SFShiftsCluster>& fshift_clusters, const CGeneModel& algn)
6493 {
6494 typedef vector<SFShiftsCluster> TFShiftsClusterVec;
6495 typedef set<SFShiftsCluster>::iterator TIt;
6496
6497 TFShiftsClusterVec algn_fclusters;
6498 algn_fclusters.reserve(algn.Exons().size());
6499
6500 {
6501 const TInDels& fs = algn.FrameShifts();
6502 TInDels::const_iterator fi = fs.begin();
6503 ITERATE (CGeneModel::TExons, e, algn.Exons()) {
6504 algn_fclusters.push_back(SFShiftsCluster(e->Limits()));
6505 while(fi != fs.end() && fi->IntersectingWith(e->GetFrom(),e->GetTo())) {
6506 algn_fclusters.back().m_fshifts.push_back(*fi++);
6507 }
6508 }
6509 }
6510
6511 ITERATE(TFShiftsClusterVec, exon_cluster, algn_fclusters) {
6512 pair<TIt,TIt> eq_rng = fshift_clusters.equal_range(*exon_cluster);
6513 for(TIt glob_cluster = eq_rng.first; glob_cluster != eq_rng.second; ++glob_cluster) {
6514 ITERATE(TInDels, fi, glob_cluster->m_fshifts)
6515 if (find(exon_cluster->m_fshifts.begin(),exon_cluster->m_fshifts.end(),*fi) == exon_cluster->m_fshifts.end())
6516 if (fi->IntersectingWith(exon_cluster->m_limits.GetFrom(),exon_cluster->m_limits.GetTo()))
6517 return false;
6518 ITERATE(TInDels, fi, exon_cluster->m_fshifts)
6519 if (find(glob_cluster->m_fshifts.begin(),glob_cluster->m_fshifts.end(),*fi) == glob_cluster->m_fshifts.end())
6520 if (fi->IntersectingWith(glob_cluster->m_limits.GetFrom(),glob_cluster->m_limits.GetTo()))
6521 return false;
6522 }
6523 }
6524 NON_CONST_ITERATE(TFShiftsClusterVec, exon_cluster, algn_fclusters) {
6525 pair<TIt,TIt> eq_rng = fshift_clusters.equal_range(*exon_cluster);
6526 for(TIt glob_cluster = eq_rng.first; glob_cluster != eq_rng.second;) {
6527 exon_cluster->m_limits += glob_cluster->m_limits;
6528 exon_cluster->m_fshifts.insert(exon_cluster->m_fshifts.end(),glob_cluster->m_fshifts.begin(),glob_cluster->m_fshifts.end());
6529 fshift_clusters.erase(glob_cluster++);
6530 }
6531 uniq(exon_cluster->m_fshifts);
6532 fshift_clusters.insert(eq_rng.second, *exon_cluster);
6533 }
6534 return true;
6535 }
6536
FsTouch(const TSignedSeqRange & lim,const CInDelInfo & fs)6537 bool CChainer::CChainerImpl::FsTouch(const TSignedSeqRange& lim, const CInDelInfo& fs) {
6538 if(fs.IsInsertion() && fs.Loc()+fs.Len() == lim.GetFrom())
6539 return true;
6540 if(fs.IsDeletion() && fs.Loc() == lim.GetFrom())
6541 return true;
6542 if(fs.Loc() == lim.GetTo()+1)
6543 return true;
6544
6545 return false;
6546 }
6547
SplitAlignmentsByStrand(const TGeneModelList & clust,TGeneModelList & clust_plus,TGeneModelList & clust_minus)6548 void CChainer::CChainerImpl::SplitAlignmentsByStrand(const TGeneModelList& clust, TGeneModelList& clust_plus, TGeneModelList& clust_minus)
6549 {
6550 ITERATE (TGeneModelList, itcl, clust) {
6551 const CGeneModel& algn = *itcl;
6552
6553 if (algn.Strand() == ePlus)
6554 clust_plus.push_back(algn);
6555 else
6556 clust_minus.push_back(algn);
6557 }
6558 }
6559
InframeFraction(const CGeneModel & a,TSignedSeqPos left,TSignedSeqPos right)6560 double InframeFraction(const CGeneModel& a, TSignedSeqPos left, TSignedSeqPos right)
6561 {
6562 if(a.FrameShifts().empty())
6563 return 1.0;
6564
6565 CAlignMap cdsmap(a.GetAlignMap());
6566 int inframelength = 0;
6567 int outframelength = 0;
6568 int frame = 0;
6569 TSignedSeqPos prev = left;
6570 TInDels indels = a.GetInDels(left, right, true);
6571 ITERATE(TInDels, fs, indels) {
6572 int len = cdsmap.FShiftedLen(cdsmap.ShrinkToRealPoints(TSignedSeqRange(prev,fs->Loc()-1)),false);
6573 if(frame == 0) {
6574 inframelength += len;
6575 } else {
6576 outframelength += len;
6577 }
6578
6579 if(fs->IsDeletion()) {
6580 frame = (frame+fs->Len())%3;
6581 } else {
6582 frame = (3+frame-fs->Len()%3)%3;
6583 }
6584 prev = fs->Loc(); // ShrinkToRealPoints will take care if it in insertion or intron
6585 }
6586 int len = cdsmap.FShiftedLen(cdsmap.ShrinkToRealPoints(TSignedSeqRange(prev,right)),false);
6587 if(frame == 0) {
6588 inframelength += len;
6589 } else {
6590 outframelength += len;
6591 }
6592 return double(inframelength)/(inframelength + outframelength);
6593 }
6594
6595 struct ProjectCDS : public TransformFunction {
ProjectCDSProjectCDS6596 ProjectCDS(double _mininframefrac, const CResidueVec& _seq, CScope* _scope, const map<string, TSignedSeqRange>& _mrnaCDS)
6597 : mininframefrac(_mininframefrac), seq(_seq), scope(_scope), mrnaCDS(_mrnaCDS) {}
6598
6599 double mininframefrac;
6600 const CResidueVec& seq;
6601 CScope* scope;
6602 const map<string, TSignedSeqRange>& mrnaCDS;
6603 virtual void transform_align(CAlignModel& align);
6604 };
6605
transform_align(CAlignModel & align)6606 void ProjectCDS::transform_align(CAlignModel& align)
6607 {
6608 if ((align.Type()&CAlignModel::emRNA)==0 || (align.Status()&CGeneModel::eTSA)!=0 || (align.Status()&CGeneModel::eReversed)!=0 || (align.Status()&CGeneModel::eUnknownOrientation)!=0)
6609 return;
6610
6611 TSignedSeqRange cds_on_mrna;
6612
6613 if (scope != NULL) {
6614 SAnnotSelector sel;
6615 sel.SetFeatSubtype(CSeqFeatData::eSubtype_cdregion);
6616 CSeq_loc mrna;
6617 CRef<CSeq_id> target_id(new CSeq_id);
6618 target_id->Assign(*align.GetTargetId());
6619 mrna.SetWhole(*target_id);
6620 CFeat_CI feat_ci(*scope, mrna, sel);
6621 if (feat_ci && !feat_ci->IsSetPartial()) {
6622 const CSeq_loc& cds_loc = feat_ci->GetMappedFeature().GetLocation();
6623 const CSeq_id* cds_loc_seq_id = cds_loc.GetId();
6624 if (cds_loc_seq_id != NULL && sequence::IsSameBioseq(*cds_loc_seq_id, *target_id, scope)) {
6625 TSeqRange feat_range = cds_loc.GetTotalRange();
6626 cds_on_mrna = TSignedSeqRange(feat_range.GetFrom(), feat_range.GetTo());
6627 }
6628 }
6629 } else {
6630 string accession = align.TargetAccession();
6631 map<string,TSignedSeqRange>::const_iterator pos = mrnaCDS.find(accession);
6632 if(pos != mrnaCDS.end()) {
6633 cds_on_mrna = pos->second;
6634 }
6635 }
6636
6637 if (cds_on_mrna.Empty())
6638 return;
6639
6640 CAlignMap alignmap(align.GetAlignMap());
6641 TSignedSeqPos left = alignmap.MapEditedToOrig(cds_on_mrna.GetFrom());
6642 TSignedSeqPos right = alignmap.MapEditedToOrig(cds_on_mrna.GetTo());
6643 if(align.Strand() == eMinus) {
6644 swap(left,right);
6645 }
6646
6647 CGeneModel a = align;
6648
6649 if(left < 0 || right < 0) // start or stop cannot be projected
6650 return;
6651
6652 CAlignMap alignmap_clipped(a.GetAlignMap());
6653 if(alignmap_clipped.MapOrigToEdited(left) < 0 || alignmap_clipped.MapOrigToEdited(right) < 0) // cds is clipped
6654 return;
6655
6656 a.Clip(TSignedSeqRange(left,right),CGeneModel::eRemoveExons);
6657
6658 if(!a.Continuous())
6659 return;
6660
6661 // ITERATE(TInDels, fs, a.FrameShifts()) {
6662 // if(fs->Len()%3 != 0) return; // there is a frameshift
6663 // }
6664
6665 if (InframeFraction(a, left, right) < mininframefrac)
6666 return;
6667
6668 a.FrameShifts().clear(); // clear notshifted indels
6669 CAlignMap cdsmap(a.GetAlignMap());
6670 CResidueVec cds;
6671 cdsmap.EditedSequence(seq, cds);
6672 unsigned int length = cds.size();
6673
6674 if(length%3 != 0)
6675 return;
6676
6677 if(!IsStartCodon(&cds[0]) || !IsStopCodon(&cds[length-3]) ) // start or stop on genome is not right
6678 return;
6679
6680 for(unsigned int i = 0; i < length-3; i += 3) {
6681 if(IsStopCodon(&cds[i]))
6682 return; // premature stop on genome
6683 }
6684
6685 TSignedSeqRange reading_frame = cdsmap.MapRangeEditedToOrig(TSignedSeqRange(3,length-4));
6686 TSignedSeqRange start = cdsmap.MapRangeEditedToOrig(TSignedSeqRange(0,2));
6687 TSignedSeqRange stop = cdsmap.MapRangeEditedToOrig(TSignedSeqRange(length-3,length-1));
6688
6689 CCDSInfo cdsinfo;
6690 cdsinfo.SetReadingFrame(reading_frame,true);
6691 cdsinfo.SetStart(start,true);
6692 cdsinfo.SetStop(stop,true);
6693
6694 // align.FrameShifts().clear();
6695 CGeneModel b = align;
6696 b.FrameShifts().clear();
6697 align = CAlignModel(b, b.GetAlignMap());
6698 align.SetCdsInfo(cdsinfo);
6699 }
6700
FilterOutBadScoreChainsHavingBetterCompatibles(TGeneModelList & chains)6701 void CChainer::CChainerImpl::FilterOutBadScoreChainsHavingBetterCompatibles(TGeneModelList& chains)
6702 {
6703 for(TGeneModelList::iterator it = chains.begin(); it != chains.end();) {
6704 TGeneModelList::iterator itt = it++;
6705 for(TGeneModelList::iterator jt = chains.begin(); jt != itt;) {
6706 TGeneModelList::iterator jtt = jt++;
6707 if(itt->Strand() != jtt->Strand() || (itt->Score() != BadScore() && jtt->Score() != BadScore())) continue;
6708
6709 // at least one score is BadScore
6710 if(itt->Score() != BadScore()) {
6711 if(itt->isCompatible(*jtt) > 1) chains.erase(jtt);
6712 } else if(jtt->Score() != BadScore()) {
6713 if(itt->isCompatible(*jtt) > 1) {
6714 chains.erase(itt);
6715 break;
6716 }
6717
6718 } else if(itt->AlignLen() > jtt->AlignLen()) {
6719 if(itt->isCompatible(*jtt) > 0) chains.erase(jtt);
6720 } else {
6721 if(itt->isCompatible(*jtt) > 0) {
6722 chains.erase(itt);
6723 break;
6724 }
6725 }
6726 }
6727 }
6728 }
6729
6730
6731 struct TrimAlignment : public TransformFunction {
6732 public:
TrimAlignmentTrimAlignment6733 TrimAlignment(int a_trim) : trim(a_trim) {}
6734 int trim;
6735
TrimCodingExonLeftTrimAlignment6736 TSignedSeqPos TrimCodingExonLeft(const CAlignModel& align, const CModelExon& e, int trim)
6737 {
6738 TSignedSeqPos old_from = e.GetFrom();
6739 TSignedSeqPos new_from = align.FShiftedMove(old_from, trim);
6740 _ASSERT( new_from-old_from >= trim && new_from <= e.GetTo() );
6741
6742 return new_from;
6743 }
6744
TrimCodingExonRightTrimAlignment6745 TSignedSeqPos TrimCodingExonRight(const CAlignModel& align, const CModelExon& e, int trim)
6746 {
6747 TSignedSeqPos old_to = e.GetTo();
6748 TSignedSeqPos new_to = align.FShiftedMove(old_to, -trim);
6749 _ASSERT( old_to-new_to >= trim && new_to >= e.GetFrom() );
6750
6751 return new_to;
6752 }
6753
transform_alignTrimAlignment6754 virtual void transform_align(CAlignModel& align)
6755 {
6756 TSignedSeqRange flimits = align.Exons().front().Limits();
6757 TSignedSeqRange blimits = align.Exons().back().Limits();
6758 CAlignMap alignmap(align.GetAlignMap());
6759
6760 if ((align.Type() & CAlignModel::eProt)!=0) {
6761 TrimProtein(align, alignmap);
6762 } else {
6763 TrimTranscript(align, alignmap);
6764 }
6765
6766 // don't mark trimmed if trim was to the next exon
6767 if(align.Limits().GetFrom() > flimits.GetFrom() && align.Limits().GetFrom() <= flimits.GetTo()) align.Status() |= CAlignModel::eLeftTrimmed;
6768 if(align.Limits().GetTo() < blimits.GetTo() && align.Limits().GetTo() >= blimits.GetFrom()) align.Status() |= CAlignModel::eRightTrimmed;
6769 }
6770
TrimProteinTrimAlignment6771 void TrimProtein(CAlignModel& align, CAlignMap& alignmap)
6772 {
6773 for (CAlignModel::TExons::const_iterator piece_begin = align.Exons().begin(); piece_begin != align.Exons().end(); ++piece_begin) {
6774 _ASSERT( !piece_begin->m_fsplice );
6775
6776 CAlignModel::TExons::const_iterator piece_end;
6777 for (piece_end = piece_begin; piece_end != align.Exons().end() && piece_end->m_ssplice; ++piece_end) ;
6778 _ASSERT( piece_end != align.Exons().end() );
6779
6780 TSignedSeqPos a;
6781 if (piece_begin == align.Exons().begin() && align.LeftComplete())
6782 a = align.Limits().GetFrom();
6783 else
6784 a = piece_begin->GetFrom()+trim;
6785
6786 TSignedSeqPos b;
6787 if (piece_end->GetTo() >= align.Limits().GetTo() && align.RightComplete())
6788 b = align.Limits().GetTo();
6789 else
6790 b = piece_end->GetTo()-trim;
6791
6792 if((a != piece_begin->GetFrom() || b != piece_end->GetTo()) && b > a) {
6793 TSignedSeqRange newlimits = alignmap.ShrinkToRealPoints(TSignedSeqRange(a,b),true);
6794 // _ASSERT(newlimits.NotEmpty() && piece_begin->GetTo() >= newlimits.GetFrom() && piece_end->GetFrom() <= newlimits.GetTo());
6795 if(newlimits.NotEmpty() && piece_begin->GetTo() >= newlimits.GetFrom() && piece_end->GetFrom() <= newlimits.GetTo())
6796 align.Clip(newlimits, CAlignModel::eDontRemoveExons);
6797 }
6798
6799 piece_begin = piece_end;
6800 }
6801 }
6802
TrimTranscriptTrimAlignment6803 void TrimTranscript(CAlignModel& align, CAlignMap& alignmap)
6804 {
6805 if(!align.TrustedmRNA().empty())
6806 return;
6807 if(align.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
6808 return;
6809
6810 int a = align.Limits().GetFrom();
6811 int b = align.Limits().GetTo();
6812 if(align.Strand() == ePlus) {
6813 if((align.Status()&CGeneModel::eCap) == 0)
6814 a += trim;
6815 if((align.Status()&CGeneModel::ePolyA) == 0)
6816 b -= trim;
6817 } else {
6818 if((align.Status()&CGeneModel::ePolyA) == 0)
6819 a += trim;
6820 if((align.Status()&CGeneModel::eCap) == 0)
6821 b -= trim;
6822 }
6823
6824 //don't trim gapfillers
6825 if(align.Exons().front().m_ssplice_sig == "XX")
6826 a = align.Limits().GetFrom();
6827 if(align.Exons().back().m_fsplice_sig == "XX")
6828 b = align.Limits().GetTo();
6829
6830 if(!align.ReadingFrame().Empty()) { // avoid trimming confirmed CDSes
6831 TSignedSeqRange cds_on_genome = align.RealCdsLimits();
6832 if(cds_on_genome.GetFrom() < a) {
6833 a = align.Limits().GetFrom();
6834 }
6835 if(b < cds_on_genome.GetTo()) {
6836 b = align.Limits().GetTo();
6837 }
6838 }
6839
6840 TSignedSeqRange newlimits = alignmap.ShrinkToRealPoints(TSignedSeqRange(a,b),false);
6841 _ASSERT(newlimits.NotEmpty() && align.Exons().front().GetTo() >= newlimits.GetFrom() && align.Exons().back().GetFrom() <= newlimits.GetTo());
6842
6843 if(newlimits != align.Limits()) {
6844 align.Clip(newlimits,CAlignModel::eDontRemoveExons); // Clip doesn't change AlignMap
6845 }
6846 }
6847 };
6848
TrimAlignment()6849 TransformFunction* CChainer::TrimAlignment()
6850 {
6851 return new gnomon::TrimAlignment(m_data->trim);
6852 }
6853
6854 struct DoNotBelieveShortPolyATail : public TransformFunction {
DoNotBelieveShortPolyATailDoNotBelieveShortPolyATail6855 DoNotBelieveShortPolyATail(int _minpolya) : minpolya(_minpolya) {}
6856
6857 int minpolya;
transform_alignDoNotBelieveShortPolyATail6858 virtual void transform_align(CAlignModel& align)
6859 {
6860 if ((align.Status()&CGeneModel::ePolyA) == 0)
6861 return;
6862
6863 if ((align.Status()&CGeneModel::eUnknownOrientation) != 0 || align.PolyALen() < minpolya)
6864 align.Status() ^= CGeneModel::ePolyA;
6865 }
6866 };
6867
DoNotBelieveShortPolyATail()6868 TransformFunction* CChainer::DoNotBelieveShortPolyATail()
6869 {
6870 return new gnomon::DoNotBelieveShortPolyATail(m_data->minpolya);
6871 }
6872
6873
SetNumbering(int idnext,int idinc)6874 void CChainer::SetNumbering(int idnext, int idinc)
6875 {
6876 m_data->m_idnext = idnext;
6877 m_data->m_idinc = idinc;
6878 }
6879
SetGenomicRange(const TAlignModelList & alignments)6880 void CChainer::SetGenomicRange(const TAlignModelList& alignments)
6881 {
6882 m_data->SetGenomicRange(alignments);
6883 }
6884
SetGenomicRange(const TAlignModelList & alignments)6885 void CChainer::CChainerImpl::SetGenomicRange(const TAlignModelList& alignments)
6886 {
6887 TSignedSeqRange range = alignments.empty() ? TSignedSeqRange::GetWhole() : TSignedSeqRange::GetEmpty();
6888
6889 CScope scope(*CObjectManager::GetInstance());
6890 scope.AddDefaults();
6891
6892 ITERATE(TAlignModelList, i, alignments) {
6893 range += i->Limits();
6894
6895 if(i->Type()&CGeneModel::eProt) {
6896 string accession = i->TargetAccession();
6897 if(!prot_complet.count(accession)) {
6898 CSeqVector protein_seqvec(scope.GetBioseqHandle(*i->GetTargetId()), CBioseq_Handle::eCoding_Iupac);
6899 CSeqVector_CI protein_ci(protein_seqvec);
6900 prot_complet[accession] = make_pair(*protein_ci == 'M', true);
6901 }
6902 }
6903 }
6904
6905 _ASSERT(m_gnomon.get() != NULL);
6906 m_gnomon->ResetRange(range);
6907
6908 confirmed_ends.clear();
6909 orig_aligns.clear();
6910 unmodified_aligns.clear();
6911 mrna_count.clear();
6912 est_count.clear();
6913 rnaseq_count.clear();
6914 oriented_introns_plus.clear();
6915 oriented_introns_minus.clear();
6916 }
6917
ProjectCDS(CScope & scope)6918 TransformFunction* CChainer::ProjectCDS(CScope& scope)
6919 {
6920 return new gnomon::ProjectCDS(m_data->mininframefrac, m_gnomon->GetSeq(),
6921 m_data->mrnaCDS.find("use_objmgr")!=m_data->mrnaCDS.end() ? &scope : NULL,
6922 m_data->mrnaCDS);
6923 }
6924
6925 struct DoNotBelieveFrameShiftsWithoutCdsEvidence : public TransformFunction {
transform_alignDoNotBelieveFrameShiftsWithoutCdsEvidence6926 virtual void transform_align(CAlignModel& align)
6927 {
6928 if (align.ReadingFrame().Empty())
6929 align.FrameShifts().clear();
6930 }
6931 };
6932
DoNotBelieveFrameShiftsWithoutCdsEvidence()6933 TransformFunction* CChainer::DoNotBelieveFrameShiftsWithoutCdsEvidence()
6934 {
6935 return new gnomon::DoNotBelieveFrameShiftsWithoutCdsEvidence();
6936 }
6937
LeftAndLongFirst(const CGeneModel & a,const CGeneModel & b)6938 bool LeftAndLongFirst(const CGeneModel& a, const CGeneModel& b) {
6939 if(a.Limits() == b.Limits()) {
6940 if(a.Type() == b.Type())
6941 return a.ID() < b.ID();
6942 else
6943 return a.Type() > b.Type();
6944 }
6945 else if(a.Limits().GetFrom() == b.Limits().GetFrom())
6946 return a.Limits().GetTo() > b.Limits().GetTo();
6947 else
6948 return a.Limits().GetFrom() < b.Limits().GetFrom();
6949 }
6950
SetConfirmedStartStopForProteinAlignments(TAlignModelList & alignments)6951 void CChainer::SetConfirmedStartStopForProteinAlignments(TAlignModelList& alignments)
6952 {
6953 m_data->SetConfirmedStartStopForProteinAlignments(alignments);
6954 }
6955
SetConfirmedStartStopForProteinAlignments(TAlignModelList & alignments)6956 void CChainer::CChainerImpl::SetConfirmedStartStopForProteinAlignments(TAlignModelList& alignments)
6957 {
6958 NON_CONST_ITERATE (TAlignModelCluster, i, alignments) {
6959 CAlignModel& algn = *i;
6960 if ((algn.Type() & CGeneModel::eProt)!=0) {
6961 CCDSInfo cds = algn.GetCdsInfo();
6962 TSignedSeqRange alignedlim = algn.GetAlignMap().MapRangeOrigToEdited(algn.Limits(),false);
6963 map<string, pair<bool,bool> >::iterator iter = prot_complet.find(algn.TargetAccession());
6964 _ASSERT(iter != prot_complet.end());
6965 if(iter == prot_complet.end())
6966 continue;
6967
6968 if(cds.HasStart() && iter->second.first && alignedlim.GetFrom() == 0)
6969 cds.SetStart(cds.Start(),true);
6970 if(cds.HasStop() && iter->second.second && alignedlim.GetTo() == algn.TargetLen()-1)
6971 cds.SetStop(cds.Stop(),true);
6972 if(cds.ConfirmedStart() || cds.ConfirmedStop())
6973 algn.SetCdsInfo(cds);
6974 }
6975 }
6976 }
6977
DropAlignmentInfo(TAlignModelList & alignments,TGeneModelList & models)6978 void CChainer::DropAlignmentInfo(TAlignModelList& alignments, TGeneModelList& models)
6979 {
6980 ///////////////////////
6981 // SMatrix blosum;
6982
6983 NON_CONST_ITERATE (TAlignModelCluster, i, alignments) {
6984 if(!(i->Status()&CGeneModel::eUnmodifiedAlign))
6985 m_data->orig_aligns[i->ID()]=&(*i);
6986
6987 CGeneModel aa = *i;
6988
6989 if(!i->TrustedmRNA().empty() && i->Exons().size() > 1) {
6990 auto tlim = i->TranscriptLimits();
6991 if(i->Exons().front().Limits().NotEmpty() && tlim.GetFrom() == 0)
6992 aa.Status() |= CGeneModel::eLeftConfirmed;
6993 if(i->Exons().back().Limits().NotEmpty() && (tlim.GetTo() == i->TargetLen()-1 || (i->Status()&CGeneModel::ePolyA)))
6994 aa.Status() |= CGeneModel::eRightConfirmed;
6995 }
6996
6997 if(aa.Type() & CGeneModel::eProt) {
6998 /*
6999 {{////////////////////// print replacement info for diagnostics
7000 const CResidueVec& contig = m_gnomon->GetSeq();
7001 CScope scope(*CObjectManager::GetInstance());
7002 scope.AddDefaults();
7003 CSeqVector protein_seqvec(scope.GetBioseqHandle(*i->GetTargetId()), CBioseq_Handle::eCoding_Iupac);
7004 CAlignMap amap = i->GetAlignMap();
7005
7006 ITERATE(CGeneModel::TExons, e, i->Exons()) {
7007 TSignedSeqRange exon = m_edited_contig_map.ShrinkToRealPointsOnEdited(e->Limits());
7008 if(exon.Empty())
7009 continue;
7010 exon = m_edited_contig_map.MapRangeEditedToOrig(exon,false);
7011 if(exon.Empty())
7012 continue;
7013 map<int,char>::const_iterator ir = m_replacements.lower_bound(exon.GetFrom()+2); // first definetely internal exon replacement or end()
7014 for( ; ir != m_replacements.end() && ir->first <= exon.GetTo()-2; ++ir) {
7015 int orig_gpos = ir->first;
7016 int edited_gpos = m_edited_contig_map.MapOrigToEdited(orig_gpos);
7017 int tpos = amap.MapOrigToEdited(edited_gpos);
7018 if(tpos < 0)
7019 continue;
7020 int pos_in_codon = tpos%3;
7021
7022 if(i->Strand() == eMinus)
7023 pos_in_codon = 2-pos_in_codon;
7024
7025 cout << tpos << '\t' << pos_in_codon << endl;
7026
7027 int codon_left = edited_gpos-pos_in_codon ;
7028 string edited_codon(contig.begin()+codon_left,contig.begin()+codon_left+3);
7029 string orig_codon = edited_codon;
7030 orig_codon[pos_in_codon] = m_replaced_bases[orig_gpos];
7031 if(i->Strand() == eMinus) {
7032 ReverseComplement(orig_codon.begin(),orig_codon.end());
7033 ReverseComplement(edited_codon.begin(),edited_codon.end());
7034 }
7035 string edited_aa, orig_aa;
7036 objects::CSeqTranslator::Translate(orig_codon, orig_aa, objects::CSeqTranslator::fIs5PrimePartial);
7037 objects::CSeqTranslator::Translate(edited_codon, edited_aa, objects::CSeqTranslator::fIs5PrimePartial);
7038 char prot_aa = (tpos/3 < protein_seqvec.size()) ? protein_seqvec[tpos/3] : '*';
7039 int delta = blosum.matrix[edited_aa[0]][prot_aa] - blosum.matrix[orig_aa[0]][prot_aa];
7040 cout << "Replacement\t" << m_contig_acc << '\t' << orig_gpos << '\t' << orig_codon << '\t' << edited_codon << '\t' << orig_aa << '\t' << edited_aa << '\t' << prot_aa << '\t' << delta << '\t' << i->ID() << endl;
7041 }
7042
7043
7044 }
7045
7046 }}//////////////////
7047 */
7048 TInDels alignfshifts = i->GetInDels(true);
7049 TInDels fshifts;
7050 ITERATE(CGeneModel::TExons, e, aa.Exons()) {
7051 TInDels efshifts;
7052 int len = 0;
7053 ITERATE(TInDels, fs, alignfshifts) {
7054 if(fs->IntersectingWith(e->GetFrom(),e->GetTo())) {
7055 efshifts.push_back(*fs);
7056 len += (fs->IsInsertion() ? fs->Len() : -fs->Len());
7057 }
7058 }
7059 if(efshifts.empty())
7060 continue;
7061
7062 int a = efshifts.front().Loc()-1;
7063 int b = efshifts.back().InDelEnd();
7064 TIntMap::iterator conf = m_confirmed_bases_len.upper_bound(b); // confirmed on the right
7065 bool confirmed_region = (conf != m_confirmed_bases_len.begin() && (--conf)->first <= a && conf->first+conf->second > b);
7066
7067 if(len%3 != 0 || !confirmed_region) {
7068 ITERATE(TInDels, fs, efshifts) {
7069 int l = fs->Len()%3;
7070 if(fs->IsInsertion()) {
7071 fshifts.push_back(CInDelInfo(fs->Loc(), l, CInDelInfo::eIns));
7072 } else {
7073 fshifts.push_back(CInDelInfo(fs->Loc(), l, CInDelInfo::eDel, fs->GetInDelV().substr(0,l)));
7074 }
7075 }
7076 // fshifts.insert(fshifts.end(), efshifts.begin(), efshifts.end());
7077 }
7078 }
7079 aa.FrameShifts() = fshifts;
7080 } else {
7081 aa.FrameShifts().clear();
7082 aa.Status() &= ~CGeneModel::eReversed;
7083 }
7084
7085 models.push_back(aa);
7086 }
7087 }
7088
7089
SetupArgDescriptions(CArgDescriptions * arg_desc)7090 void CChainerArgUtil::SetupArgDescriptions(CArgDescriptions* arg_desc)
7091 {
7092 arg_desc->AddKey("param", "param",
7093 "Organism specific parameters",
7094 CArgDescriptions::eInputFile);
7095
7096 arg_desc->SetCurrentGroup("Alignment modification");
7097 arg_desc->AddDefaultKey("trim", "trim",
7098 "If aligned sequence is partial and includes a small portion of an exon the alignment program "
7099 "usually misses this exon and might erroneously place a few bases from this exon near the previous exon, "
7100 "and this will mess up the chaining. To prevent this we trim small portions of the alignment before chaining. "
7101 "If it is possible, the trimming will be reversed for the 5'/3' ends of the final chain. Must be < minex and "
7102 "multiple of 3",
7103 CArgDescriptions::eInteger, "6");
7104
7105 arg_desc->SetCurrentGroup("Additional information about sequences");
7106 arg_desc->AddOptionalKey("mrnaCDS", "mrnaCDS",
7107 "CDSes annotated on mRNAs. If CDS could be projected on genome with intact "
7108 "Start/Stop and frame the Stop will be accepted as is. The Start could/will "
7109 "be moved further to make the longest possible complete CDS within the chain",
7110 CArgDescriptions::eInputFile);
7111 arg_desc->AddDefaultKey("mininframefrac", "mininframefrac",
7112 "Some mRNA alignments have paired indels which throw a portion of CDS out of frame."
7113 "This parameter regulates how much of the CDS could suffer from this before CDS is considered inaceptable",
7114 CArgDescriptions::eDouble, "0.95");
7115 arg_desc->AddOptionalKey("pinfo", "pinfo",
7116 "Information about protein 5' and 3' completeness",
7117 CArgDescriptions::eInputFile);
7118
7119 arg_desc->SetCurrentGroup("Thresholds");
7120 arg_desc->AddDefaultKey("minscor", "minscor",
7121 "Minimal coding propensity score for valid CDS. This threshold could be ignored depending on "
7122 "-longenoughcds or -protcdslen and -minprotfrac",
7123 CArgDescriptions::eDouble, "25.0");
7124 arg_desc->AddDefaultKey("longenoughcds", "longenoughcds",
7125 "Minimal CDS not supported by protein or annotated mRNA to ignore the score (bp)",
7126 CArgDescriptions::eInteger, "900");
7127 arg_desc->AddDefaultKey("protcdslen", "protcdslen",
7128 "Minimal CDS supported by protein or annotated mRNA to ignore the score (bp)",
7129 CArgDescriptions::eInteger, "300");
7130 arg_desc->AddDefaultKey("minprotfrac", "minprotfrac",
7131 "Minimal fraction of protein aligned to ignore "
7132 "the score and consider for confirmed start",
7133 CArgDescriptions::eDouble, "0.9");
7134 arg_desc->AddDefaultKey("endprotfrac", "endprotfrac",
7135 "Some proteins aligned with better than -minprotfrac coverage are missing Start/Stop. "
7136 "If such an alignment was extended by EST(s) which provided a Start/Stop and we are not missing "
7137 "more than (1-endprotfrac)*proteinlength on either side this chain will be considered to have a confirmed Start/Stop",
7138 CArgDescriptions::eDouble, "0.05");
7139 arg_desc->AddDefaultKey("oep", "oep",
7140 "Minimal overlap length for chaining alignments which don't have introns in the ovrlapping regions",
7141 CArgDescriptions::eInteger, "10");
7142 arg_desc->AddDefaultKey("minsupport", "minsupport",
7143 "Minimal number of mRNA/EST for valid noncoding models",
7144 CArgDescriptions::eInteger, "3");
7145 arg_desc->AddDefaultKey("minsupport_mrna", "minsupport_mrna",
7146 "Minimal number of mRNA for valid noncoding models",
7147 CArgDescriptions::eInteger, "1");
7148 arg_desc->AddDefaultKey("minsupport_rnaseq", "minsupport_rnaseq",
7149 "Minimal number of RNA-Seq for valid noncoding models",
7150 CArgDescriptions::eInteger, "5");
7151 arg_desc->AddDefaultKey("minlen", "minlen",
7152 "Chains with thorter CDS should be supported by protein or satisfy noncoding intron reguirements",
7153 CArgDescriptions::eInteger, "100");
7154 arg_desc->AddDefaultKey("altfrac","altfrac","The CDS length of the principal model in the gene is multiplied by this fraction. Alt variants with the CDS length above "
7155 "this are included in gene",CArgDescriptions::eDouble,"80.0");
7156 arg_desc->AddDefaultKey("composite","composite","Maximal composite number in alts",CArgDescriptions::eInteger,"1");
7157 arg_desc->AddFlag("opposite","Allow overlap of complete multiexon genes with opposite strands");
7158 arg_desc->AddFlag("partialalts","Allows partial alternative variants. In combination with -nognomon will allow partial genes");
7159 arg_desc->AddDefaultKey("tolerance","tolerance","if models exon boundary differ only this much only one model will survive",CArgDescriptions::eInteger,"5");
7160 arg_desc->AddFlag("no5pextension","Don't extend chain CDS to the leftmost start");
7161
7162 arg_desc->SetCurrentGroup("Heuristic parameters for score evaluation");
7163 arg_desc->AddDefaultKey("i5p", "i5p",
7164 "5p intron penalty",
7165 CArgDescriptions::eDouble, "7.0");
7166 arg_desc->AddDefaultKey("i3p", "i3p",
7167 "3p intron penalty",
7168 CArgDescriptions::eDouble, "14.0");
7169 arg_desc->AddDefaultKey("cdsbonus", "cdsbonus",
7170 "Bonus for CDS length",
7171 CArgDescriptions::eDouble, "0.05");
7172 arg_desc->AddDefaultKey("lenpen", "lenpen",
7173 "Penalty for total length",
7174 CArgDescriptions::eDouble, "0.005");
7175 arg_desc->AddDefaultKey("utrclipthreshold", "utrclipthreshold",
7176 "Relative coverage for clipping low support UTRs",
7177 CArgDescriptions::eDouble, "0.01");
7178
7179 arg_desc->SetCurrentGroup("CAGE/PolyA arguments");
7180
7181 arg_desc->AddDefaultKey("min-cap-weight", "MinCapWeight",
7182 "Minimal accepted weight for a capped alignment",
7183 CArgDescriptions::eInteger, "5");
7184 arg_desc->AddDefaultKey("min-cap-blob", "MinCapBlob",
7185 "Minimal cap blob weight for accepted peak",
7186 CArgDescriptions::eInteger, "50");
7187
7188 arg_desc->AddDefaultKey("min-polya-weight", "MinPolyaWeight",
7189 "Minimal accepted weight for polya alignment",
7190 CArgDescriptions::eInteger, "1");
7191 arg_desc->AddDefaultKey("min-polya-blob", "MinPolyaBlob",
7192 "Minimal polya blob weight for accepted peak",
7193 CArgDescriptions::eInteger, "1");
7194
7195 arg_desc->AddDefaultKey("max-dist", "MaxDist",
7196 "Maximal distance between individual cap/polya positions in a blob",
7197 CArgDescriptions::eInteger, "20");
7198 arg_desc->AddDefaultKey("secondary-peak", "SecondaryPeak",
7199 "Minimal weight fraction for a secondary cap/polya peak",
7200 CArgDescriptions::eDouble, "0.5");
7201 arg_desc->AddDefaultKey("tertiary-peak", "TertiaryPeak",
7202 "Last 5' exon is extended to low weight polya peak if there is sufficient rnaseq coverage",
7203 CArgDescriptions::eDouble, "0.2");
7204 arg_desc->AddDefaultKey("tertiary-peak-coverage", "TertiaryPeakCoverage",
7205 "Minimal relative rnaseq coverage for tertiary peak",
7206 CArgDescriptions::eDouble, "0.05");
7207
7208 arg_desc->AddDefaultKey("min-flank-exon", "MinFlankExon",
7209 "The minimal distance of cap/polya to a splice",
7210 CArgDescriptions::eInteger, "25");
7211
7212
7213 arg_desc->AddDefaultKey("minpolya", "minpolya",
7214 "Minimal accepted polyA tale length in transcript alignments",
7215 CArgDescriptions::eInteger, "6");
7216 arg_desc->AddFlag("use_confirmed_ends","Use end exons of trusted transcripts for clippig/extension");
7217
7218 }
7219
SetHMMParameters(CHMMParameters * params)7220 void CGnomonAnnotator_Base::SetHMMParameters(CHMMParameters* params)
7221 {
7222 m_hmm_params = params;
7223 }
7224
SetIntersectLimit(int value)7225 void CChainer::SetIntersectLimit(int value)
7226 {
7227 m_data->intersect_limit = value;
7228 }
SetTrim(int trim)7229 void CChainer::SetTrim(int trim)
7230 {
7231 trim = (trim/3)*3;
7232 m_data->trim = trim;
7233 }
SetMinPolyA(int minpolya)7234 void CChainer::SetMinPolyA(int minpolya)
7235 {
7236 m_data->minpolya = minpolya;
7237 }
SetMinScor()7238 SMinScor& CChainer::SetMinScor()
7239 {
7240 return m_data->minscor;
7241 }
SetMinInframeFrac(double mininframefrac)7242 void CChainer::SetMinInframeFrac(double mininframefrac)
7243 {
7244 m_data->mininframefrac = mininframefrac;
7245 }
SetProtComplet()7246 map<string, pair<bool,bool> >& CChainer::SetProtComplet()
7247 {
7248 return m_data->prot_complet;
7249 }
SetMrnaCDS()7250 map<string,TSignedSeqRange>& CChainer::SetMrnaCDS()
7251 {
7252 return m_data->mrnaCDS;
7253 }
7254
ArgsToChainer(CChainer * chainer,const CArgs & args,CScope & scope)7255 void CChainerArgUtil::ArgsToChainer(CChainer* chainer, const CArgs& args, CScope& scope)
7256 {
7257 CNcbiIfstream param_file(args["param"].AsString().c_str());
7258 chainer->SetHMMParameters(new CHMMParameters(param_file));
7259
7260 chainer->SetIntersectLimit(args["oep"].AsInteger());
7261 chainer->SetTrim(args["trim"].AsInteger());
7262
7263 SMinScor& minscor = chainer->SetMinScor();
7264 minscor.m_min = args["minscor"].AsDouble();
7265 minscor.m_i5p_penalty = args["i5p"].AsDouble();
7266 minscor.m_i3p_penalty = args["i3p"].AsDouble();
7267 minscor.m_cds_bonus = args["cdsbonus"].AsDouble();
7268 minscor.m_length_penalty = args["lenpen"].AsDouble();
7269 minscor.m_minprotfrac = args["minprotfrac"].AsDouble();
7270 minscor.m_endprotfrac = args["endprotfrac"].AsDouble();
7271 minscor.m_prot_cds_len = args["protcdslen"].AsInteger();
7272 minscor.m_cds_len = args["longenoughcds"].AsInteger();
7273 minscor.m_utr_clip_threshold = args["utrclipthreshold"].AsDouble();
7274 minscor.m_minsupport = args["minsupport"].AsInteger();
7275 minscor.m_minsupport_mrna = args["minsupport_mrna"].AsInteger();
7276 minscor.m_minsupport_rnaseq = args["minsupport_rnaseq"].AsInteger();
7277 minscor.m_minlen = args["minlen"].AsInteger();
7278
7279 chainer->SetMinInframeFrac(args["mininframefrac"].AsDouble());
7280
7281 chainer->m_data->altfrac = args["altfrac"].AsDouble();
7282 chainer->m_data->composite = args["composite"].AsInteger();
7283 chainer->m_data->allow_opposite_strand = args["opposite"];
7284 chainer->m_data->allow_partialalts = args["partialalts"];
7285 chainer->m_data->tolerance = args["tolerance"].AsInteger();
7286 chainer->m_data->no5pextension = args["no5pextension"];
7287
7288 chainer->m_data->min_cap_weight = args["min-cap-weight"].AsInteger();
7289 chainer->m_data->min_cap_blob = args["min-cap-blob"].AsInteger();
7290 chainer->m_data->min_polya_weight = args["min-polya-weight"].AsInteger();
7291 chainer->m_data->min_polya_blob = args["min-polya-blob"].AsInteger();
7292 chainer->m_data->max_dist = args["max-dist"].AsInteger();
7293 chainer->m_data->secondary_peak = args["secondary-peak"].AsDouble();
7294 chainer->m_data->tertiary_peak = args["tertiary-peak"].AsDouble();
7295 chainer->m_data->tertiary_peak_coverage = args["tertiary-peak-coverage"].AsDouble();
7296 chainer->m_data->min_flank_exon = args["min-flank-exon"].AsInteger();
7297 chainer->SetMinPolyA(args["minpolya"].AsInteger());
7298 chainer->m_data->use_confirmed_ends = args["use_confirmed_ends"];
7299
7300
7301
7302 CIdHandler cidh(scope);
7303
7304 map<string,TSignedSeqRange>& mrnaCDS = chainer->SetMrnaCDS();
7305 if(args["mrnaCDS"]) {
7306 if (args["mrnaCDS"].AsString()=="use_objmgr") {
7307 mrnaCDS[args["mrnaCDS"].AsString()] = TSignedSeqRange();
7308 } else {
7309 CNcbiIfstream cdsfile(args["mrnaCDS"].AsString().c_str());
7310 if (!cdsfile)
7311 NCBI_THROW(CGnomonException, eGenericError, "Cannot open file " + args["mrnaCDS"].AsString());
7312 string accession, tmp;
7313 int a, b;
7314 while(cdsfile >> accession >> a >> b) {
7315 _ASSERT(a > 0 && b > 0 && b > a);
7316 getline(cdsfile,tmp);
7317 accession = CIdHandler::ToString(*cidh.ToCanonical(*CIdHandler::ToSeq_id(accession)));
7318 mrnaCDS[accession] = TSignedSeqRange(a-1,b-1);
7319 }
7320 }
7321 }
7322
7323 map<string, pair<bool,bool> >& prot_complet = chainer->SetProtComplet();
7324 if(args["pinfo"]) {
7325 CNcbiIfstream protfile(args["pinfo"].AsString().c_str());
7326 if (!protfile)
7327 NCBI_THROW(CGnomonException, eGenericError, "Cannot open file " + args["pinfo"].AsString());
7328 string seqid_str;
7329 bool fivep;
7330 bool threep;
7331 while(protfile >> seqid_str >> fivep >> threep) {
7332 seqid_str = CIdHandler::ToString(*CIdHandler::ToSeq_id(seqid_str));
7333 prot_complet[seqid_str] = make_pair(fivep, threep);
7334 }
7335 }
7336 }
7337
OverlappingIndel(int pos,const CInDelInfo & indl)7338 bool OverlappingIndel(int pos, const CInDelInfo& indl) {
7339 if(indl.IsDeletion())
7340 return pos <= indl.InDelEnd();
7341 else
7342 return pos < indl.InDelEnd();
7343 }
7344
7345 //this just copies exona_indels unless genome corrections are used
7346 //extra_left/extra_right insertions at the ends of exon on Agenome
CombineCorrectionsAndIndels(const TSignedSeqRange exona,int extra_left,int extra_right,const TSignedSeqRange exonb,const TInDels & editing_indels_frombtoa,const TInDels & exona_indels)7347 TInDels CombineCorrectionsAndIndels(const TSignedSeqRange exona, int extra_left, int extra_right, const TSignedSeqRange exonb, const TInDels& editing_indels_frombtoa, const TInDels& exona_indels) {
7348 TInDels combined_indels;
7349
7350 TInDels::const_iterator ic = upper_bound(editing_indels_frombtoa.begin(), editing_indels_frombtoa.end(), exonb.GetFrom(), OverlappingIndel); // skip all correction ending before exonb
7351 for( ;ic != editing_indels_frombtoa.end() && ic->GetStatus() != CInDelInfo::eGenomeNotCorrect; ++ic); //skip ggaps and Ns
7352 if((ic == editing_indels_frombtoa.end() || ic->Loc() > exonb.GetTo()+1) && exona_indels.empty())
7353 return combined_indels;
7354
7355 typedef list<char> TCharList;
7356 TCharList edit; // edit for Bgenome->transceipt calculated in two steps: Bgenome->Agenome->transceipt
7357 // M match/mismatch
7358 // - skip one base
7359 // everything else insert this letter
7360
7361 //edit from B genome to A genome
7362 int pb = exonb.GetFrom();
7363 for( ;pb <= exonb.GetTo(); ++pb) {
7364 if(ic != editing_indels_frombtoa.end() && ic->Loc() <= pb) {
7365 if(ic->IsInsertion()) {
7366 int len = min(exonb.GetTo()+1,ic->InDelEnd())-max(exonb.GetFrom(),ic->Loc());
7367 edit.insert(edit.end(),len,'-');
7368 pb = ic->InDelEnd()-1;
7369 } else {
7370 string s = ic->GetInDelV();
7371 if(pb == exonb.GetFrom()) // include extra_left part of deletion
7372 s = s.substr(ic->Len()-extra_left);
7373 edit.insert(edit.end(),s.begin(),s.end());
7374 edit.push_back('M'); // base before deletion
7375 }
7376 ++ic;
7377 } else {
7378 edit.push_back('M');
7379 }
7380 }
7381 if(ic != editing_indels_frombtoa.end() && ic->Loc() == pb && ic->GetStatus() == CInDelInfo::eGenomeNotCorrect && extra_right > 0) { // include extra_right part of deletion
7382 _ASSERT(ic->IsDeletion());
7383 string s = ic->GetInDelV().substr(0,extra_right);
7384 edit.insert(edit.end(),s.begin(),s.end());
7385 }
7386 _ASSERT(exonb.GetLength() == count(edit.begin(),edit.end(),'M')+count(edit.begin(),edit.end(),'-'));
7387 _ASSERT(exona.GetLength() == (int)edit.size()-count(edit.begin(),edit.end(),'-'));
7388
7389 // adding changes from A to transcript
7390 if(!exona_indels.empty()) {
7391 TInDels::const_iterator jleft = exona_indels.begin();
7392 int pa = exona.GetFrom()-1;
7393 int skipsome = 0;
7394 ERASE_ITERATE(TCharList, ip, edit) {
7395 if(*ip == '-')
7396 continue;
7397 else
7398 ++pa;
7399
7400 if(jleft != exona_indels.end() && jleft->Loc() == pa) {
7401 if(jleft->IsInsertion()) { // skip extra bases on edited
7402 _ASSERT(skipsome == 0);
7403 skipsome = jleft->Len();
7404 // don't use reverse iterator for erasing
7405 for(TCharList::iterator ipp = ip; skipsome > 0 && ipp != edit.begin() && *(--ipp) != '-' && *ipp != 'M'; ) { // skip previosly inserted
7406 --skipsome;
7407 ipp = edit.erase(ipp);
7408 }
7409 } else { // insert extra bases in transcript
7410 _ASSERT(skipsome == 0);
7411 int insertsome = jleft->Len();
7412 for(reverse_iterator<TCharList::iterator> ir(ip); insertsome > 0 && ir != edit.rend() && *ir == '-'; ++ir) { // reuse skipped positions
7413 *ir = 'M';
7414 --insertsome;
7415 }
7416 if(insertsome > 0)
7417 edit.insert(ip,insertsome,'N');
7418 }
7419 ++jleft;
7420 }
7421
7422 if(skipsome > 0) {
7423 --skipsome;
7424 if(*ip == 'M')
7425 *ip = '-';
7426 else if(*ip != '-') // looks like *ip is never '-'
7427 edit.erase(ip);
7428 }
7429 }
7430 if(jleft != exona_indels.end()) {
7431 _ASSERT(jleft->IsDeletion() && jleft->Loc() == pa+1);
7432 int insertsome = jleft->Len();
7433 for(TCharList::reverse_iterator ir = edit.rbegin(); insertsome > 0 && ir != edit.rend() && *ir == '-'; ++ir) { // reuse skipped positions
7434 *ir = 'M';
7435 --insertsome;
7436 }
7437 if(insertsome > 0)
7438 edit.insert(edit.end(),insertsome,'N');
7439 }
7440 }
7441 _ASSERT(exonb.GetLength() == count(edit.begin(),edit.end(),'M')+count(edit.begin(),edit.end(),'-'));
7442
7443 //TODO: combine +- indels separated by short spans of Ms
7444 pb = exonb.GetFrom();
7445 for(TCharList::iterator ip = edit.begin(); ip != edit.end(); ) {
7446 if(*ip == 'M') {
7447 ++pb;
7448 ++ip;
7449 } else if(*ip == '-') {
7450 int len = 0;
7451 for( ;ip != edit.end() && *ip == '-'; ++ip, ++len);
7452 int pos = pb;
7453 pb += len;
7454 for( ;len > 0 && ip != edit.end() && *ip != 'M'; ++ip, --len); // we may have ----+++M but not +++--- (can't really happen unless corrections had adjacent -+)
7455 if(len > 0)
7456 combined_indels.push_back(CInDelInfo(pos,len,CInDelInfo::eIns));
7457 } else {
7458 string s;
7459 for( ;ip != edit.end() && *ip != 'M' && *ip != '-'; ++ip)
7460 s.push_back(*ip);
7461 combined_indels.push_back(CInDelInfo(pb, s.size(), CInDelInfo::eDel, s));
7462 }
7463 }
7464 _ASSERT(pb == exonb.GetTo()+1);
7465
7466 return combined_indels;
7467 }
7468
MapOneModelToOrigContig(const CGeneModel & srcmodel) const7469 CGeneModel CGnomonAnnotator_Base::MapOneModelToOrigContig(const CGeneModel& srcmodel) const {
7470 CGeneModel model = srcmodel;
7471 model.SetCdsInfo(CCDSInfo());
7472 model.CutExons(model.Limits()); // empty model with all atributes
7473 TInDels editedframeshifts;
7474
7475 for(int ie = 0; ie < (int)srcmodel.Exons().size(); ++ie) {
7476 const CModelExon& e = srcmodel.Exons()[ie];
7477
7478 string seq;
7479 CInDelInfo::SSource src;
7480 CGnomonAnnotator_Base::TGgapInfo::const_iterator i = m_inserted_seqs.upper_bound(e.GetTo()); // first ggap on right or end()
7481 if(i != m_inserted_seqs.begin()) {
7482 --i; // first ggap left or equal GetTo()
7483 int ggapa = i->first;
7484 int ggapb = i->first+(int)i->second->GetInDelV().length()-1;
7485 if(ggapa == e.GetFrom()) { // exons starts with ggap
7486 seq = i->second->GetInDelV().substr(0,e.Limits().GetLength());
7487 src = i->second->GetSource();
7488 if(src.m_strand == ePlus)
7489 src.m_range.SetTo(src.m_range.GetFrom()+e.Limits().GetLength()-1);
7490 else
7491 src.m_range.SetFrom(src.m_range.GetTo()-e.Limits().GetLength()+1);
7492 } else if(ggapb == e.GetTo()) { // exon ends by ggap
7493 string s = i->second->GetInDelV();
7494 seq = s.substr(s.length()-e.Limits().GetLength());
7495 src = i->second->GetSource();
7496 if(src.m_strand == eMinus)
7497 src.m_range.SetTo(src.m_range.GetFrom()+e.Limits().GetLength()-1);
7498 else
7499 src.m_range.SetFrom(src.m_range.GetTo()-e.Limits().GetLength()+1);
7500 } else if(ggapb >= e.GetFrom()) { // all real alignment and some filling was clipped
7501 _ASSERT(srcmodel.Exons().size() == 1);
7502 return CGeneModel();
7503 }
7504 }
7505
7506 if(!seq.empty()) { // ggap
7507 if((int)srcmodel.Exons().size() == 1){ // all real alignment was clipped
7508 return CGeneModel();
7509 }
7510 if(model.Strand() == eMinus) {
7511 ReverseComplement(seq.begin(), seq.end());
7512 src.m_strand = (src.m_strand == ePlus ? eMinus : ePlus);
7513 }
7514 _ASSERT((int)seq.length() == src.m_range.GetLength());
7515 model.AddGgapExon(0, seq, src, false);
7516 } else { // normal exon
7517 TSignedSeqRange exon = m_edited_contig_map.ShrinkToRealPointsOnEdited(e.Limits());
7518 if(exon.Empty()) { // not projectable exon
7519 return CGeneModel();
7520 }
7521 int extra_left = exon.GetFrom()-e.GetFrom();
7522 int extra_right = e.GetTo()-exon.GetTo();
7523
7524 exon = m_edited_contig_map.MapRangeEditedToOrig(exon,false);
7525 _ASSERT(exon.NotEmpty());
7526
7527 TInDels exon_indels;
7528 ITERATE(TInDels, indl, srcmodel.FrameShifts()) {
7529 if(indl->IntersectingWith(e.GetFrom(),e.GetTo()))
7530 exon_indels.push_back(*indl);
7531 }
7532 TInDels efs = CombineCorrectionsAndIndels(e.Limits(), extra_left, extra_right, exon, m_editing_indels, exon_indels);
7533
7534 TInDels erepl;
7535 map<int,char>::const_iterator ir = m_replacements.lower_bound(exon.GetFrom()); // first exon replacement or end()
7536 for( ;ir != m_replacements.end() && ir->first <= exon.GetTo(); ++ir) {
7537 int loc = ir->first;
7538 char c = ir->second;
7539 TInDels::const_iterator ic = upper_bound(efs.begin(), efs.end(), loc, OverlappingIndel); // skip all indels ending before mismatch
7540 if(ic != efs.end() && ic->IsInsertion() && ic->Loc() <= loc && ic->InDelEnd() > loc) // overlapping insertion
7541 continue;
7542 else if(ic != efs.end() && ic->IsDeletion() && ic->Loc() == loc) // deletion right before mismatch
7543 erepl.push_back(CInDelInfo(loc, 1, CInDelInfo::eMism, string(1,c)));
7544 else if(erepl.empty() || erepl.back().InDelEnd() != loc) // not extention of previous
7545 erepl.push_back(CInDelInfo(loc, 1, CInDelInfo::eMism, string(1,c)));
7546 else {
7547 loc = erepl.back().Loc();
7548 string s = erepl.back().GetInDelV()+string(1,c);
7549 erepl.back() = CInDelInfo(loc, s.size(), CInDelInfo::eMism, s);
7550 }
7551 }
7552 efs.insert(efs.end(), erepl.begin(), erepl.end());
7553 sort(efs.begin(), efs.end());
7554 for(auto& indl : efs) {
7555 indl.SetLoc(indl.Loc()+m_limits.GetFrom());
7556 editedframeshifts.push_back(indl);
7557 }
7558
7559 exon.SetFrom(exon.GetFrom()+m_limits.GetFrom());
7560 exon.SetTo(exon.GetTo()+m_limits.GetFrom());
7561 model.AddNormalExon(exon, e.m_fsplice_sig, e.m_ssplice_sig, 0, false);
7562 }
7563
7564 if(ie < (int)srcmodel.Exons().size()-1 && (!e.m_ssplice || !srcmodel.Exons()[ie+1].m_fsplice)) // hole
7565 model.AddHole();
7566 }
7567
7568 model.FrameShifts() = editedframeshifts;
7569 model.SetCdsInfo(srcmodel.GetCdsInfo().MapFromOrigToEdited(srcmodel.GetAlignMap()));
7570
7571 return model;
7572 }
7573
7574
7575 /*
7576 //currently not used for anything; will need separation of indels and replacemnets inputs if used
7577 void MapAlignsToOrigContig(TAlignModelList& aligns, const TInDels& corrections, int contig_size) {
7578 CGnomonAnnotator_Base::TGgapInfo inserted_seqs; // not used
7579 TInDels editing_indels;
7580 map<int,char> replacements;
7581
7582 ITERATE(TInDels, i, corrections) {
7583 if(i->IsMismatch()) {
7584 string seq = i->GetInDelV();
7585 for(int l = 0; l < i->Len(); ++l)
7586 replacements[i->Loc()+l] = seq[l];
7587 } else {
7588 editing_indels.push_back(*i);
7589 if(i->IsInsertion())
7590 contig_size += i->Len();
7591 else
7592 contig_size -= i->Len();
7593 }
7594 }
7595 CAlignMap edited_contig_map(0, contig_size-1, editing_indels.begin(), editing_indels.end());
7596
7597 ERASE_ITERATE(TAlignModelList, ia, aligns) {
7598 CAlignModel& align = *ia;
7599 CGeneModel model = MapOneModelToOrigContig(align, editing_indels, replacements, edited_contig_map, inserted_seqs);
7600 if(model.Limits().Empty()) {
7601 aligns.erase(ia);
7602 } else {
7603 _ASSERT(align.Exons().size() == model.Exons().size());
7604 if(align.Type()&CAlignModel::eProt)
7605 model.FrameShifts() = model.GetInDels(false);
7606 vector<TSignedSeqRange> transcript_exons;
7607 for(int i = 0; i < (int)align.Exons().size(); ++i)
7608 transcript_exons.push_back(align.TranscriptExon(i));
7609 CAlignMap amap(model.Exons(), transcript_exons, model.FrameShifts(), align.Orientation(), align.TargetLen());
7610 CConstRef<objects::CSeq_id> id = align.GetTargetId();
7611 *ia = CAlignModel(model,amap);
7612 ia->SetTargetId(*id);
7613 }
7614 }
7615 }
7616 */
7617
MapModelsToOrigContig(TGeneModelList & models) const7618 void CGnomonAnnotator_Base::MapModelsToOrigContig(TGeneModelList& models) const {
7619 ERASE_ITERATE(TGeneModelList, im, models) {
7620 CGeneModel model = MapOneModelToOrigContig(*im);
7621 if(model.Limits().Empty()) {
7622 models.erase(im);
7623 } else {
7624 NON_CONST_ITERATE(TInDels, i, model.FrameShifts()) {
7625 if(i->IsMismatch()) {
7626 i->SetStatus(CInDelInfo::eGenomeNotCorrect);
7627 } else {
7628 TIntMap::const_iterator conf = m_confirmed_bases_orig_len.upper_bound(i->Loc()); // confirmed on the right
7629 bool included = (conf != m_confirmed_bases_orig_len.begin() && (--conf)->first < i->Loc() && conf->first+conf->second >= i->InDelEnd());
7630
7631 TInDels::const_iterator ic = upper_bound(m_editing_indels.begin(), m_editing_indels.end(), i->Loc(), OverlappingIndel); // skip all correction ending before Loc()
7632 if(ic != m_editing_indels.end() && i->GetType() == ic->GetType() && i->Loc() >= ic->Loc() && i->InDelEnd() <= ic->InDelEnd()) {
7633 i->SetStatus(CInDelInfo::eGenomeNotCorrect);
7634 _ASSERT(included);
7635 } else if(included && (ic == m_editing_indels.end() || ic->Loc() > i->InDelEnd())) {
7636 i->SetStatus(CInDelInfo::eGenomeCorrect);
7637 }
7638 }
7639 }
7640 *im = model;
7641 }
7642 }
7643 }
7644
MapOneModelToEditedContig(const CGeneModel & align) const7645 CAlignModel CGnomonAnnotator_Base::MapOneModelToEditedContig(const CGeneModel& align) const
7646 {
7647 CAlignMap amap = align.GetAlignMap();
7648 CCDSInfo acds = align.GetCdsInfo();
7649 if(align.ReadingFrame().NotEmpty() && acds.IsMappedToGenome())
7650 acds = acds.MapFromOrigToEdited(amap);
7651 amap.MoveOrigin(m_limits.GetFrom());
7652
7653 //mismatches are dropped at this point
7654 TInDels aindels = align.GetInDels(false);
7655
7656 //recalculate limits to contig chunk
7657 for(auto& indel : aindels)
7658 indel.SetLoc(indel.Loc()-m_limits.GetFrom());
7659
7660 CGeneModel::TExons aexons = align.Exons();
7661 for(auto& e : aexons) {
7662 if(e.Limits().NotEmpty()) {
7663 e.AddFrom(-m_limits.GetFrom());
7664 e.AddTo(-m_limits.GetFrom());
7665 }
7666 }
7667
7668 CGeneModel editedmodel = align;
7669 editedmodel.ClearExons(); // empty alignment with all atributes
7670
7671 vector<TSignedSeqRange> transcript_exons;
7672 TInDels editedindels;
7673 bool snap_to_codons = (align.Type() == CAlignModel::eProt);
7674
7675 for(int i = 0; i < (int)aexons.size(); ++i) {
7676 const CModelExon& e = aexons[i];
7677
7678 if(e.Limits().NotEmpty()) { // real exon
7679 list<CInDelInfo> exon_indels;
7680 ITERATE(TInDels, indl, aindels) {
7681 if(indl->IntersectingWith(e.GetFrom(), e.GetTo()))
7682 exon_indels.push_back(*indl);
7683 }
7684
7685 int left = e.GetFrom(); //projectable boundary
7686 int left_shrink = 0; //unprojectable touching insertion
7687 int right = e.GetTo(); //projectable boundary
7688 int right_shrink = 0; //unprojectable touching insertion
7689 int left_extend = 0; //both alignment and correction indicate deletion of left_extend bases
7690 int right_extend = 0; //both alignment and correction indicate deletion of right_extend base
7691 CAlignMap::ERangeEnd lend = CAlignMap::eLeftEnd;
7692 CAlignMap::ERangeEnd rend = CAlignMap::eRightEnd;
7693
7694 TSignedSeqRange left_codon;
7695 TSignedSeqRange right_codon;
7696 if(align.Type() == CAlignModel::eProt) {
7697 if(i == 0)
7698 left_codon = (align.Strand() == ePlus ? acds.Start() : acds.Stop());
7699 if(i == (int)aexons.size()-1)
7700 right_codon = (align.Strand() == ePlus ? acds.Stop() : acds.Start());
7701
7702 left_codon = amap.MapRangeEditedToOrig(left_codon, false);
7703 right_codon = amap.MapRangeEditedToOrig(right_codon, false);
7704 }
7705
7706 TInDels::const_iterator ileft = upper_bound(m_editing_indels.begin(), m_editing_indels.end(), left, OverlappingIndel); // skip all correction left of exon (doesn't skip touching deletion)
7707 for( ;ileft != m_editing_indels.end() && ileft->GetStatus() != CInDelInfo::eGenomeNotCorrect; ++ileft); //skip ggaps and Ns
7708
7709 if(ileft != m_editing_indels.end() && ileft->IsDeletion() && ileft->Loc() == left) {
7710 if(!exon_indels.empty() && exon_indels.front().IsDeletion() && exon_indels.front().Loc() == left) {// ileft is touching deletion and there is matching indel in alignmnet
7711 _ASSERT(left_codon.Empty());
7712 left_extend = min(ileft->Len(),exon_indels.front().Len());
7713 }
7714 ++ileft;
7715 }
7716
7717 //adjust left end
7718 int ll = left;
7719 if(left_codon.GetLength() == 3)
7720 ll = left_codon.GetTo();
7721 if(ileft != m_editing_indels.end() && ileft->Loc() <= ll) { // left end is involved
7722 if(e.m_fsplice) { // move splice to projectable point, add indels to keep the texon length
7723 _ASSERT(left_codon.Empty());
7724 left = ileft->Loc()+ileft->Len(); //could be only touching insertion
7725 if(left > right)
7726 return CAlignModel();
7727 left_shrink = left-e.GetFrom();
7728 } else {
7729 // clip to commom projectable point
7730 TSignedSeqRange lim = e.Limits();
7731 if(left_codon.GetLength() == 3)
7732 lim.SetFrom(left_codon.GetTo()+1);
7733 while(ileft != m_editing_indels.end() && ileft->Loc() <= lim.GetFrom()) {
7734 lim.SetFrom(ileft->InDelEnd());
7735 if(lim.NotEmpty())
7736 lim = amap.ShrinkToRealPoints(lim, snap_to_codons); // skip alignment indels
7737 if(lim.Empty())
7738 return CAlignModel();
7739
7740 for( ;ileft != m_editing_indels.end() && ileft->InDelEnd() <= lim.GetFrom(); ++ileft); // skip outside corrections
7741 }
7742
7743 left = lim.GetFrom();
7744 while(!exon_indels.empty() && exon_indels.front().InDelEnd() <= left)
7745 exon_indels.pop_front();
7746 lend = CAlignMap::eSinglePoint; // is used for transcript exon
7747 }
7748 }
7749
7750 TInDels::const_iterator first_outside = ileft;
7751 for( ; first_outside != m_editing_indels.end() && first_outside->Loc() <= (first_outside->IsInsertion() ? right : right+1); ++first_outside); // end() or first completely on right
7752 reverse_iterator<TInDels::const_iterator> iright(first_outside); // previous correction (last which interferes with exon or rend())
7753 for( ;iright != m_editing_indels.rend() && iright->GetStatus() != CInDelInfo::eGenomeNotCorrect; ++iright); //skip ggaps and Ns
7754
7755 if(iright != m_editing_indels.rend() && iright->IsDeletion() && iright->Loc() == right+1) {
7756 if(!exon_indels.empty() && exon_indels.back().IsDeletion() && exon_indels.back().Loc() == right+1) { // touching deletion and there is matching indel in alignmnet
7757 _ASSERT(right_codon.Empty());
7758 right_extend = min(iright->Len(),exon_indels.back().Len());
7759 }
7760 ++iright;
7761 }
7762
7763 //adjust right end
7764 int rr = right;
7765 if(right_codon.GetLength() == 3)
7766 rr = right_codon.GetFrom();
7767 if(iright != m_editing_indels.rend() && iright->InDelEnd() > rr) { // right end is involved
7768 if(e.m_ssplice) { // move splice to projectable point, add indels to keep the texon length
7769 _ASSERT(right_codon.Empty());
7770 right = iright->Loc()-1;
7771 if(right < left)
7772 return CAlignModel();
7773 right_shrink = e.GetTo()-right;
7774 } else {
7775 // clip to commom projectable point
7776 TSignedSeqRange lim = e.Limits();
7777 if(right_codon.GetLength() == 3)
7778 lim.SetTo(right_codon.GetFrom()-1);
7779 while(iright != m_editing_indels.rend() && iright->InDelEnd() > lim.GetTo()) { // iright is insertion including right position
7780 lim.SetTo(iright->Loc()-1);
7781 if(lim.NotEmpty())
7782 lim = amap.ShrinkToRealPoints(lim, snap_to_codons); // skip alignment indels
7783 if(lim.Empty())
7784 return CAlignModel();
7785
7786 for( ; iright != m_editing_indels.rend() && iright->Loc() > lim.GetTo(); ++iright); // skip outside corrections
7787 }
7788
7789 right = lim.GetTo();
7790 while(!exon_indels.empty() && exon_indels.back().Loc() > right)
7791 exon_indels.pop_back();
7792 rend = CAlignMap::eSinglePoint; // is used for transcript exon
7793 }
7794 }
7795
7796 TSignedSeqRange orig_exon(left-left_shrink, right+right_shrink);
7797 TSignedSeqRange texon = amap.MapRangeOrigToEdited(orig_exon, lend, rend);
7798 transcript_exons.push_back(texon);
7799
7800 TSignedSeqRange corrected_exon = m_edited_contig_map.MapRangeOrigToEdited(TSignedSeqRange(left, right), false);
7801 _ASSERT(corrected_exon.NotEmpty());
7802 //TODO: account for left/right shrink? Whe projected back, this will move all isertion inside the exon
7803 corrected_exon.SetFrom(corrected_exon.GetFrom()-left_extend);
7804 corrected_exon.SetTo(corrected_exon.GetTo()+right_extend);
7805 editedmodel.AddExon(corrected_exon, e.m_fsplice_sig, e.m_ssplice_sig, e.m_ident);
7806 if(i < (int)aexons.size()-1 && (!aexons[i].m_ssplice || !aexons[i+1].m_fsplice)) // hole
7807 editedmodel.AddHole();
7808
7809
7810 TInDels efs = CombineCorrectionsAndIndels(orig_exon, left_shrink, right_shrink, corrected_exon, m_reversed_corrections, TInDels(exon_indels.begin(), exon_indels.end()));
7811 editedindels.insert(editedindels.end(), efs.begin(), efs.end());
7812 } else { // gap exon
7813 transcript_exons.push_back(align.TranscriptExon(i));
7814 string gap_seq = e.m_seq;
7815 if(align.Orientation() == eMinus)
7816 ReverseComplement(gap_seq.begin(), gap_seq.end());
7817
7818 TInDels::const_iterator gap = m_editing_indels.end();
7819 ITERATE(TInDels, ig, m_editing_indels) {
7820 if(ig->GetSource().m_range.NotEmpty()) { //ggap
7821 if(i > 0 && ig->Loc() < aexons[i-1].GetTo())
7822 continue;
7823 if(i == 0 && ig->Loc() > aexons[i+1].GetFrom())
7824 break;
7825 if(ig->GetInDelV() == gap_seq) {
7826 gap = ig;
7827 if(i > 0) break; //first available for all exons except the first one
7828 }
7829 }
7830 }
7831 _ASSERT(gap != m_editing_indels.end());
7832
7833 int left_end = m_edited_contig_map.MapOrigToEdited(gap->Loc());
7834 if(left_end >= 0) {
7835 left_end -= gap->Len();
7836 for(TInDels::const_iterator ig = gap+1; ig != m_editing_indels.end() && ig->Loc() == gap->Loc(); ++ig)
7837 left_end -= ig->Len();
7838 } else {
7839 left_end = m_edited_contig_map.MapOrigToEdited(gap->Loc()-1);
7840 _ASSERT(left_end >= 0);
7841 left_end += 1;
7842 for(TInDels::const_iterator ig = gap; ig != m_editing_indels.begin() && (ig-1)->Loc() == gap->Loc(); --ig) {
7843 left_end += (ig-1)->Len();
7844 }
7845 }
7846
7847 editedmodel.AddExon(TSignedSeqRange(left_end,left_end+gap->Len()-1), "XX", "XX", 1);
7848 }
7849 }
7850
7851 CAlignMap editedamap(editedmodel.Exons(), transcript_exons, editedindels, align.Orientation(), amap.TargetLen());
7852
7853 editedmodel.FrameShifts() = editedindels;
7854 CAlignModel editedalign(editedmodel, editedamap);
7855
7856 _ASSERT(align.GetEdgeReadingFrames()->empty());
7857
7858 if(align.ReadingFrame().NotEmpty()) {
7859 double score = acds.Score();
7860 bool open = acds.OpenCds();
7861 acds.Clip(editedalign.TranscriptLimits());
7862 acds.SetScore(score, open);
7863 editedalign.SetCdsInfo(acds.MapFromEditedToOrig(editedamap));
7864 }
7865
7866 return editedalign;
7867 }
7868
MapAlignmentsToEditedContig(TAlignModelList & alignments) const7869 void CGnomonAnnotator_Base::MapAlignmentsToEditedContig(TAlignModelList& alignments) const
7870 {
7871 ERASE_ITERATE(TAlignModelList, ia, alignments) {
7872 CAlignModel a = MapOneModelToEditedContig(*ia);
7873 if(a.Limits().NotEmpty()) {
7874 a.SetTargetId(*ia->GetTargetId());
7875 *ia = a;
7876 } else {
7877 alignments.erase(ia);
7878 }
7879 }
7880 }
7881
MapModelsToEditedContig(TGeneModelList & models) const7882 void CGnomonAnnotator_Base::MapModelsToEditedContig(TGeneModelList& models) const
7883 {
7884 NON_CONST_ITERATE(TGeneModelList, ia, models) {
7885 *ia = MapOneModelToEditedContig(*ia);
7886 _ASSERT(!ia->Exons().empty());
7887 }
7888 }
7889
SetGenomic(const CResidueVec & seq)7890 void CGnomonAnnotator_Base::SetGenomic(const CResidueVec& seq)
7891 {
7892 m_edited_contig_map = CAlignMap(0, seq.size()-1);
7893 m_editing_indels.clear();
7894 m_reversed_corrections.clear();
7895 m_confirmed_bases_len.clear();
7896 m_confirmed_bases_orig_len.clear();
7897 m_replacements.clear();
7898 m_inserted_seqs.clear();
7899 m_notbridgeable_gaps_len.clear();
7900 m_contig_acc.clear();
7901 m_gnomon.reset(new CGnomonEngine(m_hmm_params, seq, TSignedSeqRange::GetWhole()));
7902 }
7903
7904 // SetGenomic for annot - models could be 0
SetGenomic(const CSeq_id & contig,CScope & scope,const string & mask_annots,const TGeneModelList * models)7905 void CGnomonAnnotator_Base::SetGenomic(const CSeq_id& contig, CScope& scope, const string& mask_annots, const TGeneModelList* models) {
7906 SCorrectionData correction_data;
7907 m_notbridgeable_gaps_len.clear();
7908
7909 if(models) {
7910 CBioseq_Handle bh(scope.GetBioseqHandle(contig));
7911 CSeqVector sv (bh.GetSeqVector(CBioseq_Handle::eCoding_Iupac));
7912 int length (sv.size());
7913 string seq_txt;
7914 sv.GetSeqData(0, length, seq_txt);
7915
7916 TIVec exons(length,0);
7917
7918 ITERATE(TGeneModelList, i, *models) {
7919 ITERATE(CGeneModel::TExons, e, i->Exons()) {
7920 if(e->Limits().NotEmpty()) {
7921 int a = e->GetFrom();
7922 // if(a > 0 && !sv.IsInGap(a-1)) --a;
7923 // if(a > 0 && !sv.IsInGap(a-1)) --a;
7924 int b = e->GetTo();
7925 // if(b < length-1 && !sv.IsInGap(b+1)) ++b;
7926 // if(b < length-1 && !sv.IsInGap(b+1)) ++b;
7927 // for(int p = a; p <= b; ++p) { // block all exons and splices
7928 for(int p = a+1; p <= b; ++p) { // block all exons except first base (can't keep splices after all)
7929 exons[p] = 1; // mark positions which cannot be used for deletions
7930 } // !!!!!!!!it is still a problem if gapfilled models are exactly next to each other!!!!!!!!!!!!!!
7931 }
7932 }
7933 }
7934
7935 TIVec model_ranges(length,0);
7936
7937 ITERATE(TGeneModelList, i, *models) {
7938 for(int p = max(0,i->Limits().GetFrom()-2); p <= min(length-1,i->Limits().GetTo()+2); ++p)
7939 model_ranges[p] = 1;
7940
7941 ITERATE(TInDels, indl, i->FrameShifts()) {
7942 if(indl->GetStatus() == CInDelInfo::eGenomeNotCorrect) {
7943 if(indl->IsMismatch()) {
7944 string s = indl->GetInDelV();
7945 for(int l = 0; l < indl->Len(); ++l)
7946 correction_data.m_replacements[indl->Loc()+l] = s[l];
7947 } else {
7948 correction_data.m_correction_indels.push_back(*indl);
7949 }
7950 }
7951 if(indl->GetStatus() != CInDelInfo::eUnknown) {
7952 correction_data.m_confirmed_intervals.push_back(TSignedSeqRange(indl->Loc()-1,indl->InDelEnd()));
7953 _ASSERT(correction_data.m_confirmed_intervals.back().GetFrom() >= 0 && correction_data.m_confirmed_intervals.back().GetTo() < length);
7954 }
7955 }
7956 for(int ie = 0; ie < (int)i->Exons().size(); ++ie) {
7957 const CModelExon& e = i->Exons()[ie];
7958 if(e.Limits().Empty()) {
7959 int pos;
7960 if(ie > 0) {
7961 _ASSERT(i->Exons()[ie-1].Limits().NotEmpty());
7962 for(pos = i->Exons()[ie-1].GetTo()+1; pos < length && exons[pos] > 0; ++pos);
7963 } else {
7964 _ASSERT((int)i->Exons().size() > 1 && i->Exons()[1].Limits().NotEmpty());
7965 // for(pos = i->Exons()[1].GetFrom(); pos > 0 && exons[pos-1] > 0; --pos);
7966 for(pos = i->Exons()[1].GetFrom(); pos > 0 && exons[pos] > 0; --pos);
7967 }
7968 string seq = e.m_seq;
7969 CInDelInfo::SSource source = e.m_source;
7970 if(i->Strand() == eMinus) {
7971 ReverseComplement(seq.begin(),seq.end());
7972 source.m_strand = OtherStrand(source.m_strand);
7973 }
7974 correction_data.m_correction_indels.push_back(CInDelInfo(pos, seq.length(), CInDelInfo::eDel, seq, source));
7975 }
7976 }
7977 }
7978
7979 uniq(correction_data.m_correction_indels); //remove duplicates from altvariants
7980 ERASE_ITERATE(TInDels, indl, correction_data.m_correction_indels) { // remove 'partial' indels
7981 TInDels::iterator next = indl;
7982 if(++next != correction_data.m_correction_indels.end() && indl->Loc() == next->Loc()) {
7983 if(indl->GetSource().m_range.Empty() && next->GetSource().m_range.Empty()) {
7984 _ASSERT(indl->IsDeletion());
7985 _ASSERT(next->IsDeletion());
7986 VECTOR_ERASE(indl, correction_data.m_correction_indels);
7987 }
7988 }
7989 }
7990
7991 TIntMap::iterator current_gap = m_notbridgeable_gaps_len.end();
7992 for(int i = 0; i < length; ++i) {
7993 if(model_ranges[i])
7994 continue;
7995
7996 CConstRef<CSeq_literal> gsl = sv.GetGapSeq_literal(i);
7997 if(gsl && gsl->GetBridgeability() == CSeq_literal::e_NotBridgeable) {
7998 if(current_gap == m_notbridgeable_gaps_len.end())
7999 current_gap = m_notbridgeable_gaps_len.insert(TIntMap::value_type(i,1)).first;
8000 else
8001 ++current_gap->second;
8002 } else {
8003 current_gap = m_notbridgeable_gaps_len.end();
8004 }
8005 }
8006 }
8007
8008 SetGenomic(contig, scope, correction_data, TSignedSeqRange::GetWhole(), mask_annots);
8009 }
8010
SetGenomic(const CSeq_id & contig,CScope & scope,const SCorrectionData & correction_data,TSignedSeqRange limits,const string & mask_annots)8011 void CGnomonAnnotator_Base::SetGenomic(const CSeq_id& contig, CScope& scope, const SCorrectionData& correction_data, TSignedSeqRange limits, const string& mask_annots)
8012 {
8013 m_contig_acc = CIdHandler::ToString(contig);
8014
8015 CResidueVec seq;
8016 int length;
8017
8018 CBioseq_Handle bh(scope.GetBioseqHandle(contig));
8019 {
8020 CSeqVector sv (bh.GetSeqVector(CBioseq_Handle::eCoding_Iupac));
8021 length = sv.size();
8022 if(limits == TSignedSeqRange::GetWhole()) {
8023 limits.SetFrom(0);
8024 limits.SetTo(length-1);
8025 }
8026 int GC_RANGE = 200000;
8027 limits.SetFrom(max(0, limits.GetFrom()-GC_RANGE/2));
8028 limits.SetTo(min(length-1, limits.GetTo()+GC_RANGE/2));
8029 length = limits.GetLength();
8030 m_limits = limits;
8031 seq.reserve(length);
8032 for(int i = limits.GetFrom(); i <= limits.GetTo(); ++i)
8033 seq.push_back(sv[i]);
8034 }
8035
8036 if (m_masking) {
8037 SAnnotSelector sel;
8038 {
8039 list<string> arr;
8040 NStr::Split(mask_annots, " ", arr, NStr::fSplit_MergeDelimiters|NStr::fSplit_Truncate);
8041 ITERATE(list<string>, annot, arr) {
8042 sel.AddNamedAnnots(*annot);
8043 }
8044 }
8045 sel.IncludeFeatSubtype(CSeqFeatData::eSubtype_repeat_region)
8046 .SetResolveAll()
8047 .SetAdaptiveDepth(true);
8048 for (CFeat_CI it(bh, sel); it; ++it) {
8049 TSeqRange range = it->GetLocation().GetTotalRange();
8050 for(unsigned int i = range.GetFrom(); i <= range.GetTo(); ++i) {
8051 if(Include(limits, i))
8052 seq[i-limits.GetFrom()] = tolower(seq[i-limits.GetFrom()]);
8053 }
8054 }
8055 }
8056
8057 m_editing_indels.clear();
8058 m_reversed_corrections.clear();
8059 m_confirmed_bases_len.clear();
8060 m_confirmed_bases_orig_len.clear();
8061 m_replacements.clear();
8062 m_inserted_seqs.clear();
8063
8064 m_replacements = correction_data.m_replacements;
8065 for(map<int,char>::iterator ir = m_replacements.begin(); ir != m_replacements.end(); ++ir) {
8066 if(Include(limits,ir->first)) {
8067 m_replaced_bases[ir->first-limits.GetFrom()] = seq[ir->first-limits.GetFrom()];
8068 seq[ir->first-limits.GetFrom()] = ir->second;
8069 }
8070 }
8071
8072
8073 #define BLOCK_OF_Ns 35
8074 for(auto cor : correction_data.m_correction_indels) {
8075 if(cor.GetSource().m_range.Empty() && Include(limits, cor.Loc())) { // correction indel
8076 cor.SetLoc(cor.Loc()-limits.GetFrom());
8077 m_editing_indels.push_back(cor);
8078 } else if(cor.Loc() >= limits.GetFrom() && cor.Loc() <= limits.GetTo()+1) { // ggap (1bp fake ggaps may be loctated right before or after contig)
8079 int l = cor.Loc()-limits.GetFrom();
8080 CInDelInfo g(l, cor.Len(), cor.GetType(), cor.GetInDelV(), cor.GetSource());
8081 //surround ggap with Ns to satisfy MinIntron
8082 CInDelInfo Ns(l, BLOCK_OF_Ns, CInDelInfo::eDel, string(BLOCK_OF_Ns,'N'));
8083 m_editing_indels.push_back(Ns);
8084 m_editing_indels.push_back(g);
8085 m_editing_indels.push_back(Ns);
8086 }
8087 }
8088
8089 m_edited_contig_map = CAlignMap(0, length-1, m_editing_indels.begin(), m_editing_indels.end());
8090 {
8091 CResidueVec editedseq;
8092 m_edited_contig_map.EditedSequence(seq,editedseq);
8093 swap(seq, editedseq);
8094 }
8095
8096 ITERATE(TInDels, ig, m_editing_indels) {
8097 TInDels::const_iterator nexti = next(ig);
8098 if(nexti != m_editing_indels.end() && nexti->GetSource().m_range.NotEmpty() && nexti->Loc() == ig->Loc()) // block of Ns
8099 continue;
8100
8101 if(ig->GetSource().m_range.NotEmpty()) { //ggap
8102 int left_end = m_edited_contig_map.MapOrigToEdited(ig->Loc());
8103 if(left_end >= 0) {
8104 left_end -= ig->Len();
8105 for(TInDels::const_iterator igg = ig+1; igg != m_editing_indels.end() && igg->Loc() == ig->Loc(); ++igg)
8106 left_end -= igg->Len();
8107 } else {
8108 left_end = m_edited_contig_map.MapOrigToEdited(ig->Loc()-1);
8109 _ASSERT(left_end >= 0);
8110 left_end += 1;
8111 for(TInDels::const_iterator i = ig; i != m_editing_indels.begin() && (i-1)->Loc() == ig->Loc(); --i) {
8112 left_end += (i-1)->Len();
8113 }
8114 }
8115 m_inserted_seqs[left_end] = ig;
8116 ++ig; // skip block of Ns
8117 } else {
8118 int loc = m_edited_contig_map.MapOrigToEdited(ig->InDelEnd());
8119 _ASSERT(loc >= 0);
8120 if(ig->IsInsertion()) {
8121 string s(seq.begin()+ig->Loc(), seq.begin()+ig->Len());
8122 m_reversed_corrections.push_back(CInDelInfo(loc, ig->Len(), CInDelInfo::eDel, NStr::ToUpper(s)));
8123 } else {
8124 m_reversed_corrections.push_back(CInDelInfo(loc-ig->Len(), ig->Len(), CInDelInfo::eIns));
8125 }
8126 m_reversed_corrections.back().SetStatus(ig->GetStatus());
8127 }
8128 }
8129
8130 set<int> confirmed_bases;
8131 for(list<TSignedSeqRange>::const_iterator it = correction_data.m_confirmed_intervals.begin(); it != correction_data.m_confirmed_intervals.end(); ++it) {
8132 TSignedSeqRange lim = *it;
8133 _ASSERT(lim.NotEmpty());
8134 for(int p = lim.GetFrom(); p <= lim.GetTo(); ++p)
8135 confirmed_bases.insert(p);
8136 }
8137 TIntMap::iterator cbase_len = m_confirmed_bases_orig_len.end();
8138 ITERATE(set<int>, ip, confirmed_bases) {
8139 if(cbase_len == m_confirmed_bases_orig_len.end() || *ip != cbase_len->first+cbase_len->second)
8140 cbase_len = m_confirmed_bases_orig_len.insert(TIntMap::value_type(*ip,1)).first;
8141 else
8142 ++cbase_len->second;
8143 }
8144
8145 ITERATE(TIntMap, ic, m_confirmed_bases_orig_len) {
8146 TSignedSeqRange lim(ic->first, ic->first+ic->second-1);
8147 lim = m_edited_contig_map.MapRangeOrigToEdited(lim, false);
8148 _ASSERT(lim.NotEmpty());
8149 m_confirmed_bases_len[lim.GetFrom()] = lim.GetLength();
8150 }
8151
8152 TIntMap notbridgeable_gaps_len;
8153 ITERATE(TIntMap, ig, m_notbridgeable_gaps_len) {
8154 int pos = m_edited_contig_map.MapOrigToEdited(ig->first);
8155 _ASSERT(pos >= 0);
8156 notbridgeable_gaps_len[pos] = ig->second;
8157 }
8158 m_notbridgeable_gaps_len = notbridgeable_gaps_len;
8159
8160
8161 m_gnomon.reset(new CGnomonEngine(m_hmm_params, move(seq), TSignedSeqRange::GetWhole()));
8162 }
8163
GetGnomon()8164 CGnomonEngine& CGnomonAnnotator_Base::GetGnomon()
8165 {
8166 return *m_gnomon;
8167 }
8168
MarkupCappedEst(const set<string> & _caps,int _capgap)8169 MarkupCappedEst::MarkupCappedEst(const set<string>& _caps, int _capgap)
8170 : caps(_caps)
8171 , capgap(_capgap)
8172 {}
8173
transform_align(CAlignModel & align)8174 void MarkupCappedEst::transform_align(CAlignModel& align)
8175 {
8176 string acc = CIdHandler::ToString(*align.GetTargetId());
8177 int fivep = align.TranscriptExon(0).GetFrom();
8178 if(align.Strand() == eMinus)
8179 fivep = align.TranscriptExon(align.Exons().size()-1).GetFrom();
8180 if((align.Status()&CGeneModel::eReversed) == 0 && caps.find(acc) != caps.end() && fivep < capgap)
8181 align.Status() |= CGeneModel::eCap;
8182 }
8183
MarkupTrustedGenes(const set<string> & _trusted_genes)8184 MarkupTrustedGenes::MarkupTrustedGenes(const set<string>& _trusted_genes) : trusted_genes(_trusted_genes) {}
8185
transform_align(CAlignModel & align)8186 void MarkupTrustedGenes::transform_align(CAlignModel& align)
8187 {
8188 string acc = CIdHandler::ToString(*align.GetTargetId());
8189 if(trusted_genes.find(acc) != trusted_genes.end()) {
8190 CRef<CSeq_id> target_id(new CSeq_id);
8191 target_id->Assign(*align.GetTargetId());
8192 if(align.Type() == CGeneModel::eProt)
8193 align.InsertTrustedProt(target_id);
8194 else
8195 align.InsertTrustedmRNA(target_id);
8196 }
8197 }
8198
ProteinWithBigHole(double _hthresh,double _hmaxlen,CGnomonEngine & _gnomon)8199 ProteinWithBigHole::ProteinWithBigHole(double _hthresh, double _hmaxlen, CGnomonEngine& _gnomon)
8200 : hthresh(_hthresh), hmaxlen(_hmaxlen), gnomon(_gnomon) {}
model_predicate(CGeneModel & m)8201 bool ProteinWithBigHole::model_predicate(CGeneModel& m)
8202 {
8203 if ((m.Type() & CGeneModel::eProt)==0)
8204 return false;
8205 int total_hole_len = 0;
8206 for(unsigned int i = 1; i < m.Exons().size(); ++i) {
8207 if(!m.Exons()[i-1].m_ssplice || !m.Exons()[i].m_fsplice)
8208 total_hole_len += m.Exons()[i].GetFrom()-m.Exons()[i-1].GetTo()-1;
8209 }
8210 if(total_hole_len < hmaxlen*m.Limits().GetLength())
8211 return false;
8212
8213 for(unsigned int i = 1; i < m.Exons().size(); ++i) {
8214 bool hole = !m.Exons()[i-1].m_ssplice || !m.Exons()[i].m_fsplice;
8215 int intron = m.Exons()[i].GetFrom()-m.Exons()[i-1].GetTo()-1;
8216 if (hole && gnomon.GetChanceOfIntronLongerThan(intron) < hthresh) {
8217 return true;
8218 }
8219 }
8220 return false;
8221 }
8222
model_predicate(CGeneModel & m)8223 bool CdnaWithHole::model_predicate(CGeneModel& m)
8224 {
8225 if ((m.Type() & CGeneModel::eProt)!=0)
8226 return false;
8227 return !m.Continuous();
8228 }
8229
HasShortIntron(CGnomonEngine & _gnomon)8230 HasShortIntron::HasShortIntron(CGnomonEngine& _gnomon)
8231 :gnomon(_gnomon) {}
8232
model_predicate(CGeneModel & m)8233 bool HasShortIntron::model_predicate(CGeneModel& m)
8234 {
8235 for(unsigned int i = 1; i < m.Exons().size(); ++i) {
8236 bool hole = !m.Exons()[i-1].m_ssplice || !m.Exons()[i].m_fsplice;
8237 int intron = m.Exons()[i].GetFrom()-m.Exons()[i-1].GetTo()-1;
8238 if (!hole && m.Exons()[i].m_fsplice_sig != "XX" && m.Exons()[i-1].m_ssplice_sig != "XX" && intron < gnomon.GetMinIntronLen()) {
8239 return true;
8240 }
8241 }
8242 return false;
8243 }
8244
HasLongIntron(CGnomonEngine & _gnomon)8245 HasLongIntron::HasLongIntron(CGnomonEngine& _gnomon)
8246 :gnomon(_gnomon) {}
8247
model_predicate(CGeneModel & m)8248 bool HasLongIntron::model_predicate(CGeneModel& m)
8249 {
8250 for(unsigned int i = 1; i < m.Exons().size(); ++i) {
8251 bool hole = !m.Exons()[i-1].m_ssplice || !m.Exons()[i].m_fsplice;
8252 int intron = m.Exons()[i].GetFrom()-m.Exons()[i-1].GetTo()-1;
8253 if (!hole && intron > gnomon.GetMaxIntronLen()) {
8254 return true;
8255 }
8256 }
8257 return false;
8258 }
8259
CutShortPartialExons(int _minex)8260 CutShortPartialExons::CutShortPartialExons(int _minex)
8261 : minex(_minex) {}
8262
EffectiveExonLength(const CModelExon & e,const CAlignMap & alignmap,bool snap_to_codons)8263 int EffectiveExonLength(const CModelExon& e, const CAlignMap& alignmap, bool snap_to_codons) {
8264 TSignedSeqRange shrinkedexon = alignmap.ShrinkToRealPoints(e,snap_to_codons);
8265 int exonlen = alignmap.FShiftedLen(shrinkedexon,false); // length of the projection on transcript
8266 return min(exonlen,shrinkedexon.GetLength());
8267 }
8268
transform_align(CAlignModel & a)8269 void CutShortPartialExons::transform_align(CAlignModel& a)
8270 {
8271 if (a.Exons().empty())
8272 return;
8273
8274 CAlignMap alignmap(a.GetAlignMap());
8275 if(a.Exons().size() == 1 && min(a.Limits().GetLength(),alignmap.FShiftedLen(alignmap.ShrinkToRealPoints(a.Limits()),false)) < 2*minex) {
8276 // one exon and it is short
8277 a.CutExons(a.Limits());
8278 return;
8279 }
8280
8281 bool snap_to_codons = ((a.Type() & CAlignModel::eProt)!=0);
8282 TSignedSeqPos left = a.Limits().GetFrom();
8283 if ((a.Exons().size() > 1 && !a.Exons().front().m_ssplice) || (a.Type() & CAlignModel::eProt)==0 || !a.LeftComplete()) {
8284 for(unsigned int i = 0; i < a.Exons().size()-1; ++i) {
8285 if(EffectiveExonLength(a.Exons()[i], alignmap, snap_to_codons) >= minex) {
8286 break;
8287 } else {
8288 left = a.Exons()[i+1].GetFrom();
8289 if(a.Strand() == ePlus && (a.Status()&CGeneModel::eCap) != 0)
8290 a.Status() ^= CGeneModel::eCap;
8291 if(a.Strand() == eMinus && (a.Status()&CGeneModel::ePolyA) != 0)
8292 a.Status() ^= CGeneModel::ePolyA;
8293 }
8294 }
8295 }
8296
8297 TSignedSeqPos right = a.Limits().GetTo();
8298 if ((a.Exons().size() > 1 && !a.Exons().back().m_fsplice) || (a.Type() & CAlignModel::eProt)==0 || !a.RightComplete()) {
8299 for(unsigned int i = a.Exons().size()-1; i > 0; --i) {
8300 if(EffectiveExonLength(a.Exons()[i], alignmap, snap_to_codons) >= minex) {
8301 break;
8302 } else {
8303 right = a.Exons()[i-1].GetTo();
8304 if(a.Strand() == eMinus && (a.Status()&CGeneModel::eCap) != 0)
8305 a.Status() ^= CGeneModel::eCap;
8306 if(a.Strand() == ePlus && (a.Status()&CGeneModel::ePolyA) != 0)
8307 a.Status() ^= CGeneModel::ePolyA;
8308 }
8309 }
8310 }
8311
8312 TSignedSeqRange newlimits(left,right);
8313 if(newlimits.NotEmpty()) {
8314 newlimits = alignmap.ShrinkToRealPoints(newlimits,snap_to_codons);
8315 if(newlimits != a.Limits()) {
8316 if(newlimits.GetLength() < 2*minex || alignmap.FShiftedLen(newlimits,false) < 2*minex) {
8317 a.CutExons(a.Limits());
8318 return;
8319 }
8320 a.Clip(newlimits,CAlignModel::eRemoveExons);
8321 }
8322 } else {
8323 a.CutExons(a.Limits());
8324 return;
8325 }
8326
8327
8328 for (size_t i = 1; i < a.Exons().size()-1; ++i) {
8329 const CModelExon* e = &a.Exons()[i];
8330
8331 while (!e->m_ssplice && EffectiveExonLength(*e, alignmap, snap_to_codons) < minex) {
8332
8333 if(i == 0) { //first exon
8334 a.CutExons(*e);
8335 e = &a.Exons()[0]; // we still have at least one exon
8336 break;
8337 }
8338
8339 //this point is not an indel and is a codon boundary for proteins
8340 TSignedSeqPos remainingpoint = alignmap.ShrinkToRealPoints(TSignedSeqRange(a.Exons().front().GetFrom(),a.Exons()[i-1].GetTo()),snap_to_codons).GetTo();
8341 TSignedSeqPos left = e->GetFrom();
8342 if(remainingpoint < a.Exons()[i-1].GetTo())
8343 left = remainingpoint+1;
8344 a.CutExons(TSignedSeqRange(left,e->GetTo()));
8345 --i;
8346 e = &a.Exons()[i];
8347 }
8348
8349 while (!e->m_fsplice && EffectiveExonLength(*e, alignmap, snap_to_codons) < minex) {
8350
8351 if(i == a.Exons().size()-1) { //last exon
8352 a.CutExons(*e);
8353 break;
8354 }
8355
8356 //this point is not an indel and is a codon boundary for proteins
8357 TSignedSeqPos remainingpoint = alignmap.ShrinkToRealPoints(TSignedSeqRange(a.Exons()[i+1].GetFrom(),a.Exons().back().GetTo()),snap_to_codons).GetFrom();
8358 TSignedSeqPos right = e->GetTo();
8359 if(remainingpoint > a.Exons()[i+1].GetFrom())
8360 right = remainingpoint-1;
8361
8362 a.CutExons(TSignedSeqRange(e->GetFrom(),right));
8363 e = &a.Exons()[i];
8364 }
8365 }
8366 return;
8367 }
8368
model_predicate(CGeneModel & m)8369 bool HasNoExons::model_predicate(CGeneModel& m)
8370 {
8371 return m.Exons().empty();
8372 }
8373
model_predicate(CGeneModel & m)8374 bool SingleExon_AllEst::model_predicate(CGeneModel& m)
8375 {
8376 return m.Exons().size() <= 1 && (m.Type() & (CAlignModel::eProt|CAlignModel::emRNA))==0;
8377 }
8378
model_predicate(CGeneModel & m)8379 bool SingleExon_Noncoding::model_predicate(CGeneModel& m)
8380 {
8381 return m.Exons().size() <= 1 && m.Score() == BadScore();
8382 }
8383
LowSupport_Noncoding(int _minsupport)8384 LowSupport_Noncoding::LowSupport_Noncoding(int _minsupport)
8385 : minsupport(_minsupport)
8386 {}
model_predicate(CGeneModel & m)8387 bool LowSupport_Noncoding::model_predicate(CGeneModel& m)
8388 {
8389 return m.Score() == BadScore() && int(m.Support().size()) < minsupport && (m.Type() & (CAlignModel::eProt|CAlignModel::emRNA))==0;
8390 }
8391
8392 END_SCOPE(gnomon)
8393 END_SCOPE(ncbi)
8394
8395
8396