1 /*  $Id: chainer.cpp 635425 2021-08-03 16:41:35Z fukanchi $
2   ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Alexandre Souvorov
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiapp.hpp>
34 #include <corelib/ncbienv.hpp>
35 #include <corelib/ncbiargs.hpp>
36 
37 #include <algo/gnomon/chainer.hpp>
38 #include <algo/gnomon/id_handler.hpp>
39 #include <algo/gnomon/gnomon_exception.hpp>
40 #include <algo/gnomon/glb_align.hpp>
41 
42 #include <util/sequtil/sequtil_manip.hpp>
43 
44 #include <algo/gnomon/gnomon_model.hpp>
45 #include <algo/gnomon/gnomon.hpp>
46 #include <algo/gnomon/annot.hpp>
47 
48 #include <map>
49 #include <sstream>
50 #include <tuple>
51 
52 #include <objects/general/Object_id.hpp>
53 #include <objmgr/object_manager.hpp>
54 #include <objmgr/feat_ci.hpp>
55 #include <objmgr/util/sequence.hpp>
56 
57 #include "gnomon_seq.hpp"
58 
59 
60 BEGIN_SCOPE(ncbi)
BEGIN_SCOPE(gnomon)61 BEGIN_SCOPE(gnomon)
62 
63 bool BelongToExon(const CGeneModel::TExons& exons, int pos) {
64     ITERATE(CGeneModel::TExons, i, exons) {
65         if(Include(i->Limits(),pos))
66             return true;
67     }
68     return false;
69 }
70 
71 class CChain;
72 typedef list<CChain> TChainList;
73 typedef list<CChain*> TChainPointerList;
74 
75 
76 struct SChainMember;
77 typedef vector<SChainMember*> TContained;
78 
79 typedef map<Int8,CAlignModel*> TOrigAligns;
80 typedef map<Int8,CGeneModel> TUnmodAligns;
81 struct SFShiftsCluster;
82 class CChainMembers;
83 
84 class CGene;
85 
86 class CChainer::CChainerImpl {
87 
88 private:
89     CChainerImpl(CRef<CHMMParameters>& hmm_params, unique_ptr<CGnomonEngine>& gnomon, const CAlignMap& edited_contig_map, const TSignedSeqRange& limits, const string& m_contig_acc);
90     void SetGenomicRange(const TAlignModelList& alignments);
91     void SetConfirmedStartStopForProteinAlignments(TAlignModelList& alignments);
92 
93     void FilterOutChimeras(TGeneModelList& clust);
94 
95     TGeneModelList MakeChains(TGeneModelList& models, bool coding_estimates_only);
96     void FilterOutBadScoreChainsHavingBetterCompatibles(TGeneModelList& chains);
97     void CombineCompatibleChains(TChainList& chains);
98     void SetFlagsForChains(TChainList& chains);
99     SChainMember* FindOptimalChainForProtein(TContained& pointers_all, vector<CGeneModel*>& parts, CGeneModel& palign);
100     void CreateChainsForPartialProteins(TChainList& chains, TContained& pointers, TGeneModelList& unma_aligns, CChainMembers& unma_members);
101     void CutParts(TGeneModelList& clust);
102     bool CanIncludeJinI(const SChainMember& mi, const SChainMember& mj);
103     void IncludeInContained(SChainMember& big, SChainMember& small);
104     void FindContainedAlignments(TContained& pointers);
105     void DuplicateNotOriented(CChainMembers& pointers, TGeneModelList& clust);
106     void Duplicate5pendsAndShortCDSes(CChainMembers& pointers);
107     void ReplicatePStops(CChainMembers& pointers);
108     void ScoreCdnas(CChainMembers& pointers);
109     void DuplicateUTRs(CChainMembers& pointers);
110     void CalculateSpliceWeights(CChainMembers& pointers);
111     bool LRCanChainItoJ(int& delta_cds, double& delta_num, double& delta_splice_num, SChainMember& mi, SChainMember& mj, TContained& contained);
112     void LRIinit(SChainMember& mi);
113     void LeftRight(TContained& pointers);
114     void RightLeft(TContained& pointers);
115     double GoodCDNAScore(const CGeneModel& algn);
116     void RemovePoorCds(CGeneModel& algn, double minscor);
117     void SkipReason(CGeneModel* orig_align, const string& comment);
118     bool AddIfCompatible(set<SFShiftsCluster>& fshift_clusters, const CGeneModel& algn);
119     bool FsTouch(const TSignedSeqRange& lim, const CInDelInfo& fs);
120     void SplitAlignmentsByStrand(const TGeneModelList& clust, TGeneModelList& clust_plus, TGeneModelList& clust_minus);
121 
122     void FindGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet);
123     void ReplacePseudoGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet);
124     void FindAltsForGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet);
125     void PlaceAllYouCan(list<CGene>& alts, TChainPointerList& not_placed_yet, TChainPointerList& rejected);
126     enum ECompat { eNotCompatible, eAlternative, eNested, eExternal, eOtherGene };
127     ECompat CheckCompatibility(const CGene& gene, const CChain& algn);
128     list<CGene> FindGenes(TChainList& cls);
129     void FilterOutSimilarsWithLowerScore(TChainPointerList& not_placed_yet, TChainPointerList& rejected);
130     void FilterOutTandemOverlap(TChainPointerList& not_placed_yet, TChainPointerList& rejected, double fraction);
131     void TrimAlignmentsIncludedInDifferentGenes(list<CGene>& genes);
132 
133 
134     CRef<CHMMParameters>& m_hmm_params;
135     unique_ptr<CGnomonEngine>& m_gnomon;
136     const CAlignMap& m_edited_contig_map;
137     const TSignedSeqRange& m_limits;
138     const string& m_contig_acc;
139 
140 
141     SMinScor minscor;
142     int intersect_limit;
143     int trim;
144     map<string,TSignedSeqRange> mrnaCDS;
145     map<string, pair<bool,bool> > prot_complet;
146     double mininframefrac;
147     bool no5pextension;
148 
149     int min_cap_weight;
150     int min_cap_blob;
151     int min_polya_weight;
152     int min_polya_blob;
153     int max_dist;
154     double secondary_peak;
155     double tertiary_peak;
156     double tertiary_peak_coverage;
157     int min_flank_exon;
158 
159     int minpolya;
160     bool use_confirmed_ends;
161     TIntMap confirmed_ends; // [splice], end
162 
163     TOrigAligns orig_aligns;
164     TUnmodAligns unmodified_aligns;
165 
166     map<TSignedSeqRange,int> mrna_count;
167     map<TSignedSeqRange,int> est_count;
168     map<TSignedSeqRange,int> rnaseq_count;
169     bool has_rnaseq;
170     set<TSignedSeqRange> oriented_introns_plus;
171     set<TSignedSeqRange> oriented_introns_minus;
172 
173     double altfrac;
174     int composite;
175     bool allow_opposite_strand;
176     bool allow_partialalts;
177     int tolerance;
178 
179     int m_idnext;
180     int m_idinc;
181 
182     TInDels all_frameshifts;
183 
184     int flex_len;
185 
186     friend class CChainer;
187     friend class CChainerArgUtil;
188 };
189 
CGnomonAnnotator_Base()190 CGnomonAnnotator_Base::CGnomonAnnotator_Base() : m_masking(false) { }
191 
~CGnomonAnnotator_Base()192 CGnomonAnnotator_Base::~CGnomonAnnotator_Base(){ }
193 
EnableSeqMasking()194 void CGnomonAnnotator_Base::EnableSeqMasking()
195 {
196     m_masking = true;
197 }
198 
CChainer()199 CChainer::CChainer()
200 {
201     m_data.reset( new CChainerImpl(m_hmm_params, m_gnomon, m_edited_contig_map, m_limits, m_contig_acc) );
202 }
203 
~CChainer()204 CChainer::~CChainer()
205 {
206 }
207 
CChainerImpl(CRef<CHMMParameters> & hmm_params,unique_ptr<CGnomonEngine> & gnomon,const CAlignMap & edited_contig_map,const TSignedSeqRange & limits,const string & contig_acc)208 CChainer::CChainerImpl::CChainerImpl(CRef<CHMMParameters>& hmm_params, unique_ptr<CGnomonEngine>& gnomon,  const CAlignMap& edited_contig_map, const TSignedSeqRange& limits, const string& contig_acc)
209     :m_hmm_params(hmm_params), m_gnomon(gnomon), m_edited_contig_map(edited_contig_map), m_limits(limits), m_contig_acc(contig_acc), m_idnext(1), m_idinc(1)
210 {
211 }
212 
MakeChains(TGeneModelList & models,bool coding_estimates_only)213 TGeneModelList CChainer::MakeChains(TGeneModelList& models, bool coding_estimates_only)
214 {
215     return m_data->MakeChains(models, coding_estimates_only);
216 }
217 
218 enum {
219     eCDS,
220     eLeftUTR,
221     eRightUTR
222 };
223 
224 typedef set<SChainMember*> TMemberPtrSet;
225 
226 struct SChainMember
227 {
SChainMemberSChainMember228     SChainMember() :
229         m_align(0), m_cds_info(0), m_align_map(0), m_left_member(0), m_right_member(0), m_sink_for_contained(0),
230         m_copy(0), m_contained(0), m_identical_count(0),
231         m_left_num(0), m_right_num(0), m_num(0),
232         m_splice_weight(0), m_left_splice_num(0), m_right_splice_num(0), m_splice_num(0),
233         m_type(eCDS), m_left_cds(0), m_right_cds(0), m_cds(0), m_included(false),  m_postponed(false),
234         m_marked_for_deletion(false), m_marked_for_retention(false), m_restricted_to_start(false),
235         m_gapped_connection(false), m_fully_connected_to_part(-1), m_not_for_chaining(false),
236         m_rlimb(numeric_limits<int>::max()),  m_llimb(numeric_limits<int>::max()), m_orig_align(0), m_unmd_align(0), m_mem_id(0) {}
237 
238     TContained CollectContainedForChain();
239     void MarkIncludedForChain();
240     void MarkPostponedForChain();
241     void MarkUnwantedCopiesForChain(const TSignedSeqRange& cds);
242     TContained CollectContainedForMemeber();
243     void AddToContained(TContained& contained, TMemberPtrSet& included_in_list);
244 
245     CGeneModel* m_align;
246     const CCDSInfo* m_cds_info;
247     CAlignMap* m_align_map;
248     SChainMember* m_left_member;
249     SChainMember* m_right_member;
250     SChainMember* m_sink_for_contained;
251     TContained* m_copy;      // is used to make sure that the copy of already incuded duplicated alignment is not included in contained and doesn't trigger a new chain genereation
252     TContained* m_contained;
253     int m_identical_count;
254     double m_left_num, m_right_num, m_num;
255     double m_splice_weight;
256     double m_left_splice_num, m_right_splice_num, m_splice_num;
257     int m_type, m_left_cds, m_right_cds, m_cds;
258     bool m_included;
259     bool m_postponed;
260     bool m_marked_for_deletion;
261     bool m_marked_for_retention;
262     bool m_restricted_to_start;
263     bool m_gapped_connection;          // used for gapped proteins
264     int m_fully_connected_to_part;     // used for gapped proteins
265     bool m_not_for_chaining;           // included in other alignmnet(s) or supressed and can't trigger a different chain
266     int m_rlimb;                       // leftmost compatible rexon
267     int m_llimb;                       // leftmost not compatible lexon
268     CAlignModel* m_orig_align;
269     CGeneModel* m_unmd_align;
270     int m_mem_id;
271 };
272 
273 class CChain : public CGeneModel
274 {
275 private:
276     typedef map<int, double> TIDMap;
277     tuple<TIDMap, TSignedSeqRange> PeaksAndLimits(EStatus determinant, int min_blob_weight, int max_empty_dist, int min_splice_dist);
278     tuple<TIVec, TSignedSeqRange> MainPeaks(TIDMap& peak_weights, double secondary_peak, double tertiary_peak, double tertiary_peak_coverage, bool right_end);
279 public:
280     CChain(SChainMember& mbr, CGeneModel* gapped_helper = 0, bool keep_all_evidence = false);
281 
282     void RestoreTrimmedEnds(int trim);
283     void RemoveFshiftsFromUTRs();
284     void RestoreReasonableConfirmedStart(const CGnomonEngine& gnomon, TOrigAligns& orig_aligns);
285     void SetOpenForPartialyAlignedProteins(map<string, pair<bool,bool> >& prot_complet);
286     pair<bool,bool> ValidPolyA(int pos, const CResidueVec& contig);
287     void ClipToCap(int min_cap_blob, int max_dist, int min_flank_exon, double secondary_peak);
288     void ClipToPolyA(const CResidueVec& contig, int min_polya_blob, int max_dist, int min_flank_exon, double secondary_peak, double tertiary_peak, double tertiary_peak_coverage);
289     void CheckSecondaryCapPolyAEnds();
290     void ClipLowCoverageUTR(double utr_clip_threshold);
291     void CalculateDropLimits();
292     void CalculateSupportAndWeightFromMembers(bool keep_all_evidence = false);
293     void ClipChain(TSignedSeqRange limits);
294     bool SetConfirmedEnds(const CGnomonEngine& gnomon, CGnomonAnnotator_Base::TIntMap& confirmed_ends);
295 
296     void SetConfirmedStartStopForCompleteProteins(map<string, pair<bool,bool> >& prot_complet, const SMinScor& minscor);
297     void CollectTrustedmRNAsProts(TOrigAligns& orig_aligns, const SMinScor& minscor, CScope& scope, SMatrix& matrix, const CResidueVec& contig);
298     void SetBestPlacement(TOrigAligns& orig_aligns);
299     void SetConsistentCoverage();
300 
301     bool HarborsNested(const CChain& other_chain, bool check_in_holes) const;
302     bool HarborsNested(const CGene& other_gene, bool check_in_holes) const;
303 
304     bool HasTrustedEvidence(TOrigAligns& orig_aligns) const;
305 
306     TContained m_members;
307     int m_polya_cap_right_soft_limit;
308     int m_polya_cap_left_soft_limit;
309     int m_coverage_drop_left;
310     int m_coverage_drop_right;
311     int m_coverage_bump_left;
312     int m_coverage_bump_right;
313     double m_core_coverage;
314     vector<double> m_coverage;
315     double m_splice_weight;
316     CGeneModel m_gapped_helper_align;
317     TSignedSeqRange m_supported_range;
318     TIVec m_cap_peaks;
319     TIVec m_polya_peaks;
320 };
321 
322 
323 class CGene : public TChainPointerList
324 {
325 public:
CGene()326     CGene() : m_maxscore(BadScore()) {}
327     typedef list<CGeneModel>::iterator TIt;
328     typedef list<CGeneModel>::const_iterator TConstIt;
Limits() const329     TSignedSeqRange Limits() const { return m_limits; }
RealCdsLimits() const330     TSignedSeqRange RealCdsLimits() const { return m_real_cds_limits; }
331     bool IsAlternative(const CChain& a, TOrigAligns& orig_aligns) const;
332     bool IsAllowedAlternative(const ncbi::gnomon::CGeneModel&, int maxcomposite) const;
333     void Insert(CChain& a);
MaxScore() const334     double MaxScore() const { return m_maxscore; }
Nested() const335     bool Nested() const { return !m_nested_in_genes.empty(); }
336     bool LargeCdsOverlap(const CGeneModel& a) const;
337     bool HarborsNested(const CChain& other_chain, bool check_in_holes) const;
338     bool HarborsNested(const CGene& other_gene, bool check_in_holes) const;
339 
AddToHarbored(CGene * p)340     void AddToHarbored(CGene* p) { m_harbors_genes.insert(p); }
AddToNestedIn(CGene * p)341     void AddToNestedIn(CGene* p) {m_nested_in_genes.insert(p); };
342     set<CGene*> RemoveGeneFromOtherGenesSets();
343 
344 
345 private:
346     bool HarborsRange(TSignedSeqRange range, bool check_in_holes) const;
RemoveFromHarbored(CGene * p)347     void RemoveFromHarbored(CGene* p) { m_harbors_genes.erase(p); }
RemoveFromNestedIn(CGene * p)348     void RemoveFromNestedIn(CGene* p) {m_nested_in_genes.erase(p); };
349 
350     TSignedSeqRange m_limits, m_real_cds_limits;
351     double m_maxscore;
352     set<CGene*> m_nested_in_genes;
353     set<CGene*> m_harbors_genes;
354 };
355 
RemoveGeneFromOtherGenesSets()356 set<CGene*> CGene::RemoveGeneFromOtherGenesSets() {
357     NON_CONST_ITERATE(set<CGene*>, i, m_nested_in_genes)
358         (*i)->RemoveFromHarbored(this);
359     NON_CONST_ITERATE(set<CGene*>, i,m_harbors_genes)
360         (*i)->RemoveFromNestedIn(this);
361 
362     return m_harbors_genes;
363 }
364 
365 // if external model is 'open' all 5' introns can harbor
366 // gene with 'double' CDS can harbor in the interval between CDSes (intron or not)
367 // non coding models in external coding genes have no effect
HarborsRange(TSignedSeqRange range,bool check_in_holes) const368 bool CGene::HarborsRange(TSignedSeqRange range, bool check_in_holes) const {
369     TSignedSeqRange gene_lim_for_nested = Limits();
370     if(RealCdsLimits().NotEmpty())
371         gene_lim_for_nested = front()->OpenCds() ? front()->MaxCdsLimits() : RealCdsLimits();  // 'open' could be only a single variant gene
372     if(!Include(gene_lim_for_nested,range))
373         return false;
374 
375     bool nested = true;
376     ITERATE(CGene, it, *this) {
377         if(RealCdsLimits().NotEmpty() && (*it)->ReadingFrame().Empty())    // non coding model in coding gene
378             continue;
379         TSignedSeqRange model_lim_for_nested = (*it)->Limits();
380         if((*it)->ReadingFrame().NotEmpty())
381             model_lim_for_nested = (*it)->OpenCds() ? (*it)->MaxCdsLimits() : (*it)->RealCdsLimits();   // 'open' could be only a single variant gene
382         if(range.IntersectingWith(model_lim_for_nested) && !CModelCompare::RangeNestedInIntron(range, **it, check_in_holes)) {
383             nested = false;
384             break;
385         }
386     }
387 
388     return nested;
389 }
390 
391 // if external model is 'open' all 5' introns can harbor
392 // gene with 'double' CDS can harbor in the interval between CDSes (intron or not)
393 // for nested model 'open' is ignored
394 // non coding models in external coding genes have no effect
HarborsNested(const CChain & other_chain,bool check_in_holes) const395 bool CGene::HarborsNested(const CChain& other_chain, bool check_in_holes) const {
396     TSignedSeqRange other_lim_for_nested = other_chain.Limits();
397     if(!other_chain.ReadingFrame().Empty())
398         other_lim_for_nested = other_chain.RealCdsLimits();
399 
400     return HarborsRange(other_lim_for_nested, check_in_holes);
401 }
402 
403 // if external model is 'open' all 5' introns can harbor
404 // gene with 'double' CDS can harbor in the interval between CDSes (intron or not)
405 // for nested model 'open' is ignored
406 // non coding models in external coding genes have no effect
HarborsNested(const CGene & other_gene,bool check_in_holes) const407 bool CGene::HarborsNested(const CGene& other_gene, bool check_in_holes) const {
408     TSignedSeqRange other_lim_for_nested = other_gene.Limits();
409     if(!other_gene.RealCdsLimits().Empty())
410         other_lim_for_nested = other_gene.RealCdsLimits();
411 
412     return HarborsRange(other_lim_for_nested, check_in_holes);
413 }
414 
415 
LargeCdsOverlap(const CGeneModel & a) const416 bool CGene::LargeCdsOverlap(const CGeneModel& a) const {
417 
418     ITERATE(CGene, it, *this) {
419         const CGeneModel& b = **it;
420         int common_cds = 0;
421         ITERATE(CGeneModel::TExons, ib, b.Exons()) {
422             ITERATE(CGeneModel::TExons, ia, a.Exons()) {
423                 common_cds += (ib->Limits()&b.RealCdsLimits()&ia->Limits()&a.RealCdsLimits()).GetLength();
424             }
425         }
426         if(common_cds > 50)
427             return true;
428     }
429 
430     return false;
431 }
432 
Insert(CChain & a)433 void CGene::Insert(CChain& a)
434 {
435     push_back(&a);
436     m_limits += a.Limits();
437     m_real_cds_limits += a.RealCdsLimits();
438     m_maxscore = max(m_maxscore,a.Score());
439 }
440 
IsAllowedAlternative(const CGeneModel & a,int maxcomposite) const441 bool CGene::IsAllowedAlternative(const CGeneModel& a, int maxcomposite) const
442 {
443     if(a.Exons().size() > 1 && (a.Status()&CGeneModel::ecDNAIntrons) == 0 && a.TrustedmRNA().empty() && a.TrustedProt().empty()) {
444         return false;
445     }
446 
447     if (a.Support().empty()) {
448         return false;
449     }
450 
451     int composite = 0;
452     ITERATE(CSupportInfoSet, s, a.Support()) {
453         if(s->IsCore() && ++composite > maxcomposite) return false;
454     }
455 
456     if(a.PStop(false) || !a.FrameShifts().empty())
457         return false;
458     if(front()->PStop(false) || !front()->FrameShifts().empty())
459         return false;
460 
461     // check for gapfillers
462 
463     vector<TSignedSeqRange> gene_gapfill_exons;
464     ITERATE(CGeneModel::TExons, e, front()->Exons()) {
465         if(e->m_fsplice_sig == "XX" || e->m_ssplice_sig == "XX")
466             gene_gapfill_exons.push_back(e->Limits());
467     }
468     vector<TSignedSeqRange> a_gapfill_exons;
469     ITERATE(CGeneModel::TExons, e, a.Exons()) {
470         if(e->m_fsplice_sig == "XX" || e->m_ssplice_sig == "XX")
471             a_gapfill_exons.push_back(e->Limits());
472     }
473     if(gene_gapfill_exons != a_gapfill_exons)
474         return false;
475 
476     bool a_share_intron = false;
477     ITERATE(CGene, it, *this) {
478         const CGeneModel& b = **it;
479         set<TSignedSeqRange> b_introns;
480         for(int i = 1; i < (int)b.Exons().size(); ++i) {
481             if(b.Exons()[i-1].m_ssplice && b.Exons()[i].m_fsplice) {
482                 TSignedSeqRange intron(b.Exons()[i-1].GetTo()+1,b.Exons()[i].GetFrom()-1);
483                 b_introns.insert(intron);
484             }
485         }
486 
487         bool a_has_new_intron = false;
488         for(int i = 1; i < (int)a.Exons().size(); ++i) {
489             if(a.Exons()[i-1].m_ssplice && a.Exons()[i].m_fsplice && a.Exons()[i-1].m_ssplice_sig != "XX" && a.Exons()[i].m_fsplice_sig != "XX") {
490                 TSignedSeqRange intron(a.Exons()[i-1].GetTo()+1,a.Exons()[i].GetFrom()-1);
491                 if(b_introns.insert(intron).second)
492                     a_has_new_intron = true;
493                 else
494                     a_share_intron = true;
495             }
496         }
497 
498         if(a_has_new_intron) {
499             continue;
500         } else if(!gene_gapfill_exons.empty()) {
501            return false;
502         } else if(a.RealCdsLimits().NotEmpty() && b.RealCdsLimits().NotEmpty() && !a.RealCdsLimits().IntersectingWith(b.RealCdsLimits()) && (!a.TrustedmRNA().empty() || !a.TrustedProt().empty())) {
503 #ifdef _DEBUG
504             const_cast<CGeneModel&>(a).AddComment("Secondary CDS");
505 #endif
506             continue;
507         } else if(a.RealCdsLen() <= b.RealCdsLen()){
508             return false;
509         }
510     }
511 
512     return (a_share_intron || gene_gapfill_exons.empty());
513 }
514 
IsAlternative(const CChain & a,TOrigAligns & orig_aligns) const515 bool CGene::IsAlternative(const CChain& a, TOrigAligns& orig_aligns) const
516 {
517     _ASSERT( size()>0 );
518 
519     if (a.Strand() != front()->Strand())
520         return false;
521 
522     bool has_common_splice = false;
523 
524     ITERATE(CGene, it, *this) {
525         if(CModelCompare::CountCommonSplices(**it, a) > 0) {      // has common splice
526             has_common_splice = true;
527             break;
528         }
529     }
530 
531     if(a.ReadingFrame().NotEmpty() && RealCdsLimits().NotEmpty()) {
532         CAlignMap amap(a.Exons(), a.FrameShifts(), a.Strand(), a.GetCdsInfo().Cds());
533         TIVec acds_map(amap.FShiftedLen(a.GetCdsInfo().Cds()),0);
534         for(unsigned int j = 0; j < a.Exons().size(); ++j) {
535             for(TSignedSeqPos k = max(a.Exons()[j].GetFrom(),a.GetCdsInfo().Cds().GetFrom()); k <= min(a.Exons()[j].GetTo(),a.GetCdsInfo().Cds().GetTo()); ++k) {
536                 TSignedSeqPos p =  amap.MapOrigToEdited(k);
537                 _ASSERT(p < (int)acds_map.size());
538                 if(p >= 0)
539                     acds_map[p] = k;
540             }
541         }
542 
543 
544         bool has_common_cds = false;
545 
546         ITERATE(CGene, it, *this) {
547             CAlignMap gmap((*it)->Exons(), (*it)->FrameShifts(), (*it)->Strand(), (*it)->GetCdsInfo().Cds());
548             TIVec cds_map(gmap.FShiftedLen((*it)->GetCdsInfo().Cds()),0);
549             for(unsigned int j = 0; j < (*it)->Exons().size(); ++j) {
550                 for(TSignedSeqPos k = max((*it)->Exons()[j].GetFrom(),(*it)->GetCdsInfo().Cds().GetFrom()); k <= min((*it)->Exons()[j].GetTo(),(*it)->GetCdsInfo().Cds().GetTo()); ++k) {
551                     TSignedSeqPos p =  gmap.MapOrigToEdited(k);
552                     _ASSERT(p < (int)cds_map.size());
553                     if(p >= 0)
554                         cds_map[p] = k;
555                 }
556             }
557 
558             for(unsigned int i = 0; i < acds_map.size(); ) {
559                 unsigned int j = 0;
560                 for( ; j < cds_map.size() && (acds_map[i] != cds_map[j] || i%3 != j%3); ++j);
561                 if(j == cds_map.size()) {
562                     ++i;
563                     continue;
564                 }
565 
566                 int count = 0;
567                 for( ; j < cds_map.size() && i < acds_map.size() && acds_map[i] == cds_map[j]; ++j, ++i, ++count);
568 
569                 if(count > 30) {        // has common cds
570                     has_common_cds = true;
571                     break;
572                 }
573             }
574         }
575 
576         bool gene_has_trusted = false;
577         ITERATE(CGene, it, *this) {
578             if((*it)->HasTrustedEvidence(orig_aligns)) {
579                 gene_has_trusted = true;
580                 break;
581             }
582         }
583 
584         if(has_common_cds || (has_common_splice && (!gene_has_trusted || !a.HasTrustedEvidence(orig_aligns)))) // separate trusted genes with similar splices if they don't have common cds
585             return true;
586         else
587             return false;
588     }
589 
590     return has_common_splice;
591 }
592 
DescendingModelOrder(const CChain & a,const CChain & b)593 static bool DescendingModelOrder(const CChain& a, const CChain& b)
594 {
595     if (!a.Support().empty() && b.Support().empty())
596         return true;
597     else if (a.Support().empty() && !b.Support().empty())
598         return false;
599 
600 
601     bool atrusted = !a.TrustedmRNA().empty() || !a.TrustedProt().empty();
602     bool btrusted = !b.TrustedmRNA().empty() || !b.TrustedProt().empty();
603     if(atrusted && !btrusted) {                                     // trusted gene is always better
604         return true;
605     } else if(btrusted && !atrusted) {
606         return false;
607     } else if(a.ReadingFrame().NotEmpty() && b.ReadingFrame().Empty()) {       // coding is always better
608         return true;
609     } else if(b.ReadingFrame().NotEmpty() && a.ReadingFrame().Empty()) {
610         return false;
611     } else if(a.ReadingFrame().NotEmpty()) {     // both coding
612 
613         double ds = 0.05*fabs(a.Score());
614         double as = a.Score();
615         if((a.Status()&CGeneModel::ecDNAIntrons) != 0)
616             as += 2*ds;
617         if((a.Status()&CGeneModel::ePolyA) != 0)
618             as += ds;
619         if((a.Status()&CGeneModel::eCap) != 0)
620             as += ds;
621         if(a.isNMD())
622             as -= ds;
623 
624         ds = 0.05*fabs(b.Score());
625         double bs = b.Score();
626         if((b.Status()&CGeneModel::ecDNAIntrons) != 0)
627             bs += 2*ds;
628         if((b.Status()&CGeneModel::ePolyA) != 0)
629             bs += ds;
630         if((b.Status()&CGeneModel::eCap) != 0)
631             bs += ds;
632         if(b.isNMD())
633             bs -= ds;
634 
635         if(as > bs)    // better score
636             return true;
637         else if(bs > as)
638             return false;
639         else if(a.m_splice_weight > b.m_splice_weight) // more splice support
640             return true;
641         else if(a.m_splice_weight < b.m_splice_weight)
642             return false;
643         else if(a.Weight() > b.Weight())       // more alignments is better
644             return true;
645         else if(a.Weight() < b.Weight())
646             return false;
647         else if(a.Limits().GetLength() != b.Limits().GetLength())
648             return (a.Limits().GetLength() < b.Limits().GetLength());   // everything else equal prefer compact model
649         else
650             return a.ID() < b.ID();
651     } else {                       // both noncoding
652         double asize = a.m_splice_weight;
653         double bsize = b.m_splice_weight;
654         double ds = 0.025*(asize+bsize);
655 
656         if((a.Status()&CGeneModel::ePolyA) != 0)
657             asize += ds;
658         if((a.Status()&CGeneModel::eCap) != 0)
659             asize += ds;
660         if(a.isNMD())
661             asize -= ds;
662 
663         if((b.Status()&CGeneModel::ePolyA) != 0)
664             bsize += ds;
665         if((b.Status()&CGeneModel::eCap) != 0)
666             bsize += ds;
667         if(b.isNMD())
668             bsize -= ds;
669 
670         if(asize > bsize)
671             return true;
672         else if(bsize > asize)
673             return false;
674         else if(a.Limits().GetLength() != b.Limits().GetLength())
675             return (a.Limits().GetLength() < b.Limits().GetLength());   // everything else equal prefer compact model
676         else
677             return a.ID() < b.ID();
678     }
679 }
680 
681 typedef CChain* TChainPtr;
DescendingModelOrderP(const TChainPtr & a,const TChainPtr & b)682 static bool DescendingModelOrderP(const TChainPtr& a, const TChainPtr& b)
683 {
684     return DescendingModelOrder(*a, *b);
685 }
DescendingModelOrderPConsistentCoverage(const TChainPtr & a,const TChainPtr & b)686 static bool DescendingModelOrderPConsistentCoverage(const TChainPtr& a, const TChainPtr& b)
687 {
688     if((a->Status()&CGeneModel::eConsistentCoverage) != (b->Status()&CGeneModel::eConsistentCoverage))
689         return (a->Status()&CGeneModel::eConsistentCoverage) > (b->Status()&CGeneModel::eConsistentCoverage);
690     else
691         return DescendingModelOrder(*a, *b);
692 }
693 
CheckCompatibility(const CGene & gene,const CChain & algn)694 CChainer::CChainerImpl::ECompat CChainer::CChainerImpl::CheckCompatibility(const CGene& gene, const CChain& algn)
695 {
696     bool gene_good_enough_to_be_annotation = allow_partialalts || gene.front()->GoodEnoughToBeAnnotation();
697     bool algn_good_enough_to_be_annotation = allow_partialalts || algn.GoodEnoughToBeAnnotation();
698 
699     TSignedSeqRange gene_cds = (gene.size() > 1 || gene.front()->CompleteCds() || algn_good_enough_to_be_annotation) ? gene.RealCdsLimits() : gene.front()->MaxCdsLimits();
700     TSignedSeqRange algn_cds = (algn.CompleteCds() || gene_good_enough_to_be_annotation) ? algn.RealCdsLimits() : algn.MaxCdsLimits();
701 
702     if(!gene_good_enough_to_be_annotation && !algn_good_enough_to_be_annotation) { // both need ab initio
703         const CGeneModel& b = *gene.front();
704         for(int i = 1; i < (int)b.Exons().size(); ++i) {
705             if(b.Exons()[i].m_ssplice_sig == "XX" && b.Exons()[i].m_fsplice_sig == "XX" && b.Exons()[i].Limits().IntersectingWith(gene_cds)) { // if gap cds extend range to left exon
706                 gene_cds.SetFrom(min(gene_cds.GetFrom(), b.Exons()[i-1].GetTo()));
707             }
708         }
709 
710         for(int i = 1; i < (int)algn.Exons().size(); ++i) {
711             if(algn.Exons()[i].m_ssplice_sig == "XX" && algn.Exons()[i].m_fsplice_sig == "XX" && algn.Exons()[i].Limits().IntersectingWith(algn_cds)) { // if gap cds extend range to left exon
712                 algn_cds.SetFrom(min(algn_cds.GetFrom(), algn.Exons()[i-1].GetTo()));
713             }
714         }
715     }
716 
717     if(!gene.Limits().IntersectingWith(algn.Limits()))             // don't overlap
718         return eOtherGene;
719 
720     if(gene.IsAlternative(algn, orig_aligns)) {   // has common splice or common CDS
721 
722         if(gene.IsAllowedAlternative(algn, composite) && algn_good_enough_to_be_annotation) {
723             if(!algn.TrustedmRNA().empty() || !algn.TrustedProt().empty()) {                   // trusted gene
724                 return eAlternative;
725             } else if(algn.ReadingFrame().Empty() || gene.front()->ReadingFrame().Empty()) {   // one noncoding
726                 if(algn.m_splice_weight > altfrac/100*gene.front()->m_splice_weight)                     // long enough
727                     return eAlternative;
728                 else
729                     return eNotCompatible;
730             } else if(algn.RealCdsLen() > altfrac/100*gene.front()->RealCdsLen() || algn.Score() > altfrac/100*gene.front()->Score()) {   // good score or long enough cds
731                 return eAlternative;
732             }
733         }
734 
735         return eNotCompatible;
736     }
737 
738     // don't include overlapping gapfil 'introns' in different genes
739     set<TSignedSeqRange> gene_gapfill_introns;
740     set<TSignedSeqRange> align_gapfill_introns;
741     ITERATE(CGene, it, gene) {
742         const CGeneModel& b = **it;
743         for(int i = 1; i < (int)b.Exons().size(); ++i) {
744             if(b.Exons()[i-1].m_ssplice_sig == "XX" || b.Exons()[i].m_fsplice_sig == "XX") {
745                 TSignedSeqRange intron(b.Exons()[i-1].GetTo(),b.Exons()[i].GetFrom());
746                 gene_gapfill_introns.insert(intron);
747             }
748         }
749     }
750     for(int i = 1; i < (int)algn.Exons().size(); ++i) {
751         if(algn.Exons()[i-1].m_ssplice_sig == "XX" || algn.Exons()[i].m_fsplice_sig == "XX") {
752             TSignedSeqRange intron(algn.Exons()[i-1].GetTo(),algn.Exons()[i].GetFrom());
753             align_gapfill_introns.insert(intron);
754         }
755     }
756     ITERATE(set<TSignedSeqRange>, ig, gene_gapfill_introns) {
757         ITERATE(set<TSignedSeqRange>, ia, align_gapfill_introns) {
758             if(ig->IntersectingWith(*ia))
759                 return eNotCompatible;
760         }
761     }
762 
763     if(algn.HarborsNested(gene, gene_good_enough_to_be_annotation)) {    // gene is nested in align's intron (could be partial)
764         if(gene_good_enough_to_be_annotation || algn.HasTrustedEvidence(orig_aligns))
765             return eExternal;
766         else
767             return eNotCompatible;
768     }
769 
770     if(gene.HarborsNested(algn, algn_good_enough_to_be_annotation)) {   // algn is nested in gene (could be partial)
771         if(algn_good_enough_to_be_annotation || algn.HasTrustedEvidence(orig_aligns))
772             return eNested;
773         else
774             return eNotCompatible;
775     }
776 
777     if(!algn_cds.Empty() && !gene_cds.Empty()) {                          // both coding
778         if (!gene_cds.IntersectingWith(algn_cds)) {          // don't overlap
779 #ifdef _DEBUG
780             if((gene_cds+algn_cds).GetLength() < gene_cds.GetLength()+algn_cds.GetLength()+20)
781                 const_cast<CChain&>(algn).AddComment("Close proximity");
782 #endif
783             return eOtherGene;
784         } else if(gene.LargeCdsOverlap(algn)) {
785             return eNotCompatible;
786         }
787     }
788 
789     if(gene_good_enough_to_be_annotation && algn_good_enough_to_be_annotation) {
790         if(gene.front()->Strand() != algn.Strand() && allow_opposite_strand &&
791            ((algn.Status()&CGeneModel::eBestPlacement) || (algn.Exons().size() > 1 && gene.front()->Exons().size() > 1)))
792             return eOtherGene;
793         else if(algn.Status() & CGeneModel::eBestPlacement && (algn.Exons().size() == 1 || (algn.Status()&CGeneModel::ecDNAIntrons))) {
794 #ifdef _DEBUG
795             const_cast<CChain&>(algn).AddComment("Best placement overlap");
796 #endif
797             return eOtherGene;
798         }
799     }
800 
801     return eNotCompatible;
802 }
803 
FindGeneSeeds(list<CGene> & alts,TChainPointerList & not_placed_yet)804 void CChainer::CChainerImpl::FindGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet) {
805 
806     not_placed_yet.sort(DescendingModelOrderP);
807 
808     for(TChainPointerList::iterator itloop = not_placed_yet.begin(); itloop != not_placed_yet.end(); ) {
809         TChainPointerList::iterator it = itloop++;
810         CChain& algn(**it);
811 
812         if(algn.Score() == BadScore())             // postpone noncoding models
813             continue;
814         else if(algn.Score() < 2*minscor.m_min && algn.GetCdsInfo().ProtReadingFrame().Empty())  // postpone not so good models
815             continue;
816 
817         list<CGene*> possibly_nested;
818 
819         bool good_model = true;
820         for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
821             ECompat cmp = CheckCompatibility(*itl, algn);
822 
823             switch(cmp) {
824             case eExternal:
825                 possibly_nested.push_back(&(*itl));  // already created gene is nested in this model
826             case eOtherGene:
827                 break;
828             default:
829                 good_model = false;
830                 break;
831             }
832         }
833 
834         if(good_model) {
835             alts.push_back(CGene());
836 #ifdef _DEBUG
837             algn.AddComment("Pass1");
838 #endif
839             alts.back().Insert(algn);
840             not_placed_yet.erase(it);
841         }
842 
843         ITERATE(list<CGene*>, itl, possibly_nested) {
844             (*itl)->AddToNestedIn(&alts.back());
845             alts.back().AddToHarbored(*itl);
846         }
847     }
848 }
849 
ReplacePseudoGeneSeeds(list<CGene> & alts,TChainPointerList & not_placed_yet)850 void CChainer::CChainerImpl::ReplacePseudoGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet) {
851 
852     not_placed_yet.sort(DescendingModelOrderP);
853 
854     for(TChainPointerList::iterator itloop = not_placed_yet.begin(); itloop != not_placed_yet.end(); ) {
855         TChainPointerList::iterator it = itloop++;
856         CChain& algn(**it);
857 
858         list<list<CGene>::iterator> included_in;
859         list<CGene*> possibly_nested;   // genes which 'could' become nested
860         list<CGene*> nested_in;
861 
862         bool good_model = true;
863         for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
864             ECompat cmp = CheckCompatibility(*itl, algn);
865 
866             switch(cmp) {
867             case eNested:
868                 nested_in.push_back(&(*itl));
869                 break;
870             case eExternal:
871                 possibly_nested.push_back(&(*itl));  // already created gene is nested in this model
872                 break;
873             case eOtherGene:
874                 break;
875             case eAlternative:
876                 included_in.push_back(itl);
877                 break;
878             case eNotCompatible:
879                 if(itl->IsAlternative(algn, orig_aligns))
880                     included_in.push_back(itl);
881                 else
882                     good_model = false;
883                 break;
884             default:
885                 good_model = false;
886                 break;
887             }
888         }
889 
890         if(!good_model || included_in.size() != 1 || (!(algn.Status()&CGeneModel::ecDNAIntrons) && algn.TrustedmRNA().empty() && algn.TrustedProt().empty()))
891             continue;
892 
893         CGene& gene = *included_in.front();
894         CChain& model = *gene.front();
895         //        if((!model.PStop(false) && model.FrameShifts().empty()) || algn.PStop(false) || !algn.FrameShifts().empty())
896         if(!model.PStop(false) || algn.PStop(false) || !algn.FrameShifts().empty())  // use only for pstops
897             continue;
898 
899         int algn_cds_len = algn.FShiftedLen(algn.GetCdsInfo().Cds(),false);
900         int model_cds_len = model.FShiftedLen(model.GetCdsInfo().Cds(),false);
901         if(algn_cds_len < 0.8*model_cds_len)
902             continue;
903 
904 #ifdef _DEBUG
905         algn.AddComment("Replacing pseudo "+NStr::NumericToString(model.ID()));
906 #endif
907         not_placed_yet.push_back(gene.front()); // position doesn't matter - will go to 'bad' models
908         gene.RemoveGeneFromOtherGenesSets();
909         gene = CGene();
910         gene.Insert(algn);
911         ITERATE(list<CGene*>, itl, nested_in) {
912             gene.AddToNestedIn(*itl);
913             (*itl)->AddToHarbored(&gene);
914         }
915         ITERATE(list<CGene*>, itl, possibly_nested) {
916             (*itl)->AddToNestedIn(&gene);
917             gene.AddToHarbored(*itl);
918         }
919 
920         not_placed_yet.erase(it);
921     }
922 }
923 
FindAltsForGeneSeeds(list<CGene> & alts,TChainPointerList & not_placed_yet)924 void CChainer::CChainerImpl::FindAltsForGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet) {
925 
926     not_placed_yet.sort(DescendingModelOrderPConsistentCoverage);
927 
928     for(TChainPointerList::iterator itloop = not_placed_yet.begin(); itloop != not_placed_yet.end(); ) {
929         TChainPointerList::iterator it = itloop++;
930         CChain& algn(**it);
931 
932         list<list<CGene>::iterator> included_in;
933         list<CGene*> possibly_nested;   // genes which 'could' become nested
934 
935         bool good_model = true;
936         for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
937             ECompat cmp = CheckCompatibility(*itl, algn);
938 
939             switch(cmp) {
940             case eExternal:
941                 possibly_nested.push_back(&(*itl));  // already created gene is nested in this model
942             case eOtherGene:
943                 break;
944             case eAlternative:
945                 included_in.push_back(itl);
946                 break;
947             default:
948                 good_model = false;
949                 break;
950             }
951         }
952 
953         if(good_model && !included_in.empty() && (allow_partialalts || included_in.front()->front()->GoodEnoughToBeAnnotation())) {
954             if(included_in.size() == 1) {    // alternative to only one seed
955 #ifdef _DEBUG
956                 algn.AddComment("Pass2a");
957 #endif
958 
959                 CGene& gene = *included_in.front();
960                 gene.Insert(algn);
961                 not_placed_yet.erase(it);
962 
963                 ITERATE(list<CGene*>, itl, possibly_nested) {
964                     if(gene.HarborsNested(**itl, true)) {
965                         (*itl)->AddToNestedIn(&gene);
966                         gene.AddToHarbored(*itl);
967                     }
968                 }
969             } else {  // connects seeds
970 
971                 bool allow_connection = false;
972 
973                 if(!algn.TrustedmRNA().empty() || !algn.TrustedProt().empty() || (algn.Status()&CGeneModel::eConsistentCoverage)) {   // connects seeds but trusted
974                     bool cds_overlap = true;
975                     if(algn.ReadingFrame().Empty()) {
976                         cds_overlap = false;
977                     } else {
978                         CChain a = algn;
979                         a.Clip(a.RealCdsLimits(), CAlignModel::eRemoveExons);
980                         ITERATE(list<list<CGene>::iterator>, k, included_in) {
981                             if(!(*k)->IsAlternative(a, orig_aligns)) {
982                                 cds_overlap = false;
983                                 break;
984                             }
985                         }
986                     }
987 
988                     if(cds_overlap || (algn.Status()&CGeneModel::eConsistentCoverage)) {
989 #ifdef _DEBUG
990                         algn.AddComment("Gene overlap override");
991 #endif
992                         allow_connection = true;
993                     }
994                 }
995 
996                 if(allow_connection) {
997                     CGene& gene = *included_in.front();
998                     gene.Insert(algn);
999 
1000                     ITERATE(list<list<CGene>::iterator>, k, included_in) {
1001                         if(k != included_in.begin()) {
1002                             ITERATE(CGene, l, **k) {
1003                                 if(itloop == not_placed_yet.end() || !DescendingModelOrder(**itloop, **l)) {  // next is not better
1004                                     if(CheckCompatibility(*included_in.front(), **l) == eAlternative) {  // check that the thresholds are met
1005 #ifdef _DEBUG
1006                                         (*l)->AddComment("Pass2b");
1007 #endif
1008                                         included_in.front()->Insert(**l);
1009                                     } else {
1010                                         not_placed_yet.push_back(*l); // position doesn't matter - will go to 'bad' models
1011                                     }
1012                                 } else {
1013                                     TChainPointerList::iterator idest = itloop;
1014                                     for( ;idest != not_placed_yet.end() && DescendingModelOrder(**idest, **l); ++idest);
1015                                     not_placed_yet.insert(idest, *l);
1016                                 }
1017                             }
1018                             set<CGene*> nested_genes = (*k)->RemoveGeneFromOtherGenesSets();
1019                             ITERATE(set<CGene*>, i, nested_genes)
1020                                 possibly_nested.push_back(*i);
1021                             alts.erase(*k);
1022                         }
1023                     }
1024                     not_placed_yet.erase(it);
1025 
1026                     ITERATE(list<CGene*>, itl, possibly_nested) {
1027                         if(gene.HarborsNested(**itl, true)) {
1028                             (*itl)->AddToNestedIn(&gene);
1029                             gene.AddToHarbored(*itl);
1030                         }
1031                     }
1032                 }
1033             }
1034         }
1035     }
1036 }
1037 
PlaceAllYouCan(list<CGene> & alts,TChainPointerList & not_placed_yet,TChainPointerList & rejected)1038 void CChainer::CChainerImpl::PlaceAllYouCan(list<CGene>& alts, TChainPointerList& not_placed_yet, TChainPointerList& rejected) {
1039 
1040     not_placed_yet.sort(DescendingModelOrderP);
1041 
1042     ITERATE(TChainPointerList, it, not_placed_yet) {
1043         CChain& algn(**it);
1044         list<CGene>::iterator included_in(alts.end());
1045         list<CGene*> possibly_nested;
1046         list<CGene*> nested_in;
1047 
1048         bool good_model = true;
1049         for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
1050             ECompat cmp = CheckCompatibility(*itl, algn);
1051             CNcbiOstrstream ost;
1052             switch(cmp) {
1053             case eNotCompatible:
1054                 rejected.push_back(&algn);
1055                 rejected.back()->Status() |= CGeneModel::eSkipped;
1056                 ost << "Trumped by another model " << itl->front()->ID();
1057                 rejected.back()->AddComment(CNcbiOstrstreamToString(ost));
1058                 good_model = false;
1059                 break;
1060             case eAlternative:
1061                 if(!allow_partialalts && !itl->front()->GoodEnoughToBeAnnotation()) {
1062                     rejected.push_back(&algn);
1063                     rejected.back()->Status() |= CGeneModel::eSkipped;
1064                     ost << "    Trumped by another model " << itl->front()->ID();
1065                     rejected.back()->AddComment(CNcbiOstrstreamToString(ost));
1066                     good_model = false;
1067                 } else if(included_in == alts.end()) {
1068                     included_in = itl;
1069                 } else {  // tries to connect two different genes
1070                     good_model = false;
1071                     rejected.push_back(&algn);
1072                     rejected.back()->Status() |= CGeneModel::eSkipped;
1073                     ost << "Connects two genes " << itl->front()->ID() << " " << included_in->front()->ID();
1074                     rejected.back()->AddComment(CNcbiOstrstreamToString(ost));
1075                 }
1076                 break;
1077             case eNested:
1078                 nested_in.push_back(&(*itl));
1079                 break;
1080             case eExternal:
1081                 possibly_nested.push_back(&(*itl));  // already created gene is nested in this model
1082                 break;
1083             case eOtherGene:
1084                 break;
1085             }
1086         }
1087         if(good_model) {
1088             CGene* genep;
1089             if(included_in != alts.end()) {
1090 #ifdef _DEBUG
1091                 algn.AddComment("Pass3a");
1092 #endif
1093                 included_in->Insert(algn);
1094                 genep = &(*included_in);
1095             } else {
1096                 alts.push_back(CGene());
1097                 genep = &alts.back();
1098 #ifdef _DEBUG
1099                 algn.AddComment("Pass3b");
1100 #endif
1101                 alts.back().Insert(algn);
1102             }
1103             ITERATE(list<CGene*>, itl, nested_in) {
1104                 if((*itl)->HarborsNested(*genep, true)) {
1105                    genep->AddToNestedIn(*itl);
1106                    (*itl)->AddToHarbored(genep);
1107                 }
1108             }
1109             ITERATE(list<CGene*>, itl, possibly_nested) {
1110                 if(genep->HarborsNested(**itl, true)) {
1111                     (*itl)->AddToNestedIn(genep);
1112                     genep->AddToHarbored(*itl);
1113                 }
1114             }
1115         }
1116     }
1117 }
1118 
FilterOutSimilarsWithLowerScore(TChainPointerList & not_placed_yet,TChainPointerList & rejected)1119 void CChainer::CChainerImpl::FilterOutSimilarsWithLowerScore(TChainPointerList& not_placed_yet, TChainPointerList& rejected)
1120 {
1121     not_placed_yet.sort(DescendingModelOrderP);
1122 
1123     NON_CONST_ITERATE(TChainPointerList, it, not_placed_yet) {
1124         CChain& ai(**it);
1125         TChainPointerList::iterator jt_loop = it;
1126         for(++jt_loop; jt_loop != not_placed_yet.end();) {
1127             TChainPointerList::iterator jt = jt_loop++;
1128             CChain& aj(**jt);
1129             if (CModelCompare::AreSimilar(ai,aj,tolerance)) {
1130                 CNcbiOstrstream ost;
1131                 ost << "Trumped by similar chain " << ai.ID();
1132                 aj.AddComment(CNcbiOstrstreamToString(ost));
1133                 rejected.push_back(&aj);
1134                 not_placed_yet.erase(jt);
1135             }
1136         }
1137     }
1138 }
1139 
FilterOutTandemOverlap(TChainPointerList & not_placed_yet,TChainPointerList & rejected,double fraction)1140 void CChainer::CChainerImpl::FilterOutTandemOverlap(TChainPointerList& not_placed_yet, TChainPointerList& rejected, double fraction)
1141 {
1142     for(TChainPointerList::iterator it_loop = not_placed_yet.begin(); it_loop != not_placed_yet.end();) {
1143         TChainPointerList::iterator it = it_loop++;
1144         CChain& ai(**it);
1145 
1146         if(!ai.TrustedmRNA().empty() || !ai.TrustedProt().empty() || ai.ReadingFrame().Empty())
1147             continue;
1148         int cds_len = ai.RealCdsLen();
1149 
1150         vector<const CChain*> candidates;
1151         ITERATE(TChainPointerList, jt, not_placed_yet) {
1152             const CChain& aj(**jt);
1153             if(!aj.HasStart() || !aj.HasStop() || aj.Score() < fraction/100*ai.Score() || aj.RealCdsLen() < fraction/100*cds_len || !CModelCompare::HaveCommonExonOrIntron(ai,aj))
1154                 continue;
1155             candidates.push_back(&aj);
1156         }
1157 
1158         bool alive = true;
1159         for (size_t i = 0; alive && i < candidates.size(); ++i) {
1160             for (size_t j = i+1; alive && j < candidates.size(); ++j) {
1161                 if(!candidates[i]->Limits().IntersectingWith(candidates[j]->Limits())) {
1162                     CNcbiOstrstream ost;
1163                     ost << "Overlapping tandem " << candidates[i]->ID() - ai.ID() << " " << candidates[j]->ID() - ai.ID();
1164                     ai.AddComment(CNcbiOstrstreamToString(ost));
1165                     rejected.push_back(*it);
1166                     not_placed_yet.erase(it);
1167                     alive = false;
1168                 }
1169             }
1170         }
1171     }
1172 }
1173 
FindGenes(TChainList & cls)1174 list<CGene> CChainer::CChainerImpl::FindGenes(TChainList& cls)
1175 {
1176     TChainPointerList not_placed_yet;
1177     NON_CONST_ITERATE(TChainList, it, cls) {
1178         if((it->Status()&CGeneModel::eSkipped) == 0) {
1179             if(it->Type()&CGeneModel::eNested)
1180                 it->SetType(it->Type()^CGeneModel::eNested);
1181             it->SetGeneID(it->ID());
1182             it->SetRankInGene(0);
1183             not_placed_yet.push_back(&(*it));
1184         }
1185     }
1186 
1187     list<CGene> alts;
1188     TChainPointerList bad_aligns;
1189 
1190     FilterOutSimilarsWithLowerScore(not_placed_yet, bad_aligns);
1191     FilterOutTandemOverlap(not_placed_yet, bad_aligns, 80);
1192 
1193     FindGeneSeeds(alts, not_placed_yet);
1194     ReplacePseudoGeneSeeds(alts, not_placed_yet);
1195     FindAltsForGeneSeeds(alts, not_placed_yet);
1196     PlaceAllYouCan(alts, not_placed_yet, bad_aligns);
1197 
1198     NON_CONST_ITERATE(list<CGene>, k, alts) {
1199         int rank = 0;
1200         NON_CONST_ITERATE(CGene, l, *k) {
1201             (*l)->SetGeneID(k->front()->ID());
1202             (*l)->SetRankInGene(++rank);
1203             if(k->Nested())
1204                (*l)->SetType((*l)->Type()|CGeneModel::eNested);
1205         }
1206     }
1207 
1208     NON_CONST_ITERATE(TChainPointerList, l, bad_aligns)
1209         (*l)->Status() |= CGeneModel::eSkipped;
1210 
1211     return alts;
1212 }
1213 
1214 
1215 struct GenomeOrderD
1216 {
operator ()GenomeOrderD1217     bool operator()(const SChainMember* ap, const SChainMember* bp)    // left end increasing, long first if left end equal
1218     {
1219         TSignedSeqRange alimits = ap->m_align->Limits();
1220         //ignore flexible ends for sorting
1221         if(ap->m_align->Status()&CGeneModel::eLeftFlexible)
1222             alimits.SetFrom(alimits.GetTo());
1223         if(ap->m_align->Status()&CGeneModel::eRightFlexible)
1224             alimits.SetTo(alimits.GetFrom());
1225         TSignedSeqRange blimits = bp->m_align->Limits();
1226         //ignore flexible ends for sorting
1227         if(bp->m_align->Status()&CGeneModel::eLeftFlexible)
1228             blimits.SetFrom(blimits.GetTo());
1229         if(bp->m_align->Status()&CGeneModel::eRightFlexible)
1230             blimits.SetTo(blimits.GetFrom());
1231         if(alimits == blimits)
1232             return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1233         else if(alimits.GetFrom() == blimits.GetFrom())
1234             return (alimits.GetTo() > blimits.GetTo());
1235         else
1236             return (alimits.GetFrom() < blimits.GetFrom());
1237     }
1238 };
1239 
1240 
1241 typedef vector< pair<SChainMember*,CGene*> > TMemeberGeneVec;
1242 
1243 typedef tuple<Int8, TSignedSeqRange> TIdLim;
AlignIdLimits(SChainMember * mp)1244 TIdLim AlignIdLimits(SChainMember* mp) {
1245     return make_tuple(mp->m_align->ID(), mp->m_align->Limits());
1246 }
1247 struct AlignIdOrder
1248 {
operator ()AlignIdOrder1249     bool operator()(const TMemeberGeneVec::value_type& a, const TMemeberGeneVec::value_type& b)
1250     {
1251         return AlignIdLimits(a.first) < AlignIdLimits(b.first);
1252     }
1253 };
1254 
1255 
TrimAlignmentsIncludedInDifferentGenes(list<CGene> & genes)1256 void CChainer::CChainerImpl::TrimAlignmentsIncludedInDifferentGenes(list<CGene>& genes) {
1257 
1258     TMemeberGeneVec members_genes;
1259     NON_CONST_ITERATE(list<CGene>, ig, genes) {
1260         CGene& gene = *ig;
1261         TMemberPtrSet gmembers;
1262         ITERATE(CGene, ic, gene) {
1263             CChain& chain = **ic;
1264             ITERATE(TContained, im, chain.m_members) {
1265                 SChainMember& m = **im;
1266                 _ASSERT(m.m_orig_align);
1267                 if(m.m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
1268                     continue;
1269                 if(m.m_orig_align->Continuous())
1270                     gmembers.insert(&m);
1271             }
1272         }
1273         ITERATE(TMemberPtrSet, im, gmembers) {
1274             SChainMember& m = **im;
1275             members_genes.push_back(TMemeberGeneVec::value_type(&m,&gene));
1276         }
1277     }
1278 
1279     if(members_genes.empty())
1280         return;
1281 
1282     sort(members_genes.begin(),members_genes.end(),AlignIdOrder());
1283 
1284     typedef map<CGene*,list<SChainMember*> > TGeneToMembers;
1285     typedef map<TIdLim, TGeneToMembers> TMembersInDiffGenes;
1286     TMembersInDiffGenes members_in_different_genes;
1287     {
1288         SChainMember* mp = members_genes.front().first;
1289         TIdLim idlim = AlignIdLimits(mp);
1290         CGene* genep = members_genes.front().second;
1291         members_in_different_genes[idlim][genep].push_back(mp);
1292     }
1293     for(int i = 1; i < (int)members_genes.size(); ++i) {
1294         TIdLim idlim_prev = AlignIdLimits(members_genes[i-1].first);
1295         SChainMember* mp = members_genes[i].first;
1296         TIdLim idlim = AlignIdLimits(mp);
1297         CGene* genep = members_genes[i].second;
1298         if(idlim_prev != idlim) {
1299             TMembersInDiffGenes::iterator it = members_in_different_genes.find(idlim_prev);
1300             if(it->second.size() < 2) // alignment in only one gene
1301                 members_in_different_genes.erase(it);
1302         }
1303         members_in_different_genes[idlim][genep].push_back(mp);
1304     }
1305     {
1306         SChainMember* mp = members_genes.back().first;
1307         TIdLim idlim = AlignIdLimits(mp);
1308         TMembersInDiffGenes::iterator it = members_in_different_genes.find(idlim);
1309         if(it->second.size() < 2) // alignment in only one gene
1310             members_in_different_genes.erase(it);
1311     }
1312 
1313     ITERATE(TMembersInDiffGenes, imdg, members_in_different_genes) {
1314         ITERATE(TGeneToMembers, ig1, imdg->second) {
1315             CGene& gene1 = *ig1->first;
1316             ITERATE(CGene, ic1, gene1) {
1317                 CChain& chain1 = **ic1;
1318                 sort(chain1.m_members.begin(),chain1.m_members.end());
1319             }
1320         }
1321     }
1322 
1323     typedef map<CChain*,TMemberPtrSet> TConflictMemebersInChains;
1324     TConflictMemebersInChains conflict_members_in_chains;
1325 
1326     ITERATE(TMembersInDiffGenes, imdg, members_in_different_genes) {
1327         ITERATE(TGeneToMembers, ig1, imdg->second) {
1328             CGene& gene1 = *ig1->first;
1329             ITERATE(CGene, ic1, gene1) {
1330                 CChain* chain1p_orig = *ic1;
1331                 SChainMember* mbr1p_orig = 0;
1332                 for(list<SChainMember*>::const_iterator im = ig1->second.begin(); im != ig1->second.end() && mbr1p_orig == 0; ++im) {
1333                     if(binary_search(chain1p_orig->m_members.begin(),chain1p_orig->m_members.end(),*im))
1334                        mbr1p_orig = *im;
1335                 }
1336                 for(TGeneToMembers::const_iterator ig2 = imdg->second.begin(); mbr1p_orig != 0 && ig2 != ig1; ++ig2) {
1337                     CGene& gene2 = *ig2->first;
1338                     ITERATE(CGene, ic2, gene2) {
1339                         CChain* chain1p = chain1p_orig;
1340                         SChainMember* mbr1p = mbr1p_orig;
1341                         CChain* chain2p = *ic2;
1342                         SChainMember* mbr2p = 0;
1343                         for(list<SChainMember*>::const_iterator im = ig2->second.begin(); im != ig2->second.end() && mbr2p == 0; ++im) {
1344                             if(binary_search(chain2p->m_members.begin(),chain2p->m_members.end(),*im))
1345                                 mbr2p = *im;
1346                         }
1347 
1348                         if(mbr2p != 0) {    // both chains have alignment
1349 
1350                             TSignedSeqRange core1 = chain1p->RealCdsLimits();
1351                             if(chain1p->Exons().size() > 1)
1352                                 core1 += TSignedSeqRange(chain1p->Exons().front().Limits().GetTo(),chain1p->Exons().back().Limits().GetFrom());
1353                             TSignedSeqRange core2 = chain2p->RealCdsLimits();
1354                             if(chain2p->Exons().size() > 1)
1355                                 core2 += TSignedSeqRange(chain2p->Exons().front().Limits().GetTo(),chain2p->Exons().back().Limits().GetFrom());
1356                             _ASSERT(core1.NotEmpty() && core2.NotEmpty());
1357 
1358                             if(Precede(core2,core1)) {   // chain2 is on the left change them over to simplify coding below
1359                                 swap(chain1p,chain2p);
1360                                 swap(mbr1p,mbr2p);
1361                                 swap(core1,core2);
1362                             }
1363 
1364                             CChain& chain1 = *chain1p;
1365                             CChain& chain2 = *chain2p;
1366                             TSignedSeqRange align_lim = mbr1p->m_align->Limits();
1367 
1368                             if(CModelCompare::RangeNestedInIntron(core2, chain1)) {            // chain2 is nested
1369                                 conflict_members_in_chains[&chain2].insert(mbr2p);
1370                             } else if(CModelCompare::RangeNestedInIntron(core1, chain2)) {     // chain1 is nested
1371                                 conflict_members_in_chains[&chain1].insert(mbr1p);
1372                             }else if(Precede(core1,core2)) {                                   // chain1 on the left
1373                                 if(Precede(align_lim,core1))                                       // alignment on the left of chain1
1374                                     conflict_members_in_chains[&chain2].insert(mbr2p);
1375                                 else if(Precede(core2,align_lim))                                  // alignment on the right of chain2
1376                                     conflict_members_in_chains[&chain1].insert(mbr1p);
1377                                 else {                                                             // alignmnet in between
1378                                     if(chain1.m_coverage_drop_right > 0 && chain2.m_coverage_drop_left > chain1.m_coverage_drop_right) {  // non overlapping drop limits
1379                                         if(align_lim.GetTo() > chain1.m_coverage_drop_right)
1380                                             conflict_members_in_chains[&chain1].insert(mbr1p);
1381                                         if(align_lim.GetFrom() < chain2.m_coverage_drop_left)
1382                                             conflict_members_in_chains[&chain2].insert(mbr2p);
1383                                     } else if(chain1.m_coverage_drop_right > 0 && chain2.m_coverage_drop_left < 0 && chain1.m_core_coverage > 2*chain2.m_core_coverage) {    // only chain1 has drop limit and is more expressed
1384                                         if(align_lim.GetTo() > chain1.m_coverage_drop_right)
1385                                             conflict_members_in_chains[&chain1].insert(mbr1p);
1386                                         if(align_lim.GetFrom() < max(chain2.m_coverage_bump_left,chain1.m_coverage_drop_right+50))
1387                                             conflict_members_in_chains[&chain2].insert(mbr2p);
1388                                     } else if(chain1.m_coverage_drop_right < 0 && chain2.m_coverage_drop_left > 0 && chain2.m_core_coverage > 2*chain1.m_core_coverage) {    // only chain2 has drop limit and is more expressed
1389                                         if(align_lim.GetFrom() < chain2.m_coverage_drop_left)
1390                                             conflict_members_in_chains[&chain2].insert(mbr2p);
1391                                         if(align_lim.GetTo() > chain2.m_coverage_drop_left-50 || (chain1.m_coverage_bump_right > 0 && align_lim.GetTo() > chain1.m_coverage_bump_right))
1392                                             conflict_members_in_chains[&chain1].insert(mbr1p);
1393                                     } else {
1394                                         conflict_members_in_chains[&chain1].insert(mbr1p);
1395                                         conflict_members_in_chains[&chain2].insert(mbr2p);
1396                                     }
1397                                 }
1398                             } else {
1399                                 conflict_members_in_chains[&chain1].insert(mbr1p);
1400                                 conflict_members_in_chains[&chain2].insert(mbr2p);
1401                             }
1402                         }
1403                     }
1404                 }
1405             }
1406         }
1407     }
1408 
1409     ITERATE(TMembersInDiffGenes, imdg, members_in_different_genes) {
1410         ITERATE(TGeneToMembers, ig1, imdg->second) {
1411             CGene& gene1 = *ig1->first;
1412             ITERATE(CGene, ic1, gene1) {
1413                 CChain& chain1 = **ic1;
1414                 sort(chain1.m_members.begin(),chain1.m_members.end(),GenomeOrderD());
1415             }
1416         }
1417     }
1418 
1419 
1420     ITERATE(TConflictMemebersInChains, it, conflict_members_in_chains) {
1421         CChain& chain = *it->first;
1422         const TMemberPtrSet& conflict_members = it->second;
1423 
1424         CAlignMap amap = chain.GetAlignMap();
1425 
1426         TSignedSeqRange hard_limits(chain.Exons().front().Limits().GetTo()-15,chain.Exons().back().Limits().GetFrom()+15);
1427         hard_limits = (hard_limits & chain.Limits());
1428         if(chain.ReadingFrame().NotEmpty())
1429             hard_limits = (chain.OpenCds() ? chain.MaxCdsLimits() : chain.RealCdsLimits());
1430 
1431         TSignedSeqRange noclip_limits = hard_limits;
1432 
1433         /*
1434         int hard_limits_len = amap.FShiftedLen(hard_limits);
1435         ITERATE(TContained, i, chain.m_members) {
1436             const CGeneModel& a = *(*i)->m_align;
1437             if(a.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
1438                 continue;
1439             TSignedSeqRange alim(amap.ShrinkToRealPoints(a.Limits()&chain.Limits(),false));
1440             if(Include(alim,hard_limits.GetFrom()) ) {
1441                 TSignedSeqRange l(hard_limits.GetFrom(),alim.GetTo());
1442                 l = amap.ShrinkToRealPoints(l,false);
1443                 int len = 0;
1444                 if(l.NotEmpty())
1445                     len = amap.FShiftedLen(l);
1446                 if(len > 0.75*a.AlignLen() || len > 0.75*hard_limits_len)
1447                     noclip_limits.SetFrom(min(noclip_limits.GetFrom(),alim.GetFrom()));
1448             }
1449             if(Include(alim,hard_limits.GetTo())) {
1450                 TSignedSeqRange l(alim.GetFrom(),hard_limits.GetTo());
1451                 l = amap.ShrinkToRealPoints(l,false);
1452                 int len = 0;
1453                 if(l.NotEmpty())
1454                     len = amap.FShiftedLen(l);
1455                 if(len > 0.75*a.AlignLen() || len > 0.75*hard_limits_len)
1456                     noclip_limits.SetTo(max(noclip_limits.GetTo(),alim.GetTo()));
1457             }
1458         }
1459 
1460         noclip_limits = (noclip_limits & chain.Limits());
1461         */
1462 
1463         if(chain.Status()&CGeneModel::ePolyA) {
1464             if(chain.Strand() == ePlus) {
1465                 if(chain.m_coverage_drop_right < 0)
1466                     noclip_limits.SetTo(max(noclip_limits.GetTo(),chain.m_polya_cap_right_soft_limit));
1467                 else
1468                     noclip_limits.SetTo(max(noclip_limits.GetTo(),chain.m_coverage_drop_right));
1469             } else {
1470                 if(chain.m_coverage_drop_left < 0)
1471                     noclip_limits.SetFrom(min(noclip_limits.GetFrom(),chain.m_polya_cap_left_soft_limit));
1472                 else
1473                     noclip_limits.SetFrom(min(noclip_limits.GetFrom(),chain.m_coverage_drop_left));
1474             }
1475         }
1476         if(chain.Status()&CGeneModel::eCap) {
1477             if(chain.Strand() == ePlus) {
1478                 if(chain.m_coverage_drop_left < 0)
1479                     noclip_limits.SetFrom(min(noclip_limits.GetFrom(),chain.m_polya_cap_left_soft_limit));
1480                 else
1481                     noclip_limits.SetFrom(min(noclip_limits.GetFrom(),chain.m_coverage_drop_left));
1482             } else {
1483                 if(chain.m_coverage_drop_right < 0)
1484                     noclip_limits.SetTo(max(noclip_limits.GetTo(),chain.m_polya_cap_right_soft_limit));
1485                 else
1486                     noclip_limits.SetTo(max(noclip_limits.GetTo(),chain.m_coverage_drop_right));
1487             }
1488         }
1489 
1490         TSignedSeqRange new_limits = chain.Limits();
1491         ITERATE(TMemberPtrSet, im, conflict_members) {
1492             TSignedSeqRange alim = (*im)->m_align->Limits()&chain.Limits();
1493             if(alim.Empty())
1494                 continue;
1495             alim = amap.ShrinkToRealPoints(alim);
1496             if(alim.Empty())
1497                 continue;
1498             if(alim.GetFrom() < noclip_limits.GetFrom()) {
1499                 int to = min(noclip_limits.GetFrom(),alim.GetTo());
1500                 if(chain.m_coverage_drop_left > 0 && Include(alim,chain.m_coverage_drop_left)) {
1501                     to = min(noclip_limits.GetFrom(),chain.m_coverage_drop_left);
1502                 }
1503                 new_limits.SetFrom(max(new_limits.GetFrom(),to));
1504             } else if(alim.GetTo() > noclip_limits.GetTo()) {
1505                 int from = max(noclip_limits.GetTo(),alim.GetFrom());
1506                 if(chain.m_coverage_drop_right > 0 && Include(alim,chain.m_coverage_drop_right)) {
1507                     from = max(noclip_limits.GetTo(),chain.m_coverage_drop_right);
1508                 }
1509                 new_limits.SetTo(min(new_limits.GetTo(),from));
1510             }
1511         }
1512 
1513         int left_splice = -1;
1514         int right_splice = -1;
1515         for(int e = 1; e < (int)chain.Exons().size(); ++e) {
1516             if(left_splice < 0 && chain.Exons()[e-1].m_ssplice && Include(new_limits,chain.Exons()[e-1].GetTo()))
1517                 left_splice = chain.Exons()[e-1].GetTo();
1518             if(chain.Exons()[e].m_fsplice && Include(new_limits,chain.Exons()[e].GetFrom()))
1519                 right_splice = chain.Exons()[e].GetFrom();
1520         }
1521         map<int,double> left_weights;
1522         double left_weights_total = 0.;
1523         map<int,double> right_weights;
1524         double right_weights_total = 0.;
1525         ITERATE(TContained, i, chain.m_members) {
1526             const CGeneModel& a = *(*i)->m_align;
1527             if(a.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
1528                 continue;
1529             TSignedSeqRange alim(amap.ShrinkToRealPoints(a.Limits()&chain.Limits(),false));
1530             for(int e = 1; e < (int)a.Exons().size(); ++e) {
1531                 if(a.Exons()[e-1].m_ssplice && a.Exons()[e-1].GetTo() == left_splice) {
1532                     left_weights[alim.GetFrom()] += a.Weight();
1533                     left_weights_total += a.Weight();
1534                 }
1535                 if(a.Exons()[e].m_fsplice && a.Exons()[e].GetFrom() == right_splice) {
1536                     right_weights[alim.GetTo()] += a.Weight();
1537                     right_weights_total += a.Weight();
1538                 }
1539             }
1540         }
1541         if(left_weights_total > 0.) {
1542             int left = numeric_limits<int>::max();
1543             double t = 0;
1544             for(map<int,double>::reverse_iterator it = left_weights.rbegin(); it != left_weights.rend(); ++it) {
1545                 if(t < 0.9*left_weights_total)
1546                     left = it->first;
1547                 t += it->second;
1548             }
1549             if(left < new_limits.GetFrom())
1550                 new_limits.SetFrom(left);
1551         }
1552         if(right_weights_total > 0.) {
1553             int right = 0;
1554             double t = 0;
1555             for(map<int,double>::iterator it = right_weights.begin(); it != right_weights.end(); ++it) {
1556                 if(t < 0.9*right_weights_total)
1557                     right = it->first;
1558                 t += it->second;
1559             }
1560             if(right > new_limits.GetTo())
1561                 new_limits.SetTo(right);
1562         }
1563 
1564         //if has to clip, clip to next cap/polya
1565         if(new_limits.GetFrom() != chain.Limits().GetFrom() && chain.m_polya_cap_left_soft_limit < chain.Limits().GetTo())
1566             new_limits.SetFrom(chain.m_polya_cap_left_soft_limit);
1567         if(new_limits.GetTo() != chain.Limits().GetTo() && chain.m_polya_cap_right_soft_limit > chain.Limits().GetFrom())
1568             new_limits.SetTo(chain.m_polya_cap_right_soft_limit);
1569 
1570         //don't clip confirmed ends
1571         if(chain.Status()&CGeneModel::eLeftConfirmed)
1572             new_limits.SetFrom(chain.Limits().GetFrom());
1573         if(chain.Status()&CGeneModel::eRightConfirmed)
1574             new_limits.SetTo(chain.Limits().GetTo());
1575 
1576         if(new_limits != chain.Limits()) {
1577             string note;
1578             if(new_limits.GetFrom() != chain.Limits().GetFrom())
1579                 note += "Left";
1580             if(new_limits.GetTo() != chain.Limits().GetTo())
1581                 note += "Right";
1582             note += " overlap UTR clip";
1583             chain.AddComment(note);
1584             _ASSERT(new_limits.NotEmpty());
1585 
1586             bool wasopen = chain.OpenCds();
1587             chain.ClipChain(new_limits);
1588             if(chain.Type()&CGeneModel::eNested)
1589                 chain.ClipLowCoverageUTR(0.1);
1590             _ASSERT(chain.Limits().NotEmpty());
1591             if(chain.ReadingFrame().NotEmpty()) {
1592                 m_gnomon->GetScore(chain, !no5pextension);
1593                 CCDSInfo cds = chain.GetCdsInfo();
1594                 if(wasopen != chain.OpenCds() && (wasopen == false || cds.HasStart())) {
1595                     cds.SetScore(cds.Score(),wasopen);
1596                     chain.SetCdsInfo(cds);
1597                 }
1598             }
1599             chain.CalculateDropLimits();
1600         }
1601     }
1602 }
1603 
1604 
1605 
1606 //visits all levels of nested and adds uniquely to contained
AddToContained(TContained & contained,TMemberPtrSet & included_in_list)1607 void SChainMember::AddToContained(TContained& contained, TMemberPtrSet& included_in_list) {
1608 
1609     list<const SChainMember*> not_visited(1,this);
1610     while(!not_visited.empty()) {
1611         const SChainMember* mbr = not_visited.front();
1612         for(int c = 0; c < (int)mbr->m_contained->size(); ++c) {
1613             SChainMember* mi = (*mbr->m_contained)[c];
1614             if(c < mbr->m_identical_count) {
1615                 if(included_in_list.insert(mi).second) {
1616                     contained.push_back(mi);                  //action
1617                     if(mi->m_copy != 0)
1618                         included_in_list.insert(mi->m_copy->begin(),mi->m_copy->end());
1619                 }
1620             } else if(included_in_list.find(mi) == included_in_list.end()) {
1621                 not_visited.push_back(mi);                //store for future
1622             }
1623         }
1624         not_visited.pop_front();
1625     }
1626 }
1627 
CollectContainedForMemeber()1628 TContained SChainMember::CollectContainedForMemeber() {
1629 
1630     TContained contained;
1631     TMemberPtrSet included_in_list;
1632     AddToContained(contained, included_in_list);
1633 
1634     return contained;
1635 }
1636 
CollectContainedForChain()1637 TContained SChainMember::CollectContainedForChain()
1638 {
1639     TContained contained;
1640     TMemberPtrSet included_in_list;
1641 
1642     AddToContained(contained, included_in_list);
1643 
1644     for (SChainMember* left = m_left_member; left != 0; left = left->m_left_member) {
1645         left->AddToContained(contained, included_in_list);
1646     }
1647 
1648     for (SChainMember* right = m_right_member; right != 0; right = right->m_right_member) {
1649         right->AddToContained(contained, included_in_list);
1650     }
1651 
1652     return contained;
1653 }
1654 
1655 #define START_BONUS 600
1656 
MarkIncludedForChain()1657 void SChainMember::MarkIncludedForChain()
1658 {
1659     TContained contained = CollectContainedForChain();
1660     NON_CONST_ITERATE (TContained, i, contained) {
1661         SChainMember* mi = *i;
1662         mi->m_included = true;
1663         if (mi->m_copy != 0) {
1664             ITERATE(TContained, j, *mi->m_copy) {
1665                 SChainMember* mj = *j;
1666                 if(mj->m_type != eCDS || mj->m_cds < START_BONUS+25 ||
1667                    (mi->m_align->Strand() == mj->m_align->Strand() &&
1668                     (mi->m_cds_info->ReadingFrame().GetFrom() == mj->m_cds_info->ReadingFrame().GetFrom() ||   // same copy or supressed start
1669                      mi->m_cds_info->ReadingFrame().GetTo() == mj->m_cds_info->ReadingFrame().GetTo())))       // same copy or supressed start
1670                     mj->m_included = true;
1671             }
1672         }
1673     }
1674 }
1675 
MarkPostponedForChain()1676 void SChainMember::MarkPostponedForChain()
1677 {
1678     TContained contained = CollectContainedForChain();
1679     NON_CONST_ITERATE (TContained, i, contained) {
1680         SChainMember* mi = *i;
1681         mi->m_postponed = true;
1682         if (mi->m_copy != 0) {
1683             ITERATE(TContained, j, *mi->m_copy) {
1684                 SChainMember* mj = *j;
1685                 if(mj->m_type != eCDS || mj->m_cds < START_BONUS+25 ||
1686                    (mi->m_align->Strand() == mj->m_align->Strand() &&
1687                     (mi->m_cds_info->ReadingFrame().GetFrom() == mj->m_cds_info->ReadingFrame().GetFrom() ||   // same copy or supressed start
1688                      mi->m_cds_info->ReadingFrame().GetTo() == mj->m_cds_info->ReadingFrame().GetTo())))       // same copy or supressed start
1689                     mj->m_postponed = true;
1690             }
1691         }
1692     }
1693 }
1694 
MarkUnwantedCopiesForChain(const TSignedSeqRange & cds)1695 void SChainMember::MarkUnwantedCopiesForChain(const TSignedSeqRange& cds)
1696 {
1697     TContained contained = CollectContainedForChain();
1698     NON_CONST_ITERATE (TContained, i, contained) {
1699         SChainMember* mi = *i;
1700         CGeneModel& algni = *mi->m_align;
1701         const CCDSInfo& cinfoi = *mi->m_cds_info;
1702         if(Include(cds, cinfoi.ReadingFrame())) {
1703             mi->m_marked_for_retention = true;
1704             mi->m_marked_for_deletion = false;
1705             if (mi->m_copy != 0) {
1706                 ITERATE(TContained, j, *mi->m_copy) {
1707                     SChainMember* mj = *j;
1708                     const CCDSInfo& cinfoj = *mj->m_cds_info;
1709                     if(mj->m_marked_for_retention)      // already included in cds
1710                         continue;
1711                     else if(cinfoi.HasStart() || cinfoj.HasStart()) {         // don't delete copy which overrides the start or has the start
1712                         if((algni.Strand() == ePlus && cinfoi.ReadingFrame().GetTo() == cinfoj.ReadingFrame().GetTo()) ||
1713                            (algni.Strand() == eMinus && cinfoi.ReadingFrame().GetFrom() == cinfoj.ReadingFrame().GetFrom()))
1714                             continue;
1715                     }
1716                     mj->m_marked_for_deletion = true;
1717                 }
1718             }
1719         }
1720     }
1721 }
1722 
1723 
1724 struct LeftOrder
1725 {
operator ()LeftOrder1726     bool operator()(const SChainMember* ap, const SChainMember* bp)    // right end increasing, short first if right end equal
1727     {
1728         TSignedSeqRange alimits = ap->m_align->Limits();
1729         //ignore flexible ends for sorting
1730         if(ap->m_align->Status()&CGeneModel::eLeftFlexible)
1731             alimits.SetFrom(alimits.GetTo());
1732         if(ap->m_align->Status()&CGeneModel::eRightFlexible)
1733             alimits.SetTo(alimits.GetFrom());
1734         TSignedSeqRange blimits = bp->m_align->Limits();
1735         //ignore flexible ends for sorting
1736         if(bp->m_align->Status()&CGeneModel::eLeftFlexible)
1737             blimits.SetFrom(blimits.GetTo());
1738         if(bp->m_align->Status()&CGeneModel::eRightFlexible)
1739             blimits.SetTo(blimits.GetFrom());
1740 
1741         if(alimits.GetTo() == blimits.GetTo())
1742             return (alimits.GetFrom() > blimits.GetFrom());
1743         else
1744             return (alimits.GetTo() < blimits.GetTo());
1745     }
1746 };
1747 
1748 struct LeftOrderD                                                      // use for sorting not for finding
1749 {
operator ()LeftOrderD1750     bool operator()(const SChainMember* ap, const SChainMember* bp)    // right end increasing, short first if right end equal
1751     {
1752         TSignedSeqRange alimits = ap->m_align->Limits();
1753         //ignore flexible ends for sorting
1754         if(ap->m_align->Status()&CGeneModel::eLeftFlexible)
1755             alimits.SetFrom(alimits.GetTo());
1756         if(ap->m_align->Status()&CGeneModel::eRightFlexible)
1757             alimits.SetTo(alimits.GetFrom());
1758         TSignedSeqRange blimits = bp->m_align->Limits();
1759         //ignore flexible ends for sorting
1760         if(bp->m_align->Status()&CGeneModel::eLeftFlexible)
1761             blimits.SetFrom(blimits.GetTo());
1762         if(bp->m_align->Status()&CGeneModel::eRightFlexible)
1763             blimits.SetTo(blimits.GetFrom());
1764 
1765         if(alimits == blimits)
1766             return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1767         else if(alimits.GetTo() == blimits.GetTo())
1768             return (alimits.GetFrom() > blimits.GetFrom());
1769         else
1770             return (alimits.GetTo() < blimits.GetTo());
1771     }
1772 };
1773 
1774 
1775 struct RightOrder
1776 {
operator ()RightOrder1777     bool operator()(const SChainMember* ap, const SChainMember* bp)   // left end decreasing, short first if left end equal
1778     {
1779         TSignedSeqRange alimits = ap->m_align->Limits();
1780         //ignore flexible ends for sorting
1781         if(ap->m_align->Status()&CGeneModel::eLeftFlexible)
1782             alimits.SetFrom(alimits.GetTo());
1783         if(ap->m_align->Status()&CGeneModel::eRightFlexible)
1784             alimits.SetTo(alimits.GetFrom());
1785         TSignedSeqRange blimits = bp->m_align->Limits();
1786         //ignore flexible ends for sorting
1787         if(bp->m_align->Status()&CGeneModel::eLeftFlexible)
1788             blimits.SetFrom(blimits.GetTo());
1789         if(bp->m_align->Status()&CGeneModel::eRightFlexible)
1790             blimits.SetTo(blimits.GetFrom());
1791 
1792         if(alimits.GetFrom() == blimits.GetFrom())
1793             return (alimits.GetTo() < blimits.GetTo());
1794         else
1795             return (alimits.GetFrom() > blimits.GetFrom());
1796     }
1797 };
1798 
1799 struct RightOrderD
1800 {
operator ()RightOrderD1801     bool operator()(const SChainMember* ap, const SChainMember* bp)   // left end decreasing, short first if left end equal
1802     {
1803         TSignedSeqRange alimits = ap->m_align->Limits();
1804         //ignore flexible ends for sorting
1805         if(ap->m_align->Status()&CGeneModel::eLeftFlexible)
1806             alimits.SetFrom(alimits.GetTo());
1807         if(ap->m_align->Status()&CGeneModel::eRightFlexible)
1808             alimits.SetTo(alimits.GetFrom());
1809         TSignedSeqRange blimits = bp->m_align->Limits();
1810         //ignore flexible ends for sorting
1811         if(bp->m_align->Status()&CGeneModel::eLeftFlexible)
1812             blimits.SetFrom(blimits.GetTo());
1813         if(bp->m_align->Status()&CGeneModel::eRightFlexible)
1814             blimits.SetTo(blimits.GetFrom());
1815 
1816         if(alimits == blimits)
1817             return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1818         else if(alimits.GetFrom() == blimits.GetFrom())
1819             return (alimits.GetTo() < blimits.GetTo());
1820         else
1821             return (alimits.GetFrom() > blimits.GetFrom());
1822     }
1823 };
1824 
1825 
1826 struct CdsNumOrder
1827 {
operator ()CdsNumOrder1828     bool operator()(const SChainMember* ap, const SChainMember* bp)
1829     {
1830         if(max(ap->m_cds,bp->m_cds) >= 300 && ap->m_cds != bp->m_cds) // only long cdses count
1831             return (ap->m_cds > bp->m_cds);
1832         else if(fabs(ap->m_splice_num - bp->m_splice_num) > 0.001)
1833             return (ap->m_splice_num > bp->m_splice_num);
1834         else if(fabs(ap->m_num - bp->m_num) > 0.001)
1835             return (ap->m_num > bp->m_num);
1836         else
1837             return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1838     }
1839 };
1840 
1841 struct ScoreOrder
1842 {
operator ()ScoreOrder1843     bool operator()(const SChainMember* ap, const SChainMember* bp)
1844     {
1845         if (ap->m_cds_info->Score() == bp->m_cds_info->Score())
1846             return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1847         else
1848             return (ap->m_cds_info->Score() > bp->m_cds_info->Score());
1849     }
1850 };
1851 
1852 template <class C>
uniq(C & container)1853 void uniq(C& container)
1854 {
1855     sort(container.begin(),container.end());
1856     container.erase( unique(container.begin(),container.end()), container.end() );
1857 }
1858 
1859 class CChainMembers : public vector<SChainMember*> {
1860 public:
CChainMembers()1861     CChainMembers() { m_extra_cds.push_back(CCDSInfo()); }   // empty cds for utrs; first in the list
1862     CChainMembers(TGeneModelList& clust, TOrigAligns& orig_aligns, TUnmodAligns& unmodified_aligns);
1863     void InsertMember(CGeneModel& algn, SChainMember* copy_ofp = 0);
1864     void InsertMemberCopyWithCds(const CCDSInfo& cds, SChainMember* copy_ofp);
1865     void InsertMemberCopyAndStoreCds(const CCDSInfo& cds, SChainMember* copy_ofp);
1866     void InsertMemberCopyWithoutCds(SChainMember* copy_ofp);
1867     void InsertMember(SChainMember& m, SChainMember* copy_ofp = 0);
1868     void DuplicateUTR(SChainMember* copy_ofp);
1869     void SpliceFromOther(CChainMembers& other);
1870 private:
1871     CChainMembers(const CChainMembers& object) = delete;
1872     CChainMembers& operator=(const CChainMembers& object) = delete;
1873     list<SChainMember> m_members;
1874     list<TContained> m_copylist;
1875     list<CAlignMap> m_align_maps;
1876     list<TContained> m_containedlist;
1877     list<CCDSInfo> m_extra_cds;
1878 };
1879 
SpliceFromOther(CChainMembers & other)1880 void CChainMembers::SpliceFromOther(CChainMembers& other) {
1881     m_members.splice(m_members.end(),other.m_members);
1882     m_copylist.splice(m_copylist.end(),other.m_copylist);
1883     m_align_maps.splice(m_align_maps.end(),other.m_align_maps);
1884     m_containedlist.splice(m_containedlist.end(),other.m_containedlist);
1885     m_extra_cds.splice(m_extra_cds.end(),other.m_extra_cds);
1886     insert(end(),other.begin(),other.end());
1887 }
1888 
InsertMemberCopyWithCds(const CCDSInfo & cds,SChainMember * copy_ofp)1889 void CChainMembers::InsertMemberCopyWithCds(const CCDSInfo& cds, SChainMember* copy_ofp) {
1890 
1891     SChainMember mbr = *copy_ofp;
1892     mbr.m_cds_info = &cds;
1893     mbr.m_type = eCDS;
1894     InsertMember(mbr, copy_ofp);
1895 }
1896 
InsertMemberCopyAndStoreCds(const CCDSInfo & cds,SChainMember * copy_ofp)1897 void CChainMembers::InsertMemberCopyAndStoreCds(const CCDSInfo& cds, SChainMember* copy_ofp) {
1898 
1899     m_extra_cds.push_back(cds);
1900     InsertMemberCopyWithCds(m_extra_cds.back(), copy_ofp);
1901 }
1902 
InsertMemberCopyWithoutCds(SChainMember * copy_ofp)1903 void CChainMembers::InsertMemberCopyWithoutCds(SChainMember* copy_ofp) {
1904 
1905     SChainMember mbr = *copy_ofp;
1906     mbr.m_cds_info = &m_extra_cds.front(); // empty cds
1907     mbr.m_type = eLeftUTR;
1908     InsertMember(mbr, copy_ofp);
1909 }
1910 
1911 
InsertMember(CGeneModel & algn,SChainMember * copy_ofp)1912 void CChainMembers::InsertMember(CGeneModel& algn, SChainMember* copy_ofp)
1913 {
1914     SChainMember mbr;
1915     mbr.m_align = &algn;
1916     mbr.m_cds_info = &algn.GetCdsInfo();
1917     mbr.m_type = eCDS;
1918     if(algn.Score() == BadScore())
1919         mbr.m_type = eLeftUTR;
1920     if(copy_ofp) {
1921         mbr.m_orig_align = copy_ofp->m_orig_align;
1922         mbr.m_unmd_align = copy_ofp->m_unmd_align;
1923     }
1924     InsertMember(mbr, copy_ofp);
1925 }
1926 
InsertMember(SChainMember & m,SChainMember * copy_ofp)1927 void CChainMembers::InsertMember(SChainMember& m, SChainMember* copy_ofp)
1928 {
1929     m.m_mem_id = size()+1;
1930     m_members.push_back(m);
1931     push_back(&m_members.back());
1932 
1933     m_containedlist.push_back(TContained());
1934     m_members.back().m_contained = &m_containedlist.back();
1935 
1936     _ASSERT(copy_ofp == 0 || (m.m_align->Exons()==copy_ofp->m_align->Exons() && m.m_align->FrameShifts()==copy_ofp->m_align->FrameShifts()));
1937 
1938     if(copy_ofp == 0 || m.m_align->Strand() != copy_ofp->m_align->Strand()) {         // first time or reversed copy
1939         m_align_maps.push_back(CAlignMap(m.m_align->Exons(), m.m_align->FrameShifts(), m.m_align->Strand()));
1940         m_members.back().m_align_map = &m_align_maps.back();
1941     } else {
1942         m_members.back().m_align_map = copy_ofp->m_align_map;
1943     }
1944 
1945     if(copy_ofp != 0) {                                            // we are making a copy of member
1946         if(copy_ofp->m_copy == 0) {
1947             m_copylist.push_back(TContained(1,copy_ofp));
1948             copy_ofp->m_copy = &m_copylist.back();
1949         }
1950         m_members.back().m_copy = copy_ofp->m_copy;
1951         copy_ofp->m_copy->push_back(&m_members.back());
1952     }
1953 }
1954 
DuplicateUTR(SChainMember * copy_ofp)1955 void CChainMembers::DuplicateUTR(SChainMember* copy_ofp)
1956 {
1957     _ASSERT(copy_ofp->m_type == eLeftUTR);
1958     SChainMember new_mbr = *copy_ofp;
1959     new_mbr.m_type = eRightUTR;
1960     InsertMember(new_mbr, copy_ofp);
1961 }
1962 
1963 
CChainMembers(TGeneModelList & clust,TOrigAligns & orig_aligns,TUnmodAligns & unmodified_aligns)1964 CChainMembers::CChainMembers(TGeneModelList& clust, TOrigAligns& orig_aligns, TUnmodAligns& unmodified_aligns)
1965 {
1966     m_extra_cds.push_back(CCDSInfo());      // empty cds for utrs; first in the list
1967     NON_CONST_ITERATE(TGeneModelList, itcl, clust) {
1968         InsertMember(*itcl);
1969         m_members.back().m_orig_align = orig_aligns[itcl->ID()];
1970         if(unmodified_aligns.count(itcl->ID()))
1971             m_members.back().m_unmd_align = &unmodified_aligns[itcl->ID()];
1972     }
1973 }
1974 
1975 
ExtendedMaxCdsLimits(const CGeneModel & a,const CCDSInfo & cds)1976 TSignedSeqRange ExtendedMaxCdsLimits(const CGeneModel& a, const CCDSInfo& cds)
1977 {
1978     TSignedSeqRange limits(a.Limits().GetFrom()-1,a.Limits().GetTo()+1);
1979 
1980     return limits & cds.MaxCdsLimits();
1981 }
1982 
1983 
IncludeInContained(SChainMember & big,SChainMember & small)1984 void CChainer::CChainerImpl::IncludeInContained(SChainMember& big, SChainMember& small)
1985 {
1986     //all identical members are contained in each other; only one of them (with smaller m_mem_id) is contained in other members
1987     TSignedSeqRange big_limits = big.m_align->Limits();
1988     if(big.m_align->Status()&CGeneModel::eLeftFlexible)
1989         big_limits.SetFrom(big_limits.GetTo());
1990     if(big.m_align->Status()&CGeneModel::eRightFlexible)
1991         big_limits.SetTo(big_limits.GetFrom());
1992     TSignedSeqRange small_limits = small.m_align->Limits();
1993     bool small_flex = false;
1994     if(small.m_align->Status()&CGeneModel::eLeftFlexible) {
1995         small_limits.SetFrom(small_limits.GetTo());
1996         small_flex = true;
1997     }
1998     if(small.m_align->Status()&CGeneModel::eRightFlexible) {
1999         small_limits.SetTo(small_limits.GetFrom());
2000         small_flex = true;
2001     }
2002 
2003     if(big_limits == small_limits) {  // identical
2004         ++big.m_identical_count;
2005         big.m_contained->push_back(&small);
2006         return;
2007     } else if(big.m_sink_for_contained != nullptr &&
2008               small_limits.GetTo() <= big.m_sink_for_contained->m_align->Limits().GetTo() &&
2009               CanIncludeJinI(*big.m_sink_for_contained, small)) {
2010         return;  // contained in next level
2011     } else {
2012         big.m_contained->push_back(&small);
2013         if(!small_flex && (big.m_sink_for_contained == nullptr || small_limits.GetTo() > big.m_sink_for_contained->m_align->Limits().GetTo()))
2014             big.m_sink_for_contained = &small;
2015     }
2016 }
2017 
2018 
CutParts(TGeneModelList & models)2019 void CChainer::CutParts(TGeneModelList& models)
2020 {
2021     m_data->CutParts(models);
2022 }
2023 
CutParts(TGeneModelList & models)2024 void CChainer::CChainerImpl::CutParts(TGeneModelList& models) {
2025     ERASE_ITERATE(TGeneModelList, im, models) {
2026         TGeneModelList parts = GetAlignParts(*im, true);
2027         if(!parts.empty()) {
2028             models.splice(models.begin(),parts);
2029             models.erase(im);
2030         }
2031     }
2032 }
2033 
DuplicateNotOriented(CChainMembers & pointers,TGeneModelList & clust)2034 void CChainer::CChainerImpl::DuplicateNotOriented(CChainMembers& pointers, TGeneModelList& clust)
2035 {
2036     unsigned int initial_size = pointers.size();
2037     for(unsigned int i = 0; i < initial_size; ++i) {
2038         SChainMember& mbr = *pointers[i];
2039         CGeneModel& algn = *mbr.m_align;
2040         if((algn.Status()&CGeneModel::eUnknownOrientation) != 0) {
2041             CGeneModel new_algn = algn;
2042             new_algn.ReverseComplementModel();
2043             new_algn.Status() &= ~CGeneModel::eReversed;
2044             clust.push_back(new_algn);
2045             pointers.InsertMember(clust.back(), &mbr);    //reversed copy
2046         }
2047     }
2048 }
2049 
DuplicateUTRs(CChainMembers & pointers)2050 void CChainer::CChainerImpl::DuplicateUTRs(CChainMembers& pointers)
2051 {
2052     unsigned int initial_size = pointers.size();
2053     for(unsigned int i = 0; i < initial_size; ++i) {
2054         SChainMember& mbr = *pointers[i];
2055         if(mbr.m_align->Status()&CGeneModel::eLeftFlexible)
2056             mbr.m_type = eRightUTR;
2057         else if(mbr.m_align->Status()&CGeneModel::eRightFlexible)
2058             mbr.m_type = eLeftUTR;
2059         else if(mbr.m_cds_info->Score() == BadScore())
2060             pointers.DuplicateUTR(&mbr);
2061     }
2062 }
2063 
CalculateSpliceWeights(CChainMembers & pointers)2064 void CChainer::CChainerImpl::CalculateSpliceWeights(CChainMembers& pointers)
2065 {
2066     map<int, set<int> > oriented_splices;
2067     ITERATE(set<TSignedSeqRange>, i, oriented_introns_plus) {
2068         oriented_splices[ePlus].insert(i->GetFrom());
2069         oriented_splices[ePlus].insert(i->GetTo());
2070     }
2071     ITERATE(set<TSignedSeqRange>, i, oriented_introns_minus) {
2072         oriented_splices[eMinus].insert(i->GetFrom());
2073         oriented_splices[eMinus].insert(i->GetTo());
2074     }
2075 
2076     NON_CONST_ITERATE(CChainMembers, i, pointers) {
2077         SChainMember& mbr = **i;
2078         CGeneModel& algn = *mbr.m_align;
2079         if(algn.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
2080             continue;
2081         set<int>& ospl = oriented_splices[algn.Strand()];
2082         ITERATE(CGeneModel::TExons, ie, algn.Exons()) {
2083             TSignedSeqRange exon = *ie;
2084             for(set<int>::iterator spli = ospl.lower_bound(exon.GetFrom()); spli != ospl.end() && *spli <= exon.GetTo(); ++spli)
2085                 mbr.m_splice_weight += algn.Weight();
2086         }
2087     }
2088 }
2089 
ReplicatePStops(CChainMembers & pointers)2090 void CChainer::CChainerImpl::ReplicatePStops(CChainMembers& pointers)
2091 {
2092     int left = numeric_limits<int>::max();
2093     int right = 0;
2094     typedef vector<pair<CCDSInfo::SPStop,TSignedSeqRange> > TPstopIntron;
2095     TPstopIntron pstops_with_intron_plus;
2096     TPstopIntron pstops_with_intron_minus;
2097     ITERATE(CChainMembers, i, pointers) {
2098         SChainMember& mbr = **i;
2099         CGeneModel& algn = *mbr.m_align;
2100         TPstopIntron& pstops_with_intron = (algn.Strand() == ePlus) ? pstops_with_intron_plus : pstops_with_intron_minus;
2101         ITERATE(CCDSInfo::TPStops, s, algn.GetCdsInfo().PStops()) {
2102             if(s->m_status == CCDSInfo::eSelenocysteine || s->m_status == CCDSInfo::eGenomeNotCorrect) {
2103                 left = min(left,s->GetFrom());
2104                 right = max(right,s->GetTo());
2105                 if(s->GetLength() == 3) {
2106                     pstops_with_intron.push_back(make_pair(*s,TSignedSeqRange(0,0)));
2107                 } else {
2108                     for(int i = 1; i < (int)algn.Exons().size(); ++i) {
2109                         TSignedSeqRange intron(algn.Exons()[i-1].GetTo(),algn.Exons()[i].GetFrom());
2110                         pstops_with_intron.push_back(make_pair(*s,intron));
2111                     }
2112                 }
2113             }
2114         }
2115     }
2116     uniq(pstops_with_intron_plus);
2117     uniq(pstops_with_intron_minus);
2118 
2119     ITERATE(CChainMembers, i, pointers) {
2120         SChainMember& mbr = **i;
2121         CGeneModel& algn = *mbr.m_align;
2122         if(algn.Limits().GetFrom() > right || algn.Limits().GetTo() < left)
2123             continue;
2124         if((algn.Type()&CGeneModel::eProt) && !algn.PStop())
2125             continue;
2126         if(algn.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
2127             continue;
2128 
2129         TPstopIntron& pstops_with_intron = (algn.Strand() == ePlus) ? pstops_with_intron_plus : pstops_with_intron_minus;
2130         if(pstops_with_intron.empty())
2131             continue;
2132 
2133         if(algn.Type()&CGeneModel::eProt) {
2134             CCDSInfo cds = algn.GetCdsInfo();
2135             CCDSInfo::TPStops pstops = cds.PStops();
2136             NON_CONST_ITERATE(CCDSInfo::TPStops, s, pstops) {
2137                 ITERATE(TPstopIntron, si, pstops_with_intron) {
2138                     if(si->second.GetLength() == 1) {  // no split
2139                         if(si->first == *s)
2140                             *s = si->first;            // assigns status
2141                     } else {
2142                         for(int i = 1; i < (int)algn.Exons().size(); ++i) {
2143                             TSignedSeqRange intron(algn.Exons()[i-1].GetTo(),algn.Exons()[i].GetFrom());
2144                             if(si->second == intron && si->first == *s)
2145                                 *s = si->first;        // assigns status
2146                         }
2147                     }
2148                 }
2149             }
2150             cds.ClearPStops();
2151             ITERATE(CCDSInfo::TPStops, s, pstops)
2152                 cds.AddPStop(*s);
2153             algn.SetCdsInfo(cds);
2154         } else if(algn.ReadingFrame().Empty()) {
2155             CCDSInfo cds;
2156             const CGeneModel::TExons& exons = algn.Exons();
2157             ITERATE(TPstopIntron, si, pstops_with_intron) {
2158                 if(si->first.GetTo() < algn.Limits().GetFrom())
2159                     continue;
2160                 if(si->first.GetFrom() > algn.Limits().GetTo())
2161                     break;
2162                 for(int i = 0; i < (int)exons.size(); ++i) {
2163                     if(Include(exons[i].Limits(),si->first.GetFrom())) {
2164                         if(si->second.GetLength() == 1) {  // no split
2165                             if(si->first.GetTo() <= exons[i].GetTo())
2166                                 cds.AddPStop(si->first);
2167                         } else {
2168                             if(i < (int)exons.size()-1) {
2169                                 TSignedSeqRange intron(exons[i].GetTo(),exons[i+1].GetFrom());
2170                                 if(intron == si->second && si->first.GetTo() <= exons[i+1].GetTo())
2171                                     cds.AddPStop(si->first);
2172                             }
2173                         }
2174                     }
2175                 }
2176             }
2177             if(cds.PStop())
2178                 algn.SetCdsInfo(cds);
2179         }
2180     }
2181 }
2182 
ScoreCdnas(CChainMembers & pointers)2183 void CChainer::CChainerImpl::ScoreCdnas(CChainMembers& pointers)
2184 {
2185     NON_CONST_ITERATE(CChainMembers, i, pointers) {
2186         SChainMember& mbr = **i;
2187         CGeneModel& algn = *mbr.m_align;
2188 
2189         if(algn.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
2190             continue;
2191         if((algn.Type() & CGeneModel::eProt)!=0 || algn.ConfirmedStart())
2192             continue;
2193 
2194         m_gnomon->GetScore(algn);
2195         double ms = GoodCDNAScore(algn);
2196         RemovePoorCds(algn,ms);
2197 
2198         if(algn.Score() != BadScore())
2199             mbr.m_type = eCDS;
2200     }
2201 }
2202 
2203 
Duplicate5pendsAndShortCDSes(CChainMembers & pointers)2204 void CChainer::CChainerImpl::Duplicate5pendsAndShortCDSes(CChainMembers& pointers)
2205 {
2206     unsigned int initial_size = pointers.size();
2207     for(unsigned int i = 0; i < initial_size; ++i) {
2208         SChainMember& mbr = *pointers[i];
2209         CGeneModel& algn = *mbr.m_align;
2210 
2211         if(algn.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
2212             continue;
2213 
2214         if(mbr.m_type == eRightUTR)   // avoid copying UTR copies
2215             continue;
2216 
2217         if(algn.GetCdsInfo().ProtReadingFrame().Empty() && algn.Score() < 5*minscor.m_min) {
2218             for(int i = 0; i < (int)algn.GetEdgeReadingFrames()->size(); ++i) {
2219                 const CCDSInfo& cds_info = (*algn.GetEdgeReadingFrames())[i];
2220                 if(cds_info.ReadingFrame() != algn.ReadingFrame()) {
2221                     pointers.InsertMemberCopyWithCds(cds_info, &mbr);    //copy with CDS
2222                 }
2223             }
2224 
2225             if(algn.Score() != BadScore()) {
2226                 pointers.InsertMemberCopyWithoutCds(&mbr);    //UTR copy
2227             }
2228         }
2229     }
2230 
2231 
2232     initial_size = pointers.size();
2233     for(unsigned int i = 0; i < initial_size; ++i) {
2234         SChainMember& mbr = *pointers[i];
2235         CGeneModel& algn = *mbr.m_align;
2236         CCDSInfo& acdsinfo = const_cast<CCDSInfo&>(*mbr.m_cds_info);
2237 
2238         if(acdsinfo.HasStart()) {
2239             bool inf_5prime;
2240             if (algn.Strand()==ePlus) {
2241                 inf_5prime = acdsinfo.MaxCdsLimits().GetFrom()==TSignedSeqRange::GetWholeFrom();
2242             } else {
2243                 inf_5prime = acdsinfo.MaxCdsLimits().GetTo()==TSignedSeqRange::GetWholeTo();
2244             }
2245             if (inf_5prime) {
2246                 CCDSInfo cdsinfo = acdsinfo;
2247 
2248                 TSignedSeqPos start = (algn.Strand() == ePlus) ? acdsinfo.Start().GetFrom() : acdsinfo.Start().GetTo();
2249                 acdsinfo.Set5PrimeCdsLimit(start);
2250                 mbr.m_restricted_to_start = true;
2251 
2252                 if(algn.Strand() == ePlus) {
2253                     int full_rf_left = algn.FShiftedMove(algn.Limits().GetFrom(),(algn.FShiftedLen(algn.Limits().GetFrom(), cdsinfo.Start().GetFrom(), false)-1)%3);
2254                     cdsinfo.SetStart(TSignedSeqRange::GetEmpty());
2255                     cdsinfo.SetScore(cdsinfo.Score(),false);
2256                     cdsinfo.SetReadingFrame(TSignedSeqRange(full_rf_left,cdsinfo.ReadingFrame().GetTo()));
2257                 } else {
2258                     int full_rf_right = algn.FShiftedMove(algn.Limits().GetTo(),-(algn.FShiftedLen(cdsinfo.Start().GetTo(),algn.Limits().GetTo(),false)-1)%3);
2259                     cdsinfo.SetStart(TSignedSeqRange::GetEmpty());
2260                     cdsinfo.SetScore(cdsinfo.Score(),false);
2261                     cdsinfo.SetReadingFrame(TSignedSeqRange(cdsinfo.ReadingFrame().GetFrom(),full_rf_right));
2262                 }
2263 
2264                 if(mbr.m_copy != 0) {
2265                     if(mbr.m_copy->front()->m_align->Strand() == algn.Strand()) {     // first copy is original alignment; for not oriented the second copy is reverse
2266                         if(mbr.m_copy->front()->m_cds_info->ReadingFrame() == cdsinfo.ReadingFrame())
2267                             continue;
2268                     } else if((*mbr.m_copy)[1]->m_cds_info->ReadingFrame() == cdsinfo.ReadingFrame()) {
2269                         continue;
2270                     }
2271                 }
2272 
2273                 pointers.InsertMemberCopyAndStoreCds(cdsinfo, &mbr);
2274             }
2275         }
2276 
2277     }
2278 }
2279 
StrictlyContainedInDels(const TInDels & indels,const TSignedSeqRange & lim)2280 TInDels StrictlyContainedInDels(const TInDels& indels, const TSignedSeqRange& lim) {
2281     TInDels fs;
2282     ITERATE(TInDels, indl, indels) {
2283         if(indl->InDelEnd() > lim.GetFrom() && indl->Loc() <= lim.GetTo())
2284             fs.push_back(*indl);
2285     }
2286     return fs;
2287 }
2288 
CanIncludeJinI(const SChainMember & mi,const SChainMember & mj)2289 bool CChainer::CChainerImpl::CanIncludeJinI(const SChainMember& mi, const SChainMember& mj) {
2290     const CGeneModel& ai = *mi.m_align;
2291     const CGeneModel& aj = *mj.m_align;
2292 
2293     if(ai.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
2294         return false;
2295 
2296     bool jflex = false;
2297     TSignedSeqRange jlimits = aj.Limits();
2298     if(aj.Status()&CGeneModel::eLeftFlexible) {
2299         jlimits.SetFrom(jlimits.GetTo());
2300         jflex = true;
2301     }
2302     if(aj.Status()&CGeneModel::eRightFlexible) {
2303         jlimits.SetTo(jlimits.GetFrom());
2304         jflex = true;
2305     }
2306 
2307     if(aj.Strand() != ai.Strand() || !Include(ai.Limits(),jlimits))
2308         return false;
2309 
2310     if(mi.m_type != eCDS && mj.m_type  != mi.m_type)
2311         return false;    // avoid including UTR copy and avoid including CDS into UTR because that will change m_type
2312 
2313     const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2314     TSignedSeqRange ai_rf = ai_cds_info.Start()+ai_cds_info.ReadingFrame()+ai_cds_info.Stop();
2315     TSignedSeqRange ai_max_cds = ai_cds_info.MaxCdsLimits()&ai.Limits();
2316 
2317     const CCDSInfo& aj_cds_info = *mj.m_cds_info;
2318     TSignedSeqRange aj_rf = aj_cds_info.Start()+aj_cds_info.ReadingFrame()+aj_cds_info.Stop();
2319 
2320     // UTR in CDS
2321     if(mi.m_type == eCDS && mj.m_type == eLeftUTR) {
2322         if(!jflex && jlimits.GetTo()-ai_max_cds.GetFrom() >= 5)                                                                     // normal UTR don't go into CDS > 5bp
2323             return false;
2324         else if(jflex && (aj.Status()&CGeneModel::ePolyA) && (!ai_cds_info.HasStop() || jlimits.GetTo()-ai_max_cds.GetFrom() >= 5)) // flex polyA needs stop and don't go into CDS > 5bp
2325             return false;
2326         else if(jflex && (aj.Status()&CGeneModel::eCap) && ai_cds_info.HasStop() && ai_max_cds.GetTo()-jlimits.GetTo() <= 5)         // flex cap is allowed almost up to 3' UTR to be awailable if start moves
2327             return false;
2328     }
2329     if(mi.m_type == eCDS && mj.m_type == eRightUTR) {
2330         if(!jflex && ai_max_cds.GetTo()-jlimits.GetFrom() >= 5)
2331             return false;
2332         else if(jflex && (aj.Status()&CGeneModel::ePolyA) && (!ai_cds_info.HasStop() || ai_max_cds.GetTo()-jlimits.GetFrom() >= 5))
2333             return false;
2334         else if(jflex && (aj.Status()&CGeneModel::eCap) && ai_cds_info.HasStop() && jlimits.GetFrom()-ai_max_cds.GetFrom() <= 5)
2335             return false;
2336     }
2337 
2338     if(aj.FrameShifts() != StrictlyContainedInDels(ai.FrameShifts(), aj.Limits()))    // not compatible frameshifts
2339         return false;
2340 
2341     if(mi.m_type == eCDS && mj.m_type == eCDS) { // CDS in CDS
2342         TSignedSeqRange max_cds_limits = ai_cds_info.MaxCdsLimits() & aj_cds_info.MaxCdsLimits();
2343         if (!Include(max_cds_limits, ExtendedMaxCdsLimits(ai, ai_cds_info) + ExtendedMaxCdsLimits(aj, aj_cds_info)))
2344             return false;;
2345         if(!Include(ai_rf,aj_rf))
2346             return false;
2347 
2348         if(ai_rf.GetFrom() != aj_rf.GetFrom()) {
2349             TSignedSeqPos j_from = mi.m_align_map->MapOrigToEdited(aj_rf.GetFrom());
2350             if(j_from < 0)
2351                 return false;
2352             TSignedSeqPos i_from = mi.m_align_map->MapOrigToEdited(ai_rf.GetFrom());
2353             if(abs(j_from-i_from)%3 != 0)
2354                 return false;
2355         }
2356     }
2357 
2358     int iex = ai.Exons().size();
2359     int jex = aj.Exons().size();
2360     if(jex > iex)
2361         return false;
2362     if(iex > 1) {                                               // big alignment is spliced
2363         int fex = 0;
2364         while(fex < iex && ai.Exons()[fex].GetTo() < jlimits.GetFrom()) {
2365             ++fex;
2366         }
2367         if(ai.Exons()[fex].GetFrom() > jlimits.GetFrom())   // first aj exon is in ai intron
2368             return false;
2369 
2370         if(jlimits.GetLength() == 1) // flexible alignment
2371             return true;
2372 
2373         if(iex-fex < jex)                                       // not enough exons left in ai
2374             return false;
2375 
2376         if(ai.Exons()[fex+jex-1].GetTo() < jlimits.GetTo()) // last aj exon is in ai intron
2377             return false;
2378 
2379         for(int j = 0; j < jex-1; ++j) {
2380             if(aj.Exons()[j].GetTo() != ai.Exons()[fex+j].GetTo() || aj.Exons()[j+1].GetFrom() != ai.Exons()[fex+j+1].GetFrom())  // different intron
2381                 return false;
2382         }
2383     }
2384 
2385     return true;
2386 }
2387 
FindContainedAlignments(TContained & pointers)2388 void CChainer::CChainerImpl::FindContainedAlignments(TContained& pointers) {
2389 
2390     set<int> left_exon_ends, right_exon_ends;
2391     ITERATE(TContained, ip, pointers) {
2392         const CGeneModel& algn = *(*ip)->m_align;
2393         for(int i = 1; i < (int)algn.Exons().size(); ++i) {
2394             if(algn.Exons()[i-1].m_ssplice && algn.Exons()[i].m_fsplice) {
2395                 left_exon_ends.insert(algn.Exons()[i].GetFrom());
2396                 right_exon_ends.insert(algn.Exons()[i-1].GetTo());
2397             }
2398         }
2399     }
2400     NON_CONST_ITERATE(TContained, ip, pointers) {
2401         SChainMember& mi = **ip;
2402         CGeneModel& ai = *mi.m_align;
2403 
2404         set<int>::iterator ri = right_exon_ends.lower_bound(ai.Limits().GetTo()); // leftmost compatible rexon
2405         mi.m_rlimb =  numeric_limits<int>::max();
2406         if(ri != right_exon_ends.end())
2407             mi.m_rlimb = *ri;
2408         set<int>::iterator li = left_exon_ends.upper_bound(ai.Limits().GetFrom()); // leftmost not compatible lexon
2409         mi.m_llimb = numeric_limits<int>::max() ;
2410         if(li != left_exon_ends.end())
2411             mi.m_llimb = *li;
2412     }
2413 
2414 //  finding contained subalignments (alignment is contained in itself) and selecting longer alignments for chaining
2415 
2416     sort(pointers.begin(),pointers.end(),GenomeOrderD());
2417     int jfirst = 0;
2418     for(int i = 0; i < (int)pointers.size(); ++i) {
2419         SChainMember& mi = *pointers[i];
2420         CGeneModel& ai = *mi.m_align;
2421         const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2422 
2423         // knockdown spliced notconsensus UTRs in reads
2424         if(mi.m_type != eCDS && ai.Exons().size() > 1) {
2425             if(ai.Status()&CGeneModel::eUnknownOrientation) {
2426                 mi.m_not_for_chaining = true;
2427             } else {
2428                 for(int i = 1; i < (int)ai.Exons().size(); ++i) {
2429                     if(ai.Exons()[i-1].m_ssplice_sig == "XX" || ai.Exons()[i].m_fsplice_sig == "XX")
2430                         continue;
2431                     else if(ai.Strand() == ePlus && (ai.Exons()[i-1].m_ssplice_sig != "GT" || ai.Exons()[i].m_fsplice_sig != "AG"))
2432                         mi.m_not_for_chaining = true;
2433                     else if(ai.Strand() == eMinus && (ai.Exons()[i-1].m_ssplice_sig != "AG" || ai.Exons()[i].m_fsplice_sig != "GT"))
2434                         mi.m_not_for_chaining = true;
2435                 }
2436             }
2437         }
2438 
2439         //don't use alignments intersection with frameshifts for hiding smaller alignments
2440         TSignedSeqRange intersect_with_fs;
2441         ITERATE(TInDels, indl, all_frameshifts) {
2442             if(indl->InDelEnd() < ai.Limits().GetFrom())
2443                 continue;
2444             else if(indl->Loc() >  ai.Limits().GetTo()+1)
2445                 break;
2446             else {
2447                 ITERATE(CGeneModel::TExons, e, ai.Exons()) {
2448                     if(indl->IntersectingWith(e->GetFrom(), e->GetTo()))
2449                         intersect_with_fs += TSignedSeqRange(indl->Loc(), indl->InDelEnd());
2450                 }
2451             }
2452         }
2453 
2454         if(pointers[jfirst]->m_align->Limits() != ai.Limits())
2455             jfirst = i;
2456         for(int j = jfirst; j < (int)pointers.size() && pointers[j]->m_align->Limits().GetFrom() <= ai.Limits().GetTo(); ++j) {
2457 
2458             if(i == j) {
2459                 IncludeInContained(mi, mi);          // include self
2460                 continue;
2461             }
2462 
2463             SChainMember& mj = *pointers[j];
2464             CGeneModel& aj = *mj.m_align;
2465             const CCDSInfo& aj_cds_info = *mj.m_cds_info;
2466 
2467             if(CanIncludeJinI(mi, mj))
2468                 IncludeInContained(mi, mj);
2469             else
2470                 continue;
2471 
2472             if(mi.m_not_for_chaining)
2473                 continue;
2474 
2475             if(intersect_with_fs.NotEmpty() && !Include(aj.Limits(), intersect_with_fs))
2476                continue;
2477 
2478             if(mj.m_type  != mi.m_type)
2479                 continue;
2480             if((aj.Status()&CGeneModel::ePolyA) != 0 || (aj.Status()&CGeneModel::eCap) != 0)
2481                 continue;
2482             if((aj.Type()&CGeneModel::eProt) != 0)                               // proteins (actually only gapped) should be directly available
2483                 continue;
2484             if(ai.Limits() == aj.Limits())
2485                 continue;
2486             if(mj.m_rlimb < ai.Limits().GetTo() || mj.m_llimb != mi.m_llimb)      // bigger alignment may interfere with splices
2487                 continue;
2488             if(mi.m_type == eCDS && mj.m_type == eCDS && !Include(ai_cds_info.MaxCdsLimits(),aj_cds_info.MaxCdsLimits()))  // bigger alignment restricts the cds
2489                 continue;
2490 
2491             mj.m_not_for_chaining = true;
2492         }
2493     }
2494 }
2495 
2496 #define NON_CDNA_INTRON_PENALTY 20
2497 
LRCanChainItoJ(int & delta_cds,double & delta_num,double & delta_splice_num,SChainMember & mi,SChainMember & mj,TContained & contained)2498 bool CChainer::CChainerImpl::LRCanChainItoJ(int& delta_cds, double& delta_num, double& delta_splice_num, SChainMember& mi, SChainMember& mj, TContained& contained) {
2499 
2500     const CGeneModel& ai = *mi.m_align;
2501     const CGeneModel& aj = *mj.m_align;
2502 
2503 
2504     if(aj.Strand() != ai.Strand())
2505         return false;
2506 
2507     const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2508     TSignedSeqRange ai_rf = ai_cds_info.Start()+ai_cds_info.ReadingFrame()+ai_cds_info.Stop();
2509     bool ai_left_complete = ai.Strand() == ePlus ? ai_cds_info.HasStart() : ai_cds_info.HasStop();
2510 
2511     const CCDSInfo& aj_cds_info = *mj.m_cds_info;
2512     TSignedSeqRange aj_rf = aj_cds_info.Start()+aj_cds_info.ReadingFrame()+aj_cds_info.Stop();
2513     bool aj_right_complete = aj.Strand() == ePlus ? aj_cds_info.HasStop() : aj_cds_info.HasStart();
2514 
2515     bool j_rflexible = aj.Status()&CGeneModel::eRightFlexible;
2516     bool i_lflexible = ai.Status()&CGeneModel::eLeftFlexible;
2517     switch(mi.m_type) {
2518     case eCDS:
2519         if(mj.m_type == eRightUTR)
2520             return false;
2521         else if(mj.m_type == eLeftUTR && (!ai_left_complete || (!j_rflexible && (aj.Limits()&ai_rf).GetLength() > 5)))
2522             return false;
2523         else
2524             break;
2525     case eLeftUTR:
2526         if(mj.m_type != eLeftUTR)
2527             return false;
2528         else
2529             break;
2530     case eRightUTR:
2531         if(mj.m_type == eLeftUTR)
2532             return false;
2533         else if(mj.m_type == eCDS && (!aj_right_complete || (!i_lflexible && (ai.Limits()&aj_rf).GetLength() > 5)))
2534             return false;
2535         else
2536             break;
2537     default:
2538         return false;
2539     }
2540 
2541     switch(ai.MutualExtension(aj)) {
2542     case 0:              // not compatible
2543         return false;
2544     case 1:              // no introns in intersection
2545         if(mi.m_type == eCDS && mj.m_type == eCDS)  // no intersecting limit for coding
2546             break;
2547         if ((ai.Limits() & aj.Limits()).GetLength() < intersect_limit)
2548             return false;
2549         break;
2550     default:             // one or more introns in intersection
2551         break;
2552     }
2553 
2554     TSignedSeqRange overlap = (ai.Limits() & aj.Limits());
2555     if(StrictlyContainedInDels(ai.FrameShifts(), overlap) !=  StrictlyContainedInDels(aj.FrameShifts(), overlap))   // incompatible frameshifts
2556         return false;
2557 
2558     int cds_overlap = 0;
2559 
2560     if(mi.m_type == eCDS && mj.m_type == eCDS) {
2561         int genome_overlap =  ai_rf.GetLength()+aj_rf.GetLength()-(ai_rf+aj_rf).GetLength();
2562         if(genome_overlap < 0)
2563             return false;
2564 
2565         TSignedSeqRange max_cds_limits = ai_cds_info.MaxCdsLimits() & aj_cds_info.MaxCdsLimits();
2566 
2567         if (!Include(max_cds_limits, ExtendedMaxCdsLimits(ai, ai_cds_info) + ExtendedMaxCdsLimits(aj, aj_cds_info)))
2568             return false;
2569 
2570         if((Include(ai_rf,aj_rf) || Include(aj_rf,ai_rf)) && ai_rf.GetFrom() != aj_rf.GetFrom() && ai_rf.GetTo() != aj_rf.GetTo())
2571             return false;
2572 
2573         cds_overlap = mi.m_align_map->FShiftedLen(ai_rf&aj_rf,false);
2574         if(cds_overlap%3 != 0)
2575             return false;
2576 
2577         if(ai_cds_info.HasStart() && aj_cds_info.HasStart())
2578             cds_overlap += START_BONUS;
2579 
2580         if(has_rnaseq) {
2581             for(int i = 1; i < (int)ai.Exons().size(); ++i) {
2582                 if(ai.Exons()[i-1].m_ssplice && ai.Exons()[i].m_fsplice) {
2583                     TSignedSeqRange intron(ai.Exons()[i-1].Limits().GetTo(),ai.Exons()[i].Limits().GetFrom());
2584                     if(Include(ai_rf,intron) && Include(aj_rf,intron) && mrna_count[intron]+est_count[intron]+rnaseq_count[intron] == 0) {
2585                         cds_overlap -= NON_CDNA_INTRON_PENALTY;
2586                     }
2587                 }
2588             }
2589         }
2590     }
2591 
2592     delta_cds = mi.m_cds-cds_overlap;
2593 
2594     TContained::const_iterator endsp = contained.begin();
2595     if(!j_rflexible && !i_lflexible)
2596          endsp = upper_bound(contained.begin(), contained.end(), &mj, LeftOrder()); // first alignmnet contained in ai and outside aj
2597     delta_num = 0;
2598     delta_splice_num = 0;
2599     for(TContained::const_iterator ic = endsp; ic != contained.end(); ++ic) {
2600         delta_num += (*ic)->m_align->Weight();
2601         delta_splice_num += (*ic)->m_splice_weight;
2602     }
2603 
2604     return true;
2605 }
2606 
2607 
LRIinit(SChainMember & mi)2608 void CChainer::CChainerImpl::LRIinit(SChainMember& mi) {
2609     const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2610     TSignedSeqRange ai_rf = ai_cds_info.Start()+ai_cds_info.ReadingFrame()+ai_cds_info.Stop();
2611 
2612     TContained micontained = mi.CollectContainedForMemeber();
2613     mi.m_num = 0;
2614     mi.m_splice_num = 0;
2615     for(auto p : micontained) {
2616         mi.m_num += p->m_align->Weight();
2617         mi.m_splice_num = p->m_splice_weight;
2618     }
2619 
2620     const CGeneModel& ai = *mi.m_align;
2621     mi.m_cds = mi.m_align_map->FShiftedLen(ai_rf,false);
2622     if(ai_cds_info.HasStart()) {
2623         mi.m_cds += START_BONUS;
2624         _ASSERT((ai.Strand() == ePlus && ai_cds_info.Start().GetFrom() == ai_cds_info.MaxCdsLimits().GetFrom()) ||
2625                 (ai.Strand() == eMinus && ai_cds_info.Start().GetTo() == ai_cds_info.MaxCdsLimits().GetTo()));
2626     }
2627 
2628     if(has_rnaseq) {
2629         for(int i = 1; i < (int)ai.Exons().size(); ++i) {
2630             if(ai.Exons()[i-1].m_ssplice && ai.Exons()[i].m_fsplice) {
2631                 TSignedSeqRange intron(ai.Exons()[i-1].Limits().GetTo(),ai.Exons()[i].Limits().GetFrom());
2632                 if(Include(ai_rf,intron) && mrna_count[intron]+est_count[intron]+rnaseq_count[intron] == 0) {
2633                     mi.m_cds -= NON_CDNA_INTRON_PENALTY;
2634                 }
2635             }
2636         }
2637     }
2638 
2639     mi.m_left_member = 0;
2640     mi.m_left_num = mi.m_num;
2641     mi.m_left_splice_num = mi.m_splice_num;
2642     mi.m_left_cds =  mi.m_cds;
2643 
2644     mi.m_gapped_connection = false;
2645     mi.m_fully_connected_to_part = -1;
2646 }
2647 
LeftRight(TContained & pointers)2648 void CChainer::CChainerImpl::LeftRight(TContained& pointers)
2649 {
2650     sort(pointers.begin(),pointers.end(),LeftOrderD());
2651     TIVec right_ends(pointers.size());
2652     for(int k = 0; k < (int)pointers.size(); ++k) {
2653         auto& kalign = *pointers[k]->m_align;
2654         int rend = kalign.Limits().GetTo();
2655         if(kalign.Status()&CGeneModel::eRightFlexible)
2656             rend = kalign.Limits().GetFrom();
2657         right_ends[k] = rend;
2658     }
2659     NON_CONST_ITERATE(TContained, i, pointers) {
2660         SChainMember& mi = **i;
2661         CGeneModel& ai = *mi.m_align;
2662 
2663         LRIinit(mi);
2664         TContained micontained = mi.CollectContainedForMemeber();
2665         sort(micontained.begin(),micontained.end(),LeftOrderD());
2666 
2667         TIVec::iterator lb = lower_bound(right_ends.begin(),right_ends.end(),ai.Limits().GetFrom()-2*flex_len); // give some extra for flexible
2668         TContained::iterator jfirst = pointers.begin();
2669         if(lb != right_ends.end())
2670             jfirst = pointers.begin()+(lb-right_ends.begin()); // skip all on the left side
2671         for(TContained::iterator j = jfirst; j < i; ++j) {
2672             SChainMember& mj = **j;
2673             CGeneModel& aj = *mj.m_align;
2674             if(aj.Limits().GetTo() < ai.Limits().GetFrom())    // skip not overlapping (may exist because of flex_len)
2675                 continue;
2676 
2677             int delta_cds;
2678             double delta_num;
2679             double delta_splice_num;
2680             if(LRCanChainItoJ(delta_cds, delta_num, delta_splice_num, mi, mj, micontained)) {
2681                 int newcds = mj.m_left_cds+delta_cds;
2682                 double newnum = mj.m_left_num+delta_num;
2683                 double newsplicenum = mj.m_left_splice_num+delta_splice_num;
2684 
2685                 bool better_connection = false;
2686                 if(newcds != mi.m_left_cds) {
2687                     better_connection = (newcds > mi.m_left_cds);
2688                 } else if(fabs(newsplicenum - mi.m_left_splice_num) > 0.001) {
2689                     better_connection = (newsplicenum > mi.m_left_splice_num);
2690                 } else if(newnum > mi.m_left_num) {
2691                     better_connection = true;
2692                 }
2693 
2694                 if(better_connection) {
2695                     mi.m_left_cds = newcds;
2696                     mi.m_left_splice_num = newsplicenum;
2697                     mi.m_left_num = newnum;
2698                     mi.m_left_member = &mj;
2699                     _ASSERT(((ai.Status()&CGeneModel::eLeftFlexible) || aj.Limits().GetFrom() < ai.Limits().GetFrom())
2700                             && ((aj.Status()&CGeneModel::eRightFlexible) || aj.Limits().GetTo() < ai.Limits().GetTo()));
2701                 }
2702             }
2703         }
2704     }
2705 }
2706 
RightLeft(TContained & pointers)2707 void CChainer::CChainerImpl::RightLeft(TContained& pointers)
2708 {
2709     sort(pointers.begin(),pointers.end(),RightOrderD());
2710     TIVec left_ends(pointers.size());
2711     for(int k = 0; k < (int)pointers.size(); ++k) {
2712         auto& kalign = *pointers[k]->m_align;
2713         int lend = kalign.Limits().GetFrom();
2714         if(kalign.Status()&CGeneModel::eRightFlexible)
2715             lend = kalign.Limits().GetTo();
2716         left_ends[k] = lend;
2717     }
2718     NON_CONST_ITERATE(TContained, i, pointers) {
2719         SChainMember& mi = **i;
2720         CGeneModel& ai = *mi.m_align;
2721         const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2722         TSignedSeqRange ai_rf = ai_cds_info.Start()+ai_cds_info.ReadingFrame()+ai_cds_info.Stop();
2723         TSignedSeqRange ai_limits = ai.Limits();
2724         bool ai_right_complete = ai.Strand() == ePlus ? ai_cds_info.HasStop() : ai_cds_info.HasStart();
2725 
2726         mi.m_right_member = 0;
2727         mi.m_right_num = mi.m_num;
2728         mi.m_right_splice_num = mi.m_splice_num;
2729         mi.m_right_cds =  mi.m_cds;
2730         TContained micontained = mi.CollectContainedForMemeber();
2731         sort(micontained.begin(),micontained.end(),RightOrderD());
2732 
2733         TIVec::iterator lb = lower_bound(left_ends.begin(),left_ends.end(),ai.Limits().GetTo()+2*flex_len,greater<int>()); // first potentially intersecting
2734         TContained::iterator jfirst = pointers.begin();
2735         if(lb != left_ends.end())
2736             jfirst = pointers.begin()+(lb-left_ends.begin()); // skip all on the right side
2737         for(TContained::iterator j = jfirst; j < i; ++j) {
2738             SChainMember& mj = **j;
2739             CGeneModel& aj = *mj.m_align;
2740 
2741             if(aj.Strand() != ai.Strand())
2742                 continue;
2743             if(aj.Limits().GetFrom() > ai.Limits().GetTo())   // skip not overlapping (may exist because of flex_len)
2744                 continue;
2745 
2746             const CCDSInfo& aj_cds_info = *mj.m_cds_info;
2747             TSignedSeqRange aj_rf = aj_cds_info.Start()+aj_cds_info.ReadingFrame()+aj_cds_info.Stop();
2748             bool aj_left_complete = aj.Strand() == ePlus ? aj_cds_info.HasStart() : aj_cds_info.HasStop();
2749 
2750             bool j_lflexible = aj.Status()&CGeneModel::eLeftFlexible;
2751             bool i_rflexible = ai.Status()&CGeneModel::eRightFlexible;
2752             switch(mi.m_type)
2753             {
2754                 case eCDS:
2755                     if(mj.m_type == eLeftUTR)
2756                         continue;
2757                     if(mj.m_type == eRightUTR && (!ai_right_complete || (!j_lflexible && (aj.Limits()&ai_rf).GetLength() > 5)))
2758                         continue;
2759                     else
2760                         break;
2761                 case eRightUTR:
2762                     if(mj.m_type != eRightUTR)
2763                         continue;
2764                     else
2765                         break;
2766                 case eLeftUTR:
2767                     if(mj.m_type == eRightUTR)
2768                         continue;
2769                     if(mj.m_type == eCDS && (!aj_left_complete || (!i_rflexible && (ai.Limits()&aj_rf).GetLength() > 5)))
2770                         continue;
2771                     else
2772                         break;
2773                 default:
2774                     continue;
2775             }
2776 
2777             switch(ai.MutualExtension(aj))
2778             {
2779                 case 0:              // not compatible
2780                     continue;
2781                 case 1:              // no introns in intersection
2782                 {
2783                     if(mi.m_type == eCDS && mj.m_type == eCDS)  // no intersecting limit for coding
2784                         break;
2785 
2786                     int intersect = (ai_limits & aj.Limits()).GetLength();
2787                     if(intersect < intersect_limit) continue;
2788                     break;
2789                 }
2790                 default:             // one or more introns in intersection
2791                     break;
2792             }
2793 
2794             TSignedSeqRange overlap = (ai.Limits() & aj.Limits());
2795             if(StrictlyContainedInDels(ai.FrameShifts(), overlap) !=  StrictlyContainedInDels(aj.FrameShifts(), overlap))   // incompatible frameshifts
2796                 continue;
2797 
2798             int cds_overlap = 0;
2799 
2800             if(mi.m_type == eCDS && mj.m_type == eCDS) {
2801                 int genome_overlap =  ai_rf.GetLength()+aj_rf.GetLength()-(ai_rf+aj_rf).GetLength();
2802                 if(genome_overlap < 0)
2803                     continue;
2804 
2805                 TSignedSeqRange max_cds_limits = ai_cds_info.MaxCdsLimits() & aj_cds_info.MaxCdsLimits();
2806 
2807                 if (!Include(max_cds_limits, ExtendedMaxCdsLimits(ai, ai_cds_info) + ExtendedMaxCdsLimits(aj, aj_cds_info)))
2808                     continue;
2809 
2810                 if((Include(ai_rf,aj_rf) || Include(aj_rf,ai_rf)) && ai_rf.GetFrom() != aj_rf.GetFrom() && ai_rf.GetTo() != aj_rf.GetTo())
2811                     continue;
2812 
2813                 cds_overlap = mi.m_align_map->FShiftedLen(ai_rf&aj_rf,false);
2814                 if(cds_overlap%3 != 0)
2815                     continue;
2816 
2817                 if(ai_cds_info.HasStart() && aj_cds_info.HasStart())
2818                     cds_overlap += START_BONUS;
2819 
2820                 if(has_rnaseq) {
2821                     for(int i = 1; i < (int)ai.Exons().size(); ++i) {
2822                         if(ai.Exons()[i-1].m_ssplice && ai.Exons()[i].m_fsplice) {
2823                             TSignedSeqRange intron(ai.Exons()[i-1].Limits().GetTo(),ai.Exons()[i].Limits().GetFrom());
2824                             if(Include(ai_rf,intron) && Include(aj_rf,intron) && mrna_count[intron]+est_count[intron]+rnaseq_count[intron] == 0) {
2825                                 cds_overlap -= NON_CDNA_INTRON_PENALTY;
2826                             }
2827                         }
2828                     }
2829                 }
2830             }
2831 
2832 
2833             int delta_cds = mi.m_cds-cds_overlap;
2834             int newcds = mj.m_right_cds+delta_cds;
2835 
2836             TContained::iterator endsp = micontained.begin();
2837             if(!j_lflexible && !i_rflexible)
2838                 endsp = upper_bound(micontained.begin(),micontained.end(),&mj,RightOrder()); // first alignment contained in ai and outside aj
2839             double delta_num = 0;
2840             double delta_splice_num = 0;
2841             for(TContained::iterator ic = endsp; ic != micontained.end(); ++ic) {
2842                 delta_num += (*ic)->m_align->Weight();
2843                 delta_splice_num += (*ic)->m_splice_weight;
2844             }
2845             double newnum = mj.m_right_num+delta_num;
2846             double newsplicenum = mj.m_right_splice_num+delta_splice_num;
2847 
2848             bool better_connection = false;
2849             if(newcds != mi.m_right_cds) {
2850                 better_connection = (newcds > mi.m_right_cds);
2851             } else if(fabs(newsplicenum - mi.m_right_splice_num) > 0.001) {
2852                 better_connection = (newsplicenum > mi.m_right_splice_num);
2853             } else if(newnum > mi.m_right_num) {
2854                 better_connection = true;
2855             }
2856 
2857             if(better_connection) {
2858                 mi.m_right_cds = newcds;
2859                 mi.m_right_splice_num = newsplicenum;
2860                 mi.m_right_num = newnum;
2861                 mi.m_right_member = &mj;
2862                 _ASSERT(((aj.Status()&CGeneModel::eLeftFlexible) || aj.Limits().GetFrom() > ai.Limits().GetFrom())
2863                         && ((ai.Status()&CGeneModel::eRightFlexible) || aj.Limits().GetTo() > ai.Limits().GetTo()));
2864             }
2865         }
2866     }
2867 }
2868 
2869 
2870 
2871 
2872 #include <stdio.h>
2873 #include <time.h>
2874 /*
2875     time_t seconds0   = time (NULL);
2876     time_t seconds1   = time (NULL);
2877     cerr << "Time1: " << (seconds1-seconds0)/60. << endl;
2878 */
2879 
2880 
MemberIsCoding(const SChainMember * mp)2881 bool MemberIsCoding(const SChainMember* mp) {
2882     return (mp->m_cds_info->Score() != BadScore());
2883 }
2884 
MemberIsMarkedForDeletion(const SChainMember * mp)2885 bool MemberIsMarkedForDeletion(const SChainMember* mp) {
2886     return mp->m_marked_for_deletion;
2887 }
2888 
2889 // returns essential members of the chain for debugging
GetLinkedIdsForMember(const SChainMember & mi)2890 string GetLinkedIdsForMember(const SChainMember& mi) {
2891     vector<const SChainMember*> mal;
2892     mal.push_back(&mi);
2893     for (SChainMember* left = mi.m_left_member; left != 0; left = left->m_left_member) {
2894         mal.push_back(left);
2895     }
2896     for (SChainMember* right = mi.m_right_member; right != 0; right = right->m_right_member) {
2897         mal.push_back(right);
2898     }
2899     sort(mal.begin(),mal.end(),GenomeOrderD());
2900     string note = to_string(mi.m_align->ID());  //+":"+to_string(mi.m_mem_id);;
2901     ITERATE(vector<const SChainMember*>, imal, mal) {
2902         note = note+" "+to_string((*imal)->m_align->ID());  //+":"+to_string((*imal)->m_mem_id);
2903     }
2904     return note;
2905 }
2906 
GoodSupportForIntrons(const CGeneModel & chain,const SMinScor & minscor,map<TSignedSeqRange,int> & mrna_count,map<TSignedSeqRange,int> & est_count,map<TSignedSeqRange,int> & rnaseq_count)2907 bool GoodSupportForIntrons(const CGeneModel& chain, const SMinScor& minscor,
2908                                    map<TSignedSeqRange,int>& mrna_count, map<TSignedSeqRange,int>& est_count, map<TSignedSeqRange,int>& rnaseq_count) {
2909     bool good = true;
2910     for(int i = 1; i < (int)chain.Exons().size() && good; ++i) {
2911         if(chain.Exons()[i-1].m_ssplice && chain.Exons()[i].m_fsplice) {
2912             TSignedSeqRange intron(chain.Exons()[i-1].Limits().GetTo(),chain.Exons()[i].Limits().GetFrom());
2913             if(mrna_count[intron] < minscor.m_minsupport_mrna && mrna_count[intron]+est_count[intron] < minscor.m_minsupport && rnaseq_count[intron] < minscor.m_minsupport_rnaseq)
2914                 good = false;
2915         }
2916     }
2917 
2918     return good;
2919 }
2920 
MarkUnwantedLowSupportIntrons(TContained & pointers,const SMinScor & minscor,map<TSignedSeqRange,int> & mrna_count,map<TSignedSeqRange,int> & est_count,map<TSignedSeqRange,int> & rnaseq_count)2921 void MarkUnwantedLowSupportIntrons(TContained& pointers, const SMinScor& minscor,
2922                                    map<TSignedSeqRange,int>& mrna_count, map<TSignedSeqRange,int>& est_count, map<TSignedSeqRange,int>& rnaseq_count) {
2923 
2924     NON_CONST_ITERATE(TContained, i, pointers)
2925         (*i)->m_marked_for_deletion = !GoodSupportForIntrons(*(*i)->m_align, minscor, mrna_count, est_count, rnaseq_count);
2926 }
2927 
2928 struct GModelOrder
2929 {
GModelOrderGModelOrder2930     GModelOrder(TOrigAligns& oa) : orig_aligns(oa) {}
2931 
2932     TOrigAligns& orig_aligns;
2933 
operator ()GModelOrder2934     bool operator()(const CGeneModel& a, const CGeneModel& b)
2935     {
2936         if(a.Limits() != b.Limits())
2937             return a.Limits() < b.Limits();
2938         else
2939             return *orig_aligns[a.ID()]->GetTargetId() < *orig_aligns[ b.ID()]->GetTargetId(); // to make sort deterministic
2940     }
2941 };
2942 
2943 
MakeChains(TGeneModelList & clust,bool coding_estimates_only)2944 TGeneModelList CChainer::CChainerImpl::MakeChains(TGeneModelList& clust, bool coding_estimates_only)
2945 {
2946     if(clust.empty()) return TGeneModelList();
2947 
2948     clust.sort(GModelOrder(orig_aligns));
2949 
2950     {
2951         map<tuple<int, int>, TGeneModelList::iterator> special_aligns; // [left/right flex|cap/polya, position]
2952         //all known flexible
2953         for(TGeneModelList::iterator it = clust.begin(); it != clust.end(); ++it) {
2954             if(it->Status()&CGeneModel::eLeftFlexible) {
2955                 int status = it->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eCap|CGeneModel::ePolyA);
2956                 special_aligns.emplace(make_tuple(status, it->Limits().GetTo()), it);
2957             }
2958             if(it->Status()&CGeneModel::eRightFlexible) {
2959                 int status = it->Status()&(CGeneModel::eRightFlexible|CGeneModel::eCap|CGeneModel::ePolyA);
2960                 special_aligns.emplace(make_tuple(status, it->Limits().GetFrom()), it);
2961             }
2962         }
2963         //make flexible from normal cap/polya
2964         int contig_len = m_gnomon->GetSeq().size();
2965         int spec_extend = SPECIAL_ALIGN_LEN-1;
2966         for(TGeneModelList::iterator it = clust.begin(); it != clust.end(); ++it) {
2967             if(it->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
2968                 continue;
2969 
2970             if(it->Status()&CGeneModel::eCap) {
2971                 it->Status() &= ~CGeneModel::eCap;
2972                 CGeneModel galign(it->Strand(), it->ID(), CGeneModel::eSR);
2973                 galign.SetWeight(it->Weight());
2974 
2975                 int pos;
2976                 int status = CGeneModel::eCap;
2977                 if(it->Strand() == ePlus) {
2978                     pos = it->Limits().GetFrom();
2979                     galign.AddExon(TSignedSeqRange(pos, pos+spec_extend));
2980                     status |= CGeneModel::eRightFlexible;
2981                 } else {
2982                     pos = it->Limits().GetTo();
2983                     galign.AddExon(TSignedSeqRange(pos-spec_extend, pos));
2984                     status |= CGeneModel::eLeftFlexible;
2985                 }
2986                 if(galign.Limits().GetFrom() >= 0 && galign.Limits().GetTo() < contig_len) {
2987                     galign.Status() |= status;
2988                     clust.push_front(galign);
2989                     auto rslt = special_aligns.emplace(make_tuple(status, pos), clust.begin());
2990                     if(!rslt.second) {  //this position already exists
2991                         auto ialign = rslt.first->second;
2992                         ialign->SetWeight(ialign->Weight()+galign.Weight());
2993                         clust.pop_front();
2994                     }
2995                 }
2996             }
2997             if(it->Status()&CGeneModel::ePolyA) {
2998                 it->Status() &= ~CGeneModel::ePolyA;
2999                 CGeneModel galign(it->Strand(), it->ID(), CGeneModel::eSR);
3000                 galign.SetWeight(it->Weight());
3001 
3002                 int pos;
3003                 int status = CGeneModel::ePolyA;
3004                 if(it->Strand() == eMinus) {
3005                     pos = it->Limits().GetFrom();
3006                     galign.AddExon(TSignedSeqRange(pos, pos+spec_extend));
3007                     status |= CGeneModel::eRightFlexible;
3008                 } else {
3009                     pos = it->Limits().GetTo();
3010                     galign.AddExon(TSignedSeqRange(pos-spec_extend, pos));
3011                     status |= CGeneModel::eLeftFlexible;
3012                 }
3013                 if(galign.Limits().GetFrom() >= 0 && galign.Limits().GetTo() < contig_len) {
3014                     galign.Status() |= status;
3015                     clust.push_front(galign);
3016                     auto rslt = special_aligns.emplace(make_tuple(status, pos), clust.begin());
3017                     if(!rslt.second) {  //this position already exists
3018                         auto ialign = rslt.first->second;
3019                         ialign->SetWeight(ialign->Weight()+galign.Weight());
3020                         clust.pop_front();
3021                     }
3022                 }
3023             }
3024         }
3025 
3026         //remove below threshold and crossing contig boundaries
3027         for(auto& sa : special_aligns) {
3028             auto ialign = sa.second;
3029             double min_pos_weight = ((ialign->Status()&CGeneModel::eCap) ? min_cap_weight : min_polya_weight);
3030             if(ialign->Limits().GetFrom() < 0 || ialign->Limits().GetTo() >= contig_len || ialign->Weight() < min_pos_weight)
3031                 clust.erase(ialign);
3032         }
3033 
3034         clust.sort(GModelOrder(orig_aligns));
3035     }
3036 
3037     confirmed_ends.clear();
3038     ITERATE (TGeneModelList, it, clust) {
3039         const CGeneModel& align = *it;
3040         if(use_confirmed_ends) {
3041             if(align.Status()&CGeneModel::eLeftConfirmed) {
3042                 auto rslt = confirmed_ends.emplace(align.Exons().front().GetTo(), align.Exons().front().GetFrom());
3043                 if(!rslt.second)
3044                     rslt.first->second = min(rslt.first->second, align.Exons().front().GetFrom());
3045             }
3046             if(align.Status()&CGeneModel::eRightConfirmed) {
3047                 auto rslt = confirmed_ends.emplace(align.Exons().back().GetFrom(), align.Exons().back().GetTo());
3048                 if(!rslt.second)
3049                     rslt.first->second = max(rslt.first->second, align.Exons().back().GetTo());
3050             }
3051         }
3052         all_frameshifts.insert(all_frameshifts.end(), align.FrameShifts().begin(), align.FrameShifts().end());
3053         for(int i = 1; i < (int)align.Exons().size(); ++i) {
3054             if(align.Exons()[i-1].m_ssplice && align.Exons()[i].m_fsplice) {
3055                 TSignedSeqRange intron(align.Exons()[i-1].Limits().GetTo(),align.Exons()[i].Limits().GetFrom());
3056 
3057                 if((align.Status()&CGeneModel::eUnknownOrientation) == 0) {
3058                     if(align.Strand() == ePlus)
3059                         oriented_introns_plus.insert(intron);
3060                     else
3061                         oriented_introns_minus.insert(intron);
3062                 }
3063 
3064                 if(align.Type() == CGeneModel::emRNA)
3065                     mrna_count[intron] += align.Weight();
3066                 else if(align.Type() == CGeneModel::eEST)
3067                     est_count[intron] += align.Weight();
3068                 else if(align.Type() == CGeneModel::eSR)
3069                     rnaseq_count[intron] += align.Weight();
3070             }
3071         }
3072     }
3073 
3074     has_rnaseq = !rnaseq_count.empty();
3075     sort(all_frameshifts.begin(),all_frameshifts.end());
3076     if(!all_frameshifts.empty())
3077         uniq(all_frameshifts);
3078 
3079     flex_len = 0;
3080     NON_CONST_ITERATE (TGeneModelList, it, clust) {
3081         CGeneModel& align = *it;
3082         if(align.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
3083             flex_len = max(flex_len, align.Limits().GetLength());
3084 
3085         if(align.Status()&CGeneModel::eUnknownOrientation) {
3086             int pluses = 0;
3087             int minuses = 0;
3088             for(int i = 1; i < (int)align.Exons().size(); ++i) {
3089                 if(align.Exons()[i-1].m_ssplice && align.Exons()[i].m_fsplice) {
3090                     TSignedSeqRange intron(align.Exons()[i-1].Limits().GetTo(),align.Exons()[i].Limits().GetFrom());
3091                     if(oriented_introns_plus.find(intron) != oriented_introns_plus.end())
3092                         ++pluses;
3093                     if(oriented_introns_minus.find(intron) != oriented_introns_minus.end())
3094                         ++minuses;
3095                 }
3096             }
3097             if(pluses > 0 && minuses == 0) {
3098                 align.Status() ^= CGeneModel::eUnknownOrientation;
3099                 if(align.Strand() == eMinus)
3100                     align.ReverseComplementModel();
3101             } else if(minuses > 0 && pluses == 0) {
3102                 align.Status() ^= CGeneModel::eUnknownOrientation;
3103                 if(align.Strand() == ePlus)
3104                     align.ReverseComplementModel();
3105             }
3106             align.Status() &= ~CGeneModel::eReversed;
3107         }
3108     }
3109 
3110 
3111     CChainMembers allpointers(clust, orig_aligns, unmodified_aligns);
3112 
3113     DuplicateNotOriented(allpointers, clust);
3114     ReplicatePStops(allpointers);
3115     ScoreCdnas(allpointers);
3116     Duplicate5pendsAndShortCDSes(allpointers);
3117     DuplicateUTRs(allpointers);
3118     CalculateSpliceWeights(allpointers);
3119     FindContainedAlignments(allpointers);
3120 
3121     TContained pointers;
3122     ITERATE(TContained, ip, allpointers) {
3123         _ASSERT((*ip)->m_orig_align);
3124         if(!(*ip)->m_not_for_chaining)
3125             pointers.push_back(*ip);
3126     }
3127 
3128     TContained coding_pointers;
3129     ITERATE(CChainMembers, i, pointers) {
3130         if(MemberIsCoding(*i))
3131             coding_pointers.push_back(*i);
3132     }
3133 
3134     LeftRight(coding_pointers);
3135     RightLeft(coding_pointers);
3136 
3137     TChainList tmp_chains;
3138 
3139     set<tuple<int,int,int>> coding_splices; // position, strand, donor/acceptor
3140 
3141     NON_CONST_ITERATE(TContained, i, coding_pointers) {
3142         SChainMember& mi = **i;
3143         mi.m_cds = mi.m_left_cds+mi.m_right_cds-mi.m_cds;
3144         mi.m_splice_num = mi.m_left_splice_num+mi.m_right_splice_num-mi.m_splice_num;
3145         mi.m_num = mi.m_left_num+mi.m_right_num-mi.m_num;
3146     }
3147     sort(coding_pointers.begin(),coding_pointers.end(),CdsNumOrder());
3148     NON_CONST_ITERATE(TContained, i, coding_pointers) {
3149         SChainMember& mi = **i;
3150 
3151         if(mi.m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
3152             continue;
3153 
3154         if(mi.m_included)
3155             continue;
3156 
3157         CChain chain(mi, 0, coding_estimates_only);
3158         TSignedSeqRange i_rf = chain.ReadingFrame();
3159 
3160         m_gnomon->GetScore(chain, coding_estimates_only);
3161         mi.MarkIncludedForChain();
3162         if(chain.Score() == BadScore())
3163             continue;
3164 
3165         if(coding_estimates_only) {
3166             if(chain.GetCdsInfo().ProtReadingFrame().NotEmpty() || chain.Score() > 30 || chain.FShiftedLen(chain.GetCdsInfo().Cds()) >= 300) {
3167                 chain.SetID(m_idnext);
3168                 chain.SetGeneID(m_idnext);
3169                 m_idnext += m_idinc;
3170                 tmp_chains.push_back(chain);
3171             }
3172 
3173             continue;
3174         } else {
3175             if(chain.Score() == BadScore() || chain.PStop(false))
3176                 continue;
3177 
3178             int cdslen = chain.FShiftedLen(chain.GetCdsInfo().Cds(),true);
3179             if(chain.GetCdsInfo().ProtReadingFrame().Empty() &&
3180                (cdslen < minscor.m_minlen || (chain.Score() < 2*minscor.m_min && cdslen <  2*minscor.m_cds_len)))
3181                 continue;
3182 
3183             TSignedSeqRange real_cds = chain.RealCdsLimits();
3184             for(int i = 1; i < (int)chain.Exons().size(); ++ i) {
3185                 int donor = chain.Exons()[i-1].GetTo();
3186                 if(Include(real_cds, donor))
3187                     coding_splices.emplace(donor, chain.Strand(), 0);
3188                 int acceptor = chain.Exons()[i].GetFrom();
3189                 if(Include(real_cds, acceptor))
3190                     coding_splices.emplace(acceptor, chain.Strand(), 1);
3191             }
3192 
3193             TSignedSeqRange n_rf = chain.ReadingFrame();
3194             if(!i_rf.IntersectingWith(n_rf))
3195                 continue;
3196             int a,b;
3197             if(n_rf.GetFrom() <= i_rf.GetFrom()) {
3198                 a = n_rf.GetFrom();
3199                 b = i_rf.GetTo();
3200             } else {
3201                 a = i_rf.GetFrom();
3202                 b = n_rf.GetTo();
3203             }
3204             if(chain.FShiftedLen(a,b,true)%3 != 0)
3205                 continue;
3206 
3207             mi.MarkUnwantedCopiesForChain(chain.RealCdsLimits());
3208         }
3209     }
3210 
3211     for(auto ip : pointers) {
3212         if(ip->m_align->Type()&CGeneModel::eSR)
3213             continue;
3214 
3215         TSignedSeqRange cds = ip->m_cds_info->Cds();
3216         int strand = ip->m_align->Strand();
3217         for(int i = 1; i < (int)ip->m_align->Exons().size(); ++ i) {
3218             int donor = ip->m_align->Exons()[i-1].GetTo();
3219             if(coding_splices.count(make_tuple(donor, ip->m_align->Strand(), 0)) && !Include(cds, donor)) {
3220                 if(ip->m_restricted_to_start && ((strand == ePlus && donor < cds.GetFrom()) || (strand == eMinus && donor > cds.GetTo())))
3221                     continue;
3222                 ip->m_marked_for_deletion = true;
3223                 break;
3224             }
3225             int acceptor = ip->m_align->Exons()[i].GetFrom();
3226             if(coding_splices.count(make_tuple(acceptor, ip->m_align->Strand(), 1)) && !Include(cds, acceptor)) {
3227                 if(ip->m_restricted_to_start && ((strand == ePlus && acceptor < cds.GetFrom()) || (strand == eMinus && acceptor > cds.GetTo())))
3228                     continue;
3229                 ip->m_marked_for_deletion = true;
3230                 break;
3231             }
3232         }
3233     }
3234 
3235     if(coding_estimates_only) {
3236         TGeneModelList chains;
3237         ITERATE(TChainList, it, tmp_chains) {
3238             chains.push_back(*it);
3239             CGeneModel& chain = chains.back();
3240             int introns = 0;
3241             int weight = 0;
3242             for(int i = 1; i < (int)chain.Exons().size(); ++i) {
3243                 if(chain.Exons()[i-1].m_ssplice && chain.Exons()[i].m_fsplice) {
3244                     TSignedSeqRange intron(chain.Exons()[i-1].Limits().GetTo(),chain.Exons()[i].Limits().GetFrom());
3245                     weight += rnaseq_count[intron];
3246                     ++introns;
3247                 }
3248             }
3249             for(int i = 1; i < (int)chain.Exons().size(); ++i) {
3250                 if(chain.Exons()[i-1].m_ssplice && chain.Exons()[i].m_fsplice) {
3251                     TSignedSeqRange intron(chain.Exons()[i-1].Limits().GetTo(),chain.Exons()[i].Limits().GetFrom());
3252                     if(rnaseq_count[intron] < weight/introns/5) {
3253                         chain.SetSplices(i-1, chain.Exons()[i-1].m_fsplice_sig, "WL"); // set weak link
3254                         chain.SetSplices(i, "WL", chain.Exons()[i].m_ssplice_sig);     // set weak link
3255                     }
3256                 }
3257             }
3258         }
3259 
3260         return chains;
3261     }
3262 
3263     pointers.erase(std::remove_if(pointers.begin(),pointers.end(),MemberIsMarkedForDeletion),pointers.end());  // wrong orientaition/UTR/frames are removed
3264 
3265     LeftRight(pointers);
3266     RightLeft(pointers);
3267     NON_CONST_ITERATE(TContained, i, pointers) {
3268         SChainMember& mi = **i;
3269         mi.m_included = false;
3270         mi.m_cds = mi.m_left_cds+mi.m_right_cds-mi.m_cds;
3271         mi.m_splice_num = mi.m_left_splice_num+mi.m_right_splice_num-mi.m_splice_num;
3272         mi.m_num = mi.m_left_num+mi.m_right_num-mi.m_num;
3273     }
3274 
3275     sort(pointers.begin(),pointers.end(),CdsNumOrder());
3276 
3277     NON_CONST_ITERATE(TContained, i, pointers) {
3278         SChainMember& mi = **i;
3279 
3280         if(mi.m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
3281             continue;
3282 
3283         if(mi.m_included || mi.m_postponed) continue;
3284 
3285         CChain chain(mi);
3286         mi.MarkPostponedForChain();
3287 
3288         if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3289             continue;
3290         m_gnomon->GetScore(chain);
3291         if(chain.Score() == BadScore() || (chain.GetCdsInfo().Cds()&chain.m_supported_range).Empty())
3292             continue;
3293 
3294         chain.RemoveFshiftsFromUTRs();
3295         chain.RestoreReasonableConfirmedStart(*m_gnomon, orig_aligns);
3296         const CResidueVec& contig = m_gnomon->GetSeq();
3297         // alignments clipped below might not be in any chain; clipping may produce redundant chains
3298         chain.ClipToCap(min_cap_blob, max_dist, min_flank_exon, secondary_peak);
3299         chain.ClipToPolyA(contig, min_polya_blob, max_dist, min_flank_exon, secondary_peak, tertiary_peak, tertiary_peak_coverage);
3300         chain.ClipLowCoverageUTR(minscor.m_utr_clip_threshold);
3301         if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3302             continue;
3303         m_gnomon->GetScore(chain, !no5pextension); // this will return CDS to best/longest depending on no5pextension
3304         chain.CheckSecondaryCapPolyAEnds();
3305 
3306         double ms = GoodCDNAScore(chain);
3307 
3308         bool has_trusted = chain.HasTrustedEvidence(orig_aligns);
3309 
3310         if(!has_trusted)
3311             RemovePoorCds(chain,ms);
3312         if(chain.Score() != BadScore() && (has_trusted || chain.RealCdsLen() >= minscor.m_minlen)) {
3313             mi.MarkIncludedForChain();
3314 
3315 #ifdef _DEBUG
3316             chain.AddComment("Link1 "+GetLinkedIdsForMember(mi));
3317 #endif
3318             chain.CalculateDropLimits();
3319             tmp_chains.push_back(chain);
3320             _ASSERT( chain.FShiftedLen(chain.GetCdsInfo().Start()+chain.ReadingFrame()+chain.GetCdsInfo().Stop(), false)%3==0 );
3321         }
3322     }
3323 
3324     TGeneModelList unma_aligns;
3325     CChainMembers unma_members;
3326     CreateChainsForPartialProteins(tmp_chains, pointers, unma_aligns, unma_members);
3327 
3328 
3329     pointers.erase(std::remove_if(pointers.begin(),pointers.end(),MemberIsCoding),pointers.end());  // only noncoding left
3330 
3331     MarkUnwantedLowSupportIntrons(pointers, minscor, mrna_count, est_count, rnaseq_count);
3332     pointers.erase(std::remove_if(pointers.begin(),pointers.end(),MemberIsMarkedForDeletion),pointers.end());  // low support introns removed
3333 
3334     // convert all flexible to left UTRs; copy contained flexible from right UTRs to left UTRs; remove right UTRs
3335     for(auto i : allpointers) {
3336         SChainMember& mi = *i;
3337         if(mi.m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)) {
3338             mi.m_type = eLeftUTR;
3339         } else if(mi.m_type == eLeftUTR) {
3340             if(mi.m_copy != nullptr) {
3341                 for(auto j : *mi.m_copy) {
3342                     if(j->m_type == eRightUTR && j->m_align->Strand() == mi.m_align->Strand()) {
3343                         for(auto jc : *j->m_contained) {
3344                             if(jc->m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
3345                                 mi.m_contained->push_back(jc);
3346                         }
3347                     }
3348                 }
3349             }
3350         }
3351     }
3352     pointers.erase(std::remove_if(pointers.begin(),pointers.end(),[](SChainMember* p){ return p->m_type == eRightUTR; }), pointers.end());
3353 
3354     LeftRight(pointers);
3355     RightLeft(pointers);
3356 
3357     ITERATE(TContained, i, pointers) {
3358         SChainMember& mi = **i;
3359         mi.m_splice_num = mi.m_left_splice_num+mi.m_right_splice_num-mi.m_splice_num;
3360         mi.m_num = mi.m_left_num+mi.m_right_num-mi.m_num;
3361         _ASSERT(mi.m_cds == 0);
3362     }
3363 
3364     sort(pointers.begin(),pointers.end(),CdsNumOrder());
3365 
3366     NON_CONST_ITERATE(TContained, i, pointers) {
3367         SChainMember& mi = **i;
3368         if(mi.m_included)
3369             continue;
3370 
3371         if(mi.m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
3372             continue;
3373 
3374         CChain chain(mi);
3375         if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3376             continue;
3377 
3378         chain.RemoveFshiftsFromUTRs();
3379         mi.MarkIncludedForChain();
3380         const CResidueVec& contig = m_gnomon->GetSeq();
3381         chain.ClipToCap(min_cap_blob, max_dist, min_flank_exon, secondary_peak);
3382         chain.ClipToPolyA(contig, min_polya_blob, max_dist, min_flank_exon, secondary_peak, tertiary_peak, tertiary_peak_coverage);
3383         chain.ClipLowCoverageUTR(minscor.m_utr_clip_threshold);
3384         if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3385             continue;
3386         if(chain.Continuous() && chain.Exons().size() > 1) {
3387 #ifdef _DEBUG
3388             chain.AddComment("Link2  "+GetLinkedIdsForMember(mi));
3389 #endif
3390             chain.CalculateDropLimits();
3391             tmp_chains.push_back(chain);
3392         }
3393     }
3394 
3395     NON_CONST_ITERATE(TChainList, it, tmp_chains) {
3396         CChain& chain = *it;
3397         chain.SetID(m_idnext);
3398         chain.SetGeneID(m_idnext);
3399         m_idnext += m_idinc;
3400     }
3401 
3402     CombineCompatibleChains(tmp_chains);
3403     SetFlagsForChains(tmp_chains);
3404 
3405     list<CGene> genes = FindGenes(tmp_chains);  // assigns geneid, rank, skip, nested
3406 
3407     if(genes.size() > 1) {
3408         TrimAlignmentsIncludedInDifferentGenes(genes);
3409         CombineCompatibleChains(tmp_chains);
3410         SetFlagsForChains(tmp_chains);
3411     }
3412 
3413     if(genes.size() > 1)
3414         FindGenes(tmp_chains);                      // redo genes after trim
3415 
3416 
3417     TGeneModelList chains;
3418     NON_CONST_ITERATE(TChainList, it, tmp_chains) {
3419         it->RestoreTrimmedEnds(trim);
3420         chains.push_back(*it);
3421     }
3422 
3423     enum { eFirstPeak = 1, eSecondPeak = 2, eThirdPeak = 4, eAs = 8};
3424     map<tuple<int, int, int>, int> cap_polya_info; // [cap/polya strand position]
3425     const CResidueVec& contig = m_gnomon->GetSeq();
3426     for(auto& chain : tmp_chains) {
3427         if(chain.Status()&CGeneModel::eSkipped)
3428             continue;
3429         if(chain.Status()&CGeneModel::eCap) {
3430             for(int i = 0; i < (int)chain.m_cap_peaks.size(); ++i) {
3431                 int pos = chain.m_cap_peaks[i];
3432                 if(pos >= 0)
3433                     cap_polya_info[make_tuple(CGeneModel::eCap, chain.Strand(), pos)] |= (1 << i);
3434             }
3435         }
3436         if(chain.Status()&CGeneModel::ePolyA) {
3437             for(int i = 0; i < (int)chain.m_polya_peaks.size(); ++i) {
3438                 int pos = chain.m_polya_peaks[i];
3439                 if(pos >= 0) {
3440                     cap_polya_info[make_tuple(CGeneModel::ePolyA, chain.Strand(), pos)] |= (1 << i);
3441                     if(chain.ValidPolyA(pos, contig).second)
3442                         cap_polya_info[make_tuple(CGeneModel::ePolyA, chain.Strand(), pos)] |= eAs;
3443                 }
3444             }
3445         }
3446     }
3447     for(auto& info : cap_polya_info) {
3448         string determinant = get<0>(info.first) == CGeneModel::eCap ? "Cap" : "PolyA";
3449         char strand = get<1>(info.first) == ePlus ? '+' : '-';
3450         int pos = m_edited_contig_map.MapEditedToOrig(get<2>(info.first))+m_limits.GetFrom()+1;
3451         cerr << m_contig_acc << ' ' << determinant << ' ' << strand << ' ' << pos << ' ';
3452         if(info.second&eFirstPeak)
3453             cerr << ":FirstPeak";
3454         if(info.second&eSecondPeak)
3455             cerr << ":SecondPeak";
3456         if(info.second&eThirdPeak)
3457             cerr << ":ThirdPeak";
3458         if(info.second&eAs)
3459             cerr << ":As";
3460         cerr << ":\n";
3461     }
3462 
3463 
3464     return chains;
3465 }
3466 
3467 struct AlignSeqOrder
3468 {
operator ()AlignSeqOrder3469     bool operator()(const CGeneModel* ap, const CGeneModel* bp)
3470     {
3471         if (ap->Limits().GetFrom() != bp->Limits().GetFrom()) return ap->Limits().GetFrom() < bp->Limits().GetFrom();
3472         if (ap->Limits().GetTo() != bp->Limits().GetTo()) return ap->Limits().GetTo() > bp->Limits().GetTo();
3473         return ap->ID() < bp->ID(); // to make sort deterministic
3474     }
3475 };
3476 
FindOptimalChainForProtein(TContained & pointers,vector<CGeneModel * > & parts,CGeneModel & palign)3477 SChainMember* CChainer::CChainerImpl::FindOptimalChainForProtein(TContained& pointers, vector<CGeneModel*>& parts, CGeneModel& palign) {
3478     //    Int8 id = parts.front()->ID();
3479 
3480     TIVec right_ends(pointers.size());
3481     vector<SChainMember> no_gap_members(pointers.size());   // temporary helper chain members; will be used for gap filling optimisation
3482     for(int k = 0; k < (int)pointers.size(); ++k) {
3483         SChainMember& mi = *pointers[k];
3484         right_ends[k] = mi.m_align->Limits().GetTo();
3485         no_gap_members[k] = mi;
3486     }
3487 
3488     SChainMember* best_right = 0;
3489 
3490     int first_member = pointers.size()-1;
3491     int leftpos = palign.Limits().GetFrom();
3492     for(int i = pointers.size()-1; i >= 0; --i) {
3493         TSignedSeqRange limi = pointers[i]->m_align->Limits();
3494         if(limi.GetTo() >= leftpos) {
3495             first_member = i;
3496             leftpos = min(leftpos,limi.GetFrom());
3497         } else {
3498             break;
3499         }
3500     }
3501 
3502     int last_member = 0;
3503     int rightpos = palign.Limits().GetTo();
3504     for(int i = 0; i < (int)pointers.size(); ++i) {
3505         TSignedSeqRange limi = pointers[i]->m_align->Limits();
3506         if(Include(limi,rightpos)) {
3507             last_member = i;
3508             rightpos = max(rightpos,limi.GetTo());
3509         }
3510     }
3511 
3512     int fully_connected_right = 0;     // rightmost point already connected to all parts
3513 
3514     for(int i = first_member; i <= last_member; ++i) {
3515         SChainMember& mi = *pointers[i];                   // best connection maybe gapped
3516         SChainMember& mi_no_gap = no_gap_members[i];       // best not gapped connection (if any)
3517         CGeneModel& ai = *mi.m_align;
3518         LRIinit(mi);
3519         LRIinit(mi_no_gap);
3520 
3521         if(ai.Strand() != palign.Strand())
3522             continue;
3523 
3524         int part_to_connect =  parts.size()-1;
3525         while(part_to_connect >= 0 && ai.Limits().GetFrom() <= parts[part_to_connect]->Limits().GetFrom())
3526             --part_to_connect;
3527 
3528         if(part_to_connect >=0 && ai.Limits().GetFrom() < parts[part_to_connect]->Limits().GetTo() && !parts[part_to_connect]->isCompatible(ai))  // overlaps with part but not compatible
3529             continue;
3530 
3531         if(fully_connected_right > 0 && ai.Limits().GetFrom() > fully_connected_right)    // can't possibly be connected
3532             continue;
3533 
3534         TContained micontained = mi.CollectContainedForMemeber();
3535         sort(micontained.begin(),micontained.end(),LeftOrderD());
3536 
3537         bool compatible_with_included_parts = true;
3538         int last_included_part = -1;
3539         bool includes_first_part = false;
3540         for(int p = part_to_connect+1; p < (int)parts.size(); ++p) {
3541             if(Include(ai.Limits(),parts[p]->Limits())) {
3542                 TSignedSeqRange ai_rf = mi.m_cds_info->ReadingFrame();
3543                 TSignedSeqRange aj_rf = parts[p]->GetCdsInfo().ReadingFrame();
3544                 TSignedSeqRange ai_cds = mi.m_cds_info->Cds();
3545                 TSignedSeqRange aj_cds = parts[p]->GetCdsInfo().Cds();
3546                 bool compatible = (parts[p]->isCompatible(ai) && Include(ai_rf,aj_rf) && mi.m_align_map->FShiftedLen(ai_cds.GetFrom(),aj_cds.GetFrom(),false)%3==1);
3547                 bool samestop = (parts[p]->GetCdsInfo().HasStop() ==  mi.m_cds_info->HasStop() && (!parts[p]->GetCdsInfo().HasStop() || parts[p]->GetCdsInfo().Stop() == mi.m_cds_info->Stop()));
3548                 bool samefshifts = (parts[p]->FrameShifts() == StrictlyContainedInDels(ai.FrameShifts(), parts[p]->Limits()));
3549                 if(compatible && samestop && samefshifts) {
3550                     last_included_part = p;
3551                     if(p == 0)
3552                         includes_first_part = true;
3553                 } else {
3554                     compatible_with_included_parts = false;
3555                     break;
3556                 }
3557             } else if(ai.Limits().IntersectingWith(parts[p]->Limits())) {
3558                 TSignedSeqRange overlap = (ai.Limits() & parts[p]->Limits());
3559                 if(!parts[p]->isCompatible(ai) || StrictlyContainedInDels(ai.FrameShifts(), overlap) !=  StrictlyContainedInDels(parts[p]->FrameShifts(), overlap)) {
3560                     compatible_with_included_parts = false;
3561                     break;
3562                 }
3563             } else {
3564                 break;
3565             }
3566         }
3567 
3568         if(!compatible_with_included_parts)
3569             continue;
3570 
3571         _ASSERT(part_to_connect < 0 || part_to_connect == (int)parts.size()-1 || mi.m_type == eCDS);   // coding if between parts
3572 
3573         if(includes_first_part) {
3574             mi.m_fully_connected_to_part = last_included_part;
3575             mi_no_gap.m_fully_connected_to_part = last_included_part;
3576         }
3577 
3578         TIVec::iterator lb = lower_bound(right_ends.begin(),right_ends.end(),(part_to_connect >= 0 ? parts[part_to_connect]->Limits().GetTo() : ai.Limits().GetFrom()));
3579         int jfirst = 0;
3580         if(lb != right_ends.end())
3581             jfirst = lb-right_ends.begin(); // skip all on the left side
3582 
3583         for(int j = jfirst; j < i; ++j) {
3584             SChainMember& mj = *pointers[j];                   // best connection maybe gapped
3585             if(part_to_connect >= 0 && mj.m_fully_connected_to_part < part_to_connect)   // alignmnet is not connected to all previous parts
3586                 continue;
3587             CGeneModel& aj = *mj.m_align;
3588             if( ai.Strand() != aj.Strand())
3589                 continue;
3590 
3591             SChainMember& mj_no_gap = no_gap_members[j];       // best not gapped connection (if any)
3592 
3593             if(ai.Limits().GetFrom() > aj.Limits().GetTo() && part_to_connect >= 0 && part_to_connect < (int)parts.size()-1 &&       // gap is not closed
3594                mj_no_gap.m_fully_connected_to_part == part_to_connect &&                                                             // no additional gap
3595                mi.m_type == eCDS && mj.m_type == eCDS &&
3596                mj.m_cds_info->MaxCdsLimits().GetTo() == TSignedSeqRange::GetWholeTo() &&
3597                mi.m_cds_info->MaxCdsLimits().GetFrom() == TSignedSeqRange::GetWholeFrom()) {                                        // reading frame not interrupted
3598 
3599 #define PGAP_PENALTY 120
3600 
3601                 int newcds = mj_no_gap.m_left_cds+mi.m_cds - PGAP_PENALTY;
3602                 double newnum = mj_no_gap.m_left_num+mi.m_num;
3603 
3604                 if(mi.m_left_member == 0 || newcds > mi.m_left_cds || (newcds == mi.m_left_cds && newnum > mi.m_left_num)) {
3605                     mi.m_left_cds = newcds;
3606                     mi.m_left_num = newnum;
3607                     mi.m_left_member = &mj_no_gap;
3608                     mi.m_gapped_connection = true;
3609                     mi.m_fully_connected_to_part = part_to_connect;
3610                 }
3611             } else if(ai.Limits().IntersectingWith(aj.Limits())) {
3612                 int delta_cds;
3613                 double delta_num;
3614                 double delta_splice_num;
3615                 if(LRCanChainItoJ(delta_cds, delta_num, delta_splice_num, mi, mj, micontained)) {      // i and j connected continuosly
3616                     int newcds = mj.m_left_cds+delta_cds;
3617                     double newnum = mj.m_left_num+delta_num;
3618                     double newsplicenum = mj.m_left_splice_num+delta_splice_num;
3619 
3620                     bool better_connection = false;
3621                     if(newcds != mi.m_left_cds) {
3622                         better_connection = (newcds > mi.m_left_cds);
3623                     } else if(fabs(newsplicenum - mi.m_left_splice_num) > 0.001) {
3624                         better_connection = (newsplicenum > mi.m_left_splice_num);
3625                     } else if(newnum > mi.m_left_num) {
3626                         better_connection = true;
3627                     }
3628 
3629                     if (mi.m_left_member == 0 || better_connection) {
3630                         mi.m_left_cds = newcds;
3631                         mi.m_left_splice_num = newsplicenum;
3632                         mi.m_left_num = newnum;
3633                         mi.m_gapped_connection = mj.m_gapped_connection;
3634                         mi.m_left_member = &mj;
3635                         mi.m_fully_connected_to_part = part_to_connect;
3636                         if(!mi.m_gapped_connection)
3637                             mi_no_gap = mi;
3638                     } else if(mj_no_gap.m_fully_connected_to_part == part_to_connect) {
3639                         newcds = mj_no_gap.m_left_cds+delta_cds;
3640                         newnum = mj_no_gap.m_left_num+delta_num;
3641                         newsplicenum = mj_no_gap.m_left_splice_num+delta_splice_num;
3642 
3643                         better_connection = false;
3644                         if(newcds != mi_no_gap.m_left_cds) {
3645                             better_connection = (newcds > mi_no_gap.m_left_cds);
3646                         } else if(fabs(newsplicenum - mi_no_gap.m_left_splice_num) > 0.001) {
3647                             better_connection = (newsplicenum > mi_no_gap.m_left_splice_num);
3648                         } else if(newnum > mi_no_gap.m_left_num) {
3649                             better_connection = true;
3650                         }
3651 
3652                         if (mi_no_gap.m_left_member == 0 || better_connection) {
3653                             mi_no_gap.m_left_cds = newcds;
3654                             mi_no_gap.m_left_splice_num = newsplicenum;
3655                             mi_no_gap.m_left_num = newnum;
3656                             mi_no_gap.m_left_member = &mj_no_gap;
3657                             mi_no_gap.m_fully_connected_to_part = part_to_connect;
3658                         }
3659                     }
3660                 }
3661             }
3662         }
3663 
3664         if(mi.m_left_member != 0 && last_included_part >= 0) {
3665             mi.m_fully_connected_to_part = last_included_part;
3666             mi.m_gapped_connection = false;
3667             mi_no_gap = mi;
3668         }
3669 
3670         if(mi.m_fully_connected_to_part == (int)parts.size()-1) {   // includes all parts
3671             fully_connected_right = max(fully_connected_right,mi.m_align->Limits().GetTo());
3672 
3673             if(best_right == 0 || (mi.m_left_cds >  best_right->m_left_cds || (mi.m_left_cds ==  best_right->m_left_cds && mi.m_left_num >  best_right->m_left_num)) )
3674                 best_right = &mi;
3675         }
3676     }
3677 
3678     _ASSERT(best_right != 0);
3679 
3680     _ASSERT(best_right < &no_gap_members.front() || best_right > &no_gap_members.back());   // don't point to temporary vector
3681     for (SChainMember* mp = best_right; mp != 0; mp = mp->m_left_member) {
3682         if(mp->m_left_member >= &no_gap_members.front() && mp->m_left_member <= &no_gap_members.back()) { // points to temporary vector
3683             SChainMember* p = pointers[mp->m_left_member-&no_gap_members.front()];
3684             *p = *mp->m_left_member;
3685             mp->m_left_member = p;
3686         }
3687     }
3688 
3689     return best_right;
3690 }
3691 
3692 struct AlignLenOrder
3693 {
AlignLenOrderAlignLenOrder3694     AlignLenOrder(TOrigAligns& oa) : orig_aligns(oa) {}
3695     TOrigAligns& orig_aligns;
3696 
operator ()AlignLenOrder3697     bool operator()(const vector<CGeneModel*>* ap, const vector<CGeneModel*>* bp)
3698     {
3699         const vector<CGeneModel*>& partsa = *ap;
3700         const vector<CGeneModel*>& partsb = *bp;
3701 
3702         int align_lena = 0;
3703         ITERATE(vector<CGeneModel*>, k, partsa)
3704             align_lena += (*k)->AlignLen();
3705 
3706         int align_lenb = 0;
3707         ITERATE(vector<CGeneModel*>, k, partsb)
3708             align_lenb += (*k)->AlignLen();
3709 
3710         if(align_lena != align_lenb) {
3711             return align_lena > align_lenb;
3712         } else {
3713             return *orig_aligns[partsa.front()->ID()]->GetTargetId() < *orig_aligns[partsb.front()->ID()]->GetTargetId(); // to make sort deterministic
3714         }
3715     }
3716 };
3717 
CreateChainsForPartialProteins(TChainList & chains,TContained & pointers_all,TGeneModelList & unma_aligns,CChainMembers & unma_members)3718 void CChainer::CChainerImpl::CreateChainsForPartialProteins(TChainList& chains, TContained& pointers_all, TGeneModelList& unma_aligns, CChainMembers& unma_members) {
3719 
3720     sort(pointers_all.begin(),pointers_all.end(),LeftOrderD());
3721 
3722     typedef map<Int8, vector<CGeneModel*> > TIdChainMembermap;
3723     TIdChainMembermap protein_parts;
3724     for(int k = 0; k < (int)pointers_all.size(); ++k) {
3725         SChainMember& mi = *pointers_all[k];
3726 
3727         if((mi.m_align->Type() & CGeneModel::eProt) && (mi.m_copy == 0 || mi.m_cds_info->HasStart())) {  // only prots with start can have copies
3728             protein_parts[mi.m_align->ID()].push_back(mi.m_align);
3729         }
3730     }
3731 
3732     vector<vector<CGeneModel*>*> gapped_sorted_protein_parts;
3733     NON_CONST_ITERATE(TIdChainMembermap, ip, protein_parts) {
3734         vector<CGeneModel*>& parts = ip->second;
3735         if(parts.size() > 1) {
3736             sort(parts.begin(),parts.end(),AlignSeqOrder());
3737             gapped_sorted_protein_parts.push_back(&parts);
3738         }
3739     }
3740     sort(gapped_sorted_protein_parts.begin(),gapped_sorted_protein_parts.end(),AlignLenOrder(orig_aligns));
3741 
3742     NON_CONST_ITERATE(vector<vector<CGeneModel*>*>, ip, gapped_sorted_protein_parts) {  // make chains starting from long proteins
3743         vector<CGeneModel*>& parts = **ip;
3744         Int8 id = parts.front()->ID();
3745 
3746         CGeneModel palign(parts.front()->Strand(), id, CGeneModel::eProt);
3747         ITERATE(vector<CGeneModel*>, k, parts) {
3748             CGeneModel part = **k;
3749             CCDSInfo cds = part.GetCdsInfo();
3750             cds.Clear5PrimeCdsLimit();
3751             part.SetCdsInfo(cds);
3752             palign.Extend(part);
3753         }
3754         m_gnomon->GetScore(palign);
3755 
3756         bool connected = false;
3757         NON_CONST_ITERATE(TChainList, k, chains) {
3758             if(k->Continuous() && palign.Strand() == k->Strand() && palign.IsSubAlignOf(*k)) {
3759                 connected = true;
3760 #ifdef _DEBUG
3761                 k->AddComment("Was connected "+orig_aligns[palign.ID()]->TargetAccession());
3762 #endif
3763                 break;
3764             }
3765         }
3766 
3767         if(connected)
3768             continue;
3769 
3770 
3771         TContained pointers;
3772         for(int k = 0; k < (int)pointers_all.size(); ++k) {
3773             SChainMember* mip = pointers_all[k];
3774 
3775             if(mip->m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)) // skip flexible
3776                 continue;
3777 
3778             if((mip->m_type != eCDS || !Include(mip->m_cds_info->MaxCdsLimits(),mip->m_align->Limits())) && Include(palign.Limits(),mip->m_align->Limits())) // skip all not entirely coding inside protein alignment
3779                 continue;
3780 
3781             if(mip->m_align->Exons().front().m_ssplice_sig == "XX" && Include(palign.Limits(),mip->m_align->Exons().front().Limits())) // skip 3'/5' cdna gapfillers inside protein alignment
3782                 continue;
3783 
3784             if(mip->m_align->Exons().back().m_fsplice_sig == "XX" && Include(palign.Limits(),mip->m_align->Exons().back().Limits())) // skip 3'/5' cdna gapfillers inside protein alignment
3785                 continue;
3786 
3787             pointers.push_back(mip);
3788         }
3789 
3790         SChainMember* best_right = FindOptimalChainForProtein(pointers, parts, palign);
3791 
3792         best_right->m_right_member = 0;
3793         CChain chain(*best_right,&palign);
3794 
3795         if(unmodified_aligns.count(id)) {  // some unmodifies are dleted if interfere with a gap
3796             CGeneModel unma = unmodified_aligns[id];
3797             vector<TSignedSeqRange> new_holes;
3798             vector<TSignedSeqRange> remaining_holes;
3799             for(int k = 1; k < (int)chain.Exons().size(); ++k) {
3800                 CModelExon exonl = chain.Exons()[k-1];
3801                 CModelExon exonr = chain.Exons()[k];
3802                 if(!(exonl.m_ssplice && exonr.m_fsplice)) {
3803                     TSignedSeqRange h(exonl.GetTo()+1,exonr.GetFrom()-1);
3804                     remaining_holes.push_back(h);
3805                     for(int piece_begin = 0; piece_begin < (int)unma.Exons().size(); ++piece_begin) {
3806                         int piece_end = piece_begin;
3807                         for( ; piece_end < (int)unma.Exons().size() && unma.Exons()[piece_end].m_ssplice; ++piece_end);
3808                         if(unma.Exons()[piece_begin].GetFrom() < h.GetFrom() && unma.Exons()[piece_end].GetTo() > h.GetTo()) {
3809                             new_holes.push_back(h);
3810                             break;
3811                         }
3812                         piece_begin = piece_end;
3813                     }
3814                 }
3815             }
3816 
3817             if(!new_holes.empty()) {  // failed to connect all parts - try unsupported introns
3818                 CAlignMap umap = unma.GetAlignMap();
3819                 if(unma.Limits() != palign.Limits()) {
3820                     TSignedSeqRange lim = umap.ShrinkToRealPoints(palign.Limits(), true);
3821                     unma.Clip(lim,CGeneModel::eRemoveExons);
3822                 }
3823 
3824                 vector<TSignedSeqRange> existed_holes;
3825                 for(int k = 1; k < (int)unma.Exons().size(); ++k) {
3826                     CModelExon exonl = unma.Exons()[k-1];
3827                     CModelExon exonr = unma.Exons()[k];
3828                     if(!(exonl.m_ssplice && exonr.m_fsplice))
3829                         existed_holes.push_back(TSignedSeqRange(exonl.GetTo()+1,exonr.GetFrom()-1));
3830                 }
3831 
3832                 for(int k = 1; k < (int)palign.Exons().size(); ++k) {   // cut holes which were connected or existed
3833                     CModelExon exonl = palign.Exons()[k-1];
3834                     CModelExon exonr = palign.Exons()[k];
3835                     if(!(exonl.m_ssplice && exonr.m_fsplice)) {
3836                         TSignedSeqRange hole(exonl.GetTo()+1,exonr.GetFrom()-1);
3837                         bool connected = true;
3838                         ITERATE(vector<TSignedSeqRange>, h, remaining_holes) {
3839                             _ASSERT(Include(unma.Limits(), *h));
3840                             if(Include(hole, *h)) {
3841                                 connected = false;
3842                                 break;
3843                             }
3844                         }
3845 
3846                         bool existed = false;
3847                         ITERATE(vector<TSignedSeqRange>, h, existed_holes) {
3848                             if(Include(hole, *h)) {
3849                                 existed = true;
3850                                 break;
3851                             }
3852                         }
3853 
3854                         if(connected || existed) {
3855                             TSignedSeqRange left = umap.ShrinkToRealPoints(TSignedSeqRange(unma.Limits().GetFrom(),hole.GetFrom()-1), true);
3856                             TSignedSeqRange right = umap.ShrinkToRealPoints(TSignedSeqRange(hole.GetTo()+1,unma.Limits().GetTo()), true);
3857                             if(left.GetTo()+1 == hole.GetFrom() && right.GetFrom()-1 == hole.GetTo())
3858                                 unma.CutExons(hole);
3859                         }
3860                     }
3861                 }
3862                 m_gnomon->GetScore(unma);
3863 
3864                 TGeneModelList unmacl;
3865                 unmacl.push_back(unma);
3866                 CutParts(unmacl);
3867 
3868                 vector<CGeneModel*> unmaparts;
3869                 NON_CONST_ITERATE(TGeneModelList, im, unmacl) {
3870                     m_gnomon->GetScore(*im);
3871                     unmaparts.push_back(&(*im));
3872                 }
3873 
3874                 CChainMembers unmapointers(unmacl, orig_aligns, unmodified_aligns);
3875                 Duplicate5pendsAndShortCDSes(unmapointers);
3876                 sort(pointers.begin(),pointers.end(),GenomeOrderD());
3877                 ITERATE(TContained, ip, unmapointers) {
3878                     SChainMember& mi = **ip;
3879                     IncludeInContained(mi, mi);          // include self
3880                     ITERATE(TContained, jp, pointers) {
3881                         SChainMember& mj = **jp;
3882                         if(CanIncludeJinI(mi, mj))
3883                             IncludeInContained(mi, mj);
3884                     }
3885                 }
3886 
3887                 ITERATE(TContained, ip, unmapointers) {
3888                     _ASSERT((*ip)->m_orig_align);
3889                     (*ip)->m_mem_id = -(*ip)->m_mem_id;   // unique m_mem_id
3890                     pointers.push_back(*ip);
3891                 }
3892 
3893                 sort(pointers.begin(),pointers.end(),LeftOrderD());
3894                 best_right = FindOptimalChainForProtein(pointers, unmaparts, unma);
3895                 ITERATE(TContained, jp, unmapointers) {  // add parts in case they were 'shadowed' by longer or identical alignment
3896                     SChainMember& mj = **jp;
3897                     bool present = false;
3898                     for(SChainMember* ip = best_right; ip != 0 && !present; ip = ip->m_left_member)
3899                         present = ip == &mj;
3900                     for(SChainMember* ip = best_right; ip != 0 && !present; ip = ip->m_left_member) {
3901                         SChainMember& mi = *ip;
3902                         if(CanIncludeJinI(mi, mj)) {
3903                             mj.m_left_member = best_right;
3904                             best_right = &mj;
3905                             break;
3906                         }
3907                     }
3908                 }
3909                 chain = CChain(*best_right, &unma);
3910                 unma_aligns.splice(unma_aligns.end(), unmacl);
3911                 unma_members.SpliceFromOther(unmapointers);
3912             }
3913         }
3914 
3915         if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3916             continue;
3917         m_gnomon->GetScore(chain);
3918         if(chain.Score() == BadScore())
3919             continue;
3920 
3921         chain.RemoveFshiftsFromUTRs();
3922         chain.RestoreReasonableConfirmedStart(*m_gnomon, orig_aligns);
3923         const CResidueVec& contig = m_gnomon->GetSeq();
3924         chain.ClipToCap(min_cap_blob, max_dist, min_flank_exon, secondary_peak);
3925         chain.ClipToPolyA(contig, min_polya_blob, max_dist, min_flank_exon, secondary_peak, tertiary_peak, tertiary_peak_coverage);
3926         chain.ClipLowCoverageUTR(minscor.m_utr_clip_threshold);
3927         if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3928             continue;
3929         m_gnomon->GetScore(chain, !no5pextension); // this will return CDS to best/longest depending on no5pextension
3930         chain.CheckSecondaryCapPolyAEnds();
3931         chain.CalculateDropLimits();
3932         _ASSERT( chain.FShiftedLen(chain.GetCdsInfo().Start()+chain.ReadingFrame()+chain.GetCdsInfo().Stop(), false)%3==0 );
3933 
3934 #ifdef _DEBUG
3935         chain.AddComment("Connected "+orig_aligns[palign.ID()]->TargetAccession());
3936         chain.AddComment("LinkForGapped  "+GetLinkedIdsForMember(*best_right));
3937 #endif
3938         chains.push_back(chain);
3939     }
3940 }
3941 
SetFlagsForChains(TChainList & chains)3942 void CChainer::CChainerImpl::SetFlagsForChains(TChainList& chains) {
3943 
3944     int left = numeric_limits<int>::max();
3945     int right = 0;
3946     ITERATE(TOrigAligns, it, orig_aligns) {
3947         const CAlignModel& align = *it->second;
3948         left = min(left,align.Limits().GetFrom());
3949         right = max(right,align.Limits().GetTo());
3950     }
3951 
3952     int len = right-left+1;
3953 
3954     vector<int> prot_cov[2][3];
3955     prot_cov[0][0].resize(len,0);
3956     prot_cov[0][1].resize(len,0);
3957     prot_cov[0][2].resize(len,0);
3958     prot_cov[1][0].resize(len,0);
3959     prot_cov[1][1].resize(len,0);
3960     prot_cov[1][2].resize(len,0);
3961     ITERATE(TOrigAligns, it, orig_aligns) {
3962         const CAlignModel& align = *it->second;
3963         if(align.GetCdsInfo().ProtReadingFrame().NotEmpty()) {
3964             CAlignMap amap = align.GetAlignMap();
3965             int cdstr = amap.MapOrigToEdited(align.GetCdsInfo().Cds().GetFrom());
3966             for(int i = 0; i < (int)align.Exons().size(); ++i) {
3967                 TSignedSeqRange rf = (align.Exons()[i].Limits() & align.ReadingFrame());
3968                 if(rf.NotEmpty()) {
3969                     for(int j = rf.GetFrom(); j <= rf.GetTo(); ++j) {
3970                         int jtr = amap.MapOrigToEdited(j);
3971                         if(jtr >= 0)
3972                             ++prot_cov[align.Strand()][abs(cdstr-jtr)%3][j-left];
3973                     }
3974                 }
3975             }
3976         }
3977     }
3978 
3979     CScope scope(*CObjectManager::GetInstance());
3980     scope.AddDefaults();
3981 
3982     SMatrix matrix;
3983 
3984     const CResidueVec& contig = m_gnomon->GetSeq();
3985 
3986     NON_CONST_ITERATE(TChainList, it, chains) {
3987         CChain& chain = *it;
3988         //        chain.RestoreReasonableConfirmedStart(*m_gnomon, orig_aligns);
3989         chain.SetOpenForPartialyAlignedProteins(prot_complet);
3990         chain.SetConfirmedStartStopForCompleteProteins(prot_complet, minscor);
3991         chain.CollectTrustedmRNAsProts(orig_aligns, minscor, scope, matrix, contig);
3992         chain.SetBestPlacement(orig_aligns);
3993         chain.SetConsistentCoverage();
3994         if(chain.Continuous() && chain.Exons().size() > 1) {
3995             bool allcdnaintrons = true;
3996             int num = 0;
3997             for(int i = 1; i < (int)chain.Exons().size() && allcdnaintrons; ++i) {
3998                 if(chain.Exons()[i-1].m_ssplice_sig != "XX" && chain.Exons()[i].m_fsplice_sig != "XX") {
3999                     TSignedSeqRange intron(TSignedSeqRange(chain.Exons()[i-1].GetTo(),chain.Exons()[i].GetFrom()));
4000                     allcdnaintrons = (mrna_count[intron]+est_count[intron]+rnaseq_count[intron] > 0);
4001                     ++num;
4002                 }
4003             }
4004             if(allcdnaintrons && num >0)
4005                 chain.Status() |= CGeneModel::ecDNAIntrons;
4006         }
4007         if (chain.FullCds()) {
4008             chain.Status() |= CGeneModel::eFullSupCDS;
4009         }
4010 
4011         if(chain.GetCdsInfo().ProtReadingFrame().Empty() && chain.ReadingFrame().NotEmpty()) {  // coding chain without protein support
4012             int protcds = 0;
4013             int lrf_from_proteins = numeric_limits<int>::max();
4014             int rrf_from_proteins = 0;
4015             CAlignMap amap = chain.GetAlignMap();
4016             int cdstr = amap.MapOrigToEdited(chain.GetCdsInfo().Cds().GetFrom());
4017             for(int i = 0; i < (int)chain.Exons().size(); ++i) {
4018                 TSignedSeqRange rf = (chain.Exons()[i].Limits() & chain.ReadingFrame());
4019                 if(rf.NotEmpty()) {
4020                     for(int j = rf.GetFrom(); j <= rf.GetTo(); ++j) {
4021                         if(j < left || j > right)
4022                             continue;
4023 
4024                         int jtr = amap.MapOrigToEdited(j);
4025                         int frame = abs(cdstr-jtr)%3;
4026                         if(jtr >= 0 && prot_cov[chain.Strand()][frame][j-left] > 0) {
4027                             if(frame == 0)
4028                                 lrf_from_proteins = min(lrf_from_proteins,j);
4029                             if(frame == 2)
4030                                 rrf_from_proteins = max(rrf_from_proteins,j);
4031                             ++protcds;
4032                         }
4033                     }
4034                 }
4035             }
4036             if(protcds > 0.2*amap.FShiftedLen(chain.GetCdsInfo().Cds()) && rrf_from_proteins > lrf_from_proteins) {
4037                 CCDSInfo cds = chain.GetCdsInfo();
4038                 TSignedSeqRange reading_frame = cds.ReadingFrame();
4039                 cds.SetReadingFrame(reading_frame&TSignedSeqRange(lrf_from_proteins,rrf_from_proteins), true);
4040                 cds.SetReadingFrame(reading_frame);
4041                 chain.SetCdsInfo(cds);
4042                 chain.SetType(chain.Type()|CGeneModel::eProt);
4043 
4044 #ifdef _DEBUG
4045                 chain.AddComment("Added protsupport");
4046 #endif
4047             }
4048         }
4049     }
4050 }
4051 
4052 
CombineCompatibleChains(TChainList & chains)4053 void CChainer::CChainerImpl::CombineCompatibleChains(TChainList& chains) {
4054     for(TChainList::iterator itt = chains.begin(); itt != chains.end(); ++itt) {
4055         if(itt->Status()&CGeneModel::eSkipped)
4056             continue;
4057         CCDSInfo::TPStops istops = itt->GetCdsInfo().PStops();
4058         for(TChainList::iterator jt = chains.begin(); jt != chains.end();) {
4059             TChainList::iterator jtt = jt++;
4060             if(jtt->Status()&CGeneModel::eSkipped)
4061                 continue;
4062 
4063             if(itt != jtt && itt->Strand() == jtt->Strand() && jtt->IsSubAlignOf(*itt) && itt->ReadingFrame().Empty() == jtt->ReadingFrame().Empty()) {
4064                 if(itt->ReadingFrame().NotEmpty()) {
4065                     if(!Include(jtt->GetCdsInfo().MaxCdsLimits(), itt->GetCdsInfo().MaxCdsLimits()))
4066                         continue;
4067 
4068                     if(jtt->FrameShifts() != StrictlyContainedInDels(itt->FrameShifts(), jtt->Limits()))
4069                         continue;
4070 
4071                     if((itt->FShiftedLen(itt->GetCdsInfo().Cds().GetFrom(),jtt->GetCdsInfo().Cds().GetFrom(),false)-1)%3 != 0)
4072                         continue;
4073 
4074                     CCDSInfo::TPStops jstops = jtt->GetCdsInfo().PStops();
4075                     bool same_stops = true;
4076                     ITERATE(CCDSInfo::TPStops, istp, istops) {
4077                         if(Include(jtt->Limits(),*istp) && find(jstops.begin(), jstops.end(), *istp) == jstops.end()) {
4078                             same_stops = false;
4079                             break;
4080                         }
4081                     }
4082                     if(!same_stops)
4083                         continue;
4084                 }
4085 
4086                 TMemberPtrSet support;
4087                 ITERATE(TContained, i, itt->m_members) {
4088                     support.insert(*i);
4089                     if((*i)->m_copy != 0)
4090                         support.insert((*i)->m_copy->begin(),(*i)->m_copy->end());
4091                 }
4092                 ITERATE(TContained, i, jtt->m_members) {
4093                     if(support.insert(*i).second) {
4094                         itt->m_members.push_back(*i);
4095                         if((*i)->m_copy != 0)
4096                             support.insert((*i)->m_copy->begin(),(*i)->m_copy->end());
4097                     }
4098                 }
4099                 sort(itt->m_members.begin(),itt->m_members.end(),GenomeOrderD());
4100                 itt->CalculateSupportAndWeightFromMembers();
4101                 chains.erase(jtt);
4102             }
4103         }
4104     }
4105 }
4106 
GoodCDNAScore(const CGeneModel & algn)4107 double CChainer::CChainerImpl::GoodCDNAScore(const CGeneModel& algn)
4108 {
4109     if(algn.FShiftedLen(algn.GetCdsInfo().Cds(),true) >  minscor.m_cds_len)
4110         return 0.99*BadScore();
4111     if(((algn.Type()&CGeneModel::eProt)!=0 || algn.ConfirmedStart()) && algn.FShiftedLen(algn.GetCdsInfo().ProtReadingFrame(),true) > minscor.m_prot_cds_len) return 0.99*BadScore();
4112 
4113     int intron_left = 0, intron_internal = 0, intron_total =0;
4114     for(int i = 1; i < (int)algn.Exons().size(); ++i) {
4115         if(!algn.Exons()[i-1].m_ssplice || !algn.Exons()[i].m_fsplice) continue;
4116 
4117         ++intron_total;
4118         if(algn.Exons()[i].GetFrom()-1 < algn.RealCdsLimits().GetFrom()) ++intron_left;
4119         if(algn.Exons()[i-1].GetTo()+1 > algn.RealCdsLimits().GetFrom() && algn.Exons()[i].GetFrom()-1 < algn.RealCdsLimits().GetTo()) ++intron_internal;
4120     }
4121 
4122     int intron_3p, intron_5p;
4123     if(algn.Strand() == ePlus) {
4124         intron_5p = intron_left;
4125         intron_3p = intron_total -intron_5p - intron_internal;
4126     } else {
4127         intron_3p = intron_left;
4128         intron_5p = intron_total -intron_3p - intron_internal;
4129     }
4130 
4131     int cdslen = algn.RealCdsLen();
4132     int len = algn.AlignLen();
4133 
4134     //    return  max(0.,25+7*intron_5p+14*intron_3p-0.05*cdslen+0.005*len);
4135     return  max(0.,minscor.m_min+minscor.m_i5p_penalty*intron_5p+minscor.m_i3p_penalty*intron_3p-minscor.m_cds_bonus*cdslen+minscor.m_length_penalty*len);
4136 }
4137 
4138 
RemovePoorCds(CGeneModel & algn,double minscor)4139 void CChainer::CChainerImpl::RemovePoorCds(CGeneModel& algn, double minscor)
4140 {
4141     if (algn.Score() < minscor)
4142         algn.SetCdsInfo(CCDSInfo());
4143 }
4144 
4145 #define SCAN_WINDOW 49            // odd number!!!
4146 
CChain(SChainMember & mbr,CGeneModel * gapped_helper,bool keep_all_evidence)4147 CChain::CChain(SChainMember& mbr, CGeneModel* gapped_helper, bool keep_all_evidence) : m_coverage_drop_left(-1), m_coverage_drop_right(-1), m_coverage_bump_left(-1), m_coverage_bump_right(-1), m_core_coverage(0), m_splice_weight(0), m_cap_peaks(3, -1), m_polya_peaks(3, -1)
4148 {
4149     m_members = mbr.CollectContainedForChain();
4150     _ASSERT(m_members.size()>0);
4151     sort(m_members.begin(),m_members.end(),GenomeOrderD());
4152 
4153     list<CGeneModel> extened_parts;
4154     vector<CGeneModel*> extened_parts_and_gapped;
4155     if(gapped_helper != 0) {
4156         extened_parts_and_gapped.push_back(gapped_helper);
4157         m_gapped_helper_align = *gapped_helper;
4158     }
4159     //limits extended by cap/polya info alignments without other support
4160     int left = numeric_limits<int>::max();
4161     int right = 0;
4162     ITERATE(TContained, i, m_members) {
4163         SChainMember* mi = *i;
4164         CGeneModel align = *mi->m_align;
4165         if(align.Status()&CGeneModel::eLeftFlexible) {
4166             right = max(right, align.Limits().GetTo());
4167             continue;
4168         } else if(align.Status()&CGeneModel::eRightFlexible) {
4169             left = min(left, align.Limits().GetFrom());
4170             continue;
4171         }
4172         align.SetCdsInfo(*mi->m_cds_info);
4173         if(extened_parts.empty() || !align.Limits().IntersectingWith(extened_parts.back().Limits())) {
4174             extened_parts.push_back(align);
4175             _ASSERT(extened_parts.back().Continuous());
4176             extened_parts_and_gapped.push_back(&extened_parts.back());
4177         } else {
4178             extened_parts.back().Extend(align, false);
4179             _ASSERT(extened_parts.back().Continuous());
4180         }
4181     }
4182     if(left < extened_parts.front().Limits().GetFrom())
4183         extened_parts.front().ExtendLeft(extened_parts.front().Limits().GetFrom()-left);
4184     if(right > extened_parts.back().Limits().GetTo())
4185         extened_parts.back().ExtendRight(right-extened_parts.back().Limits().GetTo());
4186 
4187     SetType(eChain);
4188     EStrand strand = extened_parts_and_gapped.front()->Strand();
4189     SetStrand(strand);
4190 
4191     sort(extened_parts_and_gapped.begin(),extened_parts_and_gapped.end(),AlignSeqOrder());
4192     ITERATE (vector<CGeneModel*>, it, extened_parts_and_gapped) {
4193         const CGeneModel& align = **it;
4194         Extend(align, false);
4195     }
4196 
4197     NON_CONST_ITERATE(TExons, e, MyExons()) {
4198         if(!e->m_fsplice)
4199             e->m_fsplice_sig.clear();
4200         if(!e->m_ssplice)
4201             e->m_ssplice_sig.clear();
4202     }
4203 
4204     m_supported_range = Limits();
4205 
4206     CalculateSupportAndWeightFromMembers(keep_all_evidence);
4207 
4208     m_polya_cap_left_soft_limit = Limits().GetTo()+1;
4209     m_polya_cap_right_soft_limit = Limits().GetFrom()-1;
4210 
4211     CAlignMap amap = GetAlignMap();
4212     int mrna_len = amap.FShiftedLen(Limits());
4213     vector<double> coverage_raw(mrna_len+SCAN_WINDOW);
4214     ITERATE (TContained, it, m_members) {
4215         const CGeneModel& align = *(*it)->m_align;
4216         if(align.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
4217             continue;
4218 
4219         TSignedSeqRange overlap = Limits()&align.Limits();  // theoretically some ends could be outside (partially trimmed from other chain and combined)
4220         if(align.Type() == CGeneModel::eSR && overlap.NotEmpty()) {
4221             TSignedSeqRange on_mrna = amap.MapRangeOrigToEdited(overlap);        // for align partially in a hole will give the hole boundary
4222             for(int i = on_mrna.GetFrom(); i <= on_mrna.GetTo(); ++i)
4223                 coverage_raw[i+SCAN_WINDOW/2] += align.Weight();
4224         }
4225     }
4226 
4227     m_coverage.resize(mrna_len);
4228     double cov = 0;
4229     for(int i = 0; i < SCAN_WINDOW; ++i)
4230         cov += coverage_raw[i]/SCAN_WINDOW;
4231     for(int i = 0; i < mrna_len; ++i) {            // will decrease coverage in SCAN_WINDOW/2 end intervals
4232         m_coverage[i] = cov;
4233         cov -= coverage_raw[i]/SCAN_WINDOW;
4234         cov += coverage_raw[i+SCAN_WINDOW]/SCAN_WINDOW;
4235     }
4236 }
4237 
HasTrustedEvidence(TOrigAligns & orig_aligns) const4238 bool CChain::HasTrustedEvidence(TOrigAligns& orig_aligns) const {
4239     ITERATE (TContained, i, m_members) {
4240         const CGeneModel* align = (*i)->m_align;
4241         if(!align->TrustedProt().empty() || (!align->TrustedmRNA().empty() && (*i)->m_cds_info->ProtReadingFrame().NotEmpty())) {
4242             CAlignModel* orig_align = orig_aligns[align->ID()];
4243             if(align->AlignLen() > 0.5*orig_align->TargetLen())
4244                 return true;
4245         }
4246     }
4247 
4248     return false;
4249 }
4250 
SetBestPlacement(TOrigAligns & orig_aligns)4251 void CChain::SetBestPlacement(TOrigAligns& orig_aligns) {
4252 
4253     map<Int8,int> exonnum;
4254     ITERATE (TContained, it, m_members) {
4255         const CGeneModel& align = *(*it)->m_align;
4256 
4257         if(align.GetCdsInfo().ProtReadingFrame().NotEmpty() && (align.Status()&eBestPlacement) && ((*it)->m_copy == 0 || (*it)->m_cds_info->HasStart())) // best placed protein or projected mRNA
4258             exonnum[align.ID()] += align.Exons().size();
4259     }
4260 
4261     for(map<Int8,int>::iterator it = exonnum.begin(); it != exonnum.end(); ++it) {
4262         if(it->second >= (int)orig_aligns[it->first]->Exons().size()) {   // all exons are included in the chain
4263             Status() |= eBestPlacement;
4264             break;
4265         }
4266     }
4267 }
4268 
4269 struct SLinker
4270 {
SLinkerSLinker4271     SLinker() : m_member(0), m_value(0), m_matches(0), m_left(0), m_not_wanted(false), m_count(0), m_not_wanted_count(0), m_matches_count(0), m_connected(false) {}
operator <SLinker4272     bool operator<(const SLinker& sl) const {
4273         if(m_range != sl.m_range)
4274             return m_range < sl.m_range;
4275         else if(!m_member)
4276             return true;
4277         else if (!sl.m_member)
4278             return false;
4279         else
4280             return m_member->m_mem_id < sl.m_member->m_mem_id;  // to make sort deterministic
4281     }
4282 
4283     SChainMember* m_member;
4284     TSignedSeqRange m_range;
4285     TSignedSeqRange m_reading_frame;
4286     int m_value;
4287     int m_matches;
4288     SLinker* m_left;
4289     bool m_not_wanted;
4290     int m_count;
4291     int m_not_wanted_count;
4292     int m_matches_count;
4293     bool m_connected;
4294 };
4295 
4296 typedef vector<SLinker> TLinkers;
4297 
4298 struct RangeOrder {
operator ()RangeOrder4299     bool operator()(const TSignedSeqRange& a, const TSignedSeqRange& b) const {
4300         return Precede(a, b);
4301     }
4302 };
4303 typedef set<TSignedSeqRange,RangeOrder> TRangePrecedeSet;
4304 
CalculateSupportAndWeightFromMembers(bool keep_all_evidence)4305 void CChain::CalculateSupportAndWeightFromMembers(bool keep_all_evidence) {
4306 
4307     TLinkers linkers;
4308     ITERATE(TContained, i, m_members) {
4309         SChainMember* mi = *i;
4310         CGeneModel* ai = mi->m_align;
4311         _ASSERT(mi->m_orig_align);
4312         int matches = ai->AlignLen();
4313         if(ai->Ident() > 0.)
4314             matches = ai->Ident()*matches+0.5;
4315         bool not_wanted = false;
4316         TSignedSeqRange alimits = ai->Limits();
4317 
4318         if(ai->Status()&CGeneModel::eRightFlexible) {
4319             matches = 0;
4320             not_wanted = true;
4321             for(auto& exon : Exons()) {
4322                 if(Include(exon.Limits(), alimits.GetFrom())) {
4323                     alimits.SetTo(min(alimits.GetTo(), exon.Limits().GetTo()));
4324                     matches = alimits.GetLength();
4325                     break;
4326                 }
4327             }
4328             if(matches == 0) {
4329                 if(alimits.GetFrom() < Limits().GetFrom()) {
4330                     alimits.SetTo(min(alimits.GetTo(), Exons().front().Limits().GetTo()));
4331                     matches = alimits.GetLength();
4332                 } else {
4333                     continue;
4334                 }
4335             }
4336         }
4337         if(ai->Status()&CGeneModel::eLeftFlexible) {
4338             matches = 0;
4339             not_wanted = true;
4340             for(auto& exon : Exons()) {
4341                 if(Include(exon.Limits(), alimits.GetTo())) {
4342                     alimits.SetFrom(max(alimits.GetFrom(), exon.Limits().GetFrom()));
4343                     matches = alimits.GetLength();
4344                     break;
4345                 }
4346             }
4347             if(matches == 0) {
4348                 if(alimits.GetTo() > Limits().GetTo()) {
4349                     alimits.SetFrom(max(alimits.GetFrom(), Exons().back().Limits().GetFrom()));
4350                     matches = alimits.GetLength();
4351                 } else {
4352                     continue;
4353                 }
4354             }
4355         }
4356 
4357         TRangePrecedeSet incompatible_ranges;
4358         for(int j = 1; j < (int)Exons().size(); ++j) {
4359             TSignedSeqRange intron(Exons()[j-1].GetTo()+1,Exons()[j].GetFrom()-1);
4360             if(intron.IntersectingWith(alimits))
4361                 incompatible_ranges.insert(incompatible_ranges.end(),intron);
4362         }
4363         for(int j = 1; j < (int)ai->Exons().size(); ++j) {
4364             TSignedSeqRange intron(ai->Exons()[j-1].GetTo()+1,ai->Exons()[j].GetFrom()-1);
4365             if(intron.IntersectingWith(m_supported_range)) {
4366                 TRangePrecedeSet::iterator first = incompatible_ranges.lower_bound(TSignedSeqRange(intron.GetFrom(),intron.GetFrom()));
4367                 if(first != incompatible_ranges.end() && *first == intron) { // compatible intron
4368                     incompatible_ranges.erase(first);
4369                     continue;
4370                 }
4371 
4372                 TRangePrecedeSet::iterator second = incompatible_ranges.upper_bound(TSignedSeqRange(intron.GetTo(),intron.GetTo()));
4373                 for(TRangePrecedeSet::iterator ir = first; ir != second; ) {
4374                     intron += *ir;
4375                     incompatible_ranges.erase(ir++);
4376                 }
4377                 incompatible_ranges.insert(second,intron);
4378             }
4379         }
4380 
4381         if(!incompatible_ranges.empty())
4382             not_wanted = true;
4383 
4384         int left = (alimits&m_supported_range).GetFrom();
4385         if(!incompatible_ranges.empty() && incompatible_ranges.begin()->GetFrom() <= left) {
4386             left = incompatible_ranges.begin()->GetTo()+1;
4387             incompatible_ranges.erase(incompatible_ranges.begin());
4388         }
4389         int right = (alimits&m_supported_range).GetTo();
4390         if(!incompatible_ranges.empty()) {
4391             TRangePrecedeSet::iterator last = incompatible_ranges.end();
4392             if((--last)->GetTo() >= right) {
4393                 right = last->GetFrom()-1;
4394                 incompatible_ranges.erase(last);
4395             }
4396         }
4397         while(left <= right) {
4398             SLinker sl;
4399             sl.m_not_wanted = not_wanted;
4400             sl.m_member = mi;
4401             sl.m_value = 1;
4402             if(ai->Status()&CGeneModel::eLeftFlexible)
4403                 sl.m_value = (ai->Limits().GetTo() == Limits().GetTo()) ? 1000 : 10000;     // remove from support if possible; keep exact end if needed
4404             if(ai->Status()&CGeneModel::eRightFlexible)
4405                 sl.m_value = (ai->Limits().GetFrom() == Limits().GetFrom()) ? 1000 : 10000; // remove from support if possible; keep exact end if needed
4406             sl.m_matches = matches;
4407             sl.m_range.SetFrom(left);
4408             if(!incompatible_ranges.empty()) {
4409                 sl.m_range.SetTo(incompatible_ranges.begin()->GetFrom()-1);
4410                 left = incompatible_ranges.begin()->GetTo()+1;
4411                 incompatible_ranges.erase(incompatible_ranges.begin());
4412             } else {
4413                 sl.m_range.SetTo(right);
4414                 left = right+1;
4415             }
4416             sl.m_reading_frame = ReadingFrame()&sl.m_range;
4417             linkers.push_back(sl);
4418         }
4419     }
4420 
4421     set<TSignedSeqRange> chain_introns;
4422     for(int i = 1; i < (int)Exons().size(); ++i) {
4423         if(Exons()[i-1].m_ssplice && Exons()[i].m_fsplice)
4424             chain_introns.insert(TSignedSeqRange(Exons()[i-1].GetTo(),Exons()[i].GetFrom()));
4425     }
4426 
4427     Status() &= ~CGeneModel::eChangedByFilter;
4428 
4429     NON_CONST_ITERATE(TLinkers, l, linkers) {
4430         SLinker& sl = *l;
4431         SChainMember* mi = sl.m_member;
4432         CGeneModel& align = *mi->m_align;
4433         if(mi->m_unmd_align) {
4434             CGeneModel& unma = *mi->m_unmd_align;
4435             bool all_introns_included = true;
4436             for(int i = 1; all_introns_included && i < (int)unma.Exons().size(); ++i) {
4437                 if(unma.Exons()[i-1].m_ssplice && unma.Exons()[i].m_fsplice)
4438                     all_introns_included = chain_introns.count(TSignedSeqRange(unma.Exons()[i-1].GetTo(),unma.Exons()[i].GetFrom()));
4439             }
4440             if(!all_introns_included) {   // protein intron was clipped and not restored or part is not in chain
4441                 sl.m_not_wanted = true;
4442                 if(align.ID() == m_gapped_helper_align.ID())
4443                     Status() |= CGeneModel::eChangedByFilter;
4444             }
4445         } else if(align.Status()&CGeneModel::eChangedByFilter) {  // for proteins could be restored
4446             sl.m_not_wanted = true;
4447         } else {
4448             CAlignModel& orig_align = *mi->m_orig_align;
4449             bool all_introns_included = true;
4450             for(int i = 1; all_introns_included && i < (int)orig_align.Exons().size(); ++i) {
4451                 if(orig_align.Exons()[i-1].m_ssplice && orig_align.Exons()[i].m_fsplice)
4452                     all_introns_included = chain_introns.count(TSignedSeqRange(orig_align.Exons()[i-1].GetTo(),orig_align.Exons()[i].GetFrom()));
4453             }
4454             if(!all_introns_included) {   // intron was clipped by UTR clip or part is not in chain
4455                 sl.m_not_wanted = true;
4456                 if(align.Type()&eNotForChaining) // if TSA was clipped remove from support if possible
4457                     sl.m_value = 10000;
4458             }
4459         }
4460     }
4461 
4462     if(m_gapped_helper_align.ID()) {
4463         int left = m_gapped_helper_align.Limits().GetFrom();
4464         for(int i = 0; i < (int)m_gapped_helper_align.Exons().size(); ++i) {
4465             if(!m_gapped_helper_align.Exons()[i].m_ssplice) {
4466                 SLinker sl;
4467                 sl.m_range = TSignedSeqRange(left,m_gapped_helper_align.Exons()[i].GetTo())&m_supported_range;
4468                 sl.m_reading_frame = sl.m_range&ReadingFrame();
4469                 if(sl.m_range.NotEmpty())
4470                     linkers.push_back(sl);
4471 
4472                 if(i+1 < (int)m_gapped_helper_align.Exons().size())
4473                     left = m_gapped_helper_align.Exons()[i+1].GetFrom();
4474             }
4475         }
4476 
4477         for(int i = 1; i < (int)Exons().size(); ++i) {
4478             if(!Exons()[i-1].m_ssplice || !Exons()[i].m_fsplice) {
4479                 SLinker sl;
4480                 sl.m_range = TSignedSeqRange(Exons()[i-1].GetTo(),Exons()[i].GetFrom());
4481                 sl.m_reading_frame = sl.m_range&ReadingFrame();
4482                 linkers.push_back(sl);
4483             }
4484         }
4485     }
4486 
4487     sort(linkers.begin(), linkers.end());
4488     for(int i = 0; i < (int)linkers.size(); ++i) {
4489         SLinker& sli = linkers[i];
4490         if(sli.m_range.GetFrom() == m_supported_range.GetFrom()) {
4491             sli.m_count = sli.m_value;
4492             sli.m_matches_count = sli.m_matches;
4493             if(sli.m_not_wanted)
4494                 sli.m_not_wanted_count = sli.m_value;
4495             sli.m_connected = true;
4496         } else {
4497             for(int j = i-1; j >= 0; --j) {
4498                 SLinker& slj = linkers[j];
4499                 if(slj.m_connected &&
4500                    slj.m_range.GetFrom() < sli.m_range.GetFrom() &&
4501                    slj.m_range.GetTo() < sli.m_range.GetTo() &&
4502                    slj.m_range.GetTo() >= sli.m_range.GetFrom()-1) {   //overlaps and extends and connected to the left end
4503 
4504                     bool divided_pstop = false;
4505                     for(int is = 0; is < (int)GetCdsInfo().PStops().size() && !divided_pstop; ++is) {
4506                         const TSignedSeqRange& s = GetCdsInfo().PStops()[is];
4507                         divided_pstop = (Include(s,slj.m_range.GetTo()) || Include(s,sli.m_range.GetFrom())) && !Include(slj.m_reading_frame,s) && !Include(sli.m_reading_frame,s);
4508                     }
4509                     if(divided_pstop)  // both alignmnets just touch the pstop without actually crossing it
4510                         continue;
4511 
4512                     int new_count = slj.m_count + sli.m_value;
4513                     int new_matches_count = slj.m_matches_count + sli.m_matches;
4514                     int new_not_wanted_count = slj.m_not_wanted_count;
4515                     if(sli.m_not_wanted)
4516                         new_not_wanted_count += sli.m_value;
4517                     if(!sli.m_connected || new_count < sli.m_count || (new_count == sli.m_count && new_not_wanted_count < sli.m_not_wanted_count) ||
4518                         (new_count == sli.m_count && new_not_wanted_count == sli.m_not_wanted_count && new_matches_count > sli.m_matches_count)) {
4519                         sli.m_count = new_count;
4520                         sli.m_matches_count = new_matches_count;
4521                         sli.m_not_wanted_count = new_not_wanted_count;
4522                         sli.m_connected = true;
4523                         sli.m_left = &slj;
4524                     }
4525                 }
4526             }
4527         }
4528     }
4529     SLinker* best_right = 0;
4530     for(int i = 0; i < (int)linkers.size(); ++i) {
4531         SLinker& sli = linkers[i];
4532         if(sli.m_connected && sli.m_range.GetTo() == m_supported_range.GetTo()) {
4533             if(best_right == 0 || sli.m_count < best_right->m_count || (sli.m_count == best_right->m_count && sli.m_not_wanted_count < best_right->m_not_wanted_count) ||
4534                   (sli.m_count == best_right->m_count && sli.m_not_wanted_count == best_right->m_not_wanted_count && sli.m_matches_count >  best_right->m_matches_count))
4535                 best_right = &sli;
4536         }
4537     }
4538 
4539     _ASSERT(best_right != 0);
4540 
4541     set<Int8> sp_core;
4542     for(SLinker* l = best_right; l != 0; l = l->m_left) {
4543         if(l->m_member)
4544             sp_core.insert(l->m_member->m_align->ID());
4545     }
4546     if(m_gapped_helper_align.ID())
4547         sp_core.insert(m_gapped_helper_align.ID());
4548 
4549     set<Int8> sp_not_wanted;
4550     if(!keep_all_evidence) {
4551         for(int i = 0; i < (int)linkers.size(); ++i) {
4552             SLinker& sli = linkers[i];
4553             if(sli.m_member && sli.m_not_wanted) {
4554                 if(!sp_core.count(sli.m_member->m_align->ID()))
4555                     sp_not_wanted.insert(sli.m_member->m_align->ID());
4556                 else
4557                     Status() |= CGeneModel::eChangedByFilter;
4558             }
4559         }
4560     }
4561 
4562     double weight = 0;
4563     m_splice_weight = 0;
4564     set<Int8> sp;
4565     TSignedSeqRange protreadingframe;
4566     ReplaceSupport(CSupportInfoSet());
4567 
4568     SetType(Type() & (~(eSR | eEST | emRNA | eProt | eNotForChaining)));
4569     ITERATE (TContained, it, m_members) {
4570         const CGeneModel& align = *(*it)->m_align;
4571         Int8 id = align.ID();
4572         if(!sp_not_wanted.count(id)) {
4573             SetType(Type() | (align.Type() & (eSR | eEST | emRNA | eProt | eNotForChaining)));
4574             protreadingframe += align.GetCdsInfo().ProtReadingFrame();
4575             m_splice_weight += (*it)->m_splice_weight;
4576             if(sp.insert(id).second) {   // avoid counting parts of splitted aligns
4577                 weight += align.Weight();
4578                 AddSupport(CSupportInfo(id,sp_core.count(id)));
4579             }
4580         }
4581     }
4582 
4583 
4584 
4585     CCDSInfo cds = GetCdsInfo();
4586     TSignedSeqRange readingframe = cds.ReadingFrame();
4587     protreadingframe &= readingframe;
4588     cds.SetReadingFrame(protreadingframe, true);
4589     cds.SetReadingFrame(readingframe, false);
4590 
4591     {
4592         CAlignMap mrnamap(Exons(),FrameShifts(),Strand());
4593         CCDSInfo cds_info = cds;
4594         if(cds_info.IsMappedToGenome())
4595             cds_info = cds_info.MapFromOrigToEdited(mrnamap);
4596     }
4597 
4598     SetCdsInfo(cds);
4599 
4600     SetWeight(weight);
4601 }
4602 
RestoreTrimmedEnds(int trim)4603 void CChain::RestoreTrimmedEnds(int trim)
4604 {
4605     // add back trimmed off UTRs
4606 
4607     if(((Status()&eLeftConfirmed) == 0) && (!OpenLeftEnd() || ReadingFrame().Empty()) && (Strand() == ePlus || (Status()&ePolyA) == 0) && (Strand() == eMinus || (Status()&eCap) == 0)) {
4608         for(int ia = 0; ia < (int)m_members.size(); ++ia)  {
4609             const CGeneModel a = *m_members[ia]->m_align;
4610             if((a.Type() & eProt)==0 && (a.Status() & CGeneModel::eLeftTrimmed)!=0 &&
4611                a.Exons().size() > 1 && Exons().front().Limits().GetFrom() == a.Limits().GetFrom()) {
4612                 ExtendLeft( trim );
4613                 break;
4614             }
4615         }
4616     }
4617 
4618     if(((Status()&eRightConfirmed) == 0) && (!OpenRightEnd() || ReadingFrame().Empty()) && (Strand() == eMinus || (Status()&ePolyA) == 0) && (Strand() == ePlus || (Status()&eCap) == 0)) {
4619         for(int ia = 0; ia < (int)m_members.size(); ++ia)  {
4620             const CGeneModel a = *m_members[ia]->m_align;
4621             if((a.Type() & eProt)==0 && (a.Status() & CGeneModel::eRightTrimmed)!=0 &&
4622                a.Exons().size() > 1 && Exons().back().Limits().GetTo() == a.Limits().GetTo()) {
4623                 ExtendRight( trim );
4624                 break;
4625             }
4626         }
4627     }
4628 }
4629 
SetOpenForPartialyAlignedProteins(map<string,pair<bool,bool>> & prot_complet)4630 void CChain::SetOpenForPartialyAlignedProteins(map<string, pair<bool,bool> >& prot_complet) {
4631     if(ConfirmedStart() || !HasStart() || !HasStop() || OpenCds() || !Open5primeEnd() || (Type()&CGeneModel::eProt) == 0)
4632         return;
4633 
4634     bool found_length_match = false;
4635     ITERATE (TContained, it, m_members) {
4636         CAlignModel* orig_align = (*it)->m_orig_align;
4637         _ASSERT(orig_align);
4638         if((orig_align->Type() & CGeneModel::eProt) == 0 || orig_align->TargetLen() == 0)   // not a protein or not known length
4639             continue;
4640 
4641         string accession = orig_align->TargetAccession();
4642         map<string, pair<bool,bool> >::iterator iter = prot_complet.find(accession);
4643         _ASSERT(iter != prot_complet.end());
4644         if(iter == prot_complet.end() || !iter->second.first || !iter->second.second) // unknown or partial protein
4645             continue;
4646 
4647         if(orig_align->TargetLen()*0.8 < RealCdsLen()) {
4648             found_length_match = true;
4649             break;
4650         }
4651     }
4652 
4653     if(!found_length_match) {
4654         CCDSInfo cds_info = GetCdsInfo();
4655         cds_info.SetScore(Score(), true);
4656         SetCdsInfo(cds_info);
4657     }
4658 
4659     return;
4660 }
4661 
RestoreReasonableConfirmedStart(const CGnomonEngine & gnomon,TOrigAligns & orig_aligns)4662 void CChain::RestoreReasonableConfirmedStart(const CGnomonEngine& gnomon, TOrigAligns& orig_aligns)
4663 {
4664     //    if(ReadingFrame().Empty() || ConfirmedStart())
4665     if(ReadingFrame().Empty())
4666         return;
4667 
4668     TSignedSeqRange conf_start;
4669     TSignedSeqPos rf=0;
4670     bool trusted = false;
4671 
4672     CAlignMap amap = GetAlignMap();
4673     ITERATE(TOrigAligns, it, orig_aligns) {
4674         const CAlignModel& align = *it->second;
4675         if(align.Strand() != Strand() || !align.ConfirmedStart() || (align.TrustedProt().empty() && align.TrustedmRNA().empty()) || !(align.Status()&CGeneModel::eBestPlacement))
4676             continue;
4677 
4678         TSignedSeqRange start = align.GetCdsInfo().Start();
4679 
4680         int a = amap.MapOrigToEdited(start.GetFrom());
4681         int b = amap.MapOrigToEdited(start.GetTo());
4682         if(a < 0 || b < 0 || abs(a-b) != 2)
4683             continue;
4684 
4685         int l = GetCdsInfo().Cds().GetFrom();
4686         int r = start.GetFrom();
4687         if(l > r)
4688             swap(l,r);
4689         if(!Include(GetCdsInfo().MaxCdsLimits(),start) || amap.FShiftedLen(l,r)%3 != 1)
4690             continue;
4691 
4692         list<TSignedSeqRange> align_introns;
4693         for(int i = 1; i < (int)align.Exons().size(); ++i) {
4694             TSignedSeqRange intron(align.Exons()[i-1].Limits().GetTo(),align.Exons()[i].Limits().GetFrom());
4695             if(Include(start,intron))
4696                 align_introns.push_back(intron);
4697         }
4698 
4699         list<TSignedSeqRange> introns;
4700         bool hole = false;
4701         int len = start.GetLength();
4702         for(int i = 1; i < (int)Exons().size(); ++i) {
4703             TSignedSeqRange intron(Exons()[i-1].Limits().GetTo(),Exons()[i].Limits().GetFrom());
4704             if(Include(start,intron)) {
4705                 introns.push_back(intron);
4706                 len -= intron.GetLength()+2;
4707                 if(!Exons()[i-1].m_ssplice || !Exons()[i].m_fsplice)
4708                     hole = true;
4709             }
4710         }
4711 
4712         if(len !=3 || hole || align_introns != introns)
4713             continue;
4714 
4715         if(Strand() == ePlus) {
4716             if(conf_start.Empty() || start.GetFrom() < conf_start.GetFrom()) {
4717                 bool found = false;
4718                 for(int i = 0; i < (int)Exons().size() && !found; ++i) {
4719                     if(Include(Exons()[i].Limits(),start.GetTo())) {
4720                         if(Exons()[i].Limits().GetTo() > start.GetTo()) {
4721                             rf = start.GetTo()+1;
4722                             found = true;
4723                         } else if(i != (int)Exons().size()-1) {
4724                             rf = Exons()[i+1].Limits().GetFrom();
4725                             found = true;
4726                         }
4727                     }
4728                 }
4729 
4730                 if(found && amap.FShiftedLen(rf,GetCdsInfo().Cds().GetTo()) > 75) {
4731                     conf_start = start;
4732                     trusted = true;
4733                 }
4734             }
4735         } else {
4736             if(conf_start.Empty() || start.GetTo() > conf_start.GetTo()) {
4737                 bool found = false;
4738                 for(int i = 0; i < (int)Exons().size() && !found; ++i) {
4739                     if(Include(Exons()[i].Limits(),start.GetFrom())) {
4740                         if(Exons()[i].Limits().GetFrom() < start.GetFrom()) {
4741                             rf = start.GetFrom()-1;
4742                             found = true;
4743                         } else if(i != 0) {
4744                             rf = Exons()[i-1].Limits().GetTo();
4745                             found = true;
4746                         }
4747                     }
4748                 }
4749 
4750                 if(found && amap.FShiftedLen(GetCdsInfo().Cds().GetFrom(),rf) > 75) {
4751                     conf_start = start;
4752                     trusted = true;
4753                 }
4754             }
4755         }
4756     }
4757 
4758 
4759     if(conf_start.Empty()) {
4760         ITERATE (TContained, it, m_members) {
4761             CAlignModel* orig_align = (*it)->m_orig_align;
4762             _ASSERT(orig_align);
4763 
4764             if(orig_align->ConfirmedStart() && Include((*it)->m_align->Limits(),orig_align->GetCdsInfo().Start())) {    // right part of orig is included
4765                 TSignedSeqRange start = orig_align->GetCdsInfo().Start();
4766                 int l = GetCdsInfo().Cds().GetFrom();
4767                 int r = start.GetFrom();
4768                 if(l > r)
4769                     swap(l,r);
4770                 if(!Include(GetCdsInfo().MaxCdsLimits(),start) || amap.FShiftedLen(l,r)%3 != 1) // orig_align could be dropped beacause it was modified and have frameshifts between its start and 'best' start
4771                     continue;
4772 
4773                 if(Strand() == ePlus) {
4774                     if(conf_start.Empty() || start.GetFrom() < conf_start.GetFrom()) {
4775                         conf_start = start;
4776                         rf = orig_align->ReadingFrame().GetFrom();
4777                     }
4778                 } else {
4779                     if(conf_start.Empty() || start.GetTo() > conf_start.GetTo()) {
4780                         conf_start = start;
4781                         rf = orig_align->ReadingFrame().GetTo();
4782                     }
4783                 }
4784             }
4785         }
4786     }
4787 
4788 
4789     if(conf_start.NotEmpty()) {
4790         TSignedSeqRange extra_cds;
4791         CCDSInfo cds = GetCdsInfo();
4792         if(cds.ProtReadingFrame().NotEmpty()) {
4793             if(Strand() == ePlus && cds.ProtReadingFrame().GetFrom() < conf_start.GetFrom())
4794                 extra_cds = TSignedSeqRange(cds.ProtReadingFrame().GetFrom(), conf_start.GetFrom());
4795             else if(Strand() == eMinus && cds.ProtReadingFrame().GetTo() > conf_start.GetTo())
4796                 extra_cds = TSignedSeqRange(conf_start.GetTo(), cds.ProtReadingFrame().GetTo());
4797         }
4798         if(extra_cds.Empty() || FShiftedLen(extra_cds) < 0.2*RealCdsLen()) {
4799             TSignedSeqRange reading_frame = cds.ReadingFrame();
4800             if(Strand() == ePlus)
4801                 reading_frame.SetFrom(rf);
4802             else
4803                 reading_frame.SetTo(rf);
4804             TSignedSeqRange protreadingframe = cds.ProtReadingFrame();
4805             TSignedSeqRange stop = cds.Stop();
4806             bool confirmed_stop = cds.ConfirmedStop();
4807             CCDSInfo::TPStops pstops = cds.PStops();
4808             cds.Clear();
4809 
4810             if(protreadingframe.NotEmpty())
4811                 cds.SetReadingFrame(reading_frame&protreadingframe, true);
4812             cds.SetReadingFrame(reading_frame);
4813             cds.SetStart(conf_start,true);
4814             if(stop.NotEmpty())
4815                 cds.SetStop(stop,confirmed_stop);
4816             ITERATE(CCDSInfo::TPStops, s, pstops) {
4817                 if(Include(reading_frame, *s))
4818                     cds.AddPStop(*s);
4819             }
4820             SetCdsInfo(cds);
4821 
4822             TSignedSeqRange new_lim = Limits();
4823             for(int i = 1; i < (int)Exons().size(); ++i) {
4824                 if(!Exons()[i-1].m_ssplice || !Exons()[i].m_fsplice) {
4825                     TSignedSeqRange hole(Exons()[i-1].GetTo(),Exons()[i].GetFrom());
4826                     if(Precede(hole,reading_frame)) {
4827                         new_lim.SetFrom(hole.GetTo());
4828                     } else if(Precede(reading_frame,hole)) {
4829                         new_lim.SetTo(hole.GetFrom());
4830                         break;
4831                     }
4832                 }
4833             }
4834             if(new_lim != Limits())
4835                 ClipChain(new_lim);   // remove holes from new UTRs
4836 
4837             gnomon.GetScore(*this, false, trusted);
4838             RemoveFshiftsFromUTRs();
4839             AddComment("Restored confirmed start");
4840         }
4841     }
4842 }
4843 
RemoveFshiftsFromUTRs()4844 void CChain::RemoveFshiftsFromUTRs()
4845 {
4846     TInDels fs;
4847     ITERATE(TInDels, i, FrameShifts()) {   // removing fshifts in UTRs
4848         TSignedSeqRange cds = GetCdsInfo().Cds();
4849         if(OpenCds())
4850             cds = MaxCdsLimits();
4851         if(Include(cds,i->Loc()))
4852             fs.push_back(*i);
4853     }
4854     if(FrameShifts().size() != fs.size()) {
4855         FrameShifts() = fs;
4856         int mrna_len = AlignLen();
4857         m_coverage.resize(mrna_len, m_coverage.back());   // this will slightly shift values compared to recalculation from scratch but will keep better ends
4858     }
4859 }
4860 
4861 
ClipChain(TSignedSeqRange limits)4862 void CChain::ClipChain(TSignedSeqRange limits) {
4863 
4864     _ASSERT(Include(Limits(),limits) && (RealCdsLimits().Empty() || Include(limits,RealCdsLimits())));
4865 
4866     TSignedSeqRange limits_on_mrna = GetAlignMap().MapRangeOrigToEdited(limits,false);
4867     _ASSERT(limits_on_mrna.NotEmpty());
4868 
4869     TContained new_members;
4870     ITERATE (TContained, it, m_members) {
4871         auto ai = (*it)->m_align;
4872         TSignedSeqRange alimits = ai->Limits();
4873         if(limits.IntersectingWith(alimits))   // not clipped
4874             new_members.push_back(*it);
4875     }
4876     m_members = new_members;
4877 
4878     if(limits.GetFrom() > Limits().GetFrom()) {
4879         TSignedSeqRange clip_range(Limits().GetFrom(),limits.GetFrom()-1);
4880         CutExons(clip_range);
4881         RecalculateLimits();
4882     }
4883     if(limits.GetTo() < Limits().GetTo()) {
4884         TSignedSeqRange clip_range(limits.GetTo()+1,Limits().GetTo());
4885         CutExons(clip_range);
4886         RecalculateLimits();
4887     }
4888 
4889     if(limits_on_mrna.GetFrom() > 0)
4890         m_coverage.erase(m_coverage.begin(),m_coverage.begin()+limits_on_mrna.GetFrom());
4891     m_coverage.resize(limits_on_mrna.GetLength());
4892 
4893     if(RealCdsLimits().NotEmpty()) {
4894         CCDSInfo cds = GetCdsInfo();
4895         bool changed = false;
4896         if((Strand() == ePlus && cds.MaxCdsLimits().GetFrom() < Limits().GetFrom()) ||
4897            (Strand() == eMinus && cds.MaxCdsLimits().GetTo() > Limits().GetTo())) {
4898             cds.Clear5PrimeCdsLimit();
4899             changed = true;
4900         }
4901         if(cds.PStop()) {
4902             CCDSInfo::TPStops pstops;
4903             for(auto& pstop : cds.PStops()) {
4904                 if(Include(limits, pstop))
4905                    pstops.push_back(pstop);
4906             }
4907             if(pstops.size() != cds.PStops().size()) {
4908                 cds.ClearPStops();
4909                 for(auto& pstop : pstops)
4910                     cds.AddPStop(pstop);
4911                 changed = true;
4912             }
4913         }
4914 
4915         if(changed)
4916             SetCdsInfo(cds);
4917     }
4918 
4919     if(limits.GetFrom() > m_supported_range.GetFrom())
4920         m_supported_range.SetFrom(limits.GetFrom());
4921     if(limits.GetTo() < m_supported_range.GetTo())
4922         m_supported_range.SetTo(limits.GetTo());
4923 
4924     CalculateSupportAndWeightFromMembers();
4925 }
4926 
SetConfirmedEnds(const CGnomonEngine & gnomon,CGnomonAnnotator_Base::TIntMap & confirmed_ends)4927 bool CChain::SetConfirmedEnds(const CGnomonEngine& gnomon, CGnomonAnnotator_Base::TIntMap& confirmed_ends) {
4928     if(Exons().size() < 2)
4929         return true;
4930 
4931     auto old_limits = Limits();
4932     auto new_limits = old_limits;
4933     bool left_confirmed = false;
4934     bool right_confirmed = false;
4935 
4936     auto rslt = confirmed_ends.find(Exons().front().GetTo());
4937     if(rslt != confirmed_ends.end() && rslt->second < Exons().front().GetTo()) {
4938         left_confirmed = true;
4939         new_limits.SetFrom(rslt->second);
4940     }
4941     rslt = confirmed_ends.find(Exons().back().GetFrom());
4942     if(rslt != confirmed_ends.end() && rslt->second > Exons().back().GetFrom()) {
4943         right_confirmed = true;
4944         new_limits.SetTo(rslt->second);
4945     }
4946 
4947     if(!left_confirmed && !right_confirmed)
4948         return true;
4949     else if(!Continuous())
4950         return false;
4951 
4952     CCDSInfo cds_info = GetCdsInfo();
4953     bool left_complete = LeftComplete();   // has start/stop on left
4954     bool right_complete = RightComplete(); // has start/stop on right
4955 
4956     SetCdsInfo(CCDSInfo()); //we will deal with CDS separately
4957 
4958     //extend chain
4959     if(new_limits.GetFrom() < old_limits.GetFrom()) {
4960         int delta = old_limits.GetFrom()-new_limits.GetFrom();
4961         ExtendLeft(delta);
4962         m_coverage.insert(m_coverage.begin(), delta, 0);
4963     }
4964     if(new_limits.GetTo() > old_limits.GetTo()) {
4965         int delta = new_limits.GetTo()-old_limits.GetTo();
4966         ExtendRight(delta);
4967         m_coverage.insert(m_coverage.end(), delta, 0);
4968     }
4969 
4970     CAlignMap amap = GetAlignMap(); //includes extended ends and keeps clipped ends
4971 
4972     {   // removing fshifts outside of clip
4973         TInDels fs;
4974         ITERATE(TInDels, i, FrameShifts()) {
4975             if(i->Loc() > new_limits.GetFrom() && i->InDelEnd() < new_limits.GetTo())
4976                 fs.push_back(*i);
4977         }
4978 
4979         if(FrameShifts().size() != fs.size()) {
4980             FrameShifts() = fs;
4981             int mrna_len = AlignLen();
4982             m_coverage.resize(mrna_len, m_coverage.back());   // this will slightly shift values compared to recalculation from scratch but will keep better ends
4983         }
4984     }
4985 
4986     //clip chain
4987     if(Limits() != new_limits)
4988         ClipChain(new_limits);
4989 
4990     //set limits
4991     m_polya_cap_left_soft_limit = max(m_polya_cap_left_soft_limit, new_limits.GetFrom());
4992     m_polya_cap_right_soft_limit = min(m_polya_cap_right_soft_limit, new_limits.GetTo());
4993 
4994     //set status
4995     if(left_confirmed) {
4996         Status() |= eLeftConfirmed;
4997         if(new_limits.GetFrom() < old_limits.GetFrom())
4998             AddComment("Extended to confirmed left");
4999         else if(new_limits.GetFrom() > old_limits.GetFrom())
5000             AddComment("Clipped to confirmed left");
5001     }
5002     if(right_confirmed) {
5003         Status() |= eRightConfirmed;
5004         if(new_limits.GetTo() > old_limits.GetTo())
5005             AddComment("Extended to confirmed right");
5006         else if(new_limits.GetTo() < old_limits.GetTo())
5007             AddComment("Clipped to confirmed right");
5008     }
5009 
5010     if(cds_info.ReadingFrame().Empty()) //non coding chain
5011         return true;
5012 
5013     if(!Include(new_limits, cds_info.Cds()) || (left_confirmed && !left_complete) || (right_confirmed && !right_complete)) { //CDS may need clipping to expose startstop
5014         auto cds_info_t = cds_info.MapFromOrigToEdited(amap);
5015         int frame = cds_info_t.ReadingFrame().GetFrom()%3;
5016 
5017         //project new_limits to transcript and align to frame
5018         auto cds_limits_t = amap.ShrinkToRealPoints(new_limits);
5019         cds_limits_t = amap.MapRangeOrigToEdited(cds_limits_t, CAlignMap::eSinglePoint, CAlignMap::eSinglePoint);
5020         for(int i = cds_limits_t.GetFrom(); i <= cds_limits_t.GetTo(); ++i) {
5021             cds_limits_t.SetFrom(i);
5022             if(i%3 == frame && amap.MapEditedToOrig(i) >= 0)
5023                 break;
5024         }
5025         for(int i = cds_limits_t.GetTo(); i >= cds_limits_t.GetFrom(); --i) {
5026             cds_limits_t.SetTo(i);
5027             if((i+1)%3 == frame && amap.MapEditedToOrig(i) >= 0)
5028                 break;
5029         }
5030         if(cds_limits_t.Empty())
5031             return false;
5032         cds_info_t.Clip(cds_limits_t); // remove extra CDS
5033 
5034 
5035         bool fivep_confirmed = (Strand() == ePlus) ? left_confirmed : right_confirmed;
5036         bool threep_confirmed = (Strand() == ePlus) ? right_confirmed : left_confirmed;
5037         bool has_start = cds_info_t.HasStart();
5038         bool has_stop = cds_info_t.HasStop();
5039         auto prot_rf = cds_info_t.ProtReadingFrame();
5040         if(prot_rf.NotEmpty() && ((fivep_confirmed && !has_start) || (threep_confirmed && !has_stop))) { //CDS may need some additional clipping to expose starts/stops
5041             const CResidueVec& contig = gnomon.GetSeq();
5042             CResidueVec mrna;
5043             amap.EditedSequence(contig, mrna);
5044 
5045             auto IndelInCodon = [this](int i, CAlignMap& map) {
5046                 int a = map.MapEditedToOrig(i);
5047                 int b = map.MapEditedToOrig(i+2);
5048                 if(Strand() == eMinus)
5049                     swap(a, b);
5050                 return (a < 0 || b < 0 || map.MapEditedToOrig(i+1) < 0 || !GetInDels(a, b, false).empty()); // genomic indels inside, if true
5051             };
5052 
5053             if(fivep_confirmed && !has_start) {
5054                 for(int i = prot_rf.GetFrom(); !has_start && i >= cds_limits_t.GetFrom() && !IsStopCodon(&mrna[i]); i =- 3) { //find start outside protein (no clip will be needed)
5055                     has_start =  IsStartCodon(&mrna[i]) && !IndelInCodon(i, amap);
5056                 }
5057                 for(int i = prot_rf.GetFrom(); !has_start && i < cds_limits_t.GetTo(); i += 3) {                              //find start inside protein (clip will be needed)
5058                     if(i > prot_rf.GetTo() && IsStopCodon(&mrna[i]))
5059                         break;
5060                     has_start =  IsStartCodon(&mrna[i]) && !IndelInCodon(i, amap);
5061                     cds_limits_t.SetFrom(i);
5062                 }
5063                 if(!has_start)
5064                     return false;
5065             }
5066             if(threep_confirmed && !has_stop) {
5067                 for(int i = prot_rf.GetTo()+1; !has_stop && i < cds_limits_t.GetTo(); i += 3) //find stop outside protein (no clip will be needed)
5068                     has_stop = IsStopCodon(&mrna[i]) && !IndelInCodon(i, amap);
5069                 if(!has_stop && cds_info_t.PStop(false)) {                                    //find stop inside protein (clip will be needed)
5070                     CCDSInfo::TPStops pstops = cds_info_t.PStops();
5071                     sort(pstops.begin(), pstops.end());
5072                     for(auto& stp : pstops) {
5073                         if(stp.m_status != CCDSInfo::eGenomeNotCorrect && stp.m_status != CCDSInfo::eSelenocysteine && !IndelInCodon(stp.GetFrom(), amap)) {
5074                             has_stop = true;
5075                             cds_limits_t.SetTo(stp.GetFrom()-1);
5076                         }
5077                     }
5078                     if(!has_stop)
5079                         return false;
5080                 }
5081             }
5082 
5083             if(cds_limits_t.Empty())
5084                 return false;
5085 
5086             cds_info_t.Clip(cds_limits_t);
5087         }
5088 
5089         cds_info = cds_info_t.MapFromEditedToOrig(amap);
5090     }
5091 
5092     {   // removing fshifts in UTRs
5093         auto cds = cds_info.Cds();
5094         TInDels fs;
5095         ITERATE(TInDels, i, FrameShifts()) {
5096             if(Include(cds, i->Loc()))
5097                 fs.push_back(*i);
5098         }
5099 
5100         if(FrameShifts().size() != fs.size()) {
5101             FrameShifts() = fs;
5102             int mrna_len = AlignLen();
5103             m_coverage.resize(mrna_len, m_coverage.back());   // this will slightly shift values compared to recalculation from scratch but will keep better ends
5104         }
5105     }
5106 
5107     SetCdsInfo(cds_info);
5108 
5109     return true;
5110 }
5111 
5112 // valid, found As
ValidPolyA(int pos,const CResidueVec & contig)5113 pair<bool, bool> CChain::ValidPolyA(int pos, const CResidueVec& contig) {
5114     string motif1 = "AATAAA";
5115     string motif2 = "ATTAAA";
5116     string motif3 = "AGTAAA";
5117     int block_of_As_len = 6;
5118     CResidueVec block_of_As;
5119     if(Strand() == ePlus)
5120         block_of_As.assign(block_of_As_len, 'A');
5121     else
5122         block_of_As.assign(block_of_As_len, 'T');
5123 
5124     int a = max(0, pos-block_of_As_len);
5125     int b = min((int)contig.size()-1, pos+block_of_As_len);
5126     if(b-a+1 < block_of_As_len)
5127         return make_pair(false, false);
5128     if(search(contig.begin()+a, contig.begin()+b+1, block_of_As.begin(), block_of_As.end()) != contig.begin()+b+1) {  // found As
5129         int left;
5130         int right;
5131         if(Strand() == ePlus) {
5132             left = pos-35;
5133             right = pos-18;
5134         } else {
5135             left = pos+18;
5136             right = pos+35;
5137         }
5138         if(left < 0 || right >= (int)contig.size())
5139             return make_pair(false, false);
5140 
5141         string segment(contig.begin()+left, contig.begin()+right+1);
5142         if(Strand() == eMinus)
5143             ReverseComplement(segment.begin(), segment.end());
5144 
5145         if(segment.find(motif1) != string::npos || segment.find(motif2) != string::npos || segment.find(motif3) != string::npos)
5146             return make_pair(true, true);
5147         else
5148             return make_pair(false, true);
5149     } else {
5150         return make_pair(true, false);
5151     }
5152 }
5153 
5154 #define MIN_UTR_EXON 15
5155 
PeaksAndLimits(EStatus determinant,int min_blob_weight,int max_empty_dist,int min_splice_dist)5156 tuple<CChain::TIDMap, TSignedSeqRange> CChain::PeaksAndLimits(EStatus determinant, int min_blob_weight, int max_empty_dist, int min_splice_dist) {
5157     bool right_end = (determinant == ePolyA && Strand() == ePlus) || (determinant == eCap && Strand() == eMinus); // determinant is on the right gene side
5158     bool coding = ReadingFrame().NotEmpty();
5159 
5160     TIDMap peak_weights;
5161     TSignedSeqRange real_limits;
5162 
5163     int flex_len = 0;
5164     TIDMap raw_weights;
5165     for(auto& mi : m_members) {
5166         const CGeneModel& align = *mi->m_align;
5167         if(align.Status()&determinant) {
5168             if(right_end) {
5169                 int rlimit = (coding ? RealCdsLimits().GetTo() : Exons().back().Limits().GetFrom());      // look in the last exon of notcoding or right UTR of coding
5170                 bool belong_to_exon = false;
5171                 int pos = align.Limits().GetTo();
5172                 for(auto& exon : Exons()) {
5173                     if(pos >= exon.Limits().GetFrom()+min_splice_dist && pos <= exon.Limits().GetTo()) {
5174                         belong_to_exon = true;
5175                         break;
5176                     }
5177                 }
5178                 if(rlimit < pos && belong_to_exon)
5179                     raw_weights[align.Limits().GetTo()] += align.Weight();
5180             } else {
5181                 int llimit = (coding ? RealCdsLimits().GetFrom() : Exons().front().Limits().GetTo());     // look in the first exon of notcoding or left UTR of coding
5182                 bool belong_to_exon = false;
5183                 int pos = align.Limits().GetFrom();
5184                 for(auto& exon : Exons()) {
5185                     if(pos >= exon.Limits().GetFrom() && pos <= exon.Limits().GetTo()-min_splice_dist) {
5186                         belong_to_exon = true;
5187                         break;
5188                     }
5189                 }
5190                 if(llimit > pos && belong_to_exon)
5191                     raw_weights[-align.Limits().GetFrom()] += align.Weight();                             // negative position, so the map is in convinient order
5192             }
5193         }
5194         if(align.Status()&(eLeftFlexible|eRightFlexible))
5195             flex_len = max(flex_len, align.Limits().GetLength());
5196         else
5197             real_limits += (align.Limits()&Limits());
5198     }
5199     if(raw_weights.empty())
5200         return make_tuple(peak_weights,real_limits);
5201 
5202     int last_allowed = right_end ? real_limits.GetTo()+flex_len : -(real_limits.GetFrom()-flex_len);
5203     auto ipeak = raw_weights.begin();
5204     double w = ipeak->second;
5205     for(auto it = next(raw_weights.begin()); it != raw_weights.end(); ++it) {
5206         if(it->first > prev(it)->first+1+max_empty_dist) {           // next blob
5207             if(ipeak->first > last_allowed)
5208                 break;
5209             if(w >= min_blob_weight) {
5210                 auto still_good = ipeak;
5211                 for(auto i = ipeak; i != it && i->first <= last_allowed; ++i) { // shift position to furthest 50% within blob
5212                     if(i->second >= 0.5*ipeak->second)
5213                         still_good = i;
5214                 }
5215                 peak_weights.emplace(still_good->first, w);      // peak position, blob weight
5216             }
5217             ipeak = it;
5218             w = it->second;
5219         } else {
5220             w += it->second;
5221             if(it->second > ipeak->second)            // new peak position; first for equals
5222                 ipeak = it;
5223         }
5224     }
5225     if(ipeak->first <= last_allowed && w >= min_blob_weight) { // last peak
5226         auto still_good = ipeak;
5227         for(auto i = ipeak; i != raw_weights.end() && i->first <= last_allowed; ++i) { // shift position to furthest 50% within blob
5228             if(i->second >= 0.5*ipeak->second)
5229                 still_good = i;
5230         }
5231         peak_weights.emplace(still_good->first, w);      // peak position, blob weight
5232     }
5233 
5234     return make_tuple(peak_weights,real_limits);
5235 }
5236 
MainPeaks(TIDMap & peak_weights,double secondary_peak,double tertiary_peak,double tertiary_peak_coverage,bool right_end)5237 tuple<TIVec, TSignedSeqRange> CChain::MainPeaks(TIDMap& peak_weights, double secondary_peak, double tertiary_peak, double tertiary_peak_coverage, bool right_end) {
5238     TIVec peaks(3, -1);
5239     auto limits = Limits();
5240     auto ifirst_peak = max_element(peak_weights.begin(), peak_weights.end(), [](const TIDMap::value_type& a, const TIDMap::value_type& b) { return a.second < b.second; });
5241     peaks[0] = abs(ifirst_peak->first);
5242     if(right_end) {
5243         int first_peak = ifirst_peak->first;
5244         limits.SetTo(first_peak);
5245         m_polya_cap_right_soft_limit = first_peak;
5246     } else {
5247         int first_peak = -ifirst_peak->first;
5248         limits.SetFrom(first_peak);
5249         m_polya_cap_left_soft_limit = first_peak;
5250     }
5251     auto isecond_peak = prev(peak_weights.end());
5252     for( ; isecond_peak != ifirst_peak && isecond_peak->second < secondary_peak*ifirst_peak->second; --isecond_peak);
5253     if(isecond_peak != ifirst_peak)
5254         peaks[1] = abs(isecond_peak->first);
5255 
5256     if(tertiary_peak > 0) {
5257         CAlignMap amap = GetAlignMap();
5258         TSignedSeqRange genome_core_lim = RealCdsLimits();
5259         if(genome_core_lim.Empty()) {
5260             genome_core_lim = Limits();
5261             if(Exons().size() > 1) {
5262                 if(Exons().front().Limits().GetLength() >= MIN_UTR_EXON)
5263                     genome_core_lim.SetFrom(Exons().front().Limits().GetTo()-MIN_UTR_EXON+1);
5264                 if(Exons().back().Limits().GetLength() >= MIN_UTR_EXON)
5265                     genome_core_lim.SetTo(Exons().back().Limits().GetFrom()+MIN_UTR_EXON-1);
5266             }
5267         }
5268         genome_core_lim = amap.ShrinkToRealPoints(genome_core_lim);
5269         TSignedSeqRange core_lim = amap.MapRangeOrigToEdited(genome_core_lim);
5270         double core_coverage = 0;
5271         for (int i = core_lim.GetFrom(); i <= core_lim.GetTo(); ++i) {
5272             core_coverage += m_coverage[i];
5273         }
5274         core_coverage /= core_lim.GetLength();
5275 
5276         TSignedSeqRange fpeak_exon;
5277         for(auto& exon : Exons()) {
5278             if(Include(exon.Limits(), abs(ifirst_peak->first))) {
5279                 fpeak_exon = exon.Limits();
5280                 break;
5281             }
5282         }
5283 
5284         auto ithird_peak = prev(peak_weights.end());
5285         for( ; ithird_peak != isecond_peak; --ithird_peak) {
5286             if(Include(fpeak_exon, abs(ithird_peak->first))) {
5287                 int p = amap.MapOrigToEdited(abs(ithird_peak->first));
5288                 if(p < 0)
5289                     continue;
5290                 if(ithird_peak->second >= tertiary_peak*ifirst_peak->second && m_coverage[p] > tertiary_peak_coverage*core_coverage)
5291                     break;
5292             }
5293         }
5294         if(ithird_peak != isecond_peak)
5295             peaks[2] = abs(ithird_peak->first);
5296         isecond_peak = ithird_peak;
5297     }
5298 
5299     if(isecond_peak != ifirst_peak) {
5300         if(right_end) {
5301             int second_peak = isecond_peak->first;
5302             limits.SetTo(second_peak);
5303         } else {
5304             int second_peak = -isecond_peak->first;
5305             limits.SetFrom(second_peak);
5306         }
5307     }
5308 
5309     return make_tuple(peaks, limits);
5310 }
5311 
ClipToCap(int min_cap_blob,int max_dist,int min_flank_exon,double secondary_peak)5312 void CChain::ClipToCap(int min_cap_blob, int max_dist, int min_flank_exon, double secondary_peak) {
5313     bool right_end = Strand() == eMinus; // cap is on the right gene side
5314     if((Status()&eLeftConfirmed) && !right_end)
5315         return;
5316     if((Status()&eRightConfirmed) && right_end)
5317         return;
5318 
5319     bool coding = ReadingFrame().NotEmpty();
5320     if(!HasStart() && coding)
5321         return;
5322 
5323     auto rslt = PeaksAndLimits(eCap, min_cap_blob, max_dist, min_flank_exon);
5324     TIDMap& peak_weights(get<0>(rslt));
5325     TSignedSeqRange real_limits(get<1>(rslt));
5326 
5327     if(peak_weights.empty()) {
5328         TSignedSeqRange limits = Limits();
5329         Status() &= ~eCap;
5330         if(right_end && real_limits.GetTo() < Limits().GetTo())
5331             limits.SetTo(real_limits.GetTo());
5332         else if(!right_end && real_limits.GetFrom() > Limits().GetFrom())
5333             limits.SetFrom(real_limits.GetFrom());
5334 
5335         if (limits != Limits()) {
5336             if(!coding || Include(limits,RealCdsLimits())) {
5337                 AddComment("capsupressed");
5338                 ClipChain(limits);
5339             } else {
5340                 AddComment("capoverlapcds");
5341             }
5342         }
5343 
5344         if(right_end)
5345             m_polya_cap_right_soft_limit = Limits().GetFrom()-1;
5346         else
5347             m_polya_cap_left_soft_limit = Limits().GetTo()+1;
5348 
5349         return;
5350     }
5351 
5352     Status() |= eCap;
5353     auto rslt1 = MainPeaks(peak_weights, secondary_peak, 0., 0., right_end);
5354     m_cap_peaks = get<0>(rslt1);
5355     TSignedSeqRange limits = get<1>(rslt1);
5356 
5357     if (limits != Limits()) {
5358         AddComment("capclip");
5359         ClipChain(limits);
5360     }
5361 }
ClipToPolyA(const CResidueVec & contig,int min_polya_blob,int max_dist,int min_flank_exon,double secondary_peak,double tertiary_peak,double tertiary_peak_coverage)5362 void CChain::ClipToPolyA(const CResidueVec& contig, int min_polya_blob, int max_dist, int min_flank_exon, double secondary_peak, double tertiary_peak, double tertiary_peak_coverage) {
5363     bool right_end = Strand() == ePlus; // polya is on the right gene side
5364     if((Status()&eLeftConfirmed) && !right_end)
5365         return;
5366     if((Status()&eRightConfirmed) && right_end)
5367         return;
5368 
5369     bool coding = ReadingFrame().NotEmpty();
5370     if(!HasStop() && coding)
5371         return;
5372 
5373     auto rslt = PeaksAndLimits(ePolyA, min_polya_blob, max_dist, min_flank_exon);
5374     TIDMap& peak_weights(get<0>(rslt));
5375     TSignedSeqRange real_limits(get<1>(rslt));
5376     //check for As
5377     for(auto ip_loop = peak_weights.begin(); ip_loop != peak_weights.end(); ) {
5378         auto ip = ip_loop++;
5379         if(!ValidPolyA(abs(ip->first), contig).first)
5380             peak_weights.erase(ip);
5381     }
5382 
5383     if(peak_weights.empty()) {
5384         TSignedSeqRange limits = Limits();
5385         Status() &= ~ePolyA;
5386         if(right_end && real_limits.GetTo() < Limits().GetTo())
5387             limits.SetTo(real_limits.GetTo());
5388         else if(!right_end && real_limits.GetFrom() > Limits().GetFrom())
5389             limits.SetFrom(real_limits.GetFrom());
5390 
5391         if (limits != Limits()) {
5392             if(!coding || Include(limits,RealCdsLimits())) {
5393                 AddComment("polyasupressed");
5394                 ClipChain(limits);
5395             } else {
5396                 AddComment("polyaoverlapcds");
5397             }
5398         }
5399 
5400         if(right_end)
5401             m_polya_cap_right_soft_limit = Limits().GetFrom()-1;
5402         else
5403             m_polya_cap_left_soft_limit = Limits().GetTo()+1;
5404 
5405         return;
5406     }
5407 
5408     Status() |= ePolyA;
5409     auto rslt1 = MainPeaks(peak_weights, secondary_peak, tertiary_peak, tertiary_peak_coverage, right_end);
5410     m_polya_peaks = get<0>(rslt1);
5411     TSignedSeqRange limits = get<1>(rslt1);
5412 
5413     if (limits != Limits()) {
5414         AddComment("polyaclip");
5415         ClipChain(limits);
5416     }
5417 }
5418 
CheckSecondaryCapPolyAEnds()5419 void CChain::CheckSecondaryCapPolyAEnds() {
5420     if(m_polya_cap_left_soft_limit < Limits().GetTo() && Include(RealCdsLimits(), m_polya_cap_left_soft_limit))
5421         m_polya_cap_left_soft_limit = Limits().GetFrom();
5422 
5423     if(m_polya_cap_right_soft_limit > Limits().GetFrom() && Include(RealCdsLimits(), m_polya_cap_right_soft_limit))
5424         m_polya_cap_right_soft_limit = Limits().GetTo();
5425 }
5426 
5427 #define COVERAGE_DROP 0.1
5428 #define COVERAGE_BUMP 3
5429 #define SMALL_GAP_UTR 100
5430 
ClipLowCoverageUTR(double utr_clip_threshold)5431 void CChain::ClipLowCoverageUTR(double utr_clip_threshold)
5432 {
5433     if((Type()&CGeneModel::eSR) == 0)   // don't have SR coverage
5434         return;
5435 
5436     CAlignMap amap = GetAlignMap();
5437 
5438     int mrna_len = amap.FShiftedLen(Limits());
5439 
5440     TSignedSeqRange genome_core_lim;
5441     if(ReadingFrame().NotEmpty()) {
5442         if(OpenCds())
5443             genome_core_lim = MaxCdsLimits();
5444         else
5445             genome_core_lim = RealCdsLimits();
5446         ITERATE (CGeneModel::TExons, e, Exons()) {
5447             if(Include(e->Limits(),genome_core_lim.GetFrom()))
5448                 genome_core_lim.SetFrom(max(genome_core_lim.GetFrom()-MIN_UTR_EXON,e->GetFrom()));
5449             if(Include(e->Limits(),genome_core_lim.GetTo()))
5450                 genome_core_lim.SetTo(min(genome_core_lim.GetTo()+MIN_UTR_EXON,e->GetTo()));
5451         }
5452     } else {
5453         genome_core_lim = Limits();
5454         if(Exons().size() > 1) {
5455             if(Exons().front().Limits().GetLength() >= MIN_UTR_EXON)
5456                 genome_core_lim.SetFrom(Exons().front().Limits().GetTo()-MIN_UTR_EXON+1);
5457             if(Exons().back().Limits().GetLength() >= MIN_UTR_EXON)
5458                 genome_core_lim.SetTo(Exons().back().Limits().GetFrom()+MIN_UTR_EXON-1);
5459         }
5460     }
5461 
5462     TSignedSeqRange core_lim = amap.MapRangeOrigToEdited(genome_core_lim);
5463 
5464     vector<double> coverage = m_coverage;
5465     _ASSERT((int)coverage.size() == mrna_len && core_lim.GetFrom() >= 0 && core_lim.GetTo() < mrna_len);
5466 
5467     double core_coverage = 0;
5468     for (int i = core_lim.GetFrom(); i <= core_lim.GetTo(); ++i) {
5469         core_coverage += coverage[i];
5470     }
5471     core_coverage /= core_lim.GetLength();
5472     m_core_coverage = core_coverage;
5473 
5474     if(core_lim.GetFrom() <= 0 &&  core_lim.GetTo() >= mrna_len-1)   //nothing to clip
5475         return;
5476 
5477     if(core_lim.GetTo()-core_lim.GetFrom() < SCAN_WINDOW)      // too short
5478         return;
5479 
5480     map<int,double> intron_coverage;   // in transcript space
5481     vector<double> longseq_coverage(mrna_len);
5482     ITERATE (TContained, it, m_members) {
5483         const CGeneModel& align = *(*it)->m_align;
5484         if(align.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
5485             continue;
5486         TSignedSeqRange overlap = Limits()&align.Limits();
5487         if(overlap.Empty())   // some could be cut by polya clip
5488             continue;
5489 
5490         for(int i = 1; i < (int)align.Exons().size(); ++i) {
5491             if(align.Exons()[i-1].m_ssplice && align.Exons()[i].m_fsplice && align.Exons()[i-1].m_ssplice_sig != "XX" && align.Exons()[i].m_fsplice_sig != "XX") {
5492                 TSignedSeqRange intr(align.Exons()[i-1].Limits().GetTo(),align.Exons()[i].Limits().GetFrom());
5493                 bool valid_intron = false;                                     // some introns might be clipped by previous UTR clips but still be in members
5494                 for(int j = 1; j < (int)Exons().size() && !valid_intron; ++j) {
5495                     if(Exons()[j-1].m_ssplice && Exons()[j].m_fsplice) {
5496                         TSignedSeqRange jntr(Exons()[j-1].Limits().GetTo(),Exons()[j].Limits().GetFrom());
5497                         valid_intron = (intr == jntr);
5498                     }
5499                 }
5500                 if(valid_intron) {
5501                     int intron = 0;   // donor in transcript space
5502                     if(Strand() == ePlus) {
5503                         intron = amap.MapRangeOrigToEdited(Limits()&align.Exons()[i-1].Limits()).GetTo();
5504                     } else {
5505                         intron = amap.MapRangeOrigToEdited(Limits()&align.Exons()[i].Limits()).GetTo();
5506                     }
5507                     intron_coverage[intron] += (align.Type() == CGeneModel::eSR) ? align.Weight() : 0;
5508                 }
5509             }
5510         }
5511 
5512 
5513         TSignedSeqRange overlap_on_mrna = amap.MapRangeOrigToEdited(overlap);
5514 
5515         if(align.Type() == CGeneModel::emRNA || align.Type() == CGeneModel::eEST || align.Type() == CGeneModel::eNotForChaining) {   //OK to clip protein in UTR
5516             for(int i = overlap_on_mrna.GetFrom(); i <= overlap_on_mrna.GetTo(); ++i)
5517                 longseq_coverage[i] += align.Weight();
5518         }
5519     }
5520 
5521     //don't save short gap utrs
5522     TSignedSeqRange cds = GetCdsInfo().Cds();
5523     if(Exons().front().m_ssplice_sig == "XX" && (cds&Exons().front().Limits()).Empty() && Exons().front().Limits().GetLength() < SMALL_GAP_UTR) {
5524         TSignedSeqRange texon = TranscriptExon(0);
5525         for(int i = texon.GetFrom(); i <= texon.GetTo(); ++i) {
5526             coverage[i] = 0;
5527             longseq_coverage[i] = 0;
5528         }
5529     }
5530     if(Exons().back().m_fsplice_sig == "XX" && (cds&Exons().back().Limits()).Empty() && Exons().back().Limits().GetLength() < SMALL_GAP_UTR) {
5531         TSignedSeqRange texon = TranscriptExon(Exons().size()-1);
5532         for(int i = texon.GetFrom(); i <= texon.GetTo(); ++i) {
5533             coverage[i] = 0;
5534             longseq_coverage[i] = 0;
5535         }
5536     }
5537 
5538     double core_inron_coverage = 0;
5539     int core_introns = 0;
5540     for(int i = 1; i < (int)Exons().size(); ++i) {
5541         if(Exons()[i-1].m_ssplice && Exons()[i].m_fsplice) {
5542             int intron;   // donor in transcript space
5543             if(Strand() == ePlus)
5544                 intron = amap.MapRangeOrigToEdited(Exons()[i-1].Limits(), true).GetTo();
5545             else
5546                 intron = amap.MapRangeOrigToEdited(Exons()[i].Limits(), true).GetTo();
5547             if(Include(core_lim, intron)) {
5548                 ++core_introns;
5549                 core_inron_coverage += intron_coverage[intron];
5550             }
5551         }
5552     }
5553     if(core_introns > 0)
5554         core_inron_coverage /= core_introns;
5555     else
5556         core_inron_coverage = 0.5*core_coverage;
5557 
5558     // 5' UTR
5559     bool fivep_confirmed = (Strand() == ePlus) ? (Status()&eLeftConfirmed) : (Status()&eRightConfirmed);
5560     if(!fivep_confirmed && !(Status()&eCap) && core_lim.GetFrom() > SCAN_WINDOW/2) {
5561         int left_limit = core_lim.GetFrom(); // cds/splice
5562         int right_limit = core_lim.GetTo();  // cds/splice
5563         int len = right_limit-left_limit+1;
5564         double wlen = 0;
5565         for(int i = left_limit; i <= right_limit; ++i)
5566             wlen += coverage[i];
5567 
5568         while(left_limit > 0 && (longseq_coverage[left_limit] > 0 ||
5569                (coverage[left_limit] > max(core_coverage,wlen/len)*utr_clip_threshold &&
5570                (intron_coverage.find(left_limit-1) == intron_coverage.end() || intron_coverage[left_limit-1] > core_inron_coverage*utr_clip_threshold)))) {
5571 
5572             ++len;
5573             --left_limit;
5574             wlen += coverage[left_limit];
5575         }
5576 
5577         if(left_limit > 0) {
5578             AddComment("5putrclip");
5579             ClipChain(amap.MapRangeEditedToOrig(TSignedSeqRange(left_limit,mrna_len-1)));
5580             if(Strand() == ePlus && Exons().front().Limits().GetLength() < MIN_UTR_EXON && Exons().front().Limits().GetTo() < genome_core_lim.GetFrom())
5581                 ClipChain(TSignedSeqRange(Exons()[1].Limits().GetFrom(),Limits().GetTo()));
5582             else if(Strand() == eMinus && Exons().back().Limits().GetLength() < MIN_UTR_EXON && Exons().back().Limits().GetFrom() > genome_core_lim.GetTo())
5583                 ClipChain(TSignedSeqRange(Limits().GetFrom(),Exons()[Exons().size()-2].GetTo()));
5584         }
5585     }
5586 
5587 
5588     // 3' UTR
5589     bool threep_confirmed = (Strand() == ePlus) ? (Status()&eRightConfirmed) : (Status()&eLeftConfirmed);
5590     if(!threep_confirmed && !(Status()&ePolyA) && core_lim.GetTo() < mrna_len-1-SCAN_WINDOW/2) {
5591         int right_limit = core_lim.GetTo();     // cds/splice
5592         int left_limit = core_lim.GetFrom();    // cds/splice
5593         int len = right_limit-left_limit+1;
5594         double wlen = 0;
5595         for(int i = left_limit; i <= right_limit; ++i)
5596             wlen += coverage[i];
5597 
5598         double window_wlen = 0;
5599         for(int i = right_limit-SCAN_WINDOW/2; i <= right_limit+SCAN_WINDOW/2; ++i)
5600             window_wlen += coverage[i];
5601 
5602         while(right_limit < mrna_len-1 && (longseq_coverage[right_limit] > 0 ||
5603               (coverage[right_limit] > wlen/len*utr_clip_threshold &&
5604               (intron_coverage.find(right_limit) == intron_coverage.end() || intron_coverage[right_limit] > core_inron_coverage*utr_clip_threshold)))) {
5605 
5606             ++len;
5607             ++right_limit;
5608             wlen += coverage[right_limit];
5609         }
5610 
5611         if(right_limit < mrna_len-1) {
5612             AddComment("3putrclip");
5613             int new_5p = amap.MapRangeOrigToEdited(Limits()).GetFrom();
5614             ClipChain(amap.MapRangeEditedToOrig(TSignedSeqRange(new_5p,right_limit)));
5615             if(Strand() == ePlus && Exons().back().Limits().GetLength() < MIN_UTR_EXON && Exons().back().Limits().GetFrom() > genome_core_lim.GetTo())
5616                 ClipChain(TSignedSeqRange(Limits().GetFrom(),Exons()[Exons().size()-2].GetTo()));
5617             else if(Strand() == eMinus && Exons().front().Limits().GetLength() < MIN_UTR_EXON && Exons().front().Limits().GetTo() < genome_core_lim.GetFrom())
5618                 ClipChain(TSignedSeqRange(Exons()[1].Limits().GetFrom(),Limits().GetTo()));
5619         }
5620     }
5621 }
5622 
CalculateDropLimits()5623 void CChain::CalculateDropLimits() {
5624 
5625     m_coverage_drop_left = -1;
5626     m_coverage_drop_right = -1;
5627     m_coverage_bump_left = -1;
5628     m_coverage_bump_right = -1;
5629 
5630     bool fivep_confirmed = (Strand() == ePlus) ? (Status()&eLeftConfirmed) : (Status()&eRightConfirmed);
5631     bool threep_confirmed = (Strand() == ePlus) ? (Status()&eRightConfirmed) : (Status()&eLeftConfirmed);
5632 
5633     if(fivep_confirmed && threep_confirmed)
5634         return;
5635 
5636     CAlignMap amap = GetAlignMap();
5637 
5638     int mrna_len = amap.FShiftedLen(Limits());
5639 
5640     vector<double> longseq_coverage(mrna_len);
5641     ITERATE (TContained, it, m_members) {
5642         const CGeneModel& align = *(*it)->m_align;
5643         if(align.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
5644             continue;
5645         TSignedSeqRange overlap = Limits()&align.Limits();
5646         if(overlap.Empty())
5647             continue;
5648 
5649         TSignedSeqRange overlap_on_mrna = amap.MapRangeOrigToEdited(overlap);
5650 
5651         if(align.Type() == CGeneModel::emRNA || align.Type() == CGeneModel::eEST) {   //OK to clip protein in UTR
5652             for(int i = overlap_on_mrna.GetFrom(); i <= overlap_on_mrna.GetTo(); ++i)
5653                 longseq_coverage[i] += align.Weight();
5654         }
5655     }
5656 
5657     TSignedSeqRange sfl(Exons().front().Limits().GetTo(),Exons().back().Limits().GetFrom());
5658     if(ReadingFrame().NotEmpty()) {
5659         TSignedSeqRange cds = (OpenCds() ? MaxCdsLimits() : RealCdsLimits());
5660         sfl.SetFrom(min(sfl.GetFrom(),cds.GetFrom()));
5661         sfl.SetTo(max(sfl.GetTo(),cds.GetTo()));
5662     }
5663     TSignedSeqRange soft_limit = sfl;
5664     ITERATE(TContained, i, m_members) {
5665         if((*i)->m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
5666             continue;
5667         TSignedSeqRange overlap = ((*i)->m_align->Limits() & Limits());
5668         if(Include(overlap,sfl.GetFrom()+1))
5669             soft_limit.SetFrom(min(soft_limit.GetFrom(),overlap.GetFrom()));
5670         if(Include(overlap,sfl.GetTo()-1))
5671             soft_limit.SetTo(max(soft_limit.GetTo(),overlap.GetTo()));
5672     }
5673     soft_limit.SetFrom(min(soft_limit.GetFrom(),m_polya_cap_left_soft_limit));
5674     soft_limit.SetTo(max(soft_limit.GetTo(),m_polya_cap_right_soft_limit));
5675 
5676     soft_limit = amap.MapRangeOrigToEdited(soft_limit);
5677 
5678     // 5' UTR
5679     if(!fivep_confirmed) {
5680         int left_limit = soft_limit.GetFrom();
5681         int first_bump = -1;
5682         double max_cov = 0;
5683         while(left_limit > 0 && first_bump < 0 && (longseq_coverage[left_limit] > 0 || m_coverage[left_limit] > m_core_coverage*COVERAGE_DROP)) {
5684             max_cov = max(max_cov,m_coverage[left_limit]);
5685             if(max_cov > m_core_coverage*COVERAGE_BUMP)
5686                 first_bump = left_limit;
5687 
5688             --left_limit;
5689         }
5690 
5691         if(first_bump > 0) {
5692             for( ; first_bump < soft_limit.GetFrom()-SCAN_WINDOW && m_coverage[first_bump+SCAN_WINDOW] < m_coverage[first_bump]; ++first_bump);
5693             if(Strand() == ePlus)
5694                 m_coverage_bump_left = amap.MapEditedToOrig(first_bump);
5695             else
5696                 m_coverage_bump_right = amap.MapEditedToOrig(first_bump);
5697         } else if(left_limit > 0 || m_coverage[left_limit] <= m_core_coverage*COVERAGE_DROP) {
5698             int first_drop = left_limit;
5699             if(first_drop+SCAN_WINDOW/2 < mrna_len) {
5700                 for( ; first_drop-SCAN_WINDOW/2 > 0; --first_drop) {
5701                     if(m_coverage[first_drop-SCAN_WINDOW/2] >= m_coverage[first_drop+SCAN_WINDOW/2])  // check for negative gradient
5702                         break;
5703                     if(m_coverage[first_drop-SCAN_WINDOW/2]+m_coverage[first_drop+SCAN_WINDOW/2]-2*m_coverage[first_drop] >= 0)  // check for decrease of gradient
5704                         break;
5705                 }
5706             }
5707             if(Strand() == ePlus)
5708                 m_coverage_drop_left = amap.MapEditedToOrig(first_drop);
5709             else
5710                 m_coverage_drop_right = amap.MapEditedToOrig(first_drop);
5711         }
5712     }
5713 
5714     // 3' UTR
5715     if(!threep_confirmed) {
5716         int right_limit = soft_limit.GetTo();
5717         int first_bump = -1;
5718         double max_cov = 0;
5719         while(right_limit < mrna_len-1 && first_bump < 0 && (longseq_coverage[right_limit] > 0 || m_coverage[right_limit] > m_core_coverage*COVERAGE_DROP)) {
5720             max_cov = max(max_cov,m_coverage[right_limit]);
5721             if(first_bump < 0 && max_cov > m_core_coverage*COVERAGE_BUMP)
5722                 first_bump = right_limit;
5723 
5724             ++right_limit;
5725         }
5726         if(first_bump > 0) {
5727             for( ; first_bump > soft_limit.GetTo()+SCAN_WINDOW && m_coverage[first_bump-SCAN_WINDOW] < m_coverage[first_bump]; --first_bump);
5728             if(Strand() == ePlus)
5729                 m_coverage_bump_right = amap.MapEditedToOrig(first_bump);
5730             else
5731                 m_coverage_bump_left = amap.MapEditedToOrig(first_bump);
5732         } else if(right_limit < mrna_len-1 || m_coverage[right_limit] <= m_core_coverage*COVERAGE_DROP) {  // garanteed that right_limit <= mrna_len-1
5733             int first_drop = right_limit;
5734             if(first_drop-SCAN_WINDOW/2 > 0) {
5735                 for( ; first_drop < mrna_len-SCAN_WINDOW/2; ++first_drop) {
5736                     if(m_coverage[first_drop+SCAN_WINDOW/2] >= m_coverage[first_drop-SCAN_WINDOW/2])  // check for negative gradient
5737                         break;
5738                     if(m_coverage[first_drop-SCAN_WINDOW/2]+m_coverage[first_drop+SCAN_WINDOW/2]-2*m_coverage[first_drop] >= 0)  // check for decrease of gradient
5739                         break;
5740                 }
5741             }
5742             if(Strand() == ePlus)
5743                 m_coverage_drop_right = amap.MapEditedToOrig(first_drop);
5744             else
5745                 m_coverage_drop_left = amap.MapEditedToOrig(first_drop);
5746         }
5747     }
5748 }
5749 
SetConsistentCoverage()5750 void CChain::SetConsistentCoverage()
5751 {
5752     if(!(Type()&CGeneModel::eSR))
5753         return;
5754 
5755     CAlignMap amap = GetAlignMap();
5756     int mrna_len = amap.FShiftedLen(Limits());
5757     map<TSignedSeqRange,double> intron_coverage;
5758     vector<double> coverage(mrna_len);
5759     ITERATE (TContained, it, m_members) {
5760         const CGeneModel& align = *(*it)->m_align;
5761         if(align.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
5762             continue;
5763         TSignedSeqRange overlap = Limits()&align.Limits();
5764         if(overlap.Empty())   // some could be cut by polya clip
5765             continue;
5766 
5767         if(align.Type() == CGeneModel::eSR) {
5768             TSignedSeqRange overlap_on_mrna = amap.MapRangeOrigToEdited(overlap);
5769             for(int i = overlap_on_mrna.GetFrom(); i <= overlap_on_mrna.GetTo(); ++i)
5770                 coverage[i] += align.Weight();
5771         }
5772 
5773         for(int i = 1; i < (int)align.Exons().size(); ++i) {
5774             if(align.Exons()[i-1].m_ssplice_sig != "XX" && align.Exons()[i].m_fsplice_sig != "XX") {
5775                 TSignedSeqRange intron(align.Exons()[i-1].Limits().GetTo(),align.Exons()[i].Limits().GetFrom());
5776                 if(Include(Limits(),intron))
5777                     intron_coverage[intron] += (align.Type() == CGeneModel::eSR) ? align.Weight() : 0;
5778             }
5779         }
5780     }
5781 
5782     double minintroncount = numeric_limits<double>::max();
5783     double maxintroncount = 0;
5784     for(map<TSignedSeqRange,double>::iterator it = intron_coverage.begin(); it != intron_coverage.end(); ++it) {
5785         minintroncount = min(minintroncount,it->second);
5786         maxintroncount = max(maxintroncount,it->second);
5787     }
5788     if(minintroncount < 0.1*maxintroncount)
5789         return;
5790 
5791     vector<int> dips(mrna_len,0);
5792     double maxsofar = 0;
5793     for(int i = 0; i < mrna_len; ++i) {
5794         if(coverage[i] < 0.1*maxsofar)
5795             dips[i] = 1;
5796         maxsofar = max(maxsofar,coverage[i]);
5797     }
5798     for(int i = 0; i < (int)Exons().size(); ++i) {
5799         if(Exons()[i].m_fsplice_sig == "XX" || Exons()[i].m_ssplice_sig == "XX") {
5800             TSignedSeqRange te = amap.MapRangeOrigToEdited(Exons()[i].Limits(),false);
5801             _ASSERT(te.NotEmpty());
5802             for(int p = max(0,te.GetFrom()-50); p <= min(mrna_len-1,te.GetTo()+50); ++p)
5803                 dips[p] = 0;
5804         }
5805     }
5806     maxsofar = 0;
5807     for(int i = mrna_len-1; i >= 0; --i) {
5808         if(coverage[i] < 0.1*maxsofar && dips[i] > 0)
5809             return;
5810         maxsofar = max(maxsofar,coverage[i]);
5811     }
5812 
5813     if(intron_coverage.size() > 1)
5814         Status() |= eConsistentCoverage;
5815 }
5816 
SetConfirmedStartStopForCompleteProteins(map<string,pair<bool,bool>> & prot_complet,const SMinScor & minscor)5817 void CChain::SetConfirmedStartStopForCompleteProteins(map<string, pair<bool,bool> >& prot_complet, const SMinScor& minscor)
5818 {
5819     if(ConfirmedStart() && ConfirmedStop())
5820         return;
5821 
5822     bool setconfstart = false;
5823     bool setconfstop = false;
5824 
5825     CAlignMap mrnamap = GetAlignMap();
5826     ITERATE(TContained, i, m_members) {
5827 
5828         if((*i)->m_align->GetCdsInfo().ProtReadingFrame().Empty())            // not known CDS
5829             continue;
5830 
5831         if((*i)->m_align->Type() & emRNA) {
5832             if(!ConfirmedStart() && HasStart())
5833                 setconfstart = true;
5834             if(!ConfirmedStop() && HasStop())
5835                 setconfstop = true;
5836         } else {
5837             CAlignModel* orig_align = (*i)->m_orig_align;
5838             if(orig_align->TargetLen() == 0)   // protein of not known length
5839                 continue;
5840 
5841             string accession = orig_align->TargetAccession();
5842             map<string, pair<bool,bool> >::iterator iter = prot_complet.find(accession);
5843             _ASSERT(iter != prot_complet.end());
5844             if(iter == prot_complet.end())
5845                 continue;
5846 
5847             TSignedSeqRange fivep_exon = orig_align->Exons().front().Limits();
5848             TSignedSeqRange threep_exon = orig_align->Exons().back().Limits();
5849             if((*i)->m_align->Strand() == eMinus)
5850                 swap(fivep_exon,threep_exon);
5851 
5852             if(!ConfirmedStart() && HasStart() && fivep_exon.IntersectingWith((*i)->m_align->Limits()) &&
5853                iter->second.first && Include(Limits(),(*i)->m_align->Limits())) {  // protein has start
5854 
5855                 TSignedSeqPos not_aligned =  orig_align->GetAlignMap().MapRangeOrigToEdited((*i)->m_align->Limits(),false).GetFrom()-1;
5856                 if(not_aligned <= (1.-minscor.m_minprotfrac)*orig_align->TargetLen()) {                                                         // well aligned
5857                     TSignedSeqPos fivep = mrnamap.MapOrigToEdited(Strand() == ePlus ? (*i)->m_align->Limits().GetFrom() : (*i)->m_align->Limits().GetTo());
5858                     if(fivep > 0) {  // the end is still in chain
5859                         TSignedSeqPos extra_length = fivep-mrnamap.MapRangeOrigToEdited(GetCdsInfo().Start(),false).GetFrom()-1;
5860                         if(extra_length > not_aligned-minscor.m_endprotfrac*orig_align->TargetLen()) {
5861                             setconfstart = true;
5862                         }
5863                     }
5864                 }
5865             }
5866 
5867             if(!ConfirmedStop() && HasStop() && threep_exon.IntersectingWith((*i)->m_align->Limits()) &&
5868                iter->second.second && Include(Limits(),(*i)->m_align->Limits())) {  // protein has stop
5869 
5870                 TSignedSeqPos not_aligned = orig_align->TargetLen()-orig_align->GetAlignMap().MapRangeOrigToEdited((*i)->m_align->Limits(),false).GetTo();
5871                 if(not_aligned <= (1.-minscor.m_minprotfrac)*orig_align->TargetLen()) {                                                         // well aligned
5872                     TSignedSeqPos threep = mrnamap.MapOrigToEdited(Strand() == ePlus ? (*i)->m_align->Limits().GetTo() : (*i)->m_align->Limits().GetFrom());
5873                     if(threep >= 0) {  // the end is still in chain
5874                         TSignedSeqPos extra_length = mrnamap.MapRangeOrigToEdited(GetCdsInfo().Stop(),false).GetTo()-threep;
5875                         if(extra_length > not_aligned-minscor.m_endprotfrac*orig_align->TargetLen()) {
5876                             setconfstop = true;
5877                         }
5878                     }
5879                 }
5880             }
5881         }
5882     }
5883 
5884     CCDSInfo cds_info = GetCdsInfo();
5885     double score = cds_info.Score();
5886     if((setconfstart || ConfirmedStart()) && (setconfstop || ConfirmedStop()) && Continuous()) {
5887         score += max(1.,0.3*score);
5888         cds_info.SetScore(score, false);   // not open
5889     }
5890 
5891     if(setconfstart) {
5892         cds_info.SetScore(score, false);   // not open
5893         cds_info.SetStart(cds_info.Start(), true);    // confirmed start
5894     }
5895 
5896     if(setconfstop) {
5897         cds_info.SetStop(cds_info.Stop(), true);    // confirmed stop
5898     }
5899 
5900     SetCdsInfo(cds_info);
5901 }
5902 
CollectTrustedmRNAsProts(TOrigAligns & orig_aligns,const SMinScor & minscor,CScope & scope,SMatrix & delta,const CResidueVec & contig)5903 void CChain::CollectTrustedmRNAsProts(TOrigAligns& orig_aligns, const SMinScor& minscor, CScope& scope, SMatrix& delta, const CResidueVec& contig)
5904 {
5905     ClearTrustedmRNA();
5906     ClearTrustedProt();
5907 
5908     if(HasStart() && HasStop()) {
5909         typedef map<Int8, set<TSignedSeqRange> > Tint8range;
5910         Tint8range aexons;
5911         Tint8range uexons;
5912         ITERATE(TContained, i, m_members) {
5913             if(IntersectingWith(*(*i)->m_align)) {                                                                   // just in case we clipped this alignment
5914                 if(!(*i)->m_align->TrustedProt().empty()) {
5915                     ITERATE(TExons, e, (*i)->m_align->Exons()) {
5916                         if((*i)->m_mem_id > 0)
5917                             aexons[(*i)->m_align->ID()].insert(e->Limits());
5918                         else
5919                             uexons[(*i)->m_align->ID()].insert(e->Limits());
5920                     }
5921                 }
5922                 else if(!(*i)->m_align->TrustedmRNA().empty() && (*i)->m_align->ConfirmedStart() && (*i)->m_align->ConfirmedStop())  // trusted mRNA with aligned CDS (correctly checks not duplicated cds)
5923                     InsertTrustedmRNA(*(*i)->m_align->TrustedmRNA().begin());                                      // could be only one 'part'
5924             }
5925         }
5926         typedef map<Int8, int> Tint8int;
5927         Tint8int palignedlen;
5928         ITERATE(Tint8range, i, aexons) {
5929             int len = 0;
5930             ITERATE(set<TSignedSeqRange>, e, i->second)
5931                 len += e->GetLength();
5932             palignedlen[i->first] = len;
5933         }
5934         ITERATE(Tint8range, i, uexons) {
5935             int len = 0;
5936             ITERATE(set<TSignedSeqRange>, e, i->second)
5937                 len += e->GetLength();
5938             palignedlen[i->first] = max(len,palignedlen[i->first]);
5939         }
5940 
5941         if(ConfirmedStart() && ConfirmedStop()) {
5942             ITERATE(Tint8int, i, palignedlen) {
5943                 CAlignModel* orig_align = orig_aligns[i->first];
5944                 if((Continuous() && i->second > 0.8*orig_align->TargetLen()) || i->second > minscor.m_minprotfrac*orig_align->TargetLen())                                 // well aligned trusted protein
5945                     InsertTrustedProt(*orig_align->TrustedProt().begin());
5946             }
5947         }
5948 
5949         if(Continuous() && TrustedmRNA().empty() && TrustedProt().empty() && !palignedlen.empty()) {
5950             TSignedSeqRange cds = RealCdsLimits();
5951             int gap_cds = 0;
5952             ITERATE(CGeneModel::TExons, ie, Exons()) {
5953                 if(ie->m_fsplice_sig == "XX" || ie->m_ssplice_sig == "XX")
5954                     gap_cds += (cds&ie->Limits()).GetLength();
5955             }
5956 
5957             if(gap_cds > 0) {
5958                 string mprotein = GetProtein(contig);
5959                 ITERATE(Tint8int, i, palignedlen) {
5960                     CAlignModel* orig_align = orig_aligns[i->first];
5961                     if(i->second+gap_cds > 0.8*orig_align->TargetLen()) { //realign proteins if close enough
5962                         CSeqVector protein_seqvec(scope.GetBioseqHandle(*orig_align->GetTargetId()), CBioseq_Handle::eCoding_Iupac);
5963                         string tprotein(protein_seqvec.begin(),protein_seqvec.end());
5964                         CCigar cigar = LclAlign(mprotein.c_str(), mprotein.size(), tprotein.c_str(), tprotein.size(), 10, 1, delta.matrix);
5965                         if(cigar.SubjectRange().GetLength() > 0.8*tprotein.size()) {
5966                             InsertTrustedProt(*orig_align->TrustedProt().begin());
5967                             break;
5968                         }
5969                     }
5970                 }
5971             }
5972         }
5973     }
5974 }
5975 
5976 // if external model is 'open' all 5' introns can harbor
5977 // for nested model 'open' is ignored
HarborsNested(const CChain & other_chain,bool check_in_holes) const5978 bool CChain::HarborsNested(const CChain& other_chain, bool check_in_holes) const {
5979     TSignedSeqRange lim_for_nested = Limits();
5980     if(!ReadingFrame().Empty())
5981         lim_for_nested = OpenCds() ? MaxCdsLimits() : RealCdsLimits();
5982 
5983     TSignedSeqRange other_lim_for_nested = other_chain.Limits();
5984     if(!other_chain.ReadingFrame().Empty())
5985         other_lim_for_nested = other_chain.RealCdsLimits();
5986 
5987     if(lim_for_nested.IntersectingWith(other_lim_for_nested))
5988         return CModelCompare::RangeNestedInIntron(other_lim_for_nested, *this, check_in_holes);
5989     else
5990         return false;
5991 }
5992 
5993 // if external model is 'open' all 5' introns can harbor
5994 // for nested model 'open' is ignored
HarborsNested(const CGene & other_gene,bool check_in_holes) const5995 bool CChain::HarborsNested(const CGene& other_gene, bool check_in_holes) const {
5996     TSignedSeqRange lim_for_nested = Limits();
5997     if(!ReadingFrame().Empty())
5998         lim_for_nested = OpenCds() ? MaxCdsLimits() : RealCdsLimits();
5999 
6000     TSignedSeqRange other_lim_for_nested = other_gene.Limits();
6001     if(!other_gene.RealCdsLimits().Empty())
6002         other_lim_for_nested = other_gene.RealCdsLimits();
6003 
6004     if(lim_for_nested.IntersectingWith(other_lim_for_nested))
6005         return CModelCompare::RangeNestedInIntron(other_lim_for_nested, *this, check_in_holes);
6006     else
6007         return false;
6008 }
6009 
GetAccVer(const CAlignModel & a,CScope & scope)6010 pair<string,int> GetAccVer(const CAlignModel& a, CScope& scope)
6011 {
6012     if((a.Type()&CGeneModel::eProt) == 0)
6013         return make_pair(a.TargetAccession(), 0);
6014 
6015     try {
6016         CSeq_id_Handle idh = sequence::GetId(*a.GetTargetId(), scope,
6017                                              sequence::eGetId_ForceAcc);
6018         if (idh) {
6019             CConstRef<CSeq_id> acc = idh.GetSeqId();
6020             const CTextseq_id* txtid = acc->GetTextseq_Id();
6021             return (txtid  &&  txtid->IsSetAccession() && txtid->IsSetVersion()) ?
6022                 make_pair(txtid->GetAccession(), txtid->GetVersion()) : make_pair(idh.AsString(), 0);
6023         }
6024     }
6025     catch (sequence::CSeqIdFromHandleException&) {
6026     }
6027     return make_pair(a.TargetAccession(), 0);
6028 }
6029 
6030 static int s_ExonLen(const CGeneModel& a);
6031 
6032 struct s_ByAccVerLen {
s_ByAccVerLens_ByAccVerLen6033     s_ByAccVerLen(CScope& scope_) : scope(scope_) {}
6034     CScope& scope;
operator ()s_ByAccVerLen6035     bool operator()(const CAlignModel* a, const CAlignModel* b)
6036     {
6037         pair<string,int> a_acc = GetAccVer(*a, scope);
6038         pair<string,int> b_acc = GetAccVer(*b, scope);
6039         int acc_cmp = NStr::CompareCase(a_acc.first,b_acc.first);
6040         if (acc_cmp != 0)
6041             return acc_cmp<0;
6042         if (a_acc.second != b_acc.second)
6043             return a_acc.second > b_acc.second;
6044 
6045         int a_stt = a->HasStart()+a->HasStop();
6046         int b_stt = b->HasStart()+b->HasStop();
6047         if (a_stt != b_stt)
6048             return a_stt > b_stt;
6049 
6050         int a_len = s_ExonLen(*a);
6051         int b_len = s_ExonLen(*b);
6052         if (a_len!=b_len)
6053             return a_len > b_len;
6054 
6055         if((a->Status()&CGeneModel::eBestPlacement) != (b->Status()&CGeneModel::eBestPlacement))
6056             return (a->Status()&CGeneModel::eBestPlacement) > (b->Status()&CGeneModel::eBestPlacement);
6057 
6058         return a->ID() < b->ID(); // to make sort deterministic
6059     }
6060 };
s_ExonLen(const CGeneModel & a)6061 static int s_ExonLen(const CGeneModel& a)
6062     {
6063         int len = 0;
6064         ITERATE(CGeneModel::TExons, e, a.Exons())
6065             len += e->Limits().GetLength();
6066         return len;
6067     }
6068 
SkipReason(CGeneModel * orig_align,const string & comment)6069 void CChainer::CChainerImpl::SkipReason(CGeneModel* orig_align, const string& comment)
6070 {
6071     orig_align->Status() |= CGeneModel::eSkipped;
6072     orig_align->AddComment(comment);
6073 }
6074 
FilterOutChimeras(TGeneModelList & clust)6075 void CChainer::FilterOutChimeras(TGeneModelList& clust)
6076 {
6077     m_data->FilterOutChimeras(clust);
6078 }
6079 
FilterOutChimeras(TGeneModelList & clust)6080 void CChainer::CChainerImpl::FilterOutChimeras(TGeneModelList& clust)
6081 {
6082     typedef map<int,TGeneModelClusterSet> TClustersByStrand;
6083     TClustersByStrand trusted_aligns;
6084     ITERATE(TGeneModelList, it, clust) {
6085         if(it->Status()&CGeneModel::eUnmodifiedAlign)
6086             continue;
6087 
6088         CAlignModel* orig_align = orig_aligns[it->ID()];
6089         if(orig_align->Continuous() && (!it->TrustedmRNA().empty() || !it->TrustedProt().empty())
6090                             && it->AlignLen() > minscor.m_minprotfrac*orig_aligns[it->ID()]->TargetLen()) {
6091             trusted_aligns[it->Strand()].Insert(*it);
6092         }
6093     }
6094 
6095     if(trusted_aligns[ePlus].size() < 2 && trusted_aligns[eMinus].size() < 2)
6096         return;
6097 
6098     typedef set<int> TSplices;
6099     typedef list<TSplices> TSplicesList;
6100     typedef map<int,TSplicesList> TSplicesByStrand;
6101     TSplicesByStrand trusted_splices;
6102 
6103     ITERATE(TClustersByStrand, it, trusted_aligns) {
6104         int strand = it->first;
6105         const TGeneModelClusterSet& clset = it->second;
6106         ITERATE(TGeneModelClusterSet, jt, clset) {
6107             const TGeneModelCluster& cls = *jt;
6108             trusted_splices[strand].push_back(set<int>());
6109             TSplices& splices = trusted_splices[strand].back();
6110             ITERATE(TGeneModelCluster, lt, cls) {
6111                 const CGeneModel& align = *lt;
6112                 ITERATE(CGeneModel::TExons, e, align.Exons()) {
6113                     if(e->m_fsplice)
6114                         splices.insert(e->GetFrom());
6115                     if(e->m_ssplice)
6116                         splices.insert(e->GetTo());
6117                 }
6118             }
6119         }
6120     }
6121 
6122     for(TGeneModelList::iterator it_loop = clust.begin(); it_loop != clust.end(); ) {
6123         TGeneModelList::iterator it = it_loop++;
6124         if(it->Status()&CGeneModel::eUnmodifiedAlign)
6125             continue;
6126 
6127         const CGeneModel& align = *it;
6128         int strand = align.Strand();
6129         const TSplicesList& spl = trusted_splices[strand];
6130 
6131         int count = 0;
6132         ITERATE(TSplicesList, jt, spl) {
6133             const TSplices& splices = *jt;
6134             for(unsigned int i = 0; i < align.Exons().size(); ++i) {
6135                 const CModelExon& e = align.Exons()[i];
6136                 if(splices.find(e.GetFrom()) != splices.end() || splices.find(e.GetTo()) != splices.end()) {
6137                     ++count;
6138                     break;
6139                 }
6140             }
6141         }
6142 
6143         if(count > 1) {
6144             cerr << "Chimeric alignment " << align.ID() << endl;
6145             SkipReason(orig_aligns[align.ID()],"Chimera");
6146             clust.erase(it);
6147         }
6148     }
6149 }
6150 
6151 struct OverlapsSameAccessionAlignment : public Predicate {
6152     OverlapsSameAccessionAlignment(TAlignModelList& alignments);
6153     virtual bool align_predicate(CAlignModel& align);
GetCommentOverlapsSameAccessionAlignment6154     virtual string GetComment() { return "Overlaps the same alignment";}
6155 };
6156 
OverlapsSameAccessionAlignment(TAlignModelList & alignments)6157 OverlapsSameAccessionAlignment::OverlapsSameAccessionAlignment(TAlignModelList& alignments)
6158 {
6159     CScope scope(*CObjectManager::GetInstance());
6160     scope.AddDefaults();
6161 
6162     vector<CAlignModel*> alignment_ptrs;
6163     NON_CONST_ITERATE(TAlignModelList, a, alignments) {
6164         if(!(a->Status()&CGeneModel::eUnmodifiedAlign) && a->Type() != CGeneModel::eNotForChaining)
6165             alignment_ptrs.push_back(&*a);
6166     }
6167 
6168     if (alignment_ptrs.empty())
6169         return;
6170 
6171     sort(alignment_ptrs.begin(), alignment_ptrs.end(), s_ByAccVerLen(scope));
6172 
6173     vector<CAlignModel*>::iterator first = alignment_ptrs.begin();
6174     pair<string,int> first_accver = GetAccVer(**first, scope);
6175     vector<CAlignModel*> ::iterator current = first; ++current;
6176     for (; current != alignment_ptrs.end(); ++current) {
6177         pair<string,int> current_accver = GetAccVer(**current, scope);
6178         if (first_accver.first == current_accver.first) {
6179             if ((*current)->Strand() == (*first)->Strand() && (*current)->Limits().IntersectingWith((*first)->Limits())) {
6180                 (*current)->Status() |= CGeneModel::eSkipped;
6181             }
6182         } else {
6183             first=current;
6184             first_accver = current_accver;
6185         }
6186     }
6187 }
6188 
align_predicate(CAlignModel & align)6189 bool OverlapsSameAccessionAlignment::align_predicate(CAlignModel& align)
6190 {
6191     return align.Status() & CGeneModel::eSkipped;
6192 }
6193 
OverlapsSameAccessionAlignment(TAlignModelList & alignments)6194 Predicate* CChainer::OverlapsSameAccessionAlignment(TAlignModelList& alignments)
6195 {
6196     return new gnomon::OverlapsSameAccessionAlignment(alignments);
6197 }
6198 
FindMultiplyIncluded(CAlignModel & algn,TAlignModelList & clust)6199 string FindMultiplyIncluded(CAlignModel& algn, TAlignModelList& clust)
6200 {
6201     if ((algn.Type() & CGeneModel::eProt)!=0 && !algn.Continuous()) {
6202         set<string> compatible_evidence;
6203         int len = algn.AlignLen();
6204 
6205         static CGeneModel dummy_align;
6206         const CGeneModel* prev_alignp = &dummy_align;
6207 
6208         bool prev_is_compatible = false;
6209         NON_CONST_ITERATE(TAlignModelList, jtcl, clust) {
6210             CAlignModel& algnj = *jtcl;
6211             if (algn == algnj)
6212                 continue;
6213             if (algnj.AlignLen() < len/4)
6214                 continue;
6215 
6216             bool same_as_prev = algnj.IdenticalAlign(*prev_alignp);
6217             if (!same_as_prev)
6218                 prev_alignp = &algnj;
6219 
6220             if ((same_as_prev && prev_is_compatible) || (!same_as_prev && algn.Strand()==algnj.Strand() && algn.isCompatible(algnj))) {
6221                 prev_is_compatible = true;
6222                 if (!compatible_evidence.insert(algnj.TargetAccession()).second) {
6223                     return algnj.TargetAccession();
6224                 }
6225             } else {
6226                 prev_is_compatible = false;
6227             }
6228         }
6229     }
6230     return kEmptyStr;
6231 }
6232 
6233 struct ConnectsParalogs : public Predicate {
ConnectsParalogsConnectsParalogs6234     ConnectsParalogs(TAlignModelList& _alignments)
6235         : alignments(_alignments)
6236     {}
6237     TAlignModelList& alignments;
6238     string paralog;
6239 
align_predicateConnectsParalogs6240     virtual bool align_predicate(CAlignModel& align)
6241     {
6242         paralog = FindMultiplyIncluded(align, alignments);
6243         return !paralog.empty();
6244     }
GetCommentConnectsParalogs6245     virtual string GetComment() { return "Connects two "+paralog+" alignments"; }
6246 };
6247 
ConnectsParalogs(TAlignModelList & alignments)6248 Predicate* CChainer::ConnectsParalogs(TAlignModelList& alignments)
6249 {
6250     return new gnomon::ConnectsParalogs(alignments);
6251 }
6252 
ScoreCDSes_FilterOutPoorAlignments(TGeneModelList & clust)6253 void CChainer::ScoreCDSes_FilterOutPoorAlignments(TGeneModelList& clust)
6254 {
6255     ERASE_ITERATE(TGeneModelList, itcl, clust) {
6256         if(m_data->orig_aligns.find(itcl->ID()) == m_data->orig_aligns.end()) {
6257             clust.erase(itcl);
6258             continue;
6259         }
6260 
6261         CGeneModel& algn = *itcl;
6262         if ((algn.Type() & CGeneModel::eProt)!=0 || algn.ConfirmedStart()) {   // this includes protein alignments and mRNA with confirmed CDSes
6263 
6264             m_gnomon->GetScore(algn);
6265             double ms = m_data->GoodCDNAScore(algn);
6266             CAlignModel* orig = m_data->orig_aligns[algn.ID()];
6267 
6268             if (algn.Score() == BadScore() || (algn.Score() < ms && (algn.Type()&CGeneModel::eProt) && !(algn.Status()&CGeneModel::eBestPlacement) && orig->AlignLen() < m_data->minscor.m_minprotfrac*orig->TargetLen())) { // all mRNA with confirmed CDS and best placed or reasonably aligned proteins with known length will get through with any finite score
6269                 CNcbiOstrstream ost;
6270                 if(algn.AlignLen() <= 75)
6271                     ost << "Short alignment " << algn.AlignLen();
6272                 else
6273                     ost << "Low score " << algn.Score();
6274                 m_data->SkipReason(orig, CNcbiOstrstreamToString(ost));
6275                 clust.erase(itcl);
6276             }
6277         }
6278     }
6279 }
6280 
6281 #define PROT_CLIP 120
6282 #define PROT_CLIP_FRAC 0.20
6283 #define MIN_PART 30
6284 
FindSelenoproteinsClipProteinsToStartStop(TGeneModelList & clust)6285 void CChainer::FindSelenoproteinsClipProteinsToStartStop(TGeneModelList& clust) {
6286     CScope scope(*CObjectManager::GetInstance());
6287     scope.AddDefaults();
6288     const CResidueVec& contig = m_gnomon->GetSeq();
6289 
6290     ERASE_ITERATE(TGeneModelList, itcl, clust) {
6291         if(!(itcl->Type()&CGeneModel::eProt) || m_data->orig_aligns.find(itcl->ID()) == m_data->orig_aligns.end())  // skip cDNA and 'unmodified' without origaligns
6292             continue;
6293 
6294         CGeneModel& align = *itcl;
6295         m_gnomon->GetScore(align);
6296         if(align.Score() == BadScore()) {
6297             clust.erase(itcl);
6298             continue;
6299         }
6300 
6301         CAlignModel* orig = m_data->orig_aligns[align.ID()];
6302         CSeqVector protein_seqvec(scope.GetBioseqHandle(*orig->GetTargetId()), CBioseq_Handle::eCoding_Iupac);
6303 
6304         CAlignMap amap = align.GetAlignMap();
6305         CAlignMap origmap = orig->GetAlignMap();
6306 
6307         //find selenoproteins and stops 'confirmed' on genome
6308         if(align.PStop()) {
6309             CCDSInfo::TPStops pstops = align.GetCdsInfo().PStops();
6310             NON_CONST_ITERATE(CCDSInfo::TPStops, stp, pstops) {
6311                 TInDels fs = StrictlyContainedInDels(align.FrameShifts(), *stp);
6312                 if(!fs.empty())
6313                     continue;
6314                 TSignedSeqRange tstop = amap.MapRangeOrigToEdited(*stp,false);
6315                 CResidueVec mrna;
6316                 amap.EditedSequence(contig, mrna);
6317                 if(tstop.GetLength() == 3 && mrna[tstop.GetFrom()] == 'T' && mrna[tstop.GetFrom()+1] == 'G' && mrna[tstop.GetFrom()+2] == 'A') {
6318                     TSignedSeqRange ostop = origmap.MapRangeOrigToEdited(*stp,false);
6319                     if(ostop.GetLength() == 3 && protein_seqvec[ostop.GetFrom()/3] == 'U') {
6320                         stp->m_status = CCDSInfo::eSelenocysteine;
6321                     }
6322                 }
6323                 if(stp->m_status != CCDSInfo::eSelenocysteine) {
6324                     TIntMap::iterator conf = m_confirmed_bases_len.upper_bound(stp->GetTo()); // confirmed on the right
6325                     if(conf != m_confirmed_bases_len.begin() && (--conf)->first <= stp->GetFrom() && conf->first+conf->second > stp->GetTo())
6326                         stp->m_status =  CCDSInfo::eGenomeCorrect;
6327                 }
6328             }
6329 
6330             CCDSInfo cds = align.GetCdsInfo();
6331             cds.ClearPStops();
6332             ITERATE(CCDSInfo::TPStops, stp, pstops)
6333                 cds.AddPStop(*stp);
6334             align.SetCdsInfo(cds);
6335         }
6336 
6337         if(itcl->Status()&CGeneModel::eUnmodifiedAlign) {
6338             m_data->unmodified_aligns[itcl->ID()] = *itcl;
6339             clust.erase(itcl);
6340             continue;
6341         }
6342 
6343         if(align.Limits() == orig->Limits() && (!align.HasStart() || !align.FrameShifts().empty() || align.PStop(false))) {
6344             int maxclip = min(PROT_CLIP, (int)(align.AlignLen()*PROT_CLIP_FRAC+0.5));
6345             TSignedSeqRange tlim = orig->TranscriptLimits();
6346             int fivepclip = 0;
6347             if(protein_seqvec[0] == 'M')
6348                 fivepclip = maxclip-tlim.GetFrom();
6349             int threepclip = maxclip-(orig->TargetLen()-tlim.GetTo()-1);
6350 
6351             bool skip = false;
6352 
6353             int fivepshift = 0;
6354             int threepshift = 0;
6355             int tlen = align.TranscriptLimits().GetTo()+1;
6356             for(TInDels::iterator indl = align.FrameShifts().begin(); !skip && indl != align.FrameShifts().end(); ++indl) {
6357                 //project safely in case of tandem frameshifts or exon boundaries
6358                 TSignedSeqRange left(align.Limits().GetFrom(),indl->Loc()-1);
6359                 left = amap.ShrinkToRealPoints(left,false);
6360                 _ASSERT(left.NotEmpty());
6361                 TSignedSeqRange right(indl->InDelEnd(),align.Limits().GetTo());
6362                 right = amap.ShrinkToRealPoints(right,false);
6363                 _ASSERT(right.NotEmpty());
6364 
6365                 TSignedSeqRange lim = amap.MapRangeOrigToEdited(TSignedSeqRange(left.GetTo(),right.GetFrom()), false);
6366                 _ASSERT(lim.GetLength() >= 2);
6367                 int tpa = lim.GetFrom()+1;
6368                 int tpb = lim.GetTo()-1;
6369                 // for deletion tpa,tpb are first and last tposition of the extra sequence on transcript
6370                 // for insertion tpa is AFTER the missing sequence and tpb is BEFORE
6371                 if(tpb < fivepclip) {                           // clipable 5' frameshift
6372                     if(indl->IsInsertion())
6373                         fivepshift += indl->Len();
6374                     else if(indl->IsDeletion())
6375                         fivepshift -= indl->Len();
6376                 } else if(tpa >= tlen-threepclip) {             // clipable 3' frameshift
6377                     if(indl->IsInsertion())
6378                         threepshift += indl->Len();
6379                     else if(indl->IsDeletion())
6380                         threepshift -= indl->Len();
6381                 } else {                                        // frameshift in main body
6382                     skip = true;
6383                 }
6384             }
6385             if(skip)
6386                 continue;
6387 
6388             if(fivepshift >= 0)
6389                 fivepshift %= 3;
6390             else
6391                 fivepshift = 3-(-fivepshift)%3;
6392             if(threepshift >= 0)
6393                 threepshift %= 3;
6394             else
6395                 threepshift = 3-(-threepshift)%3;
6396 
6397             CGeneModel editedm = align;
6398             editedm.FrameShifts().clear();
6399             editedm.SetCdsInfo(CCDSInfo());
6400             //CAlignMap edited_map = editedm.GetAlignMap();
6401             TSignedSeqRange edited_tlim = editedm.TranscriptLimits();
6402             edited_tlim.SetFrom(edited_tlim.GetFrom()+fivepshift);
6403             edited_tlim.SetTo(edited_tlim.GetTo()-threepshift);
6404             TSignedSeqRange edited_lim = editedm.GetAlignMap().MapRangeEditedToOrig(edited_tlim, false);
6405             _ASSERT(edited_lim.NotEmpty());
6406             editedm.Clip(edited_lim, CGeneModel::eRemoveExons);
6407             CCDSInfo edited_cds;
6408             edited_cds.SetReadingFrame(edited_lim, true);
6409             editedm.SetCdsInfo(edited_cds);
6410 
6411             string protseq = editedm.GetProtein(contig);
6412             tlen = 3*protseq.size();
6413             int fivep_problem = -1;
6414             int first_stop = tlen;
6415 
6416             for(int p = 0; !skip && p < (int)protseq.size(); ++p) {
6417                 if(protseq[p] == '*') {
6418                     int tpa = p*3;
6419                     int tpb = tpa+2;
6420                     if(tpb < fivepclip)                                           // clipable 5' stop
6421                         fivep_problem = max(fivep_problem, tpb);
6422                     else if(tpa >= tlen-threepclip || p == (int)protseq.size()-1) // leftmost 3' stop
6423                         first_stop = min(first_stop, tpa);
6424                     else                                                          // stop in main body
6425                         skip = true;
6426                 }
6427             }
6428             if(skip)
6429                 continue;
6430 
6431             int fivep_limit = 0;
6432             size_t m = protseq.find("M", (fivep_problem+1)/3);   // first start after possible stop/frameshift
6433             skip = true;
6434             if(m != string::npos && (int)m*3 <= fivepclip) {
6435                 fivep_limit = 3*m;
6436                 skip = false;
6437             }
6438             if(skip)
6439                 continue;
6440 
6441             int threep_limit = tlen-1;
6442             skip = true;
6443             if(first_stop+2 < threep_limit) {
6444                 threep_limit =  first_stop+2;
6445                 skip = false;
6446             }
6447             if(skip)
6448                 continue;
6449 
6450             TSignedSeqRange clip(fivep_limit, threep_limit);
6451             tlen = clip.GetLength();
6452             clip = editedm.GetAlignMap().MapRangeEditedToOrig(clip, false);
6453             _ASSERT(clip.NotEmpty());
6454 
6455             editedm.Clip(clip, CGeneModel::eRemoveExons);
6456             if(align.Limits().GetFrom() != editedm.Limits().GetFrom() && !editedm.Exons().front().m_ssplice && editedm.Exons().front().Limits().GetLength() < MIN_PART) // short 5' part
6457                 continue;
6458             if(align.Limits().GetTo() != editedm.Limits().GetTo() && !editedm.Exons().back().m_fsplice && editedm.Exons().back().Limits().GetLength() < MIN_PART) // short 3' part
6459                 continue;
6460 
6461             TSignedSeqRange start(0, 2);
6462             TSignedSeqRange stop(tlen-3, tlen-1);
6463             TSignedSeqRange rf(start.GetTo()+1,stop.GetFrom()-1);
6464             edited_cds.SetReadingFrame(rf,true);
6465             edited_cds.SetStart(start,true);
6466             edited_cds.SetStop(stop,true);
6467             edited_cds.SetScore(align.Score());
6468             edited_cds = edited_cds.MapFromEditedToOrig(editedm.GetAlignMap());
6469             editedm.SetCdsInfo(edited_cds);
6470 
6471 #ifdef _DEBUG
6472             protseq = editedm.GetProtein(contig);
6473             _ASSERT(tlen == 3*(int)protseq.size());
6474             _ASSERT(protseq[0] == 'M');
6475             m = protseq.find("*");
6476             _ASSERT(m == protseq.size()-1);
6477 #endif
6478 
6479             align = editedm;
6480         }
6481     }
6482 }
6483 
6484 
6485 struct SFShiftsCluster {
SFShiftsClusterSFShiftsCluster6486     SFShiftsCluster(TSignedSeqRange limits = TSignedSeqRange::GetEmpty()) : m_limits(limits) {}
6487     TSignedSeqRange m_limits;
6488     TInDels    m_fshifts;
operator <SFShiftsCluster6489     bool operator<(const SFShiftsCluster& c) const { return m_limits.GetTo() < c.m_limits.GetFrom(); }
6490 };
6491 
AddIfCompatible(set<SFShiftsCluster> & fshift_clusters,const CGeneModel & algn)6492 bool CChainer::CChainerImpl::AddIfCompatible(set<SFShiftsCluster>& fshift_clusters, const CGeneModel& algn)
6493 {
6494     typedef vector<SFShiftsCluster> TFShiftsClusterVec;
6495     typedef set<SFShiftsCluster>::iterator TIt;
6496 
6497     TFShiftsClusterVec algn_fclusters;
6498     algn_fclusters.reserve(algn.Exons().size());
6499 
6500     {
6501         const TInDels& fs = algn.FrameShifts();
6502         TInDels::const_iterator fi = fs.begin();
6503         ITERATE (CGeneModel::TExons, e, algn.Exons()) {
6504             algn_fclusters.push_back(SFShiftsCluster(e->Limits()));
6505             while(fi != fs.end() && fi->IntersectingWith(e->GetFrom(),e->GetTo())) {
6506                 algn_fclusters.back().m_fshifts.push_back(*fi++);
6507             }
6508         }
6509     }
6510 
6511     ITERATE(TFShiftsClusterVec, exon_cluster, algn_fclusters) {
6512         pair<TIt,TIt> eq_rng = fshift_clusters.equal_range(*exon_cluster);
6513         for(TIt glob_cluster = eq_rng.first; glob_cluster != eq_rng.second; ++glob_cluster) {
6514             ITERATE(TInDels, fi, glob_cluster->m_fshifts)
6515                 if (find(exon_cluster->m_fshifts.begin(),exon_cluster->m_fshifts.end(),*fi) == exon_cluster->m_fshifts.end())
6516                     if (fi->IntersectingWith(exon_cluster->m_limits.GetFrom(),exon_cluster->m_limits.GetTo()))
6517                         return false;
6518             ITERATE(TInDels, fi, exon_cluster->m_fshifts)
6519                 if (find(glob_cluster->m_fshifts.begin(),glob_cluster->m_fshifts.end(),*fi) == glob_cluster->m_fshifts.end())
6520                     if (fi->IntersectingWith(glob_cluster->m_limits.GetFrom(),glob_cluster->m_limits.GetTo()))
6521                         return false;
6522         }
6523     }
6524     NON_CONST_ITERATE(TFShiftsClusterVec, exon_cluster, algn_fclusters) {
6525         pair<TIt,TIt> eq_rng = fshift_clusters.equal_range(*exon_cluster);
6526         for(TIt glob_cluster = eq_rng.first; glob_cluster != eq_rng.second;) {
6527             exon_cluster->m_limits += glob_cluster->m_limits;
6528             exon_cluster->m_fshifts.insert(exon_cluster->m_fshifts.end(),glob_cluster->m_fshifts.begin(),glob_cluster->m_fshifts.end());
6529             fshift_clusters.erase(glob_cluster++);
6530         }
6531         uniq(exon_cluster->m_fshifts);
6532         fshift_clusters.insert(eq_rng.second, *exon_cluster);
6533     }
6534     return true;
6535 }
6536 
FsTouch(const TSignedSeqRange & lim,const CInDelInfo & fs)6537 bool CChainer::CChainerImpl::FsTouch(const TSignedSeqRange& lim, const CInDelInfo& fs) {
6538     if(fs.IsInsertion() && fs.Loc()+fs.Len() == lim.GetFrom())
6539         return true;
6540     if(fs.IsDeletion() && fs.Loc() == lim.GetFrom())
6541         return true;
6542     if(fs.Loc() == lim.GetTo()+1)
6543         return true;
6544 
6545     return false;
6546 }
6547 
SplitAlignmentsByStrand(const TGeneModelList & clust,TGeneModelList & clust_plus,TGeneModelList & clust_minus)6548 void CChainer::CChainerImpl::SplitAlignmentsByStrand(const TGeneModelList& clust, TGeneModelList& clust_plus, TGeneModelList& clust_minus)
6549 {
6550     ITERATE (TGeneModelList, itcl, clust) {
6551         const CGeneModel& algn = *itcl;
6552 
6553         if (algn.Strand() == ePlus)
6554             clust_plus.push_back(algn);
6555         else
6556             clust_minus.push_back(algn);
6557     }
6558 }
6559 
InframeFraction(const CGeneModel & a,TSignedSeqPos left,TSignedSeqPos right)6560 double InframeFraction(const CGeneModel& a, TSignedSeqPos left, TSignedSeqPos right)
6561 {
6562     if(a.FrameShifts().empty())
6563         return 1.0;
6564 
6565     CAlignMap cdsmap(a.GetAlignMap());
6566     int inframelength = 0;
6567     int outframelength = 0;
6568     int frame = 0;
6569     TSignedSeqPos prev = left;
6570     TInDels indels = a.GetInDels(left, right, true);
6571     ITERATE(TInDels, fs, indels) {
6572         int len = cdsmap.FShiftedLen(cdsmap.ShrinkToRealPoints(TSignedSeqRange(prev,fs->Loc()-1)),false);
6573         if(frame == 0) {
6574             inframelength += len;
6575         } else {
6576             outframelength += len;
6577         }
6578 
6579         if(fs->IsDeletion()) {
6580             frame = (frame+fs->Len())%3;
6581         } else {
6582             frame = (3+frame-fs->Len()%3)%3;
6583         }
6584         prev = fs->Loc();    //  ShrinkToRealPoints will take care if it in insertion or intron
6585     }
6586     int len = cdsmap.FShiftedLen(cdsmap.ShrinkToRealPoints(TSignedSeqRange(prev,right)),false);
6587     if(frame == 0) {
6588         inframelength += len;
6589     } else {
6590         outframelength += len;
6591     }
6592     return double(inframelength)/(inframelength + outframelength);
6593 }
6594 
6595 struct ProjectCDS : public TransformFunction {
ProjectCDSProjectCDS6596     ProjectCDS(double _mininframefrac, const CResidueVec& _seq, CScope* _scope, const map<string, TSignedSeqRange>& _mrnaCDS)
6597         : mininframefrac(_mininframefrac), seq(_seq), scope(_scope), mrnaCDS(_mrnaCDS) {}
6598 
6599     double mininframefrac;
6600     const CResidueVec& seq;
6601     CScope* scope;
6602     const map<string, TSignedSeqRange>& mrnaCDS;
6603     virtual void transform_align(CAlignModel& align);
6604 };
6605 
transform_align(CAlignModel & align)6606 void ProjectCDS::transform_align(CAlignModel& align)
6607 {
6608     if ((align.Type()&CAlignModel::emRNA)==0 || (align.Status()&CGeneModel::eTSA)!=0 || (align.Status()&CGeneModel::eReversed)!=0 || (align.Status()&CGeneModel::eUnknownOrientation)!=0)
6609         return;
6610 
6611     TSignedSeqRange cds_on_mrna;
6612 
6613     if (scope != NULL) {
6614         SAnnotSelector sel;
6615         sel.SetFeatSubtype(CSeqFeatData::eSubtype_cdregion);
6616         CSeq_loc mrna;
6617         CRef<CSeq_id> target_id(new CSeq_id);
6618         target_id->Assign(*align.GetTargetId());
6619         mrna.SetWhole(*target_id);
6620         CFeat_CI feat_ci(*scope, mrna, sel);
6621         if (feat_ci && !feat_ci->IsSetPartial()) {
6622             const CSeq_loc& cds_loc = feat_ci->GetMappedFeature().GetLocation();
6623             const CSeq_id* cds_loc_seq_id  = cds_loc.GetId();
6624             if (cds_loc_seq_id != NULL && sequence::IsSameBioseq(*cds_loc_seq_id, *target_id, scope)) {
6625                 TSeqRange feat_range = cds_loc.GetTotalRange();
6626                 cds_on_mrna = TSignedSeqRange(feat_range.GetFrom(), feat_range.GetTo());
6627             }
6628         }
6629     } else {
6630         string accession = align.TargetAccession();
6631         map<string,TSignedSeqRange>::const_iterator pos = mrnaCDS.find(accession);
6632         if(pos != mrnaCDS.end()) {
6633             cds_on_mrna = pos->second;
6634         }
6635     }
6636 
6637     if (cds_on_mrna.Empty())
6638         return;
6639 
6640     CAlignMap alignmap(align.GetAlignMap());
6641     TSignedSeqPos left = alignmap.MapEditedToOrig(cds_on_mrna.GetFrom());
6642     TSignedSeqPos right = alignmap.MapEditedToOrig(cds_on_mrna.GetTo());
6643     if(align.Strand() == eMinus) {
6644         swap(left,right);
6645     }
6646 
6647     CGeneModel a = align;
6648 
6649     if(left < 0 || right < 0)     // start or stop cannot be projected
6650         return;
6651 
6652     CAlignMap alignmap_clipped(a.GetAlignMap());
6653     if(alignmap_clipped.MapOrigToEdited(left) < 0 || alignmap_clipped.MapOrigToEdited(right) < 0)     // cds is clipped
6654         return;
6655 
6656     a.Clip(TSignedSeqRange(left,right),CGeneModel::eRemoveExons);
6657 
6658     if(!a.Continuous())
6659         return;
6660 
6661     //            ITERATE(TInDels, fs, a.FrameShifts()) {
6662     //                if(fs->Len()%3 != 0) return;          // there is a frameshift
6663     //            }
6664 
6665     if (InframeFraction(a, left, right) < mininframefrac)
6666         return;
6667 
6668     a.FrameShifts().clear();                       // clear notshifted indels
6669     CAlignMap cdsmap(a.GetAlignMap());
6670     CResidueVec cds;
6671     cdsmap.EditedSequence(seq, cds);
6672     unsigned int length = cds.size();
6673 
6674     if(length%3 != 0)
6675         return;
6676 
6677     if(!IsStartCodon(&cds[0]) || !IsStopCodon(&cds[length-3]) )   // start or stop on genome is not right
6678         return;
6679 
6680     for(unsigned int i = 0; i < length-3; i += 3) {
6681         if(IsStopCodon(&cds[i]))
6682             return;                // premature stop on genome
6683     }
6684 
6685     TSignedSeqRange reading_frame = cdsmap.MapRangeEditedToOrig(TSignedSeqRange(3,length-4));
6686     TSignedSeqRange start = cdsmap.MapRangeEditedToOrig(TSignedSeqRange(0,2));
6687     TSignedSeqRange stop = cdsmap.MapRangeEditedToOrig(TSignedSeqRange(length-3,length-1));
6688 
6689     CCDSInfo cdsinfo;
6690     cdsinfo.SetReadingFrame(reading_frame,true);
6691     cdsinfo.SetStart(start,true);
6692     cdsinfo.SetStop(stop,true);
6693 
6694     //    align.FrameShifts().clear();
6695     CGeneModel b = align;
6696     b.FrameShifts().clear();
6697     align = CAlignModel(b, b.GetAlignMap());
6698     align.SetCdsInfo(cdsinfo);
6699 }
6700 
FilterOutBadScoreChainsHavingBetterCompatibles(TGeneModelList & chains)6701 void CChainer::CChainerImpl::FilterOutBadScoreChainsHavingBetterCompatibles(TGeneModelList& chains)
6702 {
6703             for(TGeneModelList::iterator it = chains.begin(); it != chains.end();) {
6704                 TGeneModelList::iterator itt = it++;
6705                 for(TGeneModelList::iterator jt = chains.begin(); jt != itt;) {
6706                     TGeneModelList::iterator jtt = jt++;
6707                     if(itt->Strand() != jtt->Strand() || (itt->Score() != BadScore() && jtt->Score() != BadScore())) continue;
6708 
6709                     // at least one score is BadScore
6710                     if(itt->Score() != BadScore()) {
6711                         if(itt->isCompatible(*jtt) > 1) chains.erase(jtt);
6712                     } else if(jtt->Score() != BadScore()) {
6713                         if(itt->isCompatible(*jtt) > 1) {
6714                             chains.erase(itt);
6715                             break;
6716                         }
6717 
6718                     } else if(itt->AlignLen() > jtt->AlignLen()) {
6719                         if(itt->isCompatible(*jtt) > 0) chains.erase(jtt);
6720                     } else {
6721                         if(itt->isCompatible(*jtt) > 0) {
6722                             chains.erase(itt);
6723                             break;
6724                         }
6725                     }
6726                 }
6727             }
6728 }
6729 
6730 
6731 struct TrimAlignment : public TransformFunction {
6732 public:
TrimAlignmentTrimAlignment6733     TrimAlignment(int a_trim) : trim(a_trim)  {}
6734     int trim;
6735 
TrimCodingExonLeftTrimAlignment6736     TSignedSeqPos TrimCodingExonLeft(const CAlignModel& align, const CModelExon& e, int trim)
6737     {
6738         TSignedSeqPos old_from = e.GetFrom();
6739         TSignedSeqPos new_from = align.FShiftedMove(old_from, trim);
6740         _ASSERT( new_from-old_from >= trim && new_from <= e.GetTo() );
6741 
6742         return new_from;
6743     }
6744 
TrimCodingExonRightTrimAlignment6745     TSignedSeqPos TrimCodingExonRight(const CAlignModel& align, const CModelExon& e, int trim)
6746     {
6747         TSignedSeqPos old_to = e.GetTo();
6748         TSignedSeqPos new_to = align.FShiftedMove(old_to, -trim);
6749         _ASSERT( old_to-new_to >= trim && new_to >= e.GetFrom() );
6750 
6751         return new_to;
6752     }
6753 
transform_alignTrimAlignment6754     virtual void transform_align(CAlignModel& align)
6755     {
6756         TSignedSeqRange flimits = align.Exons().front().Limits();
6757         TSignedSeqRange blimits = align.Exons().back().Limits();
6758         CAlignMap alignmap(align.GetAlignMap());
6759 
6760         if ((align.Type() & CAlignModel::eProt)!=0) {
6761             TrimProtein(align, alignmap);
6762         } else {
6763             TrimTranscript(align, alignmap);
6764         }
6765 
6766         // don't mark trimmed if trim was to the next exon
6767         if(align.Limits().GetFrom() > flimits.GetFrom() && align.Limits().GetFrom() <= flimits.GetTo()) align.Status() |= CAlignModel::eLeftTrimmed;
6768         if(align.Limits().GetTo() < blimits.GetTo() && align.Limits().GetTo() >= blimits.GetFrom()) align.Status() |= CAlignModel::eRightTrimmed;
6769     }
6770 
TrimProteinTrimAlignment6771     void TrimProtein(CAlignModel& align, CAlignMap& alignmap)
6772     {
6773         for (CAlignModel::TExons::const_iterator piece_begin = align.Exons().begin(); piece_begin != align.Exons().end(); ++piece_begin) {
6774             _ASSERT( !piece_begin->m_fsplice );
6775 
6776             CAlignModel::TExons::const_iterator piece_end;
6777             for (piece_end = piece_begin; piece_end != align.Exons().end() && piece_end->m_ssplice; ++piece_end) ;
6778             _ASSERT( piece_end != align.Exons().end() );
6779 
6780             TSignedSeqPos a;
6781             if (piece_begin == align.Exons().begin() && align.LeftComplete())
6782                 a = align.Limits().GetFrom();
6783             else
6784                 a = piece_begin->GetFrom()+trim;
6785 
6786             TSignedSeqPos b;
6787             if (piece_end->GetTo() >= align.Limits().GetTo() && align.RightComplete())
6788                 b = align.Limits().GetTo();
6789             else
6790                 b = piece_end->GetTo()-trim;
6791 
6792             if((a != piece_begin->GetFrom() || b != piece_end->GetTo()) && b > a) {
6793                 TSignedSeqRange newlimits = alignmap.ShrinkToRealPoints(TSignedSeqRange(a,b),true);
6794                 //                _ASSERT(newlimits.NotEmpty() && piece_begin->GetTo() >= newlimits.GetFrom() && piece_end->GetFrom() <= newlimits.GetTo());
6795                 if(newlimits.NotEmpty() && piece_begin->GetTo() >= newlimits.GetFrom() && piece_end->GetFrom() <= newlimits.GetTo())
6796                     align.Clip(newlimits, CAlignModel::eDontRemoveExons);
6797             }
6798 
6799             piece_begin = piece_end;
6800         }
6801     }
6802 
TrimTranscriptTrimAlignment6803     void TrimTranscript(CAlignModel& align, CAlignMap& alignmap)
6804     {
6805         if(!align.TrustedmRNA().empty())
6806             return;
6807         if(align.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
6808             return;
6809 
6810         int a = align.Limits().GetFrom();
6811         int b = align.Limits().GetTo();
6812         if(align.Strand() == ePlus) {
6813             if((align.Status()&CGeneModel::eCap) == 0)
6814                 a += trim;
6815             if((align.Status()&CGeneModel::ePolyA) == 0)
6816                 b -= trim;
6817         } else {
6818             if((align.Status()&CGeneModel::ePolyA) == 0)
6819                 a += trim;
6820             if((align.Status()&CGeneModel::eCap) == 0)
6821                 b -= trim;
6822         }
6823 
6824         //don't trim gapfillers
6825         if(align.Exons().front().m_ssplice_sig == "XX")
6826             a = align.Limits().GetFrom();
6827         if(align.Exons().back().m_fsplice_sig == "XX")
6828             b = align.Limits().GetTo();
6829 
6830         if(!align.ReadingFrame().Empty()) {  // avoid trimming confirmed CDSes
6831             TSignedSeqRange cds_on_genome = align.RealCdsLimits();
6832             if(cds_on_genome.GetFrom() < a) {
6833                 a = align.Limits().GetFrom();
6834             }
6835             if(b < cds_on_genome.GetTo()) {
6836                 b = align.Limits().GetTo();
6837             }
6838         }
6839 
6840         TSignedSeqRange newlimits = alignmap.ShrinkToRealPoints(TSignedSeqRange(a,b),false);
6841         _ASSERT(newlimits.NotEmpty() && align.Exons().front().GetTo() >= newlimits.GetFrom() && align.Exons().back().GetFrom() <= newlimits.GetTo());
6842 
6843         if(newlimits != align.Limits()) {
6844             align.Clip(newlimits,CAlignModel::eDontRemoveExons);    // Clip doesn't change AlignMap
6845         }
6846     }
6847 };
6848 
TrimAlignment()6849 TransformFunction* CChainer::TrimAlignment()
6850 {
6851     return new gnomon::TrimAlignment(m_data->trim);
6852 }
6853 
6854 struct DoNotBelieveShortPolyATail : public TransformFunction {
DoNotBelieveShortPolyATailDoNotBelieveShortPolyATail6855     DoNotBelieveShortPolyATail(int _minpolya) : minpolya(_minpolya) {}
6856 
6857     int minpolya;
transform_alignDoNotBelieveShortPolyATail6858     virtual void transform_align(CAlignModel& align)
6859     {
6860         if ((align.Status()&CGeneModel::ePolyA) == 0)
6861             return;
6862 
6863         if ((align.Status()&CGeneModel::eUnknownOrientation) != 0 || align.PolyALen() < minpolya)
6864             align.Status() ^= CGeneModel::ePolyA;
6865     }
6866 };
6867 
DoNotBelieveShortPolyATail()6868 TransformFunction* CChainer::DoNotBelieveShortPolyATail()
6869 {
6870     return new gnomon::DoNotBelieveShortPolyATail(m_data->minpolya);
6871 }
6872 
6873 
SetNumbering(int idnext,int idinc)6874 void CChainer::SetNumbering(int idnext, int idinc)
6875 {
6876     m_data->m_idnext = idnext;
6877     m_data->m_idinc = idinc;
6878 }
6879 
SetGenomicRange(const TAlignModelList & alignments)6880 void CChainer::SetGenomicRange(const TAlignModelList& alignments)
6881 {
6882     m_data->SetGenomicRange(alignments);
6883 }
6884 
SetGenomicRange(const TAlignModelList & alignments)6885 void CChainer::CChainerImpl::SetGenomicRange(const TAlignModelList& alignments)
6886 {
6887     TSignedSeqRange range = alignments.empty() ? TSignedSeqRange::GetWhole() : TSignedSeqRange::GetEmpty();
6888 
6889     CScope scope(*CObjectManager::GetInstance());
6890     scope.AddDefaults();
6891 
6892     ITERATE(TAlignModelList, i, alignments) {
6893         range += i->Limits();
6894 
6895         if(i->Type()&CGeneModel::eProt) {
6896             string accession = i->TargetAccession();
6897             if(!prot_complet.count(accession)) {
6898                 CSeqVector protein_seqvec(scope.GetBioseqHandle(*i->GetTargetId()), CBioseq_Handle::eCoding_Iupac);
6899                 CSeqVector_CI protein_ci(protein_seqvec);
6900                 prot_complet[accession] = make_pair(*protein_ci == 'M', true);
6901             }
6902         }
6903     }
6904 
6905     _ASSERT(m_gnomon.get() != NULL);
6906     m_gnomon->ResetRange(range);
6907 
6908     confirmed_ends.clear();
6909     orig_aligns.clear();
6910     unmodified_aligns.clear();
6911     mrna_count.clear();
6912     est_count.clear();
6913     rnaseq_count.clear();
6914     oriented_introns_plus.clear();
6915     oriented_introns_minus.clear();
6916 }
6917 
ProjectCDS(CScope & scope)6918 TransformFunction* CChainer::ProjectCDS(CScope& scope)
6919 {
6920     return new gnomon::ProjectCDS(m_data->mininframefrac, m_gnomon->GetSeq(),
6921                                   m_data->mrnaCDS.find("use_objmgr")!=m_data->mrnaCDS.end() ? &scope : NULL,
6922                                   m_data->mrnaCDS);
6923 }
6924 
6925 struct DoNotBelieveFrameShiftsWithoutCdsEvidence : public TransformFunction {
transform_alignDoNotBelieveFrameShiftsWithoutCdsEvidence6926     virtual void transform_align(CAlignModel& align)
6927     {
6928         if (align.ReadingFrame().Empty())
6929             align.FrameShifts().clear();
6930     }
6931 };
6932 
DoNotBelieveFrameShiftsWithoutCdsEvidence()6933 TransformFunction* CChainer::DoNotBelieveFrameShiftsWithoutCdsEvidence()
6934 {
6935     return new gnomon::DoNotBelieveFrameShiftsWithoutCdsEvidence();
6936 }
6937 
LeftAndLongFirst(const CGeneModel & a,const CGeneModel & b)6938 bool LeftAndLongFirst(const CGeneModel& a, const CGeneModel& b) {
6939     if(a.Limits() == b.Limits()) {
6940         if(a.Type() == b.Type())
6941             return a.ID() < b.ID();
6942         else
6943             return a.Type() > b.Type();
6944     }
6945     else if(a.Limits().GetFrom() == b.Limits().GetFrom())
6946         return a.Limits().GetTo() > b.Limits().GetTo();
6947     else
6948         return a.Limits().GetFrom() < b.Limits().GetFrom();
6949 }
6950 
SetConfirmedStartStopForProteinAlignments(TAlignModelList & alignments)6951 void CChainer::SetConfirmedStartStopForProteinAlignments(TAlignModelList& alignments)
6952 {
6953     m_data->SetConfirmedStartStopForProteinAlignments(alignments);
6954 }
6955 
SetConfirmedStartStopForProteinAlignments(TAlignModelList & alignments)6956 void CChainer::CChainerImpl::SetConfirmedStartStopForProteinAlignments(TAlignModelList& alignments)
6957 {
6958     NON_CONST_ITERATE (TAlignModelCluster, i, alignments) {
6959         CAlignModel& algn = *i;
6960         if ((algn.Type() & CGeneModel::eProt)!=0) {
6961             CCDSInfo cds = algn.GetCdsInfo();
6962             TSignedSeqRange alignedlim = algn.GetAlignMap().MapRangeOrigToEdited(algn.Limits(),false);
6963             map<string, pair<bool,bool> >::iterator iter = prot_complet.find(algn.TargetAccession());
6964             _ASSERT(iter != prot_complet.end());
6965             if(iter == prot_complet.end())
6966                 continue;
6967 
6968             if(cds.HasStart() && iter->second.first && alignedlim.GetFrom() == 0)
6969                 cds.SetStart(cds.Start(),true);
6970             if(cds.HasStop() && iter->second.second && alignedlim.GetTo() == algn.TargetLen()-1)
6971                 cds.SetStop(cds.Stop(),true);
6972             if(cds.ConfirmedStart() || cds.ConfirmedStop())
6973                 algn.SetCdsInfo(cds);
6974         }
6975     }
6976 }
6977 
DropAlignmentInfo(TAlignModelList & alignments,TGeneModelList & models)6978 void CChainer::DropAlignmentInfo(TAlignModelList& alignments, TGeneModelList& models)
6979 {
6980     ///////////////////////
6981     //    SMatrix blosum;
6982 
6983     NON_CONST_ITERATE (TAlignModelCluster, i, alignments) {
6984         if(!(i->Status()&CGeneModel::eUnmodifiedAlign))
6985             m_data->orig_aligns[i->ID()]=&(*i);
6986 
6987         CGeneModel aa = *i;
6988 
6989         if(!i->TrustedmRNA().empty() && i->Exons().size() > 1) {
6990             auto tlim = i->TranscriptLimits();
6991             if(i->Exons().front().Limits().NotEmpty() && tlim.GetFrom() == 0)
6992                 aa.Status() |= CGeneModel::eLeftConfirmed;
6993             if(i->Exons().back().Limits().NotEmpty() && (tlim.GetTo() == i->TargetLen()-1 || (i->Status()&CGeneModel::ePolyA)))
6994                 aa.Status() |= CGeneModel::eRightConfirmed;
6995         }
6996 
6997         if(aa.Type() & CGeneModel::eProt) {
6998             /*
6999             {{//////////////////////  print replacement info for diagnostics
7000                     const CResidueVec& contig = m_gnomon->GetSeq();
7001                     CScope scope(*CObjectManager::GetInstance());
7002                     scope.AddDefaults();
7003                     CSeqVector protein_seqvec(scope.GetBioseqHandle(*i->GetTargetId()), CBioseq_Handle::eCoding_Iupac);
7004                     CAlignMap amap = i->GetAlignMap();
7005 
7006                     ITERATE(CGeneModel::TExons, e, i->Exons()) {
7007                         TSignedSeqRange exon = m_edited_contig_map.ShrinkToRealPointsOnEdited(e->Limits());
7008                         if(exon.Empty())
7009                             continue;
7010                         exon = m_edited_contig_map.MapRangeEditedToOrig(exon,false);
7011                         if(exon.Empty())
7012                             continue;
7013                         map<int,char>::const_iterator ir = m_replacements.lower_bound(exon.GetFrom()+2);  // first definetely internal exon replacement or end()
7014                         for( ; ir != m_replacements.end() && ir->first <= exon.GetTo()-2; ++ir) {
7015                             int orig_gpos = ir->first;
7016                             int edited_gpos = m_edited_contig_map.MapOrigToEdited(orig_gpos);
7017                             int tpos = amap.MapOrigToEdited(edited_gpos);
7018                             if(tpos < 0)
7019                                 continue;
7020                             int pos_in_codon = tpos%3;
7021 
7022                             if(i->Strand() == eMinus)
7023                                 pos_in_codon = 2-pos_in_codon;
7024 
7025                             cout << tpos << '\t' <<  pos_in_codon << endl;
7026 
7027                             int codon_left = edited_gpos-pos_in_codon ;
7028                             string edited_codon(contig.begin()+codon_left,contig.begin()+codon_left+3);
7029                             string orig_codon = edited_codon;
7030                             orig_codon[pos_in_codon] = m_replaced_bases[orig_gpos];
7031                             if(i->Strand() == eMinus) {
7032                                 ReverseComplement(orig_codon.begin(),orig_codon.end());
7033                                 ReverseComplement(edited_codon.begin(),edited_codon.end());
7034                             }
7035                             string edited_aa, orig_aa;
7036                             objects::CSeqTranslator::Translate(orig_codon, orig_aa, objects::CSeqTranslator::fIs5PrimePartial);
7037                             objects::CSeqTranslator::Translate(edited_codon, edited_aa, objects::CSeqTranslator::fIs5PrimePartial);
7038                             char prot_aa = (tpos/3 < protein_seqvec.size()) ? protein_seqvec[tpos/3] : '*';
7039                             int delta = blosum.matrix[edited_aa[0]][prot_aa] - blosum.matrix[orig_aa[0]][prot_aa];
7040                             cout << "Replacement\t" << m_contig_acc << '\t' << orig_gpos << '\t' << orig_codon << '\t' << edited_codon << '\t' << orig_aa << '\t' << edited_aa << '\t' << prot_aa << '\t' << delta << '\t' << i->ID() << endl;
7041                         }
7042 
7043 
7044                     }
7045 
7046                 }}//////////////////
7047             */
7048             TInDels alignfshifts = i->GetInDels(true);
7049             TInDels fshifts;
7050             ITERATE(CGeneModel::TExons, e, aa.Exons()) {
7051                 TInDels efshifts;
7052                 int len = 0;
7053                 ITERATE(TInDels, fs, alignfshifts) {
7054                     if(fs->IntersectingWith(e->GetFrom(),e->GetTo())) {
7055                         efshifts.push_back(*fs);
7056                         len += (fs->IsInsertion() ? fs->Len() : -fs->Len());
7057                     }
7058                 }
7059                 if(efshifts.empty())
7060                     continue;
7061 
7062                 int a = efshifts.front().Loc()-1;
7063                 int b = efshifts.back().InDelEnd();
7064                 TIntMap::iterator conf = m_confirmed_bases_len.upper_bound(b); // confirmed on the right
7065                 bool confirmed_region = (conf != m_confirmed_bases_len.begin() && (--conf)->first <= a && conf->first+conf->second > b);
7066 
7067                 if(len%3 != 0 || !confirmed_region) {
7068                     ITERATE(TInDels, fs, efshifts) {
7069                         int l = fs->Len()%3;
7070                         if(fs->IsInsertion()) {
7071                             fshifts.push_back(CInDelInfo(fs->Loc(), l, CInDelInfo::eIns));
7072                         } else {
7073                             fshifts.push_back(CInDelInfo(fs->Loc(), l, CInDelInfo::eDel, fs->GetInDelV().substr(0,l)));
7074                         }
7075                     }
7076                     //                    fshifts.insert(fshifts.end(), efshifts.begin(), efshifts.end());
7077                 }
7078             }
7079             aa.FrameShifts() = fshifts;
7080         } else {
7081             aa.FrameShifts().clear();
7082             aa.Status() &= ~CGeneModel::eReversed;
7083         }
7084 
7085         models.push_back(aa);
7086     }
7087 }
7088 
7089 
SetupArgDescriptions(CArgDescriptions * arg_desc)7090 void CChainerArgUtil::SetupArgDescriptions(CArgDescriptions* arg_desc)
7091 {
7092     arg_desc->AddKey("param", "param",
7093                      "Organism specific parameters",
7094                      CArgDescriptions::eInputFile);
7095 
7096     arg_desc->SetCurrentGroup("Alignment modification");
7097     arg_desc->AddDefaultKey("trim", "trim",
7098                             "If aligned sequence is partial and includes a small portion of an exon the alignment program "
7099                             "usually misses this exon and might erroneously place a few bases from this exon near the previous exon, "
7100                             "and this will mess up the chaining. To prevent this we trim small portions of the alignment before chaining. "
7101                             "If it is possible, the trimming will be reversed for the 5'/3' ends of the final chain. Must be < minex and "
7102                             "multiple of 3",
7103                             CArgDescriptions::eInteger, "6");
7104 
7105     arg_desc->SetCurrentGroup("Additional information about sequences");
7106     arg_desc->AddOptionalKey("mrnaCDS", "mrnaCDS",
7107                              "CDSes annotated on mRNAs. If CDS could be projected on genome with intact "
7108                              "Start/Stop and frame the Stop will be accepted as is. The Start could/will "
7109                              "be moved further to make the longest possible complete CDS within the chain",
7110                              CArgDescriptions::eInputFile);
7111     arg_desc->AddDefaultKey("mininframefrac", "mininframefrac",
7112                             "Some mRNA alignments have paired indels which throw a portion of CDS out of frame."
7113                             "This parameter regulates how much of the CDS could suffer from this before CDS is considered inaceptable",
7114                             CArgDescriptions::eDouble, "0.95");
7115     arg_desc->AddOptionalKey("pinfo", "pinfo",
7116                              "Information about protein 5' and 3' completeness",
7117                              CArgDescriptions::eInputFile);
7118 
7119     arg_desc->SetCurrentGroup("Thresholds");
7120     arg_desc->AddDefaultKey("minscor", "minscor",
7121                             "Minimal coding propensity score for valid CDS. This threshold could be ignored depending on "
7122                             "-longenoughcds or -protcdslen and -minprotfrac",
7123                             CArgDescriptions::eDouble, "25.0");
7124     arg_desc->AddDefaultKey("longenoughcds", "longenoughcds",
7125                             "Minimal CDS not supported by protein or annotated mRNA to ignore the score (bp)",
7126                             CArgDescriptions::eInteger, "900");
7127     arg_desc->AddDefaultKey("protcdslen", "protcdslen",
7128                             "Minimal CDS supported by protein or annotated mRNA to ignore the score (bp)",
7129                             CArgDescriptions::eInteger, "300");
7130     arg_desc->AddDefaultKey("minprotfrac", "minprotfrac",
7131                             "Minimal fraction of protein aligned to ignore "
7132                             "the score and consider for confirmed start",
7133                             CArgDescriptions::eDouble, "0.9");
7134     arg_desc->AddDefaultKey("endprotfrac", "endprotfrac",
7135                             "Some proteins aligned with better than -minprotfrac coverage are missing Start/Stop. "
7136                             "If such an alignment was extended by EST(s) which provided a Start/Stop and we are not missing "
7137                             "more than (1-endprotfrac)*proteinlength on either side this chain will be considered to have a confirmed Start/Stop",
7138                             CArgDescriptions::eDouble, "0.05");
7139     arg_desc->AddDefaultKey("oep", "oep",
7140                             "Minimal overlap length for chaining alignments which don't have introns in the ovrlapping regions",
7141                             CArgDescriptions::eInteger, "10");
7142     arg_desc->AddDefaultKey("minsupport", "minsupport",
7143                             "Minimal number of mRNA/EST for valid noncoding models",
7144                             CArgDescriptions::eInteger, "3");
7145     arg_desc->AddDefaultKey("minsupport_mrna", "minsupport_mrna",
7146                             "Minimal number of mRNA for valid noncoding models",
7147                             CArgDescriptions::eInteger, "1");
7148     arg_desc->AddDefaultKey("minsupport_rnaseq", "minsupport_rnaseq",
7149                             "Minimal number of RNA-Seq for valid noncoding models",
7150                             CArgDescriptions::eInteger, "5");
7151     arg_desc->AddDefaultKey("minlen", "minlen",
7152                             "Chains with thorter CDS should be supported by protein or satisfy noncoding intron reguirements",
7153                             CArgDescriptions::eInteger, "100");
7154     arg_desc->AddDefaultKey("altfrac","altfrac","The CDS length of the principal model in the gene is multiplied by this fraction. Alt variants with the CDS length above "
7155                             "this are included in gene",CArgDescriptions::eDouble,"80.0");
7156     arg_desc->AddDefaultKey("composite","composite","Maximal composite number in alts",CArgDescriptions::eInteger,"1");
7157     arg_desc->AddFlag("opposite","Allow overlap of complete multiexon genes with opposite strands");
7158     arg_desc->AddFlag("partialalts","Allows partial alternative variants. In combination with -nognomon will allow partial genes");
7159     arg_desc->AddDefaultKey("tolerance","tolerance","if models exon boundary differ only this much only one model will survive",CArgDescriptions::eInteger,"5");
7160     arg_desc->AddFlag("no5pextension","Don't extend chain CDS to the leftmost start");
7161 
7162     arg_desc->SetCurrentGroup("Heuristic parameters for score evaluation");
7163     arg_desc->AddDefaultKey("i5p", "i5p",
7164                             "5p intron penalty",
7165                             CArgDescriptions::eDouble, "7.0");
7166     arg_desc->AddDefaultKey("i3p", "i3p",
7167                             "3p intron penalty",
7168                             CArgDescriptions::eDouble, "14.0");
7169     arg_desc->AddDefaultKey("cdsbonus", "cdsbonus",
7170                             "Bonus for CDS length",
7171                             CArgDescriptions::eDouble, "0.05");
7172     arg_desc->AddDefaultKey("lenpen", "lenpen",
7173                             "Penalty for total length",
7174                             CArgDescriptions::eDouble, "0.005");
7175     arg_desc->AddDefaultKey("utrclipthreshold", "utrclipthreshold",
7176                             "Relative coverage for clipping low support UTRs",
7177                             CArgDescriptions::eDouble, "0.01");
7178 
7179     arg_desc->SetCurrentGroup("CAGE/PolyA arguments");
7180 
7181     arg_desc->AddDefaultKey("min-cap-weight", "MinCapWeight",
7182                             "Minimal accepted weight for a capped alignment",
7183                             CArgDescriptions::eInteger, "5");
7184     arg_desc->AddDefaultKey("min-cap-blob", "MinCapBlob",
7185                             "Minimal cap blob weight for accepted peak",
7186                             CArgDescriptions::eInteger, "50");
7187 
7188     arg_desc->AddDefaultKey("min-polya-weight", "MinPolyaWeight",
7189                             "Minimal accepted weight for polya alignment",
7190                             CArgDescriptions::eInteger, "1");
7191     arg_desc->AddDefaultKey("min-polya-blob", "MinPolyaBlob",
7192                             "Minimal polya blob weight for accepted peak",
7193                             CArgDescriptions::eInteger, "1");
7194 
7195     arg_desc->AddDefaultKey("max-dist", "MaxDist",
7196                             "Maximal distance between individual cap/polya positions in a blob",
7197                             CArgDescriptions::eInteger, "20");
7198     arg_desc->AddDefaultKey("secondary-peak", "SecondaryPeak",
7199                             "Minimal weight fraction for a secondary cap/polya peak",
7200                             CArgDescriptions::eDouble, "0.5");
7201     arg_desc->AddDefaultKey("tertiary-peak", "TertiaryPeak",
7202                             "Last 5' exon is extended to low weight polya peak if there is sufficient rnaseq coverage",
7203                             CArgDescriptions::eDouble, "0.2");
7204     arg_desc->AddDefaultKey("tertiary-peak-coverage", "TertiaryPeakCoverage",
7205                             "Minimal relative rnaseq coverage for tertiary peak",
7206                             CArgDescriptions::eDouble, "0.05");
7207 
7208     arg_desc->AddDefaultKey("min-flank-exon", "MinFlankExon",
7209                             "The minimal distance of cap/polya to a splice",
7210                             CArgDescriptions::eInteger, "25");
7211 
7212 
7213     arg_desc->AddDefaultKey("minpolya", "minpolya",
7214                             "Minimal accepted polyA tale length in transcript alignments",
7215                             CArgDescriptions::eInteger, "6");
7216     arg_desc->AddFlag("use_confirmed_ends","Use end exons of trusted transcripts for clippig/extension");
7217 
7218 }
7219 
SetHMMParameters(CHMMParameters * params)7220 void CGnomonAnnotator_Base::SetHMMParameters(CHMMParameters* params)
7221 {
7222     m_hmm_params = params;
7223 }
7224 
SetIntersectLimit(int value)7225 void CChainer::SetIntersectLimit(int value)
7226 {
7227     m_data->intersect_limit = value;
7228 }
SetTrim(int trim)7229 void CChainer::SetTrim(int trim)
7230 {
7231     trim = (trim/3)*3;
7232     m_data->trim = trim;
7233 }
SetMinPolyA(int minpolya)7234 void CChainer::SetMinPolyA(int minpolya)
7235 {
7236     m_data->minpolya = minpolya;
7237 }
SetMinScor()7238 SMinScor& CChainer::SetMinScor()
7239 {
7240     return m_data->minscor;
7241 }
SetMinInframeFrac(double mininframefrac)7242 void CChainer::SetMinInframeFrac(double mininframefrac)
7243 {
7244     m_data->mininframefrac = mininframefrac;
7245 }
SetProtComplet()7246 map<string, pair<bool,bool> >& CChainer::SetProtComplet()
7247 {
7248     return m_data->prot_complet;
7249 }
SetMrnaCDS()7250 map<string,TSignedSeqRange>& CChainer::SetMrnaCDS()
7251 {
7252     return m_data->mrnaCDS;
7253 }
7254 
ArgsToChainer(CChainer * chainer,const CArgs & args,CScope & scope)7255 void CChainerArgUtil::ArgsToChainer(CChainer* chainer, const CArgs& args, CScope& scope)
7256 {
7257     CNcbiIfstream param_file(args["param"].AsString().c_str());
7258     chainer->SetHMMParameters(new CHMMParameters(param_file));
7259 
7260     chainer->SetIntersectLimit(args["oep"].AsInteger());
7261     chainer->SetTrim(args["trim"].AsInteger());
7262 
7263     SMinScor& minscor = chainer->SetMinScor();
7264     minscor.m_min = args["minscor"].AsDouble();
7265     minscor.m_i5p_penalty = args["i5p"].AsDouble();
7266     minscor.m_i3p_penalty = args["i3p"].AsDouble();
7267     minscor.m_cds_bonus = args["cdsbonus"].AsDouble();
7268     minscor.m_length_penalty = args["lenpen"].AsDouble();
7269     minscor.m_minprotfrac = args["minprotfrac"].AsDouble();
7270     minscor.m_endprotfrac = args["endprotfrac"].AsDouble();
7271     minscor.m_prot_cds_len = args["protcdslen"].AsInteger();
7272     minscor.m_cds_len = args["longenoughcds"].AsInteger();
7273     minscor.m_utr_clip_threshold = args["utrclipthreshold"].AsDouble();
7274     minscor.m_minsupport = args["minsupport"].AsInteger();
7275     minscor.m_minsupport_mrna = args["minsupport_mrna"].AsInteger();
7276     minscor.m_minsupport_rnaseq = args["minsupport_rnaseq"].AsInteger();
7277     minscor.m_minlen = args["minlen"].AsInteger();
7278 
7279     chainer->SetMinInframeFrac(args["mininframefrac"].AsDouble());
7280 
7281     chainer->m_data->altfrac = args["altfrac"].AsDouble();
7282     chainer->m_data->composite = args["composite"].AsInteger();
7283     chainer->m_data->allow_opposite_strand = args["opposite"];
7284     chainer->m_data->allow_partialalts = args["partialalts"];
7285     chainer->m_data->tolerance = args["tolerance"].AsInteger();
7286     chainer->m_data->no5pextension =  args["no5pextension"];
7287 
7288     chainer->m_data->min_cap_weight = args["min-cap-weight"].AsInteger();
7289     chainer->m_data->min_cap_blob = args["min-cap-blob"].AsInteger();
7290     chainer->m_data->min_polya_weight = args["min-polya-weight"].AsInteger();
7291     chainer->m_data->min_polya_blob = args["min-polya-blob"].AsInteger();
7292     chainer->m_data->max_dist = args["max-dist"].AsInteger();
7293     chainer->m_data->secondary_peak = args["secondary-peak"].AsDouble();
7294     chainer->m_data->tertiary_peak = args["tertiary-peak"].AsDouble();
7295     chainer->m_data->tertiary_peak_coverage = args["tertiary-peak-coverage"].AsDouble();
7296     chainer->m_data->min_flank_exon = args["min-flank-exon"].AsInteger();
7297     chainer->SetMinPolyA(args["minpolya"].AsInteger());
7298     chainer->m_data->use_confirmed_ends = args["use_confirmed_ends"];
7299 
7300 
7301 
7302     CIdHandler cidh(scope);
7303 
7304     map<string,TSignedSeqRange>& mrnaCDS = chainer->SetMrnaCDS();
7305     if(args["mrnaCDS"]) {
7306         if (args["mrnaCDS"].AsString()=="use_objmgr") {
7307             mrnaCDS[args["mrnaCDS"].AsString()] = TSignedSeqRange();
7308         } else {
7309             CNcbiIfstream cdsfile(args["mrnaCDS"].AsString().c_str());
7310             if (!cdsfile)
7311                 NCBI_THROW(CGnomonException, eGenericError, "Cannot open file " + args["mrnaCDS"].AsString());
7312             string accession, tmp;
7313             int a, b;
7314             while(cdsfile >> accession >> a >> b) {
7315                 _ASSERT(a > 0 && b > 0 && b > a);
7316                 getline(cdsfile,tmp);
7317                 accession = CIdHandler::ToString(*cidh.ToCanonical(*CIdHandler::ToSeq_id(accession)));
7318                 mrnaCDS[accession] = TSignedSeqRange(a-1,b-1);
7319             }
7320         }
7321     }
7322 
7323     map<string, pair<bool,bool> >& prot_complet = chainer->SetProtComplet();
7324     if(args["pinfo"]) {
7325         CNcbiIfstream protfile(args["pinfo"].AsString().c_str());
7326             if (!protfile)
7327                 NCBI_THROW(CGnomonException, eGenericError, "Cannot open file " + args["pinfo"].AsString());
7328         string seqid_str;
7329         bool fivep;
7330         bool threep;
7331         while(protfile >> seqid_str >> fivep >> threep) {
7332             seqid_str = CIdHandler::ToString(*CIdHandler::ToSeq_id(seqid_str));
7333             prot_complet[seqid_str] = make_pair(fivep, threep);
7334         }
7335     }
7336 }
7337 
OverlappingIndel(int pos,const CInDelInfo & indl)7338 bool OverlappingIndel(int pos, const CInDelInfo& indl) {
7339     if(indl.IsDeletion())
7340         return pos <= indl.InDelEnd();
7341     else
7342         return pos < indl.InDelEnd();
7343 }
7344 
7345 //this just copies exona_indels unless genome corrections are used
7346 //extra_left/extra_right insertions at the ends of exon on Agenome
CombineCorrectionsAndIndels(const TSignedSeqRange exona,int extra_left,int extra_right,const TSignedSeqRange exonb,const TInDels & editing_indels_frombtoa,const TInDels & exona_indels)7347 TInDels CombineCorrectionsAndIndels(const TSignedSeqRange exona, int extra_left, int extra_right, const TSignedSeqRange exonb, const TInDels& editing_indels_frombtoa, const TInDels& exona_indels) {
7348     TInDels combined_indels;
7349 
7350     TInDels::const_iterator ic = upper_bound(editing_indels_frombtoa.begin(), editing_indels_frombtoa.end(), exonb.GetFrom(), OverlappingIndel);  // skip all correction ending before exonb
7351     for( ;ic != editing_indels_frombtoa.end() && ic->GetStatus() != CInDelInfo::eGenomeNotCorrect; ++ic);   //skip ggaps and Ns
7352     if((ic == editing_indels_frombtoa.end() || ic->Loc() > exonb.GetTo()+1) && exona_indels.empty())
7353         return combined_indels;
7354 
7355     typedef list<char> TCharList;
7356     TCharList edit; // edit for Bgenome->transceipt calculated in two steps: Bgenome->Agenome->transceipt
7357     // M match/mismatch
7358     // - skip one base
7359     // everything else insert this letter
7360 
7361     //edit from B genome to A genome
7362     int pb = exonb.GetFrom();
7363     for( ;pb <= exonb.GetTo(); ++pb) {
7364         if(ic != editing_indels_frombtoa.end() && ic->Loc() <= pb) {
7365             if(ic->IsInsertion()) {
7366                 int len = min(exonb.GetTo()+1,ic->InDelEnd())-max(exonb.GetFrom(),ic->Loc());
7367                 edit.insert(edit.end(),len,'-');
7368                 pb = ic->InDelEnd()-1;
7369             } else {
7370                 string s = ic->GetInDelV();
7371                 if(pb == exonb.GetFrom())       // include extra_left part of deletion
7372                     s = s.substr(ic->Len()-extra_left);
7373                 edit.insert(edit.end(),s.begin(),s.end());
7374                 edit.push_back('M');            // base before deletion
7375             }
7376             ++ic;
7377         } else {
7378             edit.push_back('M');
7379         }
7380     }
7381     if(ic != editing_indels_frombtoa.end() && ic->Loc() == pb && ic->GetStatus() == CInDelInfo::eGenomeNotCorrect && extra_right > 0) { // include extra_right part of deletion
7382         _ASSERT(ic->IsDeletion());
7383         string s = ic->GetInDelV().substr(0,extra_right);
7384         edit.insert(edit.end(),s.begin(),s.end());
7385     }
7386     _ASSERT(exonb.GetLength() == count(edit.begin(),edit.end(),'M')+count(edit.begin(),edit.end(),'-'));
7387     _ASSERT(exona.GetLength() == (int)edit.size()-count(edit.begin(),edit.end(),'-'));
7388 
7389     // adding changes from A to transcript
7390     if(!exona_indels.empty()) {
7391         TInDels::const_iterator jleft = exona_indels.begin();
7392         int pa = exona.GetFrom()-1;
7393         int skipsome = 0;
7394         ERASE_ITERATE(TCharList, ip, edit) {
7395             if(*ip == '-')
7396                 continue;
7397             else
7398                 ++pa;
7399 
7400             if(jleft != exona_indels.end() && jleft->Loc() == pa) {
7401                 if(jleft->IsInsertion()) {  // skip extra bases on edited
7402                     _ASSERT(skipsome == 0);
7403                     skipsome = jleft->Len();
7404                     // don't use reverse iterator for erasing
7405                     for(TCharList::iterator ipp = ip; skipsome > 0 && ipp != edit.begin() && *(--ipp) != '-' && *ipp != 'M'; ) {  // skip previosly inserted
7406                         --skipsome;
7407                         ipp = edit.erase(ipp);
7408                     }
7409                 } else {                    // insert extra bases in transcript
7410                     _ASSERT(skipsome == 0);
7411                     int insertsome = jleft->Len();
7412                     for(reverse_iterator<TCharList::iterator> ir(ip); insertsome > 0 && ir != edit.rend() && *ir == '-'; ++ir) { // reuse skipped positions
7413                         *ir = 'M';
7414                         --insertsome;
7415                     }
7416                     if(insertsome > 0)
7417                         edit.insert(ip,insertsome,'N');
7418                 }
7419                 ++jleft;
7420             }
7421 
7422             if(skipsome > 0) {
7423                 --skipsome;
7424                 if(*ip == 'M')
7425                     *ip = '-';
7426                 else if(*ip != '-') // looks like *ip is never '-'
7427                     edit.erase(ip);
7428             }
7429         }
7430         if(jleft != exona_indels.end()) {
7431             _ASSERT(jleft->IsDeletion() && jleft->Loc() == pa+1);
7432             int insertsome = jleft->Len();
7433             for(TCharList::reverse_iterator ir = edit.rbegin(); insertsome > 0 && ir != edit.rend() && *ir == '-'; ++ir) { // reuse skipped positions
7434                 *ir = 'M';
7435                 --insertsome;
7436             }
7437             if(insertsome > 0)
7438                 edit.insert(edit.end(),insertsome,'N');
7439         }
7440     }
7441     _ASSERT(exonb.GetLength() == count(edit.begin(),edit.end(),'M')+count(edit.begin(),edit.end(),'-'));
7442 
7443     //TODO: combine +- indels separated by short spans of Ms
7444     pb = exonb.GetFrom();
7445     for(TCharList::iterator ip = edit.begin(); ip != edit.end(); ) {
7446         if(*ip == 'M') {
7447             ++pb;
7448             ++ip;
7449         } else if(*ip == '-') {
7450             int len = 0;
7451             for( ;ip != edit.end() && *ip == '-'; ++ip, ++len);
7452             int pos = pb;
7453             pb += len;
7454             for( ;len > 0 && ip != edit.end() && *ip != 'M'; ++ip, --len);   // we may have ----+++M but not +++--- (can't really happen unless corrections had adjacent -+)
7455             if(len > 0)
7456                 combined_indels.push_back(CInDelInfo(pos,len,CInDelInfo::eIns));
7457         } else {
7458             string s;
7459             for( ;ip != edit.end() && *ip != 'M' && *ip != '-'; ++ip)
7460                 s.push_back(*ip);
7461             combined_indels.push_back(CInDelInfo(pb, s.size(), CInDelInfo::eDel, s));
7462         }
7463     }
7464     _ASSERT(pb == exonb.GetTo()+1);
7465 
7466     return combined_indels;
7467 }
7468 
MapOneModelToOrigContig(const CGeneModel & srcmodel) const7469 CGeneModel CGnomonAnnotator_Base::MapOneModelToOrigContig(const CGeneModel& srcmodel) const {
7470     CGeneModel model = srcmodel;
7471     model.SetCdsInfo(CCDSInfo());
7472     model.CutExons(model.Limits());  // empty model with all atributes
7473     TInDels editedframeshifts;
7474 
7475     for(int ie = 0; ie < (int)srcmodel.Exons().size(); ++ie) {
7476         const CModelExon& e = srcmodel.Exons()[ie];
7477 
7478         string seq;
7479         CInDelInfo::SSource src;
7480         CGnomonAnnotator_Base::TGgapInfo::const_iterator i = m_inserted_seqs.upper_bound(e.GetTo());          // first ggap on right or end()
7481         if(i != m_inserted_seqs.begin()) {
7482             --i;                                                                       // first ggap left or equal GetTo()
7483             int ggapa = i->first;
7484             int ggapb = i->first+(int)i->second->GetInDelV().length()-1;
7485             if(ggapa == e.GetFrom()) {                                                 // exons starts with ggap
7486                 seq = i->second->GetInDelV().substr(0,e.Limits().GetLength());
7487                 src = i->second->GetSource();
7488                 if(src.m_strand == ePlus)
7489                     src.m_range.SetTo(src.m_range.GetFrom()+e.Limits().GetLength()-1);
7490                 else
7491                     src.m_range.SetFrom(src.m_range.GetTo()-e.Limits().GetLength()+1);
7492             } else if(ggapb == e.GetTo()) {                                            // exon ends by ggap
7493                 string s = i->second->GetInDelV();
7494                 seq = s.substr(s.length()-e.Limits().GetLength());
7495                 src = i->second->GetSource();
7496                 if(src.m_strand == eMinus)
7497                     src.m_range.SetTo(src.m_range.GetFrom()+e.Limits().GetLength()-1);
7498                 else
7499                     src.m_range.SetFrom(src.m_range.GetTo()-e.Limits().GetLength()+1);
7500             } else if(ggapb >= e.GetFrom()) {                                          // all real alignment and some filling was clipped
7501                 _ASSERT(srcmodel.Exons().size() == 1);
7502                 return CGeneModel();
7503             }
7504         }
7505 
7506         if(!seq.empty()) {  // ggap
7507             if((int)srcmodel.Exons().size() == 1){ // all real alignment was clipped
7508                 return CGeneModel();
7509             }
7510             if(model.Strand() == eMinus) {
7511                 ReverseComplement(seq.begin(), seq.end());
7512                 src.m_strand = (src.m_strand == ePlus ? eMinus : ePlus);
7513             }
7514             _ASSERT((int)seq.length() == src.m_range.GetLength());
7515             model.AddGgapExon(0, seq, src, false);
7516         } else {  // normal exon
7517             TSignedSeqRange exon = m_edited_contig_map.ShrinkToRealPointsOnEdited(e.Limits());
7518             if(exon.Empty()) {   // not projectable exon
7519                 return CGeneModel();
7520             }
7521             int extra_left = exon.GetFrom()-e.GetFrom();
7522             int extra_right = e.GetTo()-exon.GetTo();
7523 
7524             exon = m_edited_contig_map.MapRangeEditedToOrig(exon,false);
7525             _ASSERT(exon.NotEmpty());
7526 
7527             TInDels exon_indels;
7528             ITERATE(TInDels, indl, srcmodel.FrameShifts()) {
7529                 if(indl->IntersectingWith(e.GetFrom(),e.GetTo()))
7530                     exon_indels.push_back(*indl);
7531             }
7532             TInDels efs = CombineCorrectionsAndIndels(e.Limits(), extra_left, extra_right, exon, m_editing_indels, exon_indels);
7533 
7534             TInDels erepl;
7535             map<int,char>::const_iterator ir = m_replacements.lower_bound(exon.GetFrom());  // first exon replacement or end()
7536             for( ;ir != m_replacements.end() && ir->first <= exon.GetTo(); ++ir) {
7537                 int loc = ir->first;
7538                 char c = ir->second;
7539                 TInDels::const_iterator ic = upper_bound(efs.begin(), efs.end(), loc, OverlappingIndel);  // skip all indels ending before mismatch
7540                 if(ic != efs.end() && ic->IsInsertion() && ic->Loc() <= loc && ic->InDelEnd() > loc)   // overlapping insertion
7541                     continue;
7542                 else if(ic != efs.end() && ic->IsDeletion() && ic->Loc() == loc)                       // deletion right before mismatch
7543                     erepl.push_back(CInDelInfo(loc, 1, CInDelInfo::eMism, string(1,c)));
7544                 else if(erepl.empty() || erepl.back().InDelEnd() != loc)                               // not extention of previous
7545                     erepl.push_back(CInDelInfo(loc, 1, CInDelInfo::eMism, string(1,c)));
7546                 else {
7547                     loc = erepl.back().Loc();
7548                     string s = erepl.back().GetInDelV()+string(1,c);
7549                     erepl.back() = CInDelInfo(loc, s.size(), CInDelInfo::eMism, s);
7550                 }
7551             }
7552             efs.insert(efs.end(), erepl.begin(), erepl.end());
7553             sort(efs.begin(), efs.end());
7554             for(auto& indl : efs) {
7555                 indl.SetLoc(indl.Loc()+m_limits.GetFrom());
7556                 editedframeshifts.push_back(indl);
7557             }
7558 
7559             exon.SetFrom(exon.GetFrom()+m_limits.GetFrom());
7560             exon.SetTo(exon.GetTo()+m_limits.GetFrom());
7561             model.AddNormalExon(exon, e.m_fsplice_sig, e.m_ssplice_sig, 0, false);
7562         }
7563 
7564         if(ie < (int)srcmodel.Exons().size()-1 && (!e.m_ssplice || !srcmodel.Exons()[ie+1].m_fsplice)) // hole
7565             model.AddHole();
7566     }
7567 
7568     model.FrameShifts() = editedframeshifts;
7569     model.SetCdsInfo(srcmodel.GetCdsInfo().MapFromOrigToEdited(srcmodel.GetAlignMap()));
7570 
7571     return model;
7572 }
7573 
7574 
7575 /*
7576 //currently not used for anything; will need separation of indels and replacemnets inputs if used
7577 void MapAlignsToOrigContig(TAlignModelList& aligns, const TInDels& corrections, int contig_size) {
7578     CGnomonAnnotator_Base::TGgapInfo inserted_seqs;  // not used
7579     TInDels editing_indels;
7580     map<int,char> replacements;
7581 
7582     ITERATE(TInDels, i, corrections) {
7583         if(i->IsMismatch()) {
7584             string seq = i->GetInDelV();
7585             for(int l = 0; l < i->Len(); ++l)
7586                 replacements[i->Loc()+l] = seq[l];
7587         } else {
7588             editing_indels.push_back(*i);
7589             if(i->IsInsertion())
7590                 contig_size += i->Len();
7591             else
7592                 contig_size -= i->Len();
7593         }
7594     }
7595     CAlignMap edited_contig_map(0, contig_size-1, editing_indels.begin(), editing_indels.end());
7596 
7597     ERASE_ITERATE(TAlignModelList, ia, aligns) {
7598         CAlignModel& align = *ia;
7599         CGeneModel model = MapOneModelToOrigContig(align, editing_indels, replacements, edited_contig_map, inserted_seqs);
7600         if(model.Limits().Empty()) {
7601             aligns.erase(ia);
7602         } else {
7603             _ASSERT(align.Exons().size() == model.Exons().size());
7604             if(align.Type()&CAlignModel::eProt)
7605                 model.FrameShifts() = model.GetInDels(false);
7606             vector<TSignedSeqRange> transcript_exons;
7607             for(int i = 0; i < (int)align.Exons().size(); ++i)
7608                 transcript_exons.push_back(align.TranscriptExon(i));
7609             CAlignMap amap(model.Exons(), transcript_exons, model.FrameShifts(), align.Orientation(), align.TargetLen());
7610             CConstRef<objects::CSeq_id> id = align.GetTargetId();
7611             *ia = CAlignModel(model,amap);
7612             ia->SetTargetId(*id);
7613         }
7614     }
7615 }
7616 */
7617 
MapModelsToOrigContig(TGeneModelList & models) const7618 void CGnomonAnnotator_Base::MapModelsToOrigContig(TGeneModelList& models) const {
7619     ERASE_ITERATE(TGeneModelList, im, models) {
7620         CGeneModel model = MapOneModelToOrigContig(*im);
7621         if(model.Limits().Empty()) {
7622             models.erase(im);
7623         } else {
7624             NON_CONST_ITERATE(TInDels, i, model.FrameShifts()) {
7625                 if(i->IsMismatch()) {
7626                     i->SetStatus(CInDelInfo::eGenomeNotCorrect);
7627                 } else {
7628                     TIntMap::const_iterator conf = m_confirmed_bases_orig_len.upper_bound(i->Loc()); // confirmed on the right
7629                     bool included = (conf != m_confirmed_bases_orig_len.begin() && (--conf)->first < i->Loc() &&  conf->first+conf->second >= i->InDelEnd());
7630 
7631                     TInDels::const_iterator ic = upper_bound(m_editing_indels.begin(), m_editing_indels.end(), i->Loc(), OverlappingIndel);  // skip all correction ending before Loc()
7632                     if(ic != m_editing_indels.end() && i->GetType() == ic->GetType() && i->Loc() >= ic->Loc() && i->InDelEnd() <= ic->InDelEnd()) {
7633                         i->SetStatus(CInDelInfo::eGenomeNotCorrect);
7634                         _ASSERT(included);
7635                     } else if(included && (ic == m_editing_indels.end() || ic->Loc() > i->InDelEnd())) {
7636                         i->SetStatus(CInDelInfo::eGenomeCorrect);
7637                     }
7638                 }
7639             }
7640             *im = model;
7641         }
7642     }
7643 }
7644 
MapOneModelToEditedContig(const CGeneModel & align) const7645 CAlignModel CGnomonAnnotator_Base::MapOneModelToEditedContig(const CGeneModel& align) const
7646 {
7647     CAlignMap amap = align.GetAlignMap();
7648     CCDSInfo acds = align.GetCdsInfo();
7649     if(align.ReadingFrame().NotEmpty() && acds.IsMappedToGenome())
7650         acds = acds.MapFromOrigToEdited(amap);
7651     amap.MoveOrigin(m_limits.GetFrom());
7652 
7653     //mismatches are dropped at this point
7654     TInDels aindels = align.GetInDels(false);
7655 
7656     //recalculate limits to contig chunk
7657     for(auto& indel : aindels)
7658         indel.SetLoc(indel.Loc()-m_limits.GetFrom());
7659 
7660     CGeneModel::TExons aexons = align.Exons();
7661     for(auto& e : aexons) {
7662         if(e.Limits().NotEmpty()) {
7663             e.AddFrom(-m_limits.GetFrom());
7664             e.AddTo(-m_limits.GetFrom());
7665         }
7666     }
7667 
7668     CGeneModel editedmodel = align;
7669     editedmodel.ClearExons();  // empty alignment with all atributes
7670 
7671     vector<TSignedSeqRange> transcript_exons;
7672     TInDels editedindels;
7673     bool snap_to_codons = (align.Type() == CAlignModel::eProt);
7674 
7675     for(int i = 0; i < (int)aexons.size(); ++i) {
7676         const CModelExon& e = aexons[i];
7677 
7678         if(e.Limits().NotEmpty()) {   // real exon
7679             list<CInDelInfo> exon_indels;
7680             ITERATE(TInDels, indl, aindels) {
7681                 if(indl->IntersectingWith(e.GetFrom(), e.GetTo()))
7682                     exon_indels.push_back(*indl);
7683             }
7684 
7685             int left = e.GetFrom();  //projectable boundary
7686             int left_shrink = 0;     //unprojectable touching insertion
7687             int right = e.GetTo();   //projectable boundary
7688             int right_shrink = 0;    //unprojectable touching insertion
7689             int left_extend = 0;     //both alignment and correction indicate deletion of left_extend bases
7690             int right_extend = 0;    //both alignment and correction indicate deletion of right_extend base
7691             CAlignMap::ERangeEnd lend = CAlignMap::eLeftEnd;
7692             CAlignMap::ERangeEnd rend = CAlignMap::eRightEnd;
7693 
7694             TSignedSeqRange left_codon;
7695             TSignedSeqRange right_codon;
7696             if(align.Type() == CAlignModel::eProt) {
7697                 if(i == 0)
7698                     left_codon = (align.Strand() == ePlus ? acds.Start() :  acds.Stop());
7699                 if(i == (int)aexons.size()-1)
7700                     right_codon = (align.Strand() == ePlus ? acds.Stop() :  acds.Start());
7701 
7702                 left_codon = amap.MapRangeEditedToOrig(left_codon, false);
7703                 right_codon = amap.MapRangeEditedToOrig(right_codon, false);
7704             }
7705 
7706             TInDels::const_iterator ileft = upper_bound(m_editing_indels.begin(), m_editing_indels.end(), left, OverlappingIndel);  // skip all correction left of exon (doesn't skip touching deletion)
7707             for( ;ileft != m_editing_indels.end() && ileft->GetStatus() != CInDelInfo::eGenomeNotCorrect; ++ileft);   //skip ggaps and Ns
7708 
7709             if(ileft != m_editing_indels.end() && ileft->IsDeletion() && ileft->Loc() == left) {
7710                 if(!exon_indels.empty() && exon_indels.front().IsDeletion() && exon_indels.front().Loc() == left) {// ileft is touching deletion and there is matching indel in alignmnet
7711                     _ASSERT(left_codon.Empty());
7712                     left_extend = min(ileft->Len(),exon_indels.front().Len());
7713                 }
7714                 ++ileft;
7715             }
7716 
7717             //adjust left end
7718             int ll = left;
7719             if(left_codon.GetLength() == 3)
7720                 ll = left_codon.GetTo();
7721             if(ileft != m_editing_indels.end() && ileft->Loc() <= ll) {  // left end is involved
7722                 if(e.m_fsplice) {  // move splice to projectable point, add indels to keep the texon length
7723                     _ASSERT(left_codon.Empty());
7724                     left = ileft->Loc()+ileft->Len();  //could be only touching insertion
7725                     if(left > right)
7726                         return CAlignModel();
7727                     left_shrink = left-e.GetFrom();
7728                 } else {
7729                     // clip to commom projectable point
7730                     TSignedSeqRange lim = e.Limits();
7731                     if(left_codon.GetLength() == 3)
7732                         lim.SetFrom(left_codon.GetTo()+1);
7733                     while(ileft != m_editing_indels.end() && ileft->Loc() <= lim.GetFrom()) {
7734                         lim.SetFrom(ileft->InDelEnd());
7735                         if(lim.NotEmpty())
7736                             lim = amap.ShrinkToRealPoints(lim, snap_to_codons);  // skip alignment indels
7737                         if(lim.Empty())
7738                             return CAlignModel();
7739 
7740                         for( ;ileft != m_editing_indels.end() && ileft->InDelEnd() <= lim.GetFrom(); ++ileft); // skip outside corrections
7741                     }
7742 
7743                     left = lim.GetFrom();
7744                     while(!exon_indels.empty() && exon_indels.front().InDelEnd() <= left)
7745                         exon_indels.pop_front();
7746                     lend = CAlignMap::eSinglePoint;  // is used for transcript exon
7747                 }
7748             }
7749 
7750             TInDels::const_iterator first_outside = ileft;
7751             for( ; first_outside != m_editing_indels.end() && first_outside->Loc() <= (first_outside->IsInsertion() ? right : right+1); ++first_outside); // end() or first completely on right
7752             reverse_iterator<TInDels::const_iterator> iright(first_outside);  // previous correction (last which interferes with exon or rend())
7753             for( ;iright != m_editing_indels.rend() && iright->GetStatus() != CInDelInfo::eGenomeNotCorrect; ++iright);   //skip ggaps and Ns
7754 
7755             if(iright != m_editing_indels.rend() && iright->IsDeletion() && iright->Loc() == right+1) {
7756                 if(!exon_indels.empty() && exon_indels.back().IsDeletion() && exon_indels.back().Loc() == right+1) { // touching deletion and there is matching indel in alignmnet
7757                     _ASSERT(right_codon.Empty());
7758                     right_extend = min(iright->Len(),exon_indels.back().Len());
7759                 }
7760                 ++iright;
7761             }
7762 
7763             //adjust right end
7764             int rr = right;
7765             if(right_codon.GetLength() == 3)
7766                 rr = right_codon.GetFrom();
7767             if(iright != m_editing_indels.rend() && iright->InDelEnd() > rr) {  // right end is involved
7768                 if(e.m_ssplice) { // move splice to projectable point, add indels to keep the texon length
7769                     _ASSERT(right_codon.Empty());
7770                     right = iright->Loc()-1;
7771                     if(right < left)
7772                         return CAlignModel();
7773                     right_shrink = e.GetTo()-right;
7774                 } else {
7775                     // clip to commom projectable point
7776                     TSignedSeqRange lim = e.Limits();
7777                     if(right_codon.GetLength() == 3)
7778                         lim.SetTo(right_codon.GetFrom()-1);
7779                     while(iright != m_editing_indels.rend() && iright->InDelEnd() > lim.GetTo()) { // iright is insertion including right position
7780                         lim.SetTo(iright->Loc()-1);
7781                         if(lim.NotEmpty())
7782                             lim = amap.ShrinkToRealPoints(lim, snap_to_codons);  // skip alignment indels
7783                         if(lim.Empty())
7784                             return CAlignModel();
7785 
7786                         for( ; iright != m_editing_indels.rend() && iright->Loc() > lim.GetTo(); ++iright);  // skip outside corrections
7787                     }
7788 
7789                     right = lim.GetTo();
7790                     while(!exon_indels.empty() && exon_indels.back().Loc() > right)
7791                         exon_indels.pop_back();
7792                     rend = CAlignMap::eSinglePoint;  // is used for transcript exon
7793                 }
7794             }
7795 
7796             TSignedSeqRange orig_exon(left-left_shrink, right+right_shrink);
7797             TSignedSeqRange texon = amap.MapRangeOrigToEdited(orig_exon, lend, rend);
7798             transcript_exons.push_back(texon);
7799 
7800             TSignedSeqRange corrected_exon = m_edited_contig_map.MapRangeOrigToEdited(TSignedSeqRange(left, right), false);
7801             _ASSERT(corrected_exon.NotEmpty());
7802             //TODO: account for left/right shrink? Whe projected back, this will move all isertion inside the exon
7803             corrected_exon.SetFrom(corrected_exon.GetFrom()-left_extend);
7804             corrected_exon.SetTo(corrected_exon.GetTo()+right_extend);
7805             editedmodel.AddExon(corrected_exon, e.m_fsplice_sig, e.m_ssplice_sig, e.m_ident);
7806             if(i < (int)aexons.size()-1 && (!aexons[i].m_ssplice || !aexons[i+1].m_fsplice))  // hole
7807                 editedmodel.AddHole();
7808 
7809 
7810             TInDels efs = CombineCorrectionsAndIndels(orig_exon, left_shrink, right_shrink, corrected_exon, m_reversed_corrections, TInDels(exon_indels.begin(), exon_indels.end()));
7811             editedindels.insert(editedindels.end(), efs.begin(), efs.end());
7812         } else {                     // gap exon
7813             transcript_exons.push_back(align.TranscriptExon(i));
7814             string gap_seq = e.m_seq;
7815             if(align.Orientation() == eMinus)
7816                 ReverseComplement(gap_seq.begin(), gap_seq.end());
7817 
7818             TInDels::const_iterator gap = m_editing_indels.end();
7819             ITERATE(TInDels, ig, m_editing_indels) {
7820                 if(ig->GetSource().m_range.NotEmpty()) {  //ggap
7821                     if(i > 0 && ig->Loc() < aexons[i-1].GetTo())
7822                         continue;
7823                     if(i == 0 && ig->Loc() > aexons[i+1].GetFrom())
7824                         break;
7825                     if(ig->GetInDelV() == gap_seq) {
7826                         gap = ig;
7827                         if(i > 0) break;  //first available  for all exons except the first one
7828                     }
7829                 }
7830             }
7831             _ASSERT(gap != m_editing_indels.end());
7832 
7833             int left_end = m_edited_contig_map.MapOrigToEdited(gap->Loc());
7834             if(left_end >= 0) {
7835                 left_end -= gap->Len();
7836                 for(TInDels::const_iterator ig = gap+1; ig != m_editing_indels.end() && ig->Loc() == gap->Loc(); ++ig)
7837                     left_end -= ig->Len();
7838             } else {
7839                 left_end = m_edited_contig_map.MapOrigToEdited(gap->Loc()-1);
7840                 _ASSERT(left_end >= 0);
7841                 left_end += 1;
7842                 for(TInDels::const_iterator ig = gap; ig != m_editing_indels.begin() && (ig-1)->Loc() == gap->Loc(); --ig) {
7843                     left_end += (ig-1)->Len();
7844                 }
7845             }
7846 
7847             editedmodel.AddExon(TSignedSeqRange(left_end,left_end+gap->Len()-1), "XX", "XX", 1);
7848         }
7849     }
7850 
7851     CAlignMap editedamap(editedmodel.Exons(), transcript_exons, editedindels, align.Orientation(), amap.TargetLen());
7852 
7853     editedmodel.FrameShifts() = editedindels;
7854     CAlignModel editedalign(editedmodel, editedamap);
7855 
7856     _ASSERT(align.GetEdgeReadingFrames()->empty());
7857 
7858     if(align.ReadingFrame().NotEmpty()) {
7859         double score = acds.Score();
7860         bool open = acds.OpenCds();
7861         acds.Clip(editedalign.TranscriptLimits());
7862         acds.SetScore(score, open);
7863         editedalign.SetCdsInfo(acds.MapFromEditedToOrig(editedamap));
7864     }
7865 
7866     return editedalign;
7867 }
7868 
MapAlignmentsToEditedContig(TAlignModelList & alignments) const7869 void CGnomonAnnotator_Base::MapAlignmentsToEditedContig(TAlignModelList& alignments) const
7870 {
7871     ERASE_ITERATE(TAlignModelList, ia, alignments) {
7872         CAlignModel a = MapOneModelToEditedContig(*ia);
7873         if(a.Limits().NotEmpty()) {
7874             a.SetTargetId(*ia->GetTargetId());
7875             *ia = a;
7876         } else {
7877             alignments.erase(ia);
7878         }
7879     }
7880 }
7881 
MapModelsToEditedContig(TGeneModelList & models) const7882 void CGnomonAnnotator_Base::MapModelsToEditedContig(TGeneModelList& models) const
7883 {
7884     NON_CONST_ITERATE(TGeneModelList, ia, models) {
7885         *ia = MapOneModelToEditedContig(*ia);
7886         _ASSERT(!ia->Exons().empty());
7887     }
7888 }
7889 
SetGenomic(const CResidueVec & seq)7890 void CGnomonAnnotator_Base::SetGenomic(const CResidueVec& seq)
7891 {
7892     m_edited_contig_map = CAlignMap(0, seq.size()-1);
7893     m_editing_indels.clear();
7894     m_reversed_corrections.clear();
7895     m_confirmed_bases_len.clear();
7896     m_confirmed_bases_orig_len.clear();
7897     m_replacements.clear();
7898     m_inserted_seqs.clear();
7899     m_notbridgeable_gaps_len.clear();
7900     m_contig_acc.clear();
7901     m_gnomon.reset(new CGnomonEngine(m_hmm_params, seq, TSignedSeqRange::GetWhole()));
7902 }
7903 
7904 // SetGenomic for annot - models could be 0
SetGenomic(const CSeq_id & contig,CScope & scope,const string & mask_annots,const TGeneModelList * models)7905 void CGnomonAnnotator_Base::SetGenomic(const CSeq_id& contig, CScope& scope, const string& mask_annots, const TGeneModelList* models) {
7906     SCorrectionData correction_data;
7907     m_notbridgeable_gaps_len.clear();
7908 
7909     if(models) {
7910         CBioseq_Handle bh(scope.GetBioseqHandle(contig));
7911         CSeqVector sv (bh.GetSeqVector(CBioseq_Handle::eCoding_Iupac));
7912         int length (sv.size());
7913         string seq_txt;
7914         sv.GetSeqData(0, length, seq_txt);
7915 
7916         TIVec exons(length,0);
7917 
7918         ITERATE(TGeneModelList, i, *models) {
7919             ITERATE(CGeneModel::TExons, e, i->Exons()) {
7920                 if(e->Limits().NotEmpty()) {
7921                     int a = e->GetFrom();
7922                     //                    if(a > 0 && !sv.IsInGap(a-1)) --a;
7923                     //                    if(a > 0 && !sv.IsInGap(a-1)) --a;
7924                     int b = e->GetTo();
7925                     //                    if(b < length-1 && !sv.IsInGap(b+1)) ++b;
7926                     //                    if(b < length-1 && !sv.IsInGap(b+1)) ++b;
7927                     //                    for(int p = a; p <= b; ++p) {  // block all exons and splices
7928                     for(int p = a+1; p <= b; ++p) {  // block all exons except first base (can't keep splices after all)
7929                         exons[p] = 1;                // mark positions which cannot be used for deletions
7930                     }                                // !!!!!!!!it is still a problem if gapfilled models are exactly next to each other!!!!!!!!!!!!!!
7931                 }
7932             }
7933         }
7934 
7935         TIVec model_ranges(length,0);
7936 
7937         ITERATE(TGeneModelList, i, *models) {
7938             for(int p = max(0,i->Limits().GetFrom()-2); p <= min(length-1,i->Limits().GetTo()+2); ++p)
7939                 model_ranges[p] = 1;
7940 
7941             ITERATE(TInDels, indl, i->FrameShifts()) {
7942                 if(indl->GetStatus() == CInDelInfo::eGenomeNotCorrect) {
7943                     if(indl->IsMismatch()) {
7944                         string s = indl->GetInDelV();
7945                         for(int l = 0; l < indl->Len(); ++l)
7946                             correction_data.m_replacements[indl->Loc()+l] = s[l];
7947                     } else {
7948                         correction_data.m_correction_indels.push_back(*indl);
7949                     }
7950                 }
7951                 if(indl->GetStatus() != CInDelInfo::eUnknown) {
7952                     correction_data.m_confirmed_intervals.push_back(TSignedSeqRange(indl->Loc()-1,indl->InDelEnd()));
7953                     _ASSERT(correction_data.m_confirmed_intervals.back().GetFrom() >= 0 && correction_data.m_confirmed_intervals.back().GetTo() < length);
7954                 }
7955             }
7956             for(int ie = 0; ie < (int)i->Exons().size(); ++ie) {
7957                 const CModelExon& e = i->Exons()[ie];
7958                 if(e.Limits().Empty()) {
7959                     int pos;
7960                     if(ie > 0) {
7961                         _ASSERT(i->Exons()[ie-1].Limits().NotEmpty());
7962                         for(pos = i->Exons()[ie-1].GetTo()+1; pos < length && exons[pos] > 0; ++pos);
7963                     } else {
7964                         _ASSERT((int)i->Exons().size() > 1 && i->Exons()[1].Limits().NotEmpty());
7965                         //                        for(pos = i->Exons()[1].GetFrom(); pos > 0 && exons[pos-1] > 0; --pos);
7966                         for(pos = i->Exons()[1].GetFrom(); pos > 0 && exons[pos] > 0; --pos);
7967                     }
7968                     string seq = e.m_seq;
7969                     CInDelInfo::SSource source = e.m_source;
7970                     if(i->Strand() == eMinus) {
7971                         ReverseComplement(seq.begin(),seq.end());
7972                         source.m_strand = OtherStrand(source.m_strand);
7973                     }
7974                     correction_data.m_correction_indels.push_back(CInDelInfo(pos, seq.length(), CInDelInfo::eDel, seq, source));
7975                 }
7976             }
7977         }
7978 
7979         uniq(correction_data.m_correction_indels);  //remove duplicates from altvariants
7980         ERASE_ITERATE(TInDels, indl, correction_data.m_correction_indels) {  // remove 'partial' indels
7981             TInDels::iterator next = indl;
7982             if(++next != correction_data.m_correction_indels.end() && indl->Loc() == next->Loc()) {
7983                 if(indl->GetSource().m_range.Empty() && next->GetSource().m_range.Empty()) {
7984                     _ASSERT(indl->IsDeletion());
7985                     _ASSERT(next->IsDeletion());
7986                     VECTOR_ERASE(indl, correction_data.m_correction_indels);
7987                 }
7988             }
7989         }
7990 
7991         TIntMap::iterator current_gap = m_notbridgeable_gaps_len.end();
7992         for(int i = 0; i < length; ++i) {
7993             if(model_ranges[i])
7994                 continue;
7995 
7996             CConstRef<CSeq_literal> gsl = sv.GetGapSeq_literal(i);
7997             if(gsl && gsl->GetBridgeability() == CSeq_literal::e_NotBridgeable) {
7998                 if(current_gap == m_notbridgeable_gaps_len.end())
7999                     current_gap = m_notbridgeable_gaps_len.insert(TIntMap::value_type(i,1)).first;
8000                 else
8001                     ++current_gap->second;
8002             } else {
8003                 current_gap = m_notbridgeable_gaps_len.end();
8004             }
8005         }
8006     }
8007 
8008     SetGenomic(contig, scope, correction_data,  TSignedSeqRange::GetWhole(), mask_annots);
8009 }
8010 
SetGenomic(const CSeq_id & contig,CScope & scope,const SCorrectionData & correction_data,TSignedSeqRange limits,const string & mask_annots)8011 void CGnomonAnnotator_Base::SetGenomic(const CSeq_id& contig, CScope& scope, const SCorrectionData& correction_data, TSignedSeqRange limits, const string& mask_annots)
8012 {
8013     m_contig_acc = CIdHandler::ToString(contig);
8014 
8015     CResidueVec seq;
8016     int length;
8017 
8018     CBioseq_Handle bh(scope.GetBioseqHandle(contig));
8019     {
8020         CSeqVector sv (bh.GetSeqVector(CBioseq_Handle::eCoding_Iupac));
8021         length = sv.size();
8022         if(limits == TSignedSeqRange::GetWhole()) {
8023             limits.SetFrom(0);
8024             limits.SetTo(length-1);
8025         }
8026         int GC_RANGE = 200000;
8027         limits.SetFrom(max(0, limits.GetFrom()-GC_RANGE/2));
8028         limits.SetTo(min(length-1, limits.GetTo()+GC_RANGE/2));
8029         length = limits.GetLength();
8030         m_limits = limits;
8031         seq.reserve(length);
8032         for(int i = limits.GetFrom(); i <= limits.GetTo(); ++i)
8033             seq.push_back(sv[i]);
8034     }
8035 
8036     if (m_masking) {
8037         SAnnotSelector sel;
8038         {
8039             list<string> arr;
8040             NStr::Split(mask_annots, " ", arr, NStr::fSplit_MergeDelimiters|NStr::fSplit_Truncate);
8041             ITERATE(list<string>, annot, arr) {
8042                 sel.AddNamedAnnots(*annot);
8043             }
8044         }
8045         sel.IncludeFeatSubtype(CSeqFeatData::eSubtype_repeat_region)
8046             .SetResolveAll()
8047             .SetAdaptiveDepth(true);
8048         for (CFeat_CI it(bh, sel);  it;  ++it) {
8049             TSeqRange range = it->GetLocation().GetTotalRange();
8050             for(unsigned int i = range.GetFrom(); i <= range.GetTo(); ++i) {
8051                 if(Include(limits, i))
8052                     seq[i-limits.GetFrom()] = tolower(seq[i-limits.GetFrom()]);
8053             }
8054         }
8055     }
8056 
8057     m_editing_indels.clear();
8058     m_reversed_corrections.clear();
8059     m_confirmed_bases_len.clear();
8060     m_confirmed_bases_orig_len.clear();
8061     m_replacements.clear();
8062     m_inserted_seqs.clear();
8063 
8064     m_replacements = correction_data.m_replacements;
8065     for(map<int,char>::iterator ir = m_replacements.begin(); ir != m_replacements.end(); ++ir) {
8066         if(Include(limits,ir->first)) {
8067             m_replaced_bases[ir->first-limits.GetFrom()] = seq[ir->first-limits.GetFrom()];
8068             seq[ir->first-limits.GetFrom()] = ir->second;
8069         }
8070     }
8071 
8072 
8073 #define     BLOCK_OF_Ns 35
8074     for(auto cor :  correction_data.m_correction_indels) {
8075         if(cor.GetSource().m_range.Empty() && Include(limits, cor.Loc())) { // correction indel
8076             cor.SetLoc(cor.Loc()-limits.GetFrom());
8077             m_editing_indels.push_back(cor);
8078         } else if(cor.Loc() >= limits.GetFrom() && cor.Loc() <= limits.GetTo()+1) {     // ggap (1bp fake ggaps may be loctated right before or after contig)
8079             int l = cor.Loc()-limits.GetFrom();
8080             CInDelInfo g(l, cor.Len(), cor.GetType(), cor.GetInDelV(), cor.GetSource());
8081             //surround ggap with Ns to satisfy MinIntron
8082             CInDelInfo Ns(l, BLOCK_OF_Ns, CInDelInfo::eDel, string(BLOCK_OF_Ns,'N'));
8083             m_editing_indels.push_back(Ns);
8084             m_editing_indels.push_back(g);
8085             m_editing_indels.push_back(Ns);
8086         }
8087     }
8088 
8089     m_edited_contig_map = CAlignMap(0, length-1, m_editing_indels.begin(), m_editing_indels.end());
8090     {
8091         CResidueVec editedseq;
8092         m_edited_contig_map.EditedSequence(seq,editedseq);
8093         swap(seq, editedseq);
8094     }
8095 
8096     ITERATE(TInDels, ig, m_editing_indels) {
8097         TInDels::const_iterator nexti = next(ig);
8098         if(nexti != m_editing_indels.end() && nexti->GetSource().m_range.NotEmpty() && nexti->Loc() == ig->Loc())  // block of Ns
8099             continue;
8100 
8101         if(ig->GetSource().m_range.NotEmpty()) {  //ggap
8102             int left_end = m_edited_contig_map.MapOrigToEdited(ig->Loc());
8103             if(left_end >= 0) {
8104                 left_end -= ig->Len();
8105                 for(TInDels::const_iterator igg = ig+1; igg != m_editing_indels.end() && igg->Loc() == ig->Loc(); ++igg)
8106                     left_end -= igg->Len();
8107             } else {
8108                 left_end = m_edited_contig_map.MapOrigToEdited(ig->Loc()-1);
8109                 _ASSERT(left_end >= 0);
8110                 left_end += 1;
8111                 for(TInDels::const_iterator i = ig; i != m_editing_indels.begin() && (i-1)->Loc() == ig->Loc(); --i) {
8112                     left_end += (i-1)->Len();
8113                 }
8114             }
8115             m_inserted_seqs[left_end] = ig;
8116             ++ig;   // skip  block of Ns
8117         } else {
8118             int loc = m_edited_contig_map.MapOrigToEdited(ig->InDelEnd());
8119             _ASSERT(loc >= 0);
8120             if(ig->IsInsertion()) {
8121                 string s(seq.begin()+ig->Loc(), seq.begin()+ig->Len());
8122                 m_reversed_corrections.push_back(CInDelInfo(loc, ig->Len(), CInDelInfo::eDel, NStr::ToUpper(s)));
8123             } else {
8124                 m_reversed_corrections.push_back(CInDelInfo(loc-ig->Len(), ig->Len(), CInDelInfo::eIns));
8125             }
8126             m_reversed_corrections.back().SetStatus(ig->GetStatus());
8127         }
8128     }
8129 
8130     set<int> confirmed_bases;
8131     for(list<TSignedSeqRange>::const_iterator it = correction_data.m_confirmed_intervals.begin(); it != correction_data.m_confirmed_intervals.end(); ++it) {
8132         TSignedSeqRange lim = *it;
8133         _ASSERT(lim.NotEmpty());
8134         for(int p = lim.GetFrom(); p <= lim.GetTo(); ++p)
8135             confirmed_bases.insert(p);
8136     }
8137     TIntMap::iterator cbase_len = m_confirmed_bases_orig_len.end();
8138     ITERATE(set<int>, ip, confirmed_bases) {
8139         if(cbase_len == m_confirmed_bases_orig_len.end() || *ip != cbase_len->first+cbase_len->second)
8140             cbase_len = m_confirmed_bases_orig_len.insert(TIntMap::value_type(*ip,1)).first;
8141         else
8142             ++cbase_len->second;
8143     }
8144 
8145     ITERATE(TIntMap, ic,  m_confirmed_bases_orig_len) {
8146         TSignedSeqRange lim(ic->first, ic->first+ic->second-1);
8147         lim = m_edited_contig_map.MapRangeOrigToEdited(lim, false);
8148         _ASSERT(lim.NotEmpty());
8149         m_confirmed_bases_len[lim.GetFrom()] = lim.GetLength();
8150     }
8151 
8152     TIntMap notbridgeable_gaps_len;
8153     ITERATE(TIntMap, ig, m_notbridgeable_gaps_len) {
8154         int pos = m_edited_contig_map.MapOrigToEdited(ig->first);
8155         _ASSERT(pos >= 0);
8156         notbridgeable_gaps_len[pos] = ig->second;
8157     }
8158     m_notbridgeable_gaps_len = notbridgeable_gaps_len;
8159 
8160 
8161     m_gnomon.reset(new CGnomonEngine(m_hmm_params, move(seq), TSignedSeqRange::GetWhole()));
8162 }
8163 
GetGnomon()8164 CGnomonEngine& CGnomonAnnotator_Base::GetGnomon()
8165 {
8166     return *m_gnomon;
8167 }
8168 
MarkupCappedEst(const set<string> & _caps,int _capgap)8169 MarkupCappedEst::MarkupCappedEst(const set<string>& _caps, int _capgap)
8170     : caps(_caps)
8171     , capgap(_capgap)
8172 {}
8173 
transform_align(CAlignModel & align)8174 void MarkupCappedEst::transform_align(CAlignModel& align)
8175 {
8176     string acc = CIdHandler::ToString(*align.GetTargetId());
8177     int fivep = align.TranscriptExon(0).GetFrom();
8178     if(align.Strand() == eMinus)
8179         fivep = align.TranscriptExon(align.Exons().size()-1).GetFrom();
8180     if((align.Status()&CGeneModel::eReversed) == 0 && caps.find(acc) != caps.end() && fivep < capgap)
8181         align.Status() |= CGeneModel::eCap;
8182 }
8183 
MarkupTrustedGenes(const set<string> & _trusted_genes)8184 MarkupTrustedGenes::MarkupTrustedGenes(const set<string>& _trusted_genes) : trusted_genes(_trusted_genes) {}
8185 
transform_align(CAlignModel & align)8186 void MarkupTrustedGenes::transform_align(CAlignModel& align)
8187 {
8188     string acc = CIdHandler::ToString(*align.GetTargetId());
8189     if(trusted_genes.find(acc) != trusted_genes.end()) {
8190         CRef<CSeq_id> target_id(new CSeq_id);
8191         target_id->Assign(*align.GetTargetId());
8192         if(align.Type() == CGeneModel::eProt)
8193             align.InsertTrustedProt(target_id);
8194         else
8195             align.InsertTrustedmRNA(target_id);
8196     }
8197 }
8198 
ProteinWithBigHole(double _hthresh,double _hmaxlen,CGnomonEngine & _gnomon)8199 ProteinWithBigHole::ProteinWithBigHole(double _hthresh, double _hmaxlen, CGnomonEngine& _gnomon)
8200     : hthresh(_hthresh), hmaxlen(_hmaxlen), gnomon(_gnomon) {}
model_predicate(CGeneModel & m)8201 bool ProteinWithBigHole::model_predicate(CGeneModel& m)
8202 {
8203     if ((m.Type() & CGeneModel::eProt)==0)
8204         return false;
8205     int total_hole_len = 0;
8206     for(unsigned int i = 1; i < m.Exons().size(); ++i) {
8207         if(!m.Exons()[i-1].m_ssplice || !m.Exons()[i].m_fsplice)
8208             total_hole_len += m.Exons()[i].GetFrom()-m.Exons()[i-1].GetTo()-1;
8209     }
8210     if(total_hole_len < hmaxlen*m.Limits().GetLength())
8211         return false;
8212 
8213     for(unsigned int i = 1; i < m.Exons().size(); ++i) {
8214         bool hole = !m.Exons()[i-1].m_ssplice || !m.Exons()[i].m_fsplice;
8215         int intron = m.Exons()[i].GetFrom()-m.Exons()[i-1].GetTo()-1;
8216         if (hole && gnomon.GetChanceOfIntronLongerThan(intron) < hthresh) {
8217             return true;
8218         }
8219     }
8220     return false;
8221 }
8222 
model_predicate(CGeneModel & m)8223 bool CdnaWithHole::model_predicate(CGeneModel& m)
8224 {
8225     if ((m.Type() & CGeneModel::eProt)!=0)
8226         return false;
8227     return !m.Continuous();
8228 }
8229 
HasShortIntron(CGnomonEngine & _gnomon)8230 HasShortIntron::HasShortIntron(CGnomonEngine& _gnomon)
8231     :gnomon(_gnomon) {}
8232 
model_predicate(CGeneModel & m)8233 bool HasShortIntron::model_predicate(CGeneModel& m)
8234 {
8235     for(unsigned int i = 1; i < m.Exons().size(); ++i) {
8236         bool hole = !m.Exons()[i-1].m_ssplice || !m.Exons()[i].m_fsplice;
8237         int intron = m.Exons()[i].GetFrom()-m.Exons()[i-1].GetTo()-1;
8238         if (!hole && m.Exons()[i].m_fsplice_sig != "XX" && m.Exons()[i-1].m_ssplice_sig != "XX" && intron < gnomon.GetMinIntronLen()) {
8239             return true;
8240         }
8241     }
8242     return false;
8243 }
8244 
HasLongIntron(CGnomonEngine & _gnomon)8245 HasLongIntron::HasLongIntron(CGnomonEngine& _gnomon)
8246     :gnomon(_gnomon) {}
8247 
model_predicate(CGeneModel & m)8248 bool HasLongIntron::model_predicate(CGeneModel& m)
8249 {
8250     for(unsigned int i = 1; i < m.Exons().size(); ++i) {
8251         bool hole = !m.Exons()[i-1].m_ssplice || !m.Exons()[i].m_fsplice;
8252         int intron = m.Exons()[i].GetFrom()-m.Exons()[i-1].GetTo()-1;
8253         if (!hole && intron > gnomon.GetMaxIntronLen()) {
8254             return true;
8255         }
8256     }
8257     return false;
8258 }
8259 
CutShortPartialExons(int _minex)8260 CutShortPartialExons::CutShortPartialExons(int _minex)
8261     : minex(_minex) {}
8262 
EffectiveExonLength(const CModelExon & e,const CAlignMap & alignmap,bool snap_to_codons)8263 int EffectiveExonLength(const CModelExon& e, const CAlignMap& alignmap, bool snap_to_codons) {
8264     TSignedSeqRange shrinkedexon = alignmap.ShrinkToRealPoints(e,snap_to_codons);
8265     int exonlen = alignmap.FShiftedLen(shrinkedexon,false);  // length of the projection on transcript
8266     return min(exonlen,shrinkedexon.GetLength());
8267 }
8268 
transform_align(CAlignModel & a)8269 void CutShortPartialExons::transform_align(CAlignModel& a)
8270 {
8271     if (a.Exons().empty())
8272         return;
8273 
8274     CAlignMap alignmap(a.GetAlignMap());
8275     if(a.Exons().size() == 1 && min(a.Limits().GetLength(),alignmap.FShiftedLen(alignmap.ShrinkToRealPoints(a.Limits()),false)) < 2*minex) {
8276         // one exon and it is short
8277         a.CutExons(a.Limits());
8278         return;
8279     }
8280 
8281     bool snap_to_codons = ((a.Type() & CAlignModel::eProt)!=0);
8282     TSignedSeqPos left  = a.Limits().GetFrom();
8283     if ((a.Exons().size() > 1 && !a.Exons().front().m_ssplice) || (a.Type() & CAlignModel::eProt)==0 || !a.LeftComplete()) {
8284         for(unsigned int i = 0; i < a.Exons().size()-1; ++i) {
8285             if(EffectiveExonLength(a.Exons()[i], alignmap, snap_to_codons) >= minex) {
8286                 break;
8287             } else {
8288                 left = a.Exons()[i+1].GetFrom();
8289                 if(a.Strand() == ePlus && (a.Status()&CGeneModel::eCap) != 0)
8290                     a.Status() ^= CGeneModel::eCap;
8291                 if(a.Strand() == eMinus && (a.Status()&CGeneModel::ePolyA) != 0)
8292                     a.Status() ^= CGeneModel::ePolyA;
8293             }
8294         }
8295     }
8296 
8297     TSignedSeqPos right = a.Limits().GetTo();
8298     if ((a.Exons().size() > 1 && !a.Exons().back().m_fsplice) || (a.Type() & CAlignModel::eProt)==0 || !a.RightComplete()) {
8299         for(unsigned int i = a.Exons().size()-1; i > 0; --i) {
8300             if(EffectiveExonLength(a.Exons()[i], alignmap, snap_to_codons) >= minex) {
8301                 break;
8302             } else {
8303                 right = a.Exons()[i-1].GetTo();
8304                 if(a.Strand() == eMinus && (a.Status()&CGeneModel::eCap) != 0)
8305                     a.Status() ^= CGeneModel::eCap;
8306                 if(a.Strand() == ePlus && (a.Status()&CGeneModel::ePolyA) != 0)
8307                     a.Status() ^= CGeneModel::ePolyA;
8308             }
8309         }
8310     }
8311 
8312     TSignedSeqRange newlimits(left,right);
8313     if(newlimits.NotEmpty()) {
8314         newlimits = alignmap.ShrinkToRealPoints(newlimits,snap_to_codons);
8315         if(newlimits != a.Limits()) {
8316             if(newlimits.GetLength() < 2*minex || alignmap.FShiftedLen(newlimits,false) < 2*minex) {
8317                 a.CutExons(a.Limits());
8318                 return;
8319             }
8320             a.Clip(newlimits,CAlignModel::eRemoveExons);
8321         }
8322     } else {
8323         a.CutExons(a.Limits());
8324         return;
8325     }
8326 
8327 
8328     for (size_t i = 1; i < a.Exons().size()-1; ++i) {
8329         const CModelExon* e = &a.Exons()[i];
8330 
8331         while (!e->m_ssplice && EffectiveExonLength(*e, alignmap, snap_to_codons) < minex) {
8332 
8333             if(i == 0) { //first exon
8334                 a.CutExons(*e);
8335                 e = &a.Exons()[0];    // we still have at least one exon
8336                 break;
8337             }
8338 
8339             //this point is not an indel and is a codon boundary for proteins
8340             TSignedSeqPos remainingpoint = alignmap.ShrinkToRealPoints(TSignedSeqRange(a.Exons().front().GetFrom(),a.Exons()[i-1].GetTo()),snap_to_codons).GetTo();
8341             TSignedSeqPos left = e->GetFrom();
8342             if(remainingpoint < a.Exons()[i-1].GetTo())
8343                 left = remainingpoint+1;
8344             a.CutExons(TSignedSeqRange(left,e->GetTo()));
8345             --i;
8346             e = &a.Exons()[i];
8347         }
8348 
8349         while (!e->m_fsplice && EffectiveExonLength(*e, alignmap, snap_to_codons) < minex) {
8350 
8351             if(i == a.Exons().size()-1) { //last exon
8352                 a.CutExons(*e);
8353                 break;
8354             }
8355 
8356             //this point is not an indel and is a codon boundary for proteins
8357             TSignedSeqPos remainingpoint = alignmap.ShrinkToRealPoints(TSignedSeqRange(a.Exons()[i+1].GetFrom(),a.Exons().back().GetTo()),snap_to_codons).GetFrom();
8358             TSignedSeqPos right = e->GetTo();
8359             if(remainingpoint > a.Exons()[i+1].GetFrom())
8360                 right = remainingpoint-1;
8361 
8362             a.CutExons(TSignedSeqRange(e->GetFrom(),right));
8363             e = &a.Exons()[i];
8364         }
8365     }
8366     return;
8367 }
8368 
model_predicate(CGeneModel & m)8369 bool HasNoExons::model_predicate(CGeneModel& m)
8370 {
8371     return m.Exons().empty();
8372 }
8373 
model_predicate(CGeneModel & m)8374 bool SingleExon_AllEst::model_predicate(CGeneModel& m)
8375 {
8376     return m.Exons().size() <= 1 && (m.Type() & (CAlignModel::eProt|CAlignModel::emRNA))==0;
8377 }
8378 
model_predicate(CGeneModel & m)8379 bool SingleExon_Noncoding::model_predicate(CGeneModel& m)
8380 {
8381     return m.Exons().size() <= 1 && m.Score() == BadScore();
8382 }
8383 
LowSupport_Noncoding(int _minsupport)8384 LowSupport_Noncoding::LowSupport_Noncoding(int _minsupport)
8385     : minsupport(_minsupport)
8386 {}
model_predicate(CGeneModel & m)8387 bool LowSupport_Noncoding::model_predicate(CGeneModel& m)
8388 {
8389     return m.Score() == BadScore() && int(m.Support().size()) < minsupport && (m.Type() & (CAlignModel::eProt|CAlignModel::emRNA))==0;
8390 }
8391 
8392 END_SCOPE(gnomon)
8393 END_SCOPE(ncbi)
8394 
8395 
8396