1 #ifndef ALGO_GNOMON___CHAINER__HPP
2 #define ALGO_GNOMON___CHAINER__HPP
3
4 /* $Id: chainer.hpp 626285 2021-02-25 14:08:44Z ivanov $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Authors: Alexandre Souvorov
30 *
31 * File Description:
32 *
33 */
34
35 #include <algo/gnomon/gnomon_model.hpp>
36 #include <algo/gnomon/aligncollapser.hpp>
37
38 BEGIN_SCOPE(ncbi)
39
40 class CArgDescriptions;
41 class CArgs;
42
43 BEGIN_SCOPE(gnomon)
44
45 class CHMMParameters;
46
47 struct SMinScor {
48 double m_min;
49 double m_i5p_penalty;
50 double m_i3p_penalty;
51 double m_cds_bonus;
52 double m_length_penalty;
53 double m_minprotfrac;
54 double m_endprotfrac;
55 int m_prot_cds_len;
56 int m_cds_len;
57 double m_utr_clip_threshold;
58 int m_minsupport;
59 int m_minsupport_mrna;
60 int m_minsupport_rnaseq;
61 int m_minlen;
62 };
63
64
65 // redefine STL algorithms to take Function Object pointer to allow for inheritance
66
67 template <class Container, class Predicate>
remove_if(Container & c,Predicate * __pred)68 void remove_if(Container& c, Predicate* __pred)
69 {
70 typedef typename Container::iterator Iterator;
71 Iterator __first = c.begin();
72 Iterator __last = c.end();
73 while (__first != __last) {
74 if ((*__pred)(*__first)) {
75 (__first)->Status() |= CGeneModel::eSkipped;
76 (__first)->AddComment(__pred->GetComment());
77 __first = c.erase(__first);
78 } else
79 ++__first;
80 }
81 delete __pred;
82 }
83
84 template <class Container, class UnaryFunction>
transform(Container & c,UnaryFunction * op)85 void transform(Container& c, UnaryFunction* op)
86 {
87 typedef typename Container::iterator Iterator;
88 Iterator __first = c.begin();
89 Iterator __last = c.end();
90 for (;__first != __last;++__first)
91 (*op)(*__first);
92 delete op;
93 }
94
95 struct TransformFunction {
~TransformFunctionTransformFunction96 virtual ~TransformFunction() {}
operator ()TransformFunction97 void operator()(CGeneModel& a) { transform_model(a); }
operator ()TransformFunction98 void operator()(CAlignModel& a) { transform_align(a); }
99
transform_modelTransformFunction100 virtual void transform_model(CGeneModel& /*a*/) {}
transform_alignTransformFunction101 virtual void transform_align(CAlignModel& a) { transform_model(a); }
102 };
103 struct Predicate {
~PredicatePredicate104 virtual ~Predicate() {}
GetCommentPredicate105 virtual string GetComment() { return "reason not given"; }
operator ()Predicate106 bool operator()(CGeneModel& a) { return model_predicate(a); }
operator ()Predicate107 bool operator()(CAlignModel& a) { return align_predicate(a); }
108
model_predicatePredicate109 virtual bool model_predicate(CGeneModel& /*a*/) { return false; }
align_predicatePredicate110 virtual bool align_predicate(CAlignModel& a) { return model_predicate(a); }
111 };
112
113 class NCBI_XALGOGNOMON_EXPORT CGnomonAnnotator_Base {
114 public:
115 CGnomonAnnotator_Base();
116 virtual ~CGnomonAnnotator_Base();
117
118 void SetHMMParameters(CHMMParameters* params);
119 void EnableSeqMasking();
120 void SetGenomic(const CResidueVec& seq);
121 void SetGenomic(const CSeq_id& seqid, objects::CScope& scope, const string& mask_annots = kEmptyStr, const TGeneModelList* models = 0);
122 void SetGenomic(const CSeq_id& seqid, objects::CScope& scope, const SCorrectionData& correction_data, TSignedSeqRange range = TSignedSeqRange::GetWhole(), const string& mask_annots = kEmptyStr);
123
124 /*
125 //for compatibilty with 'pre-correction' worker node
126 NCBI_DEPRECATED
127 static TInDels GetGenomicGaps(const TGeneModelList& ) { return TInDels(); }
128 NCBI_DEPRECATED
129 void SetGenomic(const CSeq_id& seqid, CScope& scope, const string& mask_annots, const TInDels* contig_fix_indels, const TGeneModelList* models = 0) {
130 if(models != 0 || contig_fix_indels == 0) {
131 SetGenomic(seqid, scope, mask_annots, models);
132 } else {
133 SCorrectionData correction_data;
134 correction_data.m_correction_indels = *contig_fix_indels;
135 SetGenomic(seqid, scope, correction_data, TSignedSeqRange::GetWhole(), mask_annots);
136 }
137 }
138 */
139
140 CGnomonEngine& GetGnomon();
141 void MapAlignmentsToEditedContig(TAlignModelList& alignments) const;
142 void MapModelsToEditedContig(TGeneModelList& models) const;
143 void MapModelsToOrigContig(TGeneModelList& models) const;
144
145 typedef map<int,TInDels::const_iterator> TGgapInfo;
146 typedef map<int,int> TIntMap;
147
148 protected:
149 CAlignModel MapOneModelToEditedContig(const CGeneModel& align) const;
150 CGeneModel MapOneModelToOrigContig(const CGeneModel& srcmodel) const;
151
152 bool m_masking;
153 CRef<CHMMParameters> m_hmm_params;
154 unique_ptr<CGnomonEngine> m_gnomon;
155 CAlignMap m_edited_contig_map;
156 TInDels m_editing_indels; // in original coordinates (include corrections, ggaps and Ns)
157 TInDels m_reversed_corrections; // corrections from edited genome back to original (without gggaps or ns)
158 TIntMap m_confirmed_bases_len; // include all "confirmed" or "corrected" positions in corrected coordinates
159 TIntMap m_confirmed_bases_orig_len; // include all "confirmed" or "corrected" positions in original coordinates
160 map<int,char> m_replacements; // in original coordinates
161 map<int,char> m_replaced_bases; // in original coordinates; just in case
162 TGgapInfo m_inserted_seqs; // edited left coord to indelinfo for ggaps
163 TIntMap m_notbridgeable_gaps_len; // don't allow introns to cross this
164 TSignedSeqRange m_limits; // limits on contig
165 string m_contig_acc;
166 };
167
168 ////////////////////////////////////////////////////////////////////////
169 class CChainerArgUtil;
170
171 class NCBI_XALGOGNOMON_EXPORT CChainer : public CGnomonAnnotator_Base {
172 public:
173 CChainer();
174 ~CChainer();
175
176 void SetIntersectLimit(int value);
177 void SetTrim(int trim);
178 void SetMinPolyA(int minpolya);
179 SMinScor& SetMinScor();
180 void SetMinInframeFrac(double mininframefrac);
181 map<string, pair<bool,bool> >& SetProtComplet();
182 map<string,TSignedSeqRange>& SetMrnaCDS();
183 void SetGenomicRange(const TAlignModelList& alignments);
184 void SetNumbering(int idnext, int idinc);
185 void SetOnlyBestFs(bool onlybestfs);
186
187 TransformFunction* TrimAlignment();
188 TransformFunction* DoNotBelieveShortPolyATail();
189 Predicate* OverlapsSameAccessionAlignment(TAlignModelList& alignments);
190 Predicate* ConnectsParalogs(TAlignModelList& alignments);
191 TransformFunction* ProjectCDS(objects::CScope& scope);
192 TransformFunction* DoNotBelieveFrameShiftsWithoutCdsEvidence();
193 void SetConfirmedStartStopForProteinAlignments(TAlignModelList& alignments);
194 void DropAlignmentInfo(TAlignModelList& alignments, TGeneModelList& models);
195 void FilterOutChimeras(TGeneModelList& clust);
196 void ScoreCDSes_FilterOutPoorAlignments(TGeneModelList& clust);
197 void FindSelenoproteinsClipProteinsToStartStop(TGeneModelList& clust);
198 void CutParts(TGeneModelList& models);
199 TGeneModelList MakeChains(TGeneModelList& models, bool coding_estimates_only = false);
200
201 /*
202 // dummy functions for compatibilty with 'pre-correction' worker node
203 NCBI_DEPRECATED
204 void FilterOutInferiorProtAlignmentsWithIncompatibleFShifts(TGeneModelList& ) { return; }
205 NCBI_DEPRECATED
206 void ReplicateFrameShifts(TGeneModelList& ) { return; }
207 */
208
209 private:
210 // Prohibit copy constructor and assignment operator
211 CChainer(const CChainer& value);
212 CChainer& operator= (const CChainer& value);
213
214 class CChainerImpl;
215 unique_ptr<CChainerImpl> m_data;
216
217 friend class CChainerArgUtil;
218 };
219
220 struct MarkupCappedEst : public TransformFunction {
221 MarkupCappedEst(const set<string>& _caps, int _capgap);
222
223 const set<string>& caps;
224 int capgap;
225
226 virtual void transform_align(CAlignModel& align);
227 };
228
229 struct MarkupTrustedGenes : public TransformFunction {
230 MarkupTrustedGenes(const set<string>& _trusted_genes);
231 const set<string>& trusted_genes;
232
233 virtual void transform_align(CAlignModel& align);
234 };
235
236 struct ProteinWithBigHole : public Predicate {
237 ProteinWithBigHole(double hthresh, double hmaxlen, CGnomonEngine& gnomon);
238 double hthresh, hmaxlen;
239 CGnomonEngine& gnomon;
240
241 virtual bool model_predicate(CGeneModel& align);
242 };
243
244 struct CdnaWithHole : public Predicate {
245 virtual bool model_predicate(CGeneModel& align);
246 };
247
248 struct HasShortIntron : public Predicate {
249 HasShortIntron(CGnomonEngine& gnomon);
250 CGnomonEngine& gnomon;
251 virtual bool model_predicate(CGeneModel& align);
252 };
253
254 struct HasLongIntron : public Predicate {
255 HasLongIntron(CGnomonEngine& gnomon);
256 CGnomonEngine& gnomon;
257 virtual bool model_predicate(CGeneModel& align);
258 };
259
260 struct CutShortPartialExons : public TransformFunction {
261 CutShortPartialExons(int minex);
262 int minex;
263
264 virtual void transform_align(CAlignModel& align);
265 };
266
267 struct HasNoExons : public Predicate {
268 virtual bool model_predicate(CGeneModel& align);
269 };
270
271 struct SingleExon_AllEst : public Predicate {
272 virtual bool model_predicate(CGeneModel& align);
273 };
274
275 struct SingleExon_Noncoding : public Predicate {
276 virtual bool model_predicate(CGeneModel& align);
277 };
278
279 struct LowSupport_Noncoding : public Predicate {
280 LowSupport_Noncoding(int _minsupport);
281 int minsupport;
282 virtual bool model_predicate(CGeneModel& align);
283 };
284
285 class NCBI_XALGOGNOMON_EXPORT CChainerArgUtil {
286 public:
287 static void SetupArgDescriptions(CArgDescriptions* arg_desc);
288 static void ArgsToChainer(CChainer* chainer, const CArgs& args, objects::CScope& scope);
289 };
290
291 END_SCOPE(gnomon)
292 END_SCOPE(ncbi)
293
294
295 #endif // ALGO_GNOMON___CHAINER__HPP
296