1 #ifndef ALGO_GNOMON___CHAINER__HPP
2 #define ALGO_GNOMON___CHAINER__HPP
3 
4 /*  $Id: chainer.hpp 626285 2021-02-25 14:08:44Z ivanov $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors:  Alexandre Souvorov
30  *
31  * File Description:
32  *
33  */
34 
35 #include <algo/gnomon/gnomon_model.hpp>
36 #include <algo/gnomon/aligncollapser.hpp>
37 
38 BEGIN_SCOPE(ncbi)
39 
40 class CArgDescriptions;
41 class CArgs;
42 
43 BEGIN_SCOPE(gnomon)
44 
45 class CHMMParameters;
46 
47 struct SMinScor {
48     double m_min;
49     double m_i5p_penalty;
50     double m_i3p_penalty;
51     double m_cds_bonus;
52     double m_length_penalty;
53     double m_minprotfrac;
54     double m_endprotfrac;
55     int m_prot_cds_len;
56     int m_cds_len;
57     double m_utr_clip_threshold;
58     int m_minsupport;
59     int m_minsupport_mrna;
60     int m_minsupport_rnaseq;
61     int m_minlen;
62 };
63 
64 
65 // redefine STL algorithms to take Function Object pointer to allow for inheritance
66 
67 template <class Container, class Predicate>
remove_if(Container & c,Predicate * __pred)68 void remove_if(Container& c, Predicate* __pred)
69 {
70     typedef typename Container::iterator Iterator;
71     Iterator __first = c.begin();
72     Iterator __last = c.end();
73     while (__first != __last) {
74         if ((*__pred)(*__first)) {
75             (__first)->Status() |= CGeneModel::eSkipped;
76             (__first)->AddComment(__pred->GetComment());
77             __first = c.erase(__first);
78         } else
79             ++__first;
80     }
81     delete __pred;
82 }
83 
84 template <class Container, class UnaryFunction>
transform(Container & c,UnaryFunction * op)85 void transform(Container& c,  UnaryFunction* op)
86 {
87     typedef typename Container::iterator Iterator;
88     Iterator __first = c.begin();
89     Iterator __last = c.end();
90     for (;__first != __last;++__first)
91 	(*op)(*__first);
92     delete op;
93 }
94 
95 struct TransformFunction {
~TransformFunctionTransformFunction96     virtual ~TransformFunction() {}
operator ()TransformFunction97     void operator()(CGeneModel& a) { transform_model(a); }
operator ()TransformFunction98     void operator()(CAlignModel& a) { transform_align(a); }
99 
transform_modelTransformFunction100     virtual void transform_model(CGeneModel& /*a*/) {}
transform_alignTransformFunction101     virtual void transform_align(CAlignModel& a) { transform_model(a); }
102 };
103 struct Predicate {
~PredicatePredicate104     virtual ~Predicate() {}
GetCommentPredicate105     virtual string GetComment() { return "reason not given"; }
operator ()Predicate106     bool operator()(CGeneModel& a) { return  model_predicate(a); }
operator ()Predicate107     bool operator()(CAlignModel& a) { return align_predicate(a); }
108 
model_predicatePredicate109     virtual bool model_predicate(CGeneModel& /*a*/) { return false; }
align_predicatePredicate110     virtual bool align_predicate(CAlignModel& a) { return model_predicate(a); }
111 };
112 
113 class NCBI_XALGOGNOMON_EXPORT CGnomonAnnotator_Base {
114 public:
115     CGnomonAnnotator_Base();
116     virtual ~CGnomonAnnotator_Base();
117 
118     void SetHMMParameters(CHMMParameters* params);
119     void EnableSeqMasking();
120     void SetGenomic(const CResidueVec& seq);
121     void SetGenomic(const CSeq_id& seqid, objects::CScope& scope, const string& mask_annots = kEmptyStr, const TGeneModelList* models = 0);
122     void SetGenomic(const CSeq_id& seqid, objects::CScope& scope, const SCorrectionData& correction_data, TSignedSeqRange range = TSignedSeqRange::GetWhole(), const string& mask_annots = kEmptyStr);
123 
124     /*
125     //for compatibilty with 'pre-correction' worker node
126     NCBI_DEPRECATED
127     static TInDels GetGenomicGaps(const TGeneModelList& ) { return TInDels(); }
128     NCBI_DEPRECATED
129     void SetGenomic(const CSeq_id& seqid, CScope& scope, const string& mask_annots, const TInDels* contig_fix_indels, const TGeneModelList* models = 0) {
130         if(models != 0 || contig_fix_indels == 0) {
131             SetGenomic(seqid, scope, mask_annots, models);
132         } else {
133             SCorrectionData correction_data;
134             correction_data.m_correction_indels = *contig_fix_indels;
135             SetGenomic(seqid, scope, correction_data, TSignedSeqRange::GetWhole(), mask_annots);
136         }
137     }
138     */
139 
140     CGnomonEngine& GetGnomon();
141     void MapAlignmentsToEditedContig(TAlignModelList& alignments) const;
142     void MapModelsToEditedContig(TGeneModelList& models) const;
143     void MapModelsToOrigContig(TGeneModelList& models) const;
144 
145     typedef map<int,TInDels::const_iterator> TGgapInfo;
146     typedef map<int,int> TIntMap;
147 
148 protected:
149     CAlignModel MapOneModelToEditedContig(const CGeneModel& align) const;
150     CGeneModel MapOneModelToOrigContig(const CGeneModel& srcmodel) const;
151 
152     bool m_masking;
153     CRef<CHMMParameters> m_hmm_params;
154     unique_ptr<CGnomonEngine> m_gnomon;
155     CAlignMap m_edited_contig_map;
156     TInDels m_editing_indels;           // in original coordinates (include corrections, ggaps and Ns)
157     TInDels m_reversed_corrections;     // corrections from edited genome back to original (without gggaps or ns)
158     TIntMap m_confirmed_bases_len;      // include all "confirmed" or "corrected" positions in corrected coordinates
159     TIntMap m_confirmed_bases_orig_len; // include all "confirmed" or "corrected" positions in original coordinates
160     map<int,char> m_replacements;       // in original coordinates
161     map<int,char> m_replaced_bases;     // in original coordinates; just in case
162     TGgapInfo m_inserted_seqs;          // edited left coord to indelinfo for ggaps
163     TIntMap m_notbridgeable_gaps_len;   // don't allow introns to cross this
164     TSignedSeqRange m_limits;           // limits on contig
165     string m_contig_acc;
166 };
167 
168 ////////////////////////////////////////////////////////////////////////
169 class CChainerArgUtil;
170 
171 class NCBI_XALGOGNOMON_EXPORT CChainer : public CGnomonAnnotator_Base {
172 public:
173     CChainer();
174     ~CChainer();
175 
176     void SetIntersectLimit(int value);
177     void SetTrim(int trim);
178     void SetMinPolyA(int minpolya);
179     SMinScor& SetMinScor();
180     void SetMinInframeFrac(double mininframefrac);
181     map<string, pair<bool,bool> >& SetProtComplet();
182     map<string,TSignedSeqRange>& SetMrnaCDS();
183     void SetGenomicRange(const TAlignModelList& alignments);
184     void SetNumbering(int idnext, int idinc);
185     void SetOnlyBestFs(bool onlybestfs);
186 
187     TransformFunction* TrimAlignment();
188     TransformFunction* DoNotBelieveShortPolyATail();
189     Predicate* OverlapsSameAccessionAlignment(TAlignModelList& alignments);
190     Predicate* ConnectsParalogs(TAlignModelList& alignments);
191     TransformFunction* ProjectCDS(objects::CScope& scope);
192     TransformFunction* DoNotBelieveFrameShiftsWithoutCdsEvidence();
193     void SetConfirmedStartStopForProteinAlignments(TAlignModelList& alignments);
194     void DropAlignmentInfo(TAlignModelList& alignments, TGeneModelList& models);
195     void FilterOutChimeras(TGeneModelList& clust);
196     void ScoreCDSes_FilterOutPoorAlignments(TGeneModelList& clust);
197     void FindSelenoproteinsClipProteinsToStartStop(TGeneModelList& clust);
198     void CutParts(TGeneModelList& models);
199     TGeneModelList MakeChains(TGeneModelList& models, bool coding_estimates_only = false);
200 
201     /*
202     // dummy functions for compatibilty with 'pre-correction' worker node
203     NCBI_DEPRECATED
204     void FilterOutInferiorProtAlignmentsWithIncompatibleFShifts(TGeneModelList& ) { return; }
205     NCBI_DEPRECATED
206     void ReplicateFrameShifts(TGeneModelList& ) { return; }
207     */
208 
209 private:
210     // Prohibit copy constructor and assignment operator
211     CChainer(const CChainer& value);
212     CChainer& operator= (const CChainer& value);
213 
214     class CChainerImpl;
215     unique_ptr<CChainerImpl> m_data;
216 
217     friend class CChainerArgUtil;
218 };
219 
220 struct MarkupCappedEst : public TransformFunction {
221     MarkupCappedEst(const set<string>& _caps, int _capgap);
222 
223     const set<string>& caps;
224     int capgap;
225 
226     virtual void transform_align(CAlignModel& align);
227 };
228 
229 struct MarkupTrustedGenes : public TransformFunction {
230     MarkupTrustedGenes(const set<string>& _trusted_genes);
231     const set<string>& trusted_genes;
232 
233     virtual void transform_align(CAlignModel& align);
234 };
235 
236 struct ProteinWithBigHole : public Predicate {
237     ProteinWithBigHole(double hthresh, double hmaxlen, CGnomonEngine& gnomon);
238     double hthresh, hmaxlen;
239     CGnomonEngine& gnomon;
240 
241     virtual bool model_predicate(CGeneModel& align);
242 };
243 
244 struct CdnaWithHole : public Predicate {
245     virtual bool model_predicate(CGeneModel& align);
246 };
247 
248 struct HasShortIntron : public Predicate {
249     HasShortIntron(CGnomonEngine& gnomon);
250     CGnomonEngine& gnomon;
251     virtual bool model_predicate(CGeneModel& align);
252 };
253 
254 struct HasLongIntron : public Predicate {
255     HasLongIntron(CGnomonEngine& gnomon);
256     CGnomonEngine& gnomon;
257     virtual bool model_predicate(CGeneModel& align);
258 };
259 
260 struct CutShortPartialExons : public TransformFunction {
261     CutShortPartialExons(int minex);
262     int minex;
263 
264     virtual void transform_align(CAlignModel& align);
265 };
266 
267 struct HasNoExons : public Predicate {
268     virtual bool model_predicate(CGeneModel& align);
269 };
270 
271 struct SingleExon_AllEst : public Predicate {
272     virtual bool model_predicate(CGeneModel& align);
273 };
274 
275 struct SingleExon_Noncoding : public Predicate {
276     virtual bool model_predicate(CGeneModel& align);
277 };
278 
279 struct LowSupport_Noncoding : public Predicate {
280     LowSupport_Noncoding(int _minsupport);
281     int minsupport;
282     virtual bool model_predicate(CGeneModel& align);
283 };
284 
285 class NCBI_XALGOGNOMON_EXPORT CChainerArgUtil {
286 public:
287     static void SetupArgDescriptions(CArgDescriptions* arg_desc);
288     static void ArgsToChainer(CChainer* chainer, const CArgs& args, objects::CScope& scope);
289 };
290 
291 END_SCOPE(gnomon)
292 END_SCOPE(ncbi)
293 
294 
295 #endif  // ALGO_GNOMON___CHAINER__HPP
296