1 #ifndef ALGO_SEQUENCE___GENE_MODEL__HPP
2 #define ALGO_SEQUENCE___GENE_MODEL__HPP
3 
4 /*  $Id: gene_model.hpp 633919 2021-06-29 12:10:49Z ivanov $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors:  Mike DiCuccio
30  *
31  * File Description:
32  *
33  */
34 
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbiobj.hpp>
37 #include <corelib/ncbiexpt.hpp>
38 #include <util/range.hpp>
39 #include <objects/seqfeat/Cdregion.hpp>
40 
41 BEGIN_NCBI_SCOPE
42 BEGIN_SCOPE(objects)
43     class CScope;
44     class CSeq_id;
45     class CSeq_loc;
46     class CSeq_feat;
47     class CSeq_align;
48     class CSeq_annot;
49     class CBioseq_set;
50     class CBioseq_Handle;
51     struct SAnnotSelector;
52 END_SCOPE(objects)
53 
54 class CAlgoFeatureGeneratorException : public CException
55  {
56  public:
57      enum EErrCode {
58          eUnknown,
59          eMicroIntrons,
60      };
GetErrCodeString(void) const61      virtual const char* GetErrCodeString(void) const override {
62          switch ( GetErrCode() ) {
63          case eUnknown:
64              return "Unknown error";
65          case eMicroIntrons:
66              return "MicroIntron generation failure";
67          default:
68              return CException::GetErrCodeString();
69          }
70      }
71      NCBI_EXCEPTION_DEFAULT(CAlgoFeatureGeneratorException, CException);
72  };
73 
74 
75 class NCBI_XALGOSEQ_EXPORT CFeatureGenerator
76 {
77 public:
78     NCBI_DEPRECATED
79     CFeatureGenerator(CRef<objects::CScope> scope);
80 
81     CFeatureGenerator(objects::CScope& scope);
82     ~CFeatureGenerator();
83 
84     enum EGeneModelCreateFlags {
85 
86         // CleanAlignment flags
87         fTrimEnds            = 0x1000, // trim ends to codon boundaries (protein or mrna with CDS partially aligned)
88         fMaximizeTranslation = 0x2000, // leave only 1-2 base indels:
89                                        // minimize product-ins modulo 3,
90                                        // replace complete genomic-ins triplets with diags
91                                        // recalculate query positions.
92                                        // Need to be careful with transcript queries -
93                                        // cdregion passed to convert should correspond to the modified
94                                        // query positions
95 
96         // Convert flags
97         fCreateGene          = 0x001,
98         fCreateMrna          = 0x002,
99         fCreateCdregion      = 0x004,
100         fPromoteAllFeatures  = 0x008,
101         fPropagateOnly       = 0x010,
102         fForceTranslateCds   = 0x020,
103         fForceTranscribeMrna = 0x040,
104         fDensegAsExon        = 0x080,
105         fGenerateLocalIds    = 0x100,     // uses current date
106         fGenerateStableLocalIds = 0x200,  // reproducible ids
107         fPropagateNcrnaFeats = 0x400,
108         fTrustProteinSeq     = 0x800,
109         // already-used:     = 0x1000,
110         // already-used:     = 0x2000,
111         fDeNovoProducts      = 0x4000,
112         fAddTranslatedCDSAssembly = 0x8000, // add translated_cds_bioseq->SetInst().SetHist().SetAssembly().push_back(align)
113         fDropManeMarkup       = 0x00010000,
114         fSkipLocationCheck    = 0x00020000,
115 
116         fDefaults = fCreateGene | fCreateMrna | fCreateCdregion |
117                     fGenerateLocalIds | fPropagateNcrnaFeats
118     };
119     typedef int TFeatureGeneratorFlags;
120     static const TSeqPos kDefaultMinIntron = 200;
121     static const TSeqPos kDefaultAllowedUnaligned = 10;
122 
123     void SetFlags(TFeatureGeneratorFlags);
124     TFeatureGeneratorFlags GetFlags() const;
125     void SetMinIntron(TSeqPos);
126     void SetAllowedUnaligned(TSeqPos);
127 
128     /// Clean an alignment according to our best guess of its biological
129     /// representation.  Cleaning involves adjusting segments to satisfy our
130     /// expectations of partial exonic alignments and account for unaligned
131     /// parts. Eg. stitching small gaps (less than min_intron), trimming to codon boundaries.
132     /// May shift product positions.
133     CConstRef<objects::CSeq_align>
134     CleanAlignment(const objects::CSeq_align& align);
135 
136     /// Adjust alignment to the specified range
137     /// (cross-the-origin range on circular chromosome is indicated by range.from > range.to)
138     /// Will add necessary 'diags' at ends.
139     /// Throws an exception on attempt to shink past an indel in CDS
140     /// Works on Spliced-seg alignments only.
141     /// Note: for a protein alignment do not expand it to include stop codon.
142 
143     enum EProductPositionsMode {
144         eForceProductFrom0,
145         eTryToPreserveProductPositions
146     };
147     CConstRef<objects::CSeq_align>
148     AdjustAlignment(const objects::CSeq_align& align, TSeqRange range, EProductPositionsMode mode = eForceProductFrom0);
149 
150 
151     /// Convert an alignment to an annotation.
152     /// This will optionally promote all features through the alignment
153     /// and create product sequences
154     /// Returns mRNA feature
155     CRef<objects::CSeq_feat> ConvertAlignToAnnot(const objects::CSeq_align& align,
156                              objects::CSeq_annot& annot,
157                              objects::CBioseq_set& seqs,
158                              Int8 gene_id = 0,
159                              const objects::CSeq_feat* cdregion_on_mrna = NULL);
160 
161     void ConvertAlignToAnnot(const list< CRef<objects::CSeq_align> > &aligns,
162                              objects::CSeq_annot &annot,
163                              objects::CBioseq_set &seqs);
164 
165     /// Convert genomic location to an annotation. Populates seqs with mRna
166     /// and protein sequences, and populates annot with gene, mRna
167     /// and cdretgion features
168     void ConvertLocToAnnot(
169         const objects::CSeq_loc &loc,
170         objects::CSeq_annot& annot,
171         objects::CBioseq_set& seqs,
172         objects::CCdregion::EFrame frame = objects::CCdregion::eFrame_one,
173         CRef<objects::CSeq_id> prot_id = CRef<objects::CSeq_id>(),
174         CRef<objects::CSeq_id> rna_id = CRef<objects::CSeq_id>());
175 
176     /// Correctly mark exceptions on a feature
177     ///
178     void SetFeatureExceptions(objects::CSeq_feat& feat,
179                               const objects::CSeq_align* align = NULL);
180 
181     /// Mark the correct partial states for a set of features
182     ///
183     void SetPartialFlags(CRef<objects::CSeq_feat> gene_feat,
184                          CRef<objects::CSeq_feat> mrna_feat,
185                          CRef<objects::CSeq_feat> cds_feat);
186 
187     /// Recompute the correct partial states for all features in this annotation
188     void RecomputePartialFlags(objects::CSeq_annot& annot);
189 
190 
191 
192 
193     /// Project RNA, preserving discontinuities in the CDS.
194     ///
195     /// Postcondition: Output is a mix of packed-ints, where each sub-loc in the mix
196     /// is an exon, and each subloc in the exon packed-int is an exon chunk. The chunks may
197     /// have gaps between them or overlap as to preserve the translation frame of the CDS.
198     ///
199     /// The discontinuities (gaps and overlaps of chunks) that are outside of the CDS are collapsed.
200     ///
201     /// Singleton container locs (comprised of single element) are canonicalized:
202     ///   unbroken exons are represented as a single interval
203     ///   single-exon locs are represented as a single packed-int (or int, as per above)
204     static CRef<objects::CSeq_loc> s_ProjectRNA(const objects::CSeq_align& spliced_aln,
205                                                 CConstRef<objects::CSeq_loc> product_cds_loc = CConstRef<objects::CSeq_loc>(NULL),
206                                                 size_t unaligned_ends_partialness_thr = kDefaultAllowedUnaligned);
207     /// Similar to s_ProjectRNA(...)
208     /// Postcondition: seq-vector of the returned loc is of exact same length and has no indels
209     ///                relative to the seq-vector of the product_cds_loc truncated to the alignment boundaries.
210     /// 1-2 bp overlaps converted to gaps preserving frame if convert_overlaps = true
211     static CRef<objects::CSeq_loc> s_ProjectCDS(const objects::CSeq_align& spliced_aln,
212                                                 const objects::CSeq_loc& product_cds_loc,
213                                                 bool convert_overlaps = true);
214     // when specified, annot_name creates introns for features from a given annot_name
215     // non-NULL range limits processing to a specific range
216     static void CreateMicroIntrons(
217                                                 objects::CScope& scope,
218                                                 objects::CBioseq_Handle bsh,
219                                                 const string& annot_name = "",
220                                                 TSeqRange* range = NULL,
221                                                 bool ignore_errors = false);
222 
223 private:
224     struct SImplementation;
225     auto_ptr<SImplementation> m_impl;
226 
227     // adjust the selector to use a given annotation if not empty
228     static void x_SetAnnotName(objects::SAnnotSelector& sel, const string& annot_name);
229 };
230 
231 
232 class NCBI_XALGOSEQ_EXPORT CGeneModel
233 {
234 public:
235     enum EGeneModelCreateFlags {
236         fCreateGene          = CFeatureGenerator::fCreateGene,
237         fCreateMrna          = CFeatureGenerator::fCreateMrna,
238         fCreateCdregion      = CFeatureGenerator::fCreateCdregion,
239         fForceTranslateCds   = CFeatureGenerator::fForceTranslateCds,
240         fForceTranscribeMrna = CFeatureGenerator::fForceTranscribeMrna,
241 
242         fDefaults = fCreateGene | fCreateMrna | fCreateCdregion
243     };
244     typedef int TGeneModelCreateFlags;
245 
246     /// Create a gene model from an alignment
247     /// this will optionally promote all features through the alignment
248     NCBI_DEPRECATED
249     static void CreateGeneModelFromAlign(const objects::CSeq_align& align,
250                                          objects::CScope& scope,
251                                          objects::CSeq_annot& annot,
252                                          objects::CBioseq_set& seqs,
253                                          TGeneModelCreateFlags flags = fDefaults,
254                                          TSeqPos allowed_unaligned = 10);
255 
256     NCBI_DEPRECATED
257     static void CreateGeneModelsFromAligns(const list< CRef<objects::CSeq_align> > &aligns,
258                                            objects::CScope& scope,
259                                            objects::CSeq_annot& annot,
260                                            objects::CBioseq_set& seqs,
261                                            TGeneModelCreateFlags flags = fDefaults,
262                                            TSeqPos allowed_unaligned = 10);
263 
264     /// Correctly mark exceptions on a feature
265     ///
266     NCBI_DEPRECATED
267     static void SetFeatureExceptions(objects::CSeq_feat& feat,
268                                      objects::CScope& scope,
269                                      const objects::CSeq_align* align = NULL);
270 
271     NCBI_DEPRECATED
272     static void SetPartialFlags(objects::CScope& scope,
273                                 CRef<objects::CSeq_feat> gene_feat,
274                                 CRef<objects::CSeq_feat> mrna_feat,
275                                 CRef<objects::CSeq_feat> cds_feat);
276 
277     NCBI_DEPRECATED
278     static void RecomputePartialFlags(objects::CScope& scope,
279                                       objects::CSeq_annot& annot);
280 };
281 
282 END_NCBI_SCOPE
283 
284 #endif  // ALGO_SEQUENCE___GENE_MODEL__HPP
285