1 #ifndef ALGO_SEQUENCE___GENE_MODEL__HPP 2 #define ALGO_SEQUENCE___GENE_MODEL__HPP 3 4 /* $Id: gene_model.hpp 633919 2021-06-29 12:10:49Z ivanov $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Authors: Mike DiCuccio 30 * 31 * File Description: 32 * 33 */ 34 35 #include <corelib/ncbistd.hpp> 36 #include <corelib/ncbiobj.hpp> 37 #include <corelib/ncbiexpt.hpp> 38 #include <util/range.hpp> 39 #include <objects/seqfeat/Cdregion.hpp> 40 41 BEGIN_NCBI_SCOPE 42 BEGIN_SCOPE(objects) 43 class CScope; 44 class CSeq_id; 45 class CSeq_loc; 46 class CSeq_feat; 47 class CSeq_align; 48 class CSeq_annot; 49 class CBioseq_set; 50 class CBioseq_Handle; 51 struct SAnnotSelector; 52 END_SCOPE(objects) 53 54 class CAlgoFeatureGeneratorException : public CException 55 { 56 public: 57 enum EErrCode { 58 eUnknown, 59 eMicroIntrons, 60 }; GetErrCodeString(void) const61 virtual const char* GetErrCodeString(void) const override { 62 switch ( GetErrCode() ) { 63 case eUnknown: 64 return "Unknown error"; 65 case eMicroIntrons: 66 return "MicroIntron generation failure"; 67 default: 68 return CException::GetErrCodeString(); 69 } 70 } 71 NCBI_EXCEPTION_DEFAULT(CAlgoFeatureGeneratorException, CException); 72 }; 73 74 75 class NCBI_XALGOSEQ_EXPORT CFeatureGenerator 76 { 77 public: 78 NCBI_DEPRECATED 79 CFeatureGenerator(CRef<objects::CScope> scope); 80 81 CFeatureGenerator(objects::CScope& scope); 82 ~CFeatureGenerator(); 83 84 enum EGeneModelCreateFlags { 85 86 // CleanAlignment flags 87 fTrimEnds = 0x1000, // trim ends to codon boundaries (protein or mrna with CDS partially aligned) 88 fMaximizeTranslation = 0x2000, // leave only 1-2 base indels: 89 // minimize product-ins modulo 3, 90 // replace complete genomic-ins triplets with diags 91 // recalculate query positions. 92 // Need to be careful with transcript queries - 93 // cdregion passed to convert should correspond to the modified 94 // query positions 95 96 // Convert flags 97 fCreateGene = 0x001, 98 fCreateMrna = 0x002, 99 fCreateCdregion = 0x004, 100 fPromoteAllFeatures = 0x008, 101 fPropagateOnly = 0x010, 102 fForceTranslateCds = 0x020, 103 fForceTranscribeMrna = 0x040, 104 fDensegAsExon = 0x080, 105 fGenerateLocalIds = 0x100, // uses current date 106 fGenerateStableLocalIds = 0x200, // reproducible ids 107 fPropagateNcrnaFeats = 0x400, 108 fTrustProteinSeq = 0x800, 109 // already-used: = 0x1000, 110 // already-used: = 0x2000, 111 fDeNovoProducts = 0x4000, 112 fAddTranslatedCDSAssembly = 0x8000, // add translated_cds_bioseq->SetInst().SetHist().SetAssembly().push_back(align) 113 fDropManeMarkup = 0x00010000, 114 fSkipLocationCheck = 0x00020000, 115 116 fDefaults = fCreateGene | fCreateMrna | fCreateCdregion | 117 fGenerateLocalIds | fPropagateNcrnaFeats 118 }; 119 typedef int TFeatureGeneratorFlags; 120 static const TSeqPos kDefaultMinIntron = 200; 121 static const TSeqPos kDefaultAllowedUnaligned = 10; 122 123 void SetFlags(TFeatureGeneratorFlags); 124 TFeatureGeneratorFlags GetFlags() const; 125 void SetMinIntron(TSeqPos); 126 void SetAllowedUnaligned(TSeqPos); 127 128 /// Clean an alignment according to our best guess of its biological 129 /// representation. Cleaning involves adjusting segments to satisfy our 130 /// expectations of partial exonic alignments and account for unaligned 131 /// parts. Eg. stitching small gaps (less than min_intron), trimming to codon boundaries. 132 /// May shift product positions. 133 CConstRef<objects::CSeq_align> 134 CleanAlignment(const objects::CSeq_align& align); 135 136 /// Adjust alignment to the specified range 137 /// (cross-the-origin range on circular chromosome is indicated by range.from > range.to) 138 /// Will add necessary 'diags' at ends. 139 /// Throws an exception on attempt to shink past an indel in CDS 140 /// Works on Spliced-seg alignments only. 141 /// Note: for a protein alignment do not expand it to include stop codon. 142 143 enum EProductPositionsMode { 144 eForceProductFrom0, 145 eTryToPreserveProductPositions 146 }; 147 CConstRef<objects::CSeq_align> 148 AdjustAlignment(const objects::CSeq_align& align, TSeqRange range, EProductPositionsMode mode = eForceProductFrom0); 149 150 151 /// Convert an alignment to an annotation. 152 /// This will optionally promote all features through the alignment 153 /// and create product sequences 154 /// Returns mRNA feature 155 CRef<objects::CSeq_feat> ConvertAlignToAnnot(const objects::CSeq_align& align, 156 objects::CSeq_annot& annot, 157 objects::CBioseq_set& seqs, 158 Int8 gene_id = 0, 159 const objects::CSeq_feat* cdregion_on_mrna = NULL); 160 161 void ConvertAlignToAnnot(const list< CRef<objects::CSeq_align> > &aligns, 162 objects::CSeq_annot &annot, 163 objects::CBioseq_set &seqs); 164 165 /// Convert genomic location to an annotation. Populates seqs with mRna 166 /// and protein sequences, and populates annot with gene, mRna 167 /// and cdretgion features 168 void ConvertLocToAnnot( 169 const objects::CSeq_loc &loc, 170 objects::CSeq_annot& annot, 171 objects::CBioseq_set& seqs, 172 objects::CCdregion::EFrame frame = objects::CCdregion::eFrame_one, 173 CRef<objects::CSeq_id> prot_id = CRef<objects::CSeq_id>(), 174 CRef<objects::CSeq_id> rna_id = CRef<objects::CSeq_id>()); 175 176 /// Correctly mark exceptions on a feature 177 /// 178 void SetFeatureExceptions(objects::CSeq_feat& feat, 179 const objects::CSeq_align* align = NULL); 180 181 /// Mark the correct partial states for a set of features 182 /// 183 void SetPartialFlags(CRef<objects::CSeq_feat> gene_feat, 184 CRef<objects::CSeq_feat> mrna_feat, 185 CRef<objects::CSeq_feat> cds_feat); 186 187 /// Recompute the correct partial states for all features in this annotation 188 void RecomputePartialFlags(objects::CSeq_annot& annot); 189 190 191 192 193 /// Project RNA, preserving discontinuities in the CDS. 194 /// 195 /// Postcondition: Output is a mix of packed-ints, where each sub-loc in the mix 196 /// is an exon, and each subloc in the exon packed-int is an exon chunk. The chunks may 197 /// have gaps between them or overlap as to preserve the translation frame of the CDS. 198 /// 199 /// The discontinuities (gaps and overlaps of chunks) that are outside of the CDS are collapsed. 200 /// 201 /// Singleton container locs (comprised of single element) are canonicalized: 202 /// unbroken exons are represented as a single interval 203 /// single-exon locs are represented as a single packed-int (or int, as per above) 204 static CRef<objects::CSeq_loc> s_ProjectRNA(const objects::CSeq_align& spliced_aln, 205 CConstRef<objects::CSeq_loc> product_cds_loc = CConstRef<objects::CSeq_loc>(NULL), 206 size_t unaligned_ends_partialness_thr = kDefaultAllowedUnaligned); 207 /// Similar to s_ProjectRNA(...) 208 /// Postcondition: seq-vector of the returned loc is of exact same length and has no indels 209 /// relative to the seq-vector of the product_cds_loc truncated to the alignment boundaries. 210 /// 1-2 bp overlaps converted to gaps preserving frame if convert_overlaps = true 211 static CRef<objects::CSeq_loc> s_ProjectCDS(const objects::CSeq_align& spliced_aln, 212 const objects::CSeq_loc& product_cds_loc, 213 bool convert_overlaps = true); 214 // when specified, annot_name creates introns for features from a given annot_name 215 // non-NULL range limits processing to a specific range 216 static void CreateMicroIntrons( 217 objects::CScope& scope, 218 objects::CBioseq_Handle bsh, 219 const string& annot_name = "", 220 TSeqRange* range = NULL, 221 bool ignore_errors = false); 222 223 private: 224 struct SImplementation; 225 auto_ptr<SImplementation> m_impl; 226 227 // adjust the selector to use a given annotation if not empty 228 static void x_SetAnnotName(objects::SAnnotSelector& sel, const string& annot_name); 229 }; 230 231 232 class NCBI_XALGOSEQ_EXPORT CGeneModel 233 { 234 public: 235 enum EGeneModelCreateFlags { 236 fCreateGene = CFeatureGenerator::fCreateGene, 237 fCreateMrna = CFeatureGenerator::fCreateMrna, 238 fCreateCdregion = CFeatureGenerator::fCreateCdregion, 239 fForceTranslateCds = CFeatureGenerator::fForceTranslateCds, 240 fForceTranscribeMrna = CFeatureGenerator::fForceTranscribeMrna, 241 242 fDefaults = fCreateGene | fCreateMrna | fCreateCdregion 243 }; 244 typedef int TGeneModelCreateFlags; 245 246 /// Create a gene model from an alignment 247 /// this will optionally promote all features through the alignment 248 NCBI_DEPRECATED 249 static void CreateGeneModelFromAlign(const objects::CSeq_align& align, 250 objects::CScope& scope, 251 objects::CSeq_annot& annot, 252 objects::CBioseq_set& seqs, 253 TGeneModelCreateFlags flags = fDefaults, 254 TSeqPos allowed_unaligned = 10); 255 256 NCBI_DEPRECATED 257 static void CreateGeneModelsFromAligns(const list< CRef<objects::CSeq_align> > &aligns, 258 objects::CScope& scope, 259 objects::CSeq_annot& annot, 260 objects::CBioseq_set& seqs, 261 TGeneModelCreateFlags flags = fDefaults, 262 TSeqPos allowed_unaligned = 10); 263 264 /// Correctly mark exceptions on a feature 265 /// 266 NCBI_DEPRECATED 267 static void SetFeatureExceptions(objects::CSeq_feat& feat, 268 objects::CScope& scope, 269 const objects::CSeq_align* align = NULL); 270 271 NCBI_DEPRECATED 272 static void SetPartialFlags(objects::CScope& scope, 273 CRef<objects::CSeq_feat> gene_feat, 274 CRef<objects::CSeq_feat> mrna_feat, 275 CRef<objects::CSeq_feat> cds_feat); 276 277 NCBI_DEPRECATED 278 static void RecomputePartialFlags(objects::CScope& scope, 279 objects::CSeq_annot& annot); 280 }; 281 282 END_NCBI_SCOPE 283 284 #endif // ALGO_SEQUENCE___GENE_MODEL__HPP 285