1 #ifndef ALGO_SEQUENCE___FEAT_GEN__HPP 2 #define ALGO_SEQUENCE___FEAT_GEN__HPP 3 4 /* $Id: feature_generator.hpp 624079 2021-01-25 20:00:40Z ivanov $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Authors: Vyacheslav Chetvernin 30 * 31 * File Description: 32 * 33 */ 34 35 #include <corelib/ncbistd.hpp> 36 #include <corelib/ncbiobj.hpp> 37 38 #include <algo/sequence/gene_model.hpp> 39 #include <objects/seq/MolInfo.hpp> 40 #include <objects/seqalign/Spliced_seg.hpp> 41 #include <objects/seqloc/Na_strand.hpp> 42 #include <objmgr/seq_loc_mapper.hpp> 43 #include <objmgr/mapped_feat.hpp> 44 45 #include <util/range_coll.hpp> 46 47 BEGIN_NCBI_SCOPE 48 49 USING_SCOPE(objects); 50 51 struct CFeatureGenerator::SImplementation { 52 SImplementation(objects::CScope& scope); 53 ~SImplementation(); 54 55 struct SExon { 56 TSignedSeqPos prod_from; 57 TSignedSeqPos prod_to; 58 TSignedSeqPos genomic_from; 59 TSignedSeqPos genomic_to; 60 operator ==CFeatureGenerator::SImplementation::SExon61 bool operator==(const SExon& b) const 62 { 63 return 64 prod_from == b.prod_from && 65 prod_to == b.prod_to && 66 genomic_from == b.genomic_from && 67 genomic_to == b.genomic_to; 68 } 69 }; 70 71 CRef<objects::CScope> m_scope; 72 TFeatureGeneratorFlags m_flags; 73 TSeqPos m_min_intron; 74 TSeqPos m_allowed_unaligned; 75 bool m_is_gnomon; 76 bool m_is_best_refseq; 77 78 typedef map<Int8,CRef<CSeq_feat> > TGeneMap; 79 TGeneMap genes; 80 81 void StitchSmallHoles(objects::CSeq_align& align); 82 void TrimHolesToCodons(objects::CSeq_align& align); 83 void TrimEnds(objects::CSeq_align& align); 84 void MaximizeTranslation(objects::CSeq_align& align); 85 86 CConstRef<objects::CSeq_align> CleanAlignment(const objects::CSeq_align& align_in); 87 CConstRef<objects::CSeq_align> 88 AdjustAlignment(const objects::CSeq_align& align, TSeqRange range, EProductPositionsMode mode); 89 CRef<CSeq_feat> ConvertAlignToAnnot(const objects::CSeq_align& align, 90 objects::CSeq_annot& annot, 91 objects::CBioseq_set& seqs, 92 Int8 gene_id, const objects::CSeq_feat* cdregion, 93 bool call_on_align_list); 94 void ConvertAlignToAnnot(const list< CRef<objects::CSeq_align> > &aligns, 95 objects::CSeq_annot &annot, 96 objects::CBioseq_set &seqs); 97 void SetFeatureExceptions(objects::CSeq_feat& feat, 98 const objects::CSeq_align* align, 99 objects::CSeq_feat* cds_feat = NULL, 100 const objects::CSeq_feat* cds_feat_on_query_mrna = NULL, 101 const objects::CSeq_feat* cds_feat_on_transcribed_mrna = NULL, 102 list<CRef<CSeq_loc> >* transcribed_mrna_seqloc_refs = NULL, 103 TSeqPos *clean_match_count = NULL); 104 void SetPartialFlags(CRef<CSeq_feat> gene_feat, 105 CRef<CSeq_feat> mrna_feat, 106 CRef<CSeq_feat> cds_feat); 107 108 void RecomputePartialFlags(objects::CSeq_annot& annot); 109 110 TSignedSeqRange GetCds(const objects::CSeq_id& seqid); 111 112 enum ETrimSide { 113 eTrimProduct, 114 eTrimGenomic 115 }; 116 117 // left and right are relative to a hole, so TrimLeftExon trims its right edge. 118 // if side=eTrimProduct - the intention is to trim partial codons, 119 // so trim_amount should be < 2 and <= exon length 120 // if side=eTrimGenomic - just cuts off overlap with genomic interval [edge, edge + trim_amount) 121 // the remaining alignments can shrink more from edge + trim_amount 122 // if it falls into intron or hole 123 static void TrimLeftExon(int trim_amount, ETrimSide side, 124 vector<SExon>::reverse_iterator left_edge, 125 vector<SExon>::reverse_iterator& exon_it, 126 objects::CSpliced_seg::TExons::reverse_iterator& spl_exon_it, 127 objects::ENa_strand product_strand, 128 objects::ENa_strand genomic_strand); 129 static void TrimRightExon(int trim_amount, ETrimSide side, 130 vector<SExon>::iterator& exon_it, 131 vector<SExon>::iterator right_edge, 132 objects::CSpliced_seg::TExons::iterator& spl_exon_it, 133 objects::ENa_strand product_strand, 134 objects::ENa_strand genomic_strand); 135 private: 136 137 struct SMapper 138 { 139 public: 140 141 SMapper(const CSeq_align& aln, CScope& scope, 142 TSeqPos allowed_unaligned = 10, 143 CSeq_loc_Mapper::TMapOptions opts = 0); 144 145 const CSeq_loc& GetRnaLoc(); 146 CSeq_align::TDim GetGenomicRow() const; 147 CSeq_align::TDim GetRnaRow() const; 148 CRef<CSeq_loc> Map(const CSeq_loc& loc); 149 void IncludeSourceLocs(bool b = true); 150 void SetMergeNone(); 151 152 private: 153 154 /// This has special logic to set partialness based on alignment 155 /// properties 156 /// In addition, we need to interpret partial exons correctly 157 CRef<CSeq_loc> x_GetLocFromSplicedExons(const CSeq_align& aln) const; 158 CRef<CSeq_loc_Mapper> x_Mapper(); 159 160 const CSeq_align& m_aln; 161 CScope& m_scope; 162 CRef<CSeq_loc_Mapper> m_mapper; 163 CSeq_align::TDim m_genomic_row; 164 CRef<CSeq_loc> rna_loc; 165 TSeqPos m_allowed_unaligned; 166 CSeq_loc_Mapper::TMapOptions m_opts; 167 }; 168 169 enum e_MatchType {eNone, eOverlap, eExact}; 170 171 void TransformProteinAlignToTranscript(CConstRef<CSeq_align>& align, 172 CRef<CSeq_feat>& cd_feat); 173 void x_CollectMrnaSequence(CSeq_inst& inst, 174 const CSeq_align& align, 175 const CSeq_loc& loc, 176 bool add_unaligned_parts = true, 177 bool mark_transcript_deletions = true, 178 bool* has_gap = NULL, 179 bool* has_indel = NULL); 180 CRef<CSeq_id> x_CreateMrnaBioseq(const CSeq_align& align, 181 CConstRef<CSeq_loc> loc, 182 const CTime& time, 183 size_t model_num, 184 CBioseq_set& seqs, 185 CConstRef<CSeq_feat> cds_feat_on_query_mrna, 186 CRef<CSeq_feat>& cds_feat_on_transcribed_mrna); 187 const CBioseq& x_CreateProteinBioseq(CSeq_loc* cds_loc, 188 CRef<CSeq_feat> cds_feat_on_transcribed_mrna, 189 list<CRef<CSeq_loc> >& transcribed_mrna_seqloc_refs, 190 const CTime& time, 191 size_t model_num, 192 CBioseq_set& seqs); 193 CRef<CSeq_feat> x_CreateMrnaFeature(CRef<CSeq_loc> loc, 194 const CSeq_id& query_rna_id, 195 CSeq_id& transcribed_rna_id, 196 CConstRef<CSeq_feat> cds_feat_on_query_mrna); 197 void x_CreateGeneFeature(CRef<CSeq_feat> &gene_feat, 198 const CBioseq_Handle& handle, 199 SMapper& mapper, 200 CRef<CSeq_loc> loc, 201 const CSeq_id& genomic_id, 202 Int8 gene_id = 0); 203 CRef<CSeq_feat> x_CreateCdsFeature(CConstRef<CSeq_feat> cds_feat_on_query_mrna, 204 CRef<objects::CSeq_feat> cds_feat_on_transcribed_mrna, 205 list<CRef<CSeq_loc> >& transcribed_mrna_seqloc_refs, 206 const CSeq_align& align, 207 CRef<CSeq_loc> loc, 208 const CTime& time, 209 size_t model_num, 210 CBioseq_set& seqs, 211 CSeq_loc_Mapper::TMapOptions opts); 212 CRef<CSeq_feat> x_CreateNcRnaFeature(const objects::CSeq_feat* ncrnafeature_on_mrna, 213 const CSeq_align& align, 214 CConstRef<CSeq_loc> loc, 215 CSeq_loc_Mapper::TMapOptions opts); 216 CRef<CSeq_feat> x_MapFeature(const objects::CSeq_feat* feature_on_mrna, 217 const CSeq_align& align, 218 CRef<CSeq_loc> loc, 219 CSeq_loc_Mapper::TMapOptions opts, 220 TSeqPos &offset); 221 void x_CheckInconsistentDbxrefs(CConstRef<CSeq_feat> gene_feat, 222 CConstRef<CSeq_feat> cds_feat); 223 void x_CopyAdditionalFeatures(const CBioseq_Handle& handle, 224 SMapper& mapper, 225 CSeq_annot& annot); 226 void x_HandleRnaExceptions(CSeq_feat& feat, 227 const CSeq_align* align, 228 CSeq_feat* cds_feat, 229 const CSeq_feat* cds_feat_on_mrna); 230 void x_HandleCdsExceptions(CSeq_feat& feat, 231 const CSeq_align* align, 232 const CSeq_feat* cds_feat_on_query_mrna, 233 const CSeq_feat* cds_feat_on_transcribed_mrna, 234 list<CRef<CSeq_loc> >* transcribed_mrna_seqloc_refs, 235 TSeqPos *clean_match_count); 236 void x_SetExceptText(CSeq_feat& feat, 237 const string &except_text); 238 void x_SetComment(CSeq_feat &rna_feat, 239 CSeq_feat *cds_feat, 240 const CSeq_feat *cds_feat_on_mrna, 241 const CSeq_align *align, 242 const CRangeCollection<TSeqPos> &mismatch_locs, 243 const CRangeCollection<TSeqPos> &insert_locs, 244 const CRangeCollection<TSeqPos> &delete_locs, 245 map<TSeqPos,TSeqPos> &delete_sizes, 246 bool partial_unaligned_edge); 247 248 void x_SetCommentForGapFilledModel(CSeq_feat& feat, TSeqPos insert_length); 249 void x_SetQualForGapFilledModel(CSeq_feat& feat, CSeq_id_Handle id); 250 void x_AddSelectMarkup(const CSeq_align &align, 251 const CBioseq_Handle& rna_handle, 252 const CSeq_id &genomic_acc, 253 CSeq_feat& rna_feat, CSeq_feat* cds_feat); 254 e_MatchType x_CheckMatch(const CSeq_align &align, 255 const CSeq_id &genomic_acc, 256 const CUser_field &loc_field); 257 void x_AddKeywordQuals(CSeq_feat &feat, const vector<string> &keywords); 258 259 260 string x_ConstructRnaName(const CBioseq_Handle& handle); 261 262 // merge into single interval or, if cross the origin, into two intervals abutting at the origin 263 CRef<CSeq_loc> MergeSeq_locs(const CSeq_loc* loc1, const CSeq_loc* loc2 = NULL); 264 265 CRef<CSeq_loc> FixOrderOfCrossTheOriginSeqloc(const CSeq_loc& loc, 266 TSeqPos outside_point, 267 CSeq_loc::TOpFlags flags = CSeq_loc::fSort); 268 269 bool HasMixedGenomicIds(const CSeq_align& input_align); 270 CRef<CSeq_feat>ConvertMixedAlignToAnnot(const CSeq_align& input_align, 271 CSeq_annot& annot, 272 CBioseq_set& seqs, 273 Int8 gene_id, 274 const CSeq_feat* cds_feat_on_query_mrna_ptr, 275 bool call_on_align_list); 276 277 void RecalculateScores(CSeq_align &align); 278 void RecalculateExonIdty(CSpliced_exon &exon); 279 void ClearScores(CSeq_align &align); 280 vector<SExon> GetExons(const CSeq_align &align); 281 void GetExonStructure(const CSpliced_seg& spliced_seg, vector<SExon>& exons, CScope* scope); 282 }; 283 284 CMappedFeat GetCdsOnMrna(const objects::CSeq_id& rna_id, CScope& scope); 285 286 namespace fg { 287 int GetGeneticCode(const CBioseq_Handle& bsh); 288 } 289 290 291 END_NCBI_SCOPE 292 293 #endif // ALGO_SEQUENCE___FEAT_GEN__HPP 294