1 #ifndef ALGO_SEQUENCE___FEAT_GEN__HPP
2 #define ALGO_SEQUENCE___FEAT_GEN__HPP
3 
4 /*  $Id: feature_generator.hpp 624079 2021-01-25 20:00:40Z ivanov $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors:  Vyacheslav Chetvernin
30  *
31  * File Description:
32  *
33  */
34 
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbiobj.hpp>
37 
38 #include <algo/sequence/gene_model.hpp>
39 #include <objects/seq/MolInfo.hpp>
40 #include <objects/seqalign/Spliced_seg.hpp>
41 #include <objects/seqloc/Na_strand.hpp>
42 #include <objmgr/seq_loc_mapper.hpp>
43 #include <objmgr/mapped_feat.hpp>
44 
45 #include <util/range_coll.hpp>
46 
47 BEGIN_NCBI_SCOPE
48 
49 USING_SCOPE(objects);
50 
51 struct CFeatureGenerator::SImplementation {
52     SImplementation(objects::CScope& scope);
53     ~SImplementation();
54 
55     struct SExon {
56         TSignedSeqPos prod_from;
57         TSignedSeqPos prod_to;
58         TSignedSeqPos genomic_from;
59         TSignedSeqPos genomic_to;
60 
operator ==CFeatureGenerator::SImplementation::SExon61         bool operator==(const SExon& b) const
62         {
63             return
64             prod_from == b.prod_from &&
65             prod_to == b.prod_to &&
66             genomic_from == b.genomic_from &&
67             genomic_to == b.genomic_to;
68         }
69     };
70 
71     CRef<objects::CScope> m_scope;
72     TFeatureGeneratorFlags m_flags;
73     TSeqPos m_min_intron;
74     TSeqPos m_allowed_unaligned;
75     bool m_is_gnomon;
76     bool m_is_best_refseq;
77 
78     typedef map<Int8,CRef<CSeq_feat> > TGeneMap;
79     TGeneMap genes;
80 
81     void StitchSmallHoles(objects::CSeq_align& align);
82     void TrimHolesToCodons(objects::CSeq_align& align);
83     void TrimEnds(objects::CSeq_align& align);
84     void MaximizeTranslation(objects::CSeq_align& align);
85 
86     CConstRef<objects::CSeq_align> CleanAlignment(const objects::CSeq_align& align_in);
87     CConstRef<objects::CSeq_align>
88     AdjustAlignment(const objects::CSeq_align& align, TSeqRange range, EProductPositionsMode mode);
89     CRef<CSeq_feat> ConvertAlignToAnnot(const objects::CSeq_align& align,
90                              objects::CSeq_annot& annot,
91                              objects::CBioseq_set& seqs,
92                              Int8 gene_id, const objects::CSeq_feat* cdregion,
93                              bool call_on_align_list);
94     void ConvertAlignToAnnot(const list< CRef<objects::CSeq_align> > &aligns,
95                              objects::CSeq_annot &annot,
96                              objects::CBioseq_set &seqs);
97     void SetFeatureExceptions(objects::CSeq_feat& feat,
98                             const objects::CSeq_align* align,
99                             objects::CSeq_feat* cds_feat = NULL,
100                             const objects::CSeq_feat* cds_feat_on_query_mrna = NULL,
101                             const objects::CSeq_feat* cds_feat_on_transcribed_mrna = NULL,
102                             list<CRef<CSeq_loc> >* transcribed_mrna_seqloc_refs = NULL,
103                             TSeqPos *clean_match_count = NULL);
104     void SetPartialFlags(CRef<CSeq_feat> gene_feat,
105                          CRef<CSeq_feat> mrna_feat,
106                          CRef<CSeq_feat> cds_feat);
107 
108     void RecomputePartialFlags(objects::CSeq_annot& annot);
109 
110     TSignedSeqRange GetCds(const objects::CSeq_id& seqid);
111 
112     enum ETrimSide {
113         eTrimProduct,
114         eTrimGenomic
115     };
116 
117     // left and right are relative to a hole, so TrimLeftExon trims its right edge.
118     // if side=eTrimProduct - the intention is to trim partial codons,
119     //                        so trim_amount should be < 2 and <= exon length
120     // if side=eTrimGenomic - just cuts off overlap with genomic interval [edge, edge + trim_amount)
121     //                        the remaining alignments can shrink more from edge + trim_amount
122     //                        if it falls into intron or hole
123     static void TrimLeftExon(int trim_amount, ETrimSide side,
124                       vector<SExon>::reverse_iterator left_edge,
125                       vector<SExon>::reverse_iterator& exon_it,
126                       objects::CSpliced_seg::TExons::reverse_iterator& spl_exon_it,
127                       objects::ENa_strand product_strand,
128                       objects::ENa_strand genomic_strand);
129     static void TrimRightExon(int trim_amount, ETrimSide side,
130                        vector<SExon>::iterator& exon_it,
131                        vector<SExon>::iterator right_edge,
132                        objects::CSpliced_seg::TExons::iterator& spl_exon_it,
133                        objects::ENa_strand product_strand,
134                        objects::ENa_strand genomic_strand);
135 private:
136 
137     struct SMapper
138     {
139     public:
140 
141         SMapper(const CSeq_align& aln, CScope& scope,
142                 TSeqPos allowed_unaligned = 10,
143                 CSeq_loc_Mapper::TMapOptions opts = 0);
144 
145         const CSeq_loc& GetRnaLoc();
146         CSeq_align::TDim GetGenomicRow() const;
147         CSeq_align::TDim GetRnaRow() const;
148         CRef<CSeq_loc> Map(const CSeq_loc& loc);
149         void IncludeSourceLocs(bool b = true);
150         void SetMergeNone();
151 
152     private:
153 
154         /// This has special logic to set partialness based on alignment
155         /// properties
156         /// In addition, we need to interpret partial exons correctly
157         CRef<CSeq_loc> x_GetLocFromSplicedExons(const CSeq_align& aln) const;
158         CRef<CSeq_loc_Mapper> x_Mapper();
159 
160         const CSeq_align& m_aln;
161         CScope& m_scope;
162         CRef<CSeq_loc_Mapper> m_mapper;
163         CSeq_align::TDim m_genomic_row;
164         CRef<CSeq_loc> rna_loc;
165         TSeqPos m_allowed_unaligned;
166         CSeq_loc_Mapper::TMapOptions m_opts;
167     };
168 
169     enum e_MatchType {eNone, eOverlap, eExact};
170 
171     void TransformProteinAlignToTranscript(CConstRef<CSeq_align>& align,
172                                            CRef<CSeq_feat>& cd_feat);
173     void x_CollectMrnaSequence(CSeq_inst& inst,
174                                const CSeq_align& align,
175                                const CSeq_loc& loc,
176                                bool add_unaligned_parts = true,
177                                bool mark_transcript_deletions = true,
178                                bool* has_gap = NULL,
179                                bool* has_indel = NULL);
180     CRef<CSeq_id> x_CreateMrnaBioseq(const CSeq_align& align,
181                                CConstRef<CSeq_loc> loc,
182                                const CTime& time,
183                                size_t model_num,
184                                CBioseq_set& seqs,
185                                CConstRef<CSeq_feat> cds_feat_on_query_mrna,
186                                CRef<CSeq_feat>& cds_feat_on_transcribed_mrna);
187     const CBioseq& x_CreateProteinBioseq(CSeq_loc* cds_loc,
188                                CRef<CSeq_feat> cds_feat_on_transcribed_mrna,
189                                list<CRef<CSeq_loc> >& transcribed_mrna_seqloc_refs,
190                                const CTime& time,
191                                size_t model_num,
192                                CBioseq_set& seqs);
193     CRef<CSeq_feat> x_CreateMrnaFeature(CRef<CSeq_loc> loc,
194                                         const CSeq_id& query_rna_id,
195                                         CSeq_id& transcribed_rna_id,
196                                         CConstRef<CSeq_feat> cds_feat_on_query_mrna);
197     void x_CreateGeneFeature(CRef<CSeq_feat> &gene_feat,
198                              const CBioseq_Handle& handle,
199                              SMapper& mapper,
200                              CRef<CSeq_loc> loc,
201                              const CSeq_id& genomic_id,
202                              Int8 gene_id = 0);
203     CRef<CSeq_feat> x_CreateCdsFeature(CConstRef<CSeq_feat> cds_feat_on_query_mrna,
204                                        CRef<objects::CSeq_feat> cds_feat_on_transcribed_mrna,
205                                        list<CRef<CSeq_loc> >& transcribed_mrna_seqloc_refs,
206                                        const CSeq_align& align,
207                                        CRef<CSeq_loc> loc,
208                                        const CTime& time,
209                                        size_t model_num,
210                                        CBioseq_set& seqs,
211                                        CSeq_loc_Mapper::TMapOptions opts);
212     CRef<CSeq_feat> x_CreateNcRnaFeature(const objects::CSeq_feat* ncrnafeature_on_mrna,
213                                          const CSeq_align& align,
214                                          CConstRef<CSeq_loc> loc,
215                                          CSeq_loc_Mapper::TMapOptions opts);
216     CRef<CSeq_feat> x_MapFeature(const objects::CSeq_feat* feature_on_mrna,
217                                 const CSeq_align& align,
218                                 CRef<CSeq_loc> loc,
219                                 CSeq_loc_Mapper::TMapOptions opts,
220                                 TSeqPos &offset);
221     void x_CheckInconsistentDbxrefs(CConstRef<CSeq_feat> gene_feat,
222                                     CConstRef<CSeq_feat> cds_feat);
223     void x_CopyAdditionalFeatures(const CBioseq_Handle& handle,
224                                   SMapper& mapper,
225                                   CSeq_annot& annot);
226     void x_HandleRnaExceptions(CSeq_feat& feat,
227                                const CSeq_align* align,
228                                CSeq_feat* cds_feat,
229                                const CSeq_feat* cds_feat_on_mrna);
230     void x_HandleCdsExceptions(CSeq_feat& feat,
231                                const CSeq_align* align,
232                                const CSeq_feat* cds_feat_on_query_mrna,
233                                const CSeq_feat* cds_feat_on_transcribed_mrna,
234                                list<CRef<CSeq_loc> >* transcribed_mrna_seqloc_refs,
235                                TSeqPos *clean_match_count);
236     void x_SetExceptText(CSeq_feat& feat,
237                          const string &except_text);
238     void x_SetComment(CSeq_feat &rna_feat,
239                       CSeq_feat *cds_feat,
240                       const CSeq_feat *cds_feat_on_mrna,
241                       const CSeq_align *align,
242                       const CRangeCollection<TSeqPos> &mismatch_locs,
243                       const CRangeCollection<TSeqPos> &insert_locs,
244                       const CRangeCollection<TSeqPos> &delete_locs,
245                       map<TSeqPos,TSeqPos> &delete_sizes,
246                       bool partial_unaligned_edge);
247 
248     void x_SetCommentForGapFilledModel(CSeq_feat& feat, TSeqPos insert_length);
249     void x_SetQualForGapFilledModel(CSeq_feat& feat, CSeq_id_Handle id);
250     void x_AddSelectMarkup(const CSeq_align &align,
251                            const CBioseq_Handle& rna_handle,
252                            const CSeq_id &genomic_acc,
253                            CSeq_feat& rna_feat, CSeq_feat* cds_feat);
254     e_MatchType x_CheckMatch(const CSeq_align &align,
255                              const CSeq_id &genomic_acc,
256                              const CUser_field &loc_field);
257     void x_AddKeywordQuals(CSeq_feat &feat, const vector<string> &keywords);
258 
259 
260     string x_ConstructRnaName(const CBioseq_Handle& handle);
261 
262     // merge into single interval or, if cross the origin, into two intervals abutting at the origin
263     CRef<CSeq_loc> MergeSeq_locs(const CSeq_loc* loc1, const CSeq_loc* loc2 = NULL);
264 
265     CRef<CSeq_loc> FixOrderOfCrossTheOriginSeqloc(const CSeq_loc& loc,
266                                                   TSeqPos outside_point,
267                                                   CSeq_loc::TOpFlags flags = CSeq_loc::fSort);
268 
269     bool HasMixedGenomicIds(const CSeq_align& input_align);
270     CRef<CSeq_feat>ConvertMixedAlignToAnnot(const CSeq_align& input_align,
271                                             CSeq_annot& annot,
272                                             CBioseq_set& seqs,
273                                             Int8 gene_id,
274                                             const CSeq_feat* cds_feat_on_query_mrna_ptr,
275                                             bool call_on_align_list);
276 
277     void RecalculateScores(CSeq_align &align);
278     void RecalculateExonIdty(CSpliced_exon &exon);
279     void ClearScores(CSeq_align &align);
280     vector<SExon> GetExons(const CSeq_align &align);
281     void GetExonStructure(const CSpliced_seg& spliced_seg, vector<SExon>& exons, CScope* scope);
282 };
283 
284 CMappedFeat GetCdsOnMrna(const objects::CSeq_id& rna_id, CScope& scope);
285 
286 namespace fg {
287 int GetGeneticCode(const CBioseq_Handle& bsh);
288 }
289 
290 
291 END_NCBI_SCOPE
292 
293 #endif  // ALGO_SEQUENCE___FEAT_GEN__HPP
294