1 /*  $Id: splice_problems.hpp 566666 2018-07-05 10:48:05Z bollin $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *`
26  * Author:  Colleen Bollin
27  *
28  * File Description:
29  *   For validating splice sites
30  *   .......
31  *
32  */
33 
34 #ifndef VALIDATOR___SPLICE_PROBLEMS__HPP
35 #define VALIDATOR___SPLICE_PROBLEMS__HPP
36 
37 #include <corelib/ncbistd.hpp>
38 #include <corelib/ncbi_autoinit.hpp>
39 
40 #include <objmgr/scope.hpp>
41 #include <objects/seqfeat/Seq_feat.hpp>
42 #include <objects/seqfeat/SeqFeatData.hpp>
43 
44 #include <objmgr/util/feature.hpp>
45 
46 BEGIN_NCBI_SCOPE
47 BEGIN_SCOPE(objects)
48 
49 class CSeq_entry;
50 class CCit_sub;
51 class CCit_art;
52 class CCit_gen;
53 class CSeq_feat;
54 class CBioseq;
55 class CSeqdesc;
56 class CSeq_annot;
57 class CTrna_ext;
58 class CProt_ref;
59 class CSeq_loc;
60 class CFeat_CI;
61 class CPub_set;
62 class CAuth_list;
63 class CTitle;
64 class CMolInfo;
65 class CUser_object;
66 class CSeqdesc_CI;
67 class CDense_diag;
68 class CDense_seg;
69 class CSeq_align_set;
70 class CPubdesc;
71 class CBioSource;
72 class COrg_ref;
73 class CDelta_seq;
74 class CGene_ref;
75 class CCdregion;
76 class CRNA_ref;
77 class CImp_feat;
78 class CSeq_literal;
79 class CBioseq_Handle;
80 class CSeq_feat_Handle;
81 class CCountries;
82 class CInferencePrefixList;
83 class CComment_set;
84 class CTaxon3_reply;
85 class ITaxon3;
86 class CT3Error;
87 
88 BEGIN_SCOPE(validator)
89 
90 class CValidError_imp;
91 class CTaxValidationAndCleanup;
92 class CGeneCache;
93 class CValidError_base;
94 
95 typedef Char(&TSpliceSite)[2];
96 
97 // =============================  Validate SeqFeat  ============================
98 
99 
100 
101 class NCBI_VALIDATOR_EXPORT CSpliceProblems {
102 public:
CSpliceProblems()103     CSpliceProblems() : m_ExceptionUnnecessary(false), m_ErrorsNotExpected(false) {};
~CSpliceProblems()104     ~CSpliceProblems() {};
105 
106     void CalculateSpliceProblems(const CSeq_feat& feat, bool check_all, bool pseudo, CBioseq_Handle loc_handle);
107 
108     // first is problem flags, second is position
109     typedef pair<size_t, TSeqPos> TSpliceProblem;
110     typedef vector<TSpliceProblem> TSpliceProblemList;
111 
112     typedef enum {
113         eSpliceSiteRead_OK = 0,
114         eSpliceSiteRead_BadSeq,
115         eSpliceSiteRead_Gap,
116         eSpliceSiteRead_OutOfRange,
117         eSpliceSiteRead_WrongNT
118     } ESpliceSiteRead;
119 
120     bool SpliceSitesHaveErrors();
IsExceptionUnnecessary() const121     bool IsExceptionUnnecessary() const { return m_ExceptionUnnecessary; }
AreErrorsUnexpected() const122     bool AreErrorsUnexpected() const { return m_ErrorsNotExpected; }
GetDonorProblems() const123     const TSpliceProblemList& GetDonorProblems() const { return m_DonorProblems; }
GetAcceptorProblems() const124     const TSpliceProblemList& GetAcceptorProblems() const { return m_AcceptorProblems; }
125 
126 private:
127     TSpliceProblemList m_DonorProblems;
128     TSpliceProblemList m_AcceptorProblems;
129     bool m_ExceptionUnnecessary;
130     bool m_ErrorsNotExpected;
131 
132     void ValidateSpliceCdregion(const CSeq_feat& feat, const CBioseq_Handle& bsh, ENa_strand strand);
133     void ValidateSpliceMrna(const CSeq_feat& feat, const CBioseq_Handle& bsh, ENa_strand strand);
134     void ValidateSpliceExon(const CSeq_feat& feat, const CBioseq_Handle& bsh, ENa_strand strand);
135     void ValidateDonorAcceptorPair(ENa_strand strand, TSeqPos stop, const CSeqVector& vec_donor, TSeqPos seq_len_donor,
136         TSeqPos start, const CSeqVector& vec_acceptor, TSeqPos seq_len_acceptor);
137 
138 
139     ESpliceSiteRead ReadDonorSpliceSite(ENa_strand strand, TSeqPos stop, const CSeqVector& vec, TSeqPos seq_len, TSpliceSite& site);
140     ESpliceSiteRead ReadDonorSpliceSite(ENa_strand strand, TSeqPos stop, const CSeqVector& vec, TSeqPos seq_len);
141     ESpliceSiteRead ReadAcceptorSpliceSite(ENa_strand strand, TSeqPos start, const CSeqVector& vec, TSeqPos seq_len, TSpliceSite& site);
142     ESpliceSiteRead ReadAcceptorSpliceSite(ENa_strand strand, TSeqPos start, const CSeqVector& vec, TSeqPos seq_len);
143 };
144 
145 
146 const string kSpliceSiteGTAG = "GT-AG";
147 const string kSpliceSiteGCAG = "GC-AG";
148 const string kSpliceSiteATAC = "AT-AC";
149 const string kSpliceSiteGT = "GT";
150 const string kSpliceSiteGC = "GC";
151 const string kSpliceSiteAG = "AG";
152 
153 typedef Char const (&TConstSpliceSite)[2];
154 
155 bool CheckAdjacentSpliceSites(const string& signature, ENa_strand strand, TConstSpliceSite donor, TConstSpliceSite acceptor);
156 bool CheckSpliceSite(const string& signature, ENa_strand strand, TConstSpliceSite site);
157 bool CheckIntronSpliceSites(ENa_strand strand, TConstSpliceSite donor, TConstSpliceSite acceptor);
158 bool CheckIntronDonor(ENa_strand strand, TConstSpliceSite donor);
159 bool CheckIntronAcceptor(ENa_strand strand, TConstSpliceSite acceptor);
160 
161 
162 
163 
164 END_SCOPE(validator)
165 END_SCOPE(objects)
166 END_NCBI_SCOPE
167 
168 #endif  /* VALIDATOR___SPLICE_PROBLEMS__HPP */
169