1 #ifndef VALIDATOR___VALIDATOR__HPP
2 #define VALIDATOR___VALIDATOR__HPP
3 
4 /*  $Id: validator.hpp 632625 2021-06-03 17:38:33Z ivanov $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Jonathan Kans, Clifford Clausen, Aaron Ucko......
30  *
31  * File Description:
32  *   Validates CSeq_entries and CSeq_submits
33  *   .......
34  *
35  */
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbidiag.hpp>
38 #include <serial/objectinfo.hpp>
39 #include <serial/serialbase.hpp>
40 #include <objects/general/Name_std.hpp>
41 #include <objects/biblio/Author.hpp>
42 #include <objects/valerr/ValidErrItem.hpp>
43 #include <objects/valerr/ValidError.hpp>
44 #include <objects/taxon3/itaxon3.hpp>
45 #include <objmgr/scope.hpp>
46 
47 #include <map>
48 
49 
50 BEGIN_NCBI_SCOPE
51 BEGIN_SCOPE(objects)
52 
53 class CSeq_entry;
54 class CSeq_entry_Handle;
55 class CSeq_submit;
56 class CSeq_annot;
57 class CSeq_annot_Handle;
58 class CSeq_feat;
59 class CBioSource;
60 class CPubdesc;
61 class CBioseq;
62 class CSeqdesc;
63 class CObjectManager;
64 class CScope;
65 class CDbtag;
66 
67 BEGIN_SCOPE(validator)
68 
69 
70 class NCBI_VALIDATOR_EXPORT CValidator : public CObject
71 {
72 public:
73 
74     enum EValidOptions {
75         eVal_non_ascii               = 0x1,
76         eVal_no_context              = 0x2,
77         eVal_val_align               = 0x4,
78         eVal_val_exons               = 0x8,
79         eVal_ovl_pep_err             = 0x10,
80         eVal_seqsubmit_parent        = 0x20,
81         eVal_need_isojta             = 0x40,
82         eVal_validate_id_set         = 0x80,
83         eVal_remote_fetch            = 0x100,
84         eVal_far_fetch_mrna_products = 0x200,
85         eVal_far_fetch_cds_products  = 0x400,
86         eVal_locus_tag_general_match = 0x800,
87         eVal_do_rubisco_test         = 0x1000,
88         eVal_indexer_version         = 0x2000,
89         eVal_use_entrez              = 0x4000,
90         eVal_inference_accns         = 0x8000,
91         eVal_ignore_exceptions       = 0x10000,
92         eVal_report_splice_as_error  = 0x20000,
93         eVal_latlon_check_state      = 0x40000,
94         eVal_latlon_ignore_water     = 0x80000,
95         eVal_genome_submission       = 0x100000,
96         eVal_do_tax_lookup           = 0x200000,
97         eVal_do_barcode_tests        = 0x400000,
98         eVal_refseq_conventions      = 0x800000,
99         eVal_collect_locus_tags      = 0x1000000,
100         eVal_generate_golden_file    = 0x2000000,
101         eVal_compare_vdjc_to_cds     = 0x4000000,
102     };
103 
104     // Constructor / Destructor
105     // If no taxon service is provided, a CTAxon3 client will
106     // be created.
107     CValidator(CObjectManager& objmgr, AutoPtr<ITaxon3> taxon = NULL);
108     ~CValidator(void);
109 
110     // If many validations are being done without changing the underlying
111     // objects, a cache can be given to speed up the process.
112     //
113     // Only functions that have some use for pCache or that directly
114     // or indirectly call some function that uses pCache should
115     // take pCache as an argument.
116     //
117     // (note PIMPL idiom to be as opaque as possible)
118     class CCacheImpl;
119     class NCBI_VALIDATOR_EXPORT CCache : public CObject {
120     public:
121         CCache(void);
122 
123         // the containing CCache object owns the m_impl
124         auto_ptr<CCacheImpl> m_impl;
125     };
126     static CRef<CCache> MakeEmptyCache(void);
127 
128     // Validation methods:
129     // It is possible to validate objects of types CSeq_entry, CSeq_submit
130     // or CSeq_annot. In addition to the object to validate the user must
131     // provide the scope which contain that object, and validation options
132     // that are created by OR'ing EValidOptions (as specified above)
133 
134     // Validate Seq-entry.
135     // If provding a scope the Seq-entry must be a
136     // top-level Seq-entry in that scope.
137     CConstRef<CValidError> Validate(const CSeq_entry& se, CScope* scope = 0,
138         Uint4 options = 0);
139     CConstRef<CValidError> Validate(const CSeq_entry_Handle& se,
140         Uint4 options = 0);
141     // Validate Seq-submit.
142     // Validates each of the Seq-entry contained in the submission.
143     CConstRef<CValidError> Validate(const CSeq_submit& ss, CScope* scope = 0,
144         Uint4 options = 0);
145     // Validate Seq-annot
146     // Validates stand alone Seq-annot objects. This will supress any
147     // check on the context of the annotaions.
148     CConstRef<CValidError> Validate(const CSeq_annot_Handle& sa,
149         Uint4 options = 0);
150 
151     // Validate Seq-feat
152     CConstRef<CValidError> Validate(const CSeq_feat& feat,
153         CScope *scope = 0,
154         Uint4 options = 0);
155     // old call
156     NCBI_DEPRECATED
157     CConstRef<CValidError> Validate(const CSeq_feat& feat,
158         Uint4 options = 0);
159 
160     // Validate BioSource
161     CConstRef<CValidError> Validate(const CBioSource& src,
162         CScope *scope = 0,
163         Uint4 options = 0);
164     // old call
165     NCBI_DEPRECATED
166     CConstRef<CValidError> Validate(const CBioSource& src,
167         Uint4 options = 0);
168 
169     // Validate Pubdesc
170     CConstRef<CValidError> Validate(const CPubdesc& pubdesc,
171         CScope *scope = 0,
172         Uint4 options = 0);
173     // old call
174     NCBI_DEPRECATED
175     CConstRef<CValidError> Validate(const CPubdesc& pubdesc,
176         Uint4 options = 0);
177 
178     // Validate Seqdesc
179     CConstRef<CValidError> Validate(const CSeqdesc& desc,
180         const CSeq_entry& ctx,
181         Uint4 options = 0);
182 
183     // externally callable tests
184     CConstRef<CValidError> GetTSANStretchErrors(const CSeq_entry_Handle& se);
185     CConstRef<CValidError> GetTSACDSOnMinusStrandErrors (const CSeq_entry_Handle& se);
186     CConstRef<CValidError> GetTSAConflictingBiomolTechErrors (const CSeq_entry_Handle& se);
187     CConstRef<CValidError> GetTSANStretchErrors(const CBioseq& seq);
188     CConstRef<CValidError> GetTSACDSOnMinusStrandErrors (const CSeq_feat& f, const CBioseq& seq);
189     CConstRef<CValidError> GetTSAConflictingBiomolTechErrors (const CBioseq& seq);
190 
191     static bool BadCharsInAuthorName(const string& str, bool allowcomma, bool allowperiod, bool last);
192     static bool BadCharsInAuthorLastName(const string& str);
193     static bool BadCharsInAuthorFirstName(const string& str);
194     static bool BadCharsInAuthorInitials(const string& str);
195     static bool BadCharsInAuthorSuffix(const string& str);
196     static string BadCharsInAuthor(const CName_std& author, bool& last_is_bad);
197     static string BadCharsInAuthor(const CAuthor& author, bool& last_is_bad);
198 
199     static bool IsSeqLocCorrectlyOrdered(const CSeq_loc& loc, CScope& scope);
200     static bool DoesSeqLocContainAdjacentIntervals(const CSeq_loc& loc, CScope& scope);
201     static bool DoesSeqLocContainDuplicateIntervals(const CSeq_loc& loc, CScope& scope);
202 
203     // progress reporting
204     class NCBI_VALIDATOR_EXPORT CProgressInfo
205     {
206     public:
207         enum EState {
208             eState_not_set,
209             eState_Initializing,
210             eState_Align,
211             eState_Annot,
212             eState_Bioseq,
213             eState_Bioseq_set,
214             eState_Desc,
215             eState_Descr,
216             eState_Feat,
217             eState_Graph
218         };
219 
CProgressInfo(void)220         CProgressInfo(void): m_State(eState_not_set),
221             m_Total(0), m_TotalDone(0),
222             m_Current(0), m_CurrentDone(0),
223             m_UserData(0)
224         {}
GetState(void) const225         EState GetState(void)       const { return m_State;       }
GetTotal(void) const226         size_t GetTotal(void)       const { return m_Total;       }
GetTotalDone(void) const227         size_t GetTotalDone(void)   const { return m_TotalDone;   }
GetCurrent(void) const228         size_t GetCurrent(void)     const { return m_Current;     }
GetCurrentDone(void) const229         size_t GetCurrentDone(void) const { return m_CurrentDone; }
GetUserData(void) const230         void*  GetUserData(void)    const { return m_UserData;    }
231 
232     private:
233         friend class CValidError_imp;
234 
235         EState m_State;
236         size_t m_Total;
237         size_t m_TotalDone;
238         size_t m_Current;
239         size_t m_CurrentDone;
240         void*  m_UserData;
241     };
242 
243     typedef bool (*TProgressCallback)(CProgressInfo*);
244     void SetProgressCallback(TProgressCallback callback, void* user_data = 0);
245 
246     static EErrType ConvertCode(CSubSource::ELatLonCountryErr errcode);
247 
248     enum EDbxrefValid {
249         eValid = 0,
250         eDbHasSgml = 1,
251         eTagHasSgml = 2,
252         eContainsSpace = 4,
253         eNotForSource = 8,
254         eOnlyForSource = 16,
255         eOnlyForRefSeq = 32,
256         eRefSeqNotForSource = 64,
257         eBadCapitalization = 128,
258         eUnrecognized = 256
259     };
260     typedef int TDbxrefValidFlags;
261     static TDbxrefValidFlags IsValidDbxref(const CDbtag& xref, bool is_biosource, bool is_refseq_or_gps);
262 
263 private:
264     // Prohibit copy constructor & assignment operator
265     CValidator(const CValidator&);
266     CValidator& operator= (const CValidator&);
267 
268     // Services belong here, in the outside class
269     // and are passed into the implementation.
270     CRef<CObjectManager>    m_ObjMgr;
271     AutoPtr<ITaxon3>        m_Taxon;
272 
273     TProgressCallback       m_PrgCallback;
274     void*                   m_UserData;
275 };
276 
277 
278 // Inline Functions:
279 
280 
281 END_SCOPE(validator)
282 END_SCOPE(objects)
283 END_NCBI_SCOPE
284 
285 #endif  /* VALIDATOR___VALIDATOR__HPP */
286