1 /*  $Id: validerror_imp.hpp 632625 2021-06-03 17:38:33Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *`
26  * Author:  Jonathan Kans, Clifford Clausen, Aaron Ucko......
27  *
28  * File Description:
29  *   Privae classes and definition for the validator
30  *   .......
31  *
32  */
33 
34 #ifndef VALIDATOR___VALIDERROR_IMP__HPP
35 #define VALIDATOR___VALIDERROR_IMP__HPP
36 
37 #include <corelib/ncbistd.hpp>
38 #include <corelib/ncbi_autoinit.hpp>
39 
40 #include <objmgr/scope.hpp>
41 #include <objmgr/feat_ci.hpp>  // for CMappedFeat
42 #include <objmgr/util/seq_loc_util.hpp>
43 #include <objects/seqset/Bioseq_set.hpp>
44 #include <objects/seq/GIBB_mol.hpp>
45 #include <util/strsearch.hpp>
46 #include <objects/misc/sequence_macros.hpp>
47 #include <objects/seqfeat/Seq_feat.hpp>
48 #include <objects/seqfeat/SeqFeatData.hpp>
49 #include <objects/seqalign/Seq_align.hpp>
50 #include <objects/seqalign/Std_seg.hpp>
51 #include <objects/seqalign/Packed_seg.hpp>
52 #include <objects/valid/Comment_set.hpp>
53 #include <objects/valid/Comment_rule.hpp>
54 #include <objects/taxon3/taxon3.hpp>
55 
56 #include <objtools/validator/tax_validation_and_cleanup.hpp>
57 #include <objtools/validator/utilities.hpp>
58 #include <objtools/validator/feature_match.hpp>
59 #include <objtools/validator/cache_impl.hpp>
60 #include <objtools/validator/gene_cache.hpp>
61 
62 #include <objtools/alnmgr/sparse_aln.hpp>
63 
64 #include <objmgr/util/create_defline.hpp>
65 
66 #include <objmgr/util/feature.hpp>
67 
68 BEGIN_NCBI_SCOPE
69 BEGIN_SCOPE(objects)
70 
71 class CSeq_entry;
72 class CCit_sub;
73 class CCit_art;
74 class CCit_gen;
75 class CSeq_feat;
76 class CBioseq;
77 class CSeqdesc;
78 class CSeq_annot;
79 class CTrna_ext;
80 class CProt_ref;
81 class CSeq_loc;
82 class CFeat_CI;
83 class CPub_set;
84 class CAuth_list;
85 class CTitle;
86 class CMolInfo;
87 class CUser_object;
88 class CSeqdesc_CI;
89 class CSeq_graph;
90 class CMappedGraph;
91 class CDense_diag;
92 class CDense_seg;
93 class CSeq_align_set;
94 class CPubdesc;
95 class CBioSource;
96 class COrg_ref;
97 class CByte_graph;
98 class CDelta_seq;
99 class CGene_ref;
100 class CCdregion;
101 class CRNA_ref;
102 class CImp_feat;
103 class CSeq_literal;
104 class CBioseq_Handle;
105 class CSeq_feat_Handle;
106 class CCountries;
107 class CInferencePrefixList;
108 class CComment_set;
109 class CTaxon3_reply;
110 class ITaxon3;
111 class CT3Error;
112 
113 BEGIN_SCOPE(validator)
114 
115 class CValidError_desc;
116 class CValidError_descr;
117 
118 
119 // ===========================  Central Validation  ==========================
120 
121 // CValidError_imp provides the entry point to the validation process.
122 // It calls upon the various validation classes to perform validation of
123 // each part.
124 // The class holds all the data for the validation process.
125 class NCBI_VALIDATOR_EXPORT CValidError_imp
126 {
127 public:
128     typedef map<int, int> TCount;
129 
130     // Interface to be used by the CValidError class
131 
132     CValidError_imp(CObjectManager& objmgr, CValidError* errors,
133         Uint4 options = 0);
134 
135     // Constructor allowing over-ride of Services
136     // Namely, the taxonomy service.
137     // NB: ITaxon is owned by CValidator.
138     CValidError_imp(CObjectManager& objmgr, CValidError* errors,
139         ITaxon3* taxon, Uint4 options = 0);
140 
141     // Destructor
142     virtual ~CValidError_imp(void);
143 
144     void SetOptions (Uint4 options);
145     void SetErrorRepository (CValidError* errors);
146     void Reset(void);
147 
148     // Validation methods
149     bool Validate(const CSeq_entry& se, const CCit_sub* cs = 0,
150                   CScope* scope = 0);
151     bool Validate(
152         const CSeq_entry_Handle& seh, const CCit_sub* cs = 0);
153     void Validate(
154         const CSeq_submit& ss, CScope* scope = 0);
155     void Validate(const CSeq_annot_Handle& sa);
156 
157     void Validate(const CSeq_feat& feat, CScope* scope = 0);
158     void Validate(const CBioSource& src, CScope* scope = 0);
159     void Validate(const CPubdesc& pubdesc, CScope* scope = 0);
160     void Validate(const CSeqdesc& desc, const CSeq_entry& ctx);
161     void ValidateSubAffil(const CAffil::TStd& std, const CSerialObject& obj, const CSeq_entry *ctx);
162     void ValidateAffil(const CAffil::TStd& std, const CSerialObject& obj, const CSeq_entry *ctx);
163 
164     bool GetTSANStretchErrors(const CSeq_entry_Handle& se);
165     bool GetTSACDSOnMinusStrandErrors (const CSeq_entry_Handle& se);
166     bool GetTSAConflictingBiomolTechErrors (const CSeq_entry_Handle& se);
167     bool GetTSANStretchErrors(const CBioseq& seq);
168     bool GetTSACDSOnMinusStrandErrors (const CSeq_feat& f, const CBioseq& seq);
169     bool GetTSAConflictingBiomolTechErrors (const CBioseq& seq);
170 
171 
172     void SetProgressCallback(CValidator::TProgressCallback callback,
173         void* user_data);
174 
175     void SetTSE(const CSeq_entry_Handle& seh);
176 
ShouldSubdivide() const177     bool ShouldSubdivide() const { if (m_NumTopSetSiblings > 1000) return true; else return false; }
178 
179 public:
180     // interface to be used by the various validation classes
181 
182     // typedefs:
183     typedef const CSeq_feat& TFeat;
184     typedef const CBioseq& TBioseq;
185     typedef const CBioseq_set& TSet;
186     typedef const CSeqdesc& TDesc;
187     typedef const CSeq_annot& TAnnot;
188     typedef const CSeq_graph& TGraph;
189     typedef const CSeq_align& TAlign;
190     typedef const CSeq_entry& TEntry;
191     typedef map < const CSeq_feat*, const CSeq_annot* >& TFeatAnnotMap;
192 
193     // Posts errors.
194     void PostErr(EDiagSev sv, EErrType et, const string& msg,
195         const CSerialObject& obj);
196     void PostErr(EDiagSev sv, EErrType et, const string& msg, TDesc ds);
197     void PostErr(EDiagSev sv, EErrType et, const string& msg, TFeat ft);
198     void PostErr(EDiagSev sv, EErrType et, const string& msg, TBioseq sq);
199     void PostErr(EDiagSev sv, EErrType et, const string& msg, TEntry ctx,
200         TDesc ds);
201     void PostErr(EDiagSev sv, EErrType et, const string& msg, TSet set);
202     void PostErr(EDiagSev sv, EErrType et, const string& msg, TAnnot annot);
203     void PostErr(EDiagSev sv, EErrType et, const string& msg, TGraph graph);
204     void PostErr(EDiagSev sv, EErrType et, const string& msg, TBioseq sq,
205         TGraph graph);
206     void PostErr(EDiagSev sv, EErrType et, const string& msg, TAlign align);
207     void PostErr(EDiagSev sv, EErrType et, const string& msg, TEntry entry);
208     void PostErr(EDiagSev sv, EErrType et, const string& msg, const CBioSource& src);
209     void PostErr(EDiagSev sv, EErrType et, const string& msg, const COrg_ref& org);
210     void PostErr(EDiagSev sv, EErrType et, const string& msg, const CPubdesc& src);
211     void PostErr(EDiagSev sv, EErrType et, const string& msg, const CSeq_submit& ss);
212     void PostObjErr (EDiagSev sv, EErrType et, const string& msg, const CSerialObject& obj, const CSeq_entry *ctx = 0);
213     void PostBadDateError (EDiagSev sv, const string& msg, int flags, const CSerialObject& obj, const CSeq_entry *ctx = 0);
214 
215     void HandleTaxonomyError(const CT3Error& error, const string& host, const COrg_ref& orf);
216     void HandleTaxonomyError(const CT3Error& error, const EErrType type, const CSeq_feat& feat);
217     void HandleTaxonomyError(const CT3Error& error, const EErrType type, const CSeqdesc& desc, const CSeq_entry* entry);
218 
219     bool RaiseGenomeSeverity(EErrType et);
220 
221     // General use validation methods
222     void ValidatePubdesc(const CPubdesc& pub, const CSerialObject& obj, const CSeq_entry *ctx = 0);
223     void ValidateBioSource(const CBioSource& bsrc, const CSerialObject& obj, const CSeq_entry *ctx = 0);
224     void ValidatePCRReactionSet(const CPCRReactionSet& pcrset, const CSerialObject& obj, const CSeq_entry *ctx = 0);
225     void ValidateSubSource(const CSubSource& subsrc, const CSerialObject& obj, const CSeq_entry *ctx = 0, const bool isViral = false);
226     void ValidateOrgRef(const COrg_ref& orgref, const CSerialObject& obj, const CSeq_entry *ctx);
227     void ValidateTaxNameOrgname(const string& taxname, const COrgName& orgname, const CSerialObject& obj, const CSeq_entry *ctx);
228     void ValidateOrgName(const COrgName& orgname, const bool has_taxon, const CSerialObject& obj, const CSeq_entry *ctx);
229     void ValidateOrgModVoucher(const COrgMod& orgmod, const CSerialObject& obj, const CSeq_entry *ctx);
230     void ValidateBioSourceForSeq(const CBioSource& bsrc, const CSerialObject& obj, const CSeq_entry *ctx, const CBioseq_Handle& bsh);
231 
232     void ValidateLatLonCountry(string countryname, string lat_lon, const CSerialObject& obj, const CSeq_entry *ctx);
233 
234     static bool IsSyntheticConstruct (const CBioSource& src);
235     bool IsArtificial (const CBioSource& src);
236     bool IsOtherDNA(const CBioseq_Handle& bsh) const;
237     void ValidateSeqLoc(const CSeq_loc& loc, const CBioseq_Handle& seq, bool report_abutting,
238                         const string& prefix, const CSerialObject& obj, bool lowerSev = false);
239 
240     void ValidateSeqLocIds(const CSeq_loc& loc, const CSerialObject& obj);
241     static bool IsInOrganelleSmallGenomeSet(const CSeq_id& id, CScope& scope);
242     static bool BadMultipleSequenceLocation(const CSeq_loc& loc, CScope& scope);
243     void CheckMultipleIds(const CSeq_loc& loc, const CSerialObject& obj);
244     void ValidateDbxref(const CDbtag& xref, const CSerialObject& obj,
245     bool biosource = false, const CSeq_entry *ctx = 0);
246     void ValidateDbxref(TDbtags& xref_list, const CSerialObject& obj,
247     bool biosource = false, const CSeq_entry *ctx = 0);
248     void ValidateCitSub(const CCit_sub& cs, const CSerialObject& obj, const CSeq_entry *ctx = 0);
249     void ValidateTaxonomy(const CSeq_entry& se);
250     void ValidateOrgRefs(CTaxValidationAndCleanup& tval);
251     void ValidateSpecificHost(CTaxValidationAndCleanup& tval);
252     void ValidateStrain(CTaxValidationAndCleanup& tval);
253     void ValidateSpecificHost (const CSeq_entry& se);
254     void ValidateTentativeName(const CSeq_entry& se);
255     void ValidateTaxonomy(const COrg_ref& org, int genome = CBioSource::eGenome_unknown);
256     void ValidateMultipleTaxIds(const CSeq_entry_Handle& seh);
257     void ValidateCitations (const CSeq_entry_Handle& seh);
258     bool x_IsFarFetchFailure (const CSeq_loc& loc);
259 
260     // getters
GetScope(void)261     inline CScope* GetScope(void) { return m_Scope; }
GetCache(void)262     inline CCacheImpl & GetCache(void) { return m_cache; }
263 
GetCachedGene(const CSeq_feat * f)264     inline CConstRef<CSeq_feat> GetCachedGene(const CSeq_feat* f) { return m_GeneCache.GetGeneFromCache(f, *m_Scope); }
GetGeneCache()265     inline CGeneCache& GetGeneCache() { return m_GeneCache; }
266 
267     // flags derived from options parameter
IsNonASCII(void) const268     bool IsNonASCII(void)             const { return m_NonASCII; }
IsSuppressContext(void) const269     bool IsSuppressContext(void)      const { return m_SuppressContext; }
IsValidateAlignments(void) const270     bool IsValidateAlignments(void)   const { return m_ValidateAlignments; }
IsValidateExons(void) const271     bool IsValidateExons(void)        const { return m_ValidateExons; }
IsOvlPepErr(void) const272     bool IsOvlPepErr(void)            const { return m_OvlPepErr; }
IsRequireTaxonID(void) const273     bool IsRequireTaxonID(void)       const { return !m_SeqSubmitParent; }
IsSeqSubmitParent(void) const274     bool IsSeqSubmitParent(void)      const { return m_SeqSubmitParent; }
IsRequireISOJTA(void) const275     bool IsRequireISOJTA(void)        const { return m_RequireISOJTA; }
IsValidateIdSet(void) const276     bool IsValidateIdSet(void)        const { return m_ValidateIdSet; }
IsRemoteFetch(void) const277     bool IsRemoteFetch(void)          const { return m_RemoteFetch; }
IsFarFetchMRNAproducts(void) const278     bool IsFarFetchMRNAproducts(void) const { return m_FarFetchMRNAproducts; }
IsFarFetchCDSproducts(void) const279     bool IsFarFetchCDSproducts(void)  const { return m_FarFetchCDSproducts; }
IsLocusTagGeneralMatch(void) const280     bool IsLocusTagGeneralMatch(void) const { return m_LocusTagGeneralMatch; }
DoRubiscoTest(void) const281     bool DoRubiscoTest(void)          const { return m_DoRubiscoText; }
IsIndexerVersion(void) const282     bool IsIndexerVersion(void)       const { return m_IndexerVersion; }
IsGenomeSubmission(void) const283     bool IsGenomeSubmission(void)     const { return m_genomeSubmission; }
UseEntrez(void) const284     bool UseEntrez(void)              const { return m_UseEntrez; }
DoTaxLookup(void) const285     bool DoTaxLookup(void)            const { return m_DoTaxLookup; }
ValidateInferenceAccessions(void) const286     bool ValidateInferenceAccessions(void) const { return m_ValidateInferenceAccessions; }
IgnoreExceptions(void) const287     bool IgnoreExceptions(void) const { return m_IgnoreExceptions; }
ReportSpliceAsError(void) const288     bool ReportSpliceAsError(void) const { return m_ReportSpliceAsError; }
IsLatLonCheckState(void) const289     bool IsLatLonCheckState(void)     const { return m_LatLonCheckState; }
IsLatLonIgnoreWater(void) const290     bool IsLatLonIgnoreWater(void)    const { return m_LatLonIgnoreWater; }
IsRefSeqConventions(void) const291     bool IsRefSeqConventions(void)    const { return m_RefSeqConventions; }
GenerateGoldenFile(void) const292     bool GenerateGoldenFile(void)    const { return m_GenerateGoldenFile; }
DoCompareVDJCtoCDS(void) const293     bool DoCompareVDJCtoCDS(void)    const { return m_CompareVDJCtoCDS; }
294 
295 
296     // flags calculated by examining data in record
IsStandaloneAnnot(void) const297     inline bool IsStandaloneAnnot(void) const { return m_IsStandaloneAnnot; }
IsNoPubs(void) const298     inline bool IsNoPubs(void) const { return m_NoPubs; }
IsNoCitSubPubs(void) const299     inline bool IsNoCitSubPubs(void) const { return m_NoCitSubPubs; }
IsNoBioSource(void) const300     inline bool IsNoBioSource(void) const { return m_NoBioSource; }
IsGPS(void) const301     inline bool IsGPS(void) const { return m_IsGPS; }
IsGED(void) const302     inline bool IsGED(void) const { return m_IsGED; }
IsPDB(void) const303     inline bool IsPDB(void) const { return m_IsPDB; }
IsPatent(void) const304     inline bool IsPatent(void) const { return m_IsPatent; }
IsRefSeq(void) const305     inline bool IsRefSeq(void) const { return m_IsRefSeq || m_RefSeqConventions; }
IsEmbl(void) const306     inline bool IsEmbl(void) const { return m_IsEmbl; }
IsDdbj(void) const307     inline bool IsDdbj(void) const { return m_IsDdbj; }
IsTPE(void) const308     inline bool IsTPE(void) const { return m_IsTPE; }
IsNC(void) const309     inline bool IsNC(void) const { return m_IsNC; }
IsNG(void) const310     inline bool IsNG(void) const { return m_IsNG; }
IsNM(void) const311     inline bool IsNM(void) const { return m_IsNM; }
IsNP(void) const312     inline bool IsNP(void) const { return m_IsNP; }
IsNR(void) const313     inline bool IsNR(void) const { return m_IsNR; }
IsNZ(void) const314     inline bool IsNZ(void) const { return m_IsNZ; }
IsNS(void) const315     inline bool IsNS(void) const { return m_IsNS; }
IsNT(void) const316     inline bool IsNT(void) const { return m_IsNT; }
IsNW(void) const317     inline bool IsNW(void) const { return m_IsNW; }
IsWP(void) const318     inline bool IsWP(void) const { return m_IsWP; }
IsXR(void) const319     inline bool IsXR(void) const { return m_IsXR; }
IsGI(void) const320     inline bool IsGI(void) const { return m_IsGI; }
IsGpipe(void) const321     inline bool IsGpipe(void) const { return m_IsGpipe; }
322     bool IsHtg(void) const;
IsLocalGeneralOnly(void) const323     inline bool IsLocalGeneralOnly(void) const { return m_IsLocalGeneralOnly; }
HasGiOrAccnVer(void) const324     inline bool HasGiOrAccnVer(void) const { return m_HasGiOrAccnVer; }
IsGenomic(void) const325     inline bool IsGenomic(void) const { return m_IsGenomic; }
IsSeqSubmit(void) const326     inline bool IsSeqSubmit(void) const { return m_IsSeqSubmit; }
IsSmallGenomeSet(void) const327     inline bool IsSmallGenomeSet(void) const { return m_IsSmallGenomeSet; }
328     bool IsNoncuratedRefSeq(const CBioseq& seq, EDiagSev& sev);
IsGenbank(void) const329     inline bool IsGenbank(void) const { return m_IsGB; }
DoesAnyFeatLocHaveGI(void) const330     inline bool DoesAnyFeatLocHaveGI(void) const { return m_FeatLocHasGI; }
DoesAnyProductLocHaveGI(void) const331     inline bool DoesAnyProductLocHaveGI(void) const { return m_ProductLocHasGI; }
DoesAnyGeneHaveLocusTag(void) const332     inline bool DoesAnyGeneHaveLocusTag(void) const { return m_GeneHasLocusTag; }
DoesAnyProteinHaveGeneralID(void) const333     inline bool DoesAnyProteinHaveGeneralID(void) const { return m_ProteinHasGeneralID; }
IsINSDInSep(void) const334     inline bool IsINSDInSep(void) const { return m_IsINSDInSep; }
IsGeneious(void) const335     inline bool IsGeneious(void) const { return m_IsGeneious; }
BioSourceKind() const336     inline const CBioSourceKind& BioSourceKind() const { return m_biosource_kind; }
337 
338     // counting number of misplaced features
ResetMisplacedFeatureCount(void)339     inline void ResetMisplacedFeatureCount (void) { m_NumMisplacedFeatures = 0; }
IncrementMisplacedFeatureCount(void)340     inline void IncrementMisplacedFeatureCount (void) { m_NumMisplacedFeatures++; }
AddToMisplacedFeatureCount(SIZE_TYPE num)341     inline void AddToMisplacedFeatureCount (SIZE_TYPE num) { m_NumMisplacedFeatures += num; }
342 
343     // counting number of small genome set misplaced features
ResetSmallGenomeSetMisplacedCount(void)344     inline void ResetSmallGenomeSetMisplacedCount (void) { m_NumSmallGenomeSetMisplaced = 0; }
IncrementSmallGenomeSetMisplacedCount(void)345     inline void IncrementSmallGenomeSetMisplacedCount (void) { m_NumSmallGenomeSetMisplaced++; }
AddToSmallGenomeSetMisplacedCount(SIZE_TYPE num)346     inline void AddToSmallGenomeSetMisplacedCount (SIZE_TYPE num) { m_NumSmallGenomeSetMisplaced += num; }
347 
348     // counting number of misplaced graphs
ResetMisplacedGraphCount(void)349     inline void ResetMisplacedGraphCount (void) { m_NumMisplacedGraphs = 0; }
IncrementMisplacedGraphCount(void)350     inline void IncrementMisplacedGraphCount (void) { m_NumMisplacedGraphs++; }
AddToMisplacedGraphCount(SIZE_TYPE num)351     inline void AddToMisplacedGraphCount (SIZE_TYPE num) { m_NumMisplacedGraphs += num; }
352 
353     // counting number of genes and gene xrefs
ResetGeneCount(void)354     inline void ResetGeneCount (void) { m_NumGenes = 0; }
IncrementGeneCount(void)355     inline void IncrementGeneCount (void) { m_NumGenes++; }
AddToGeneCount(SIZE_TYPE num)356     inline void AddToGeneCount (SIZE_TYPE num) { m_NumGenes += num; }
ResetGeneXrefCount(void)357     inline void ResetGeneXrefCount (void) { m_NumGeneXrefs = 0; }
IncrementGeneXrefCount(void)358     inline void IncrementGeneXrefCount (void) { m_NumGeneXrefs++; }
AddToGeneXrefCount(SIZE_TYPE num)359     inline void AddToGeneXrefCount (SIZE_TYPE num) { m_NumGeneXrefs += num; }
360 
361     // counting sequences with and without TPA history
ResetTpaWithHistoryCount(void)362     inline void ResetTpaWithHistoryCount (void) { m_NumTpaWithHistory = 0; }
IncrementTpaWithHistoryCount(void)363     inline void IncrementTpaWithHistoryCount (void) { m_NumTpaWithHistory++; }
AddToTpaWithHistoryCount(SIZE_TYPE num)364     inline void AddToTpaWithHistoryCount (SIZE_TYPE num) { m_NumTpaWithHistory += num; }
ResetTpaWithoutHistoryCount(void)365     inline void ResetTpaWithoutHistoryCount (void) { m_NumTpaWithoutHistory = 0; }
IncrementTpaWithoutHistoryCount(void)366     inline void IncrementTpaWithoutHistoryCount (void) { m_NumTpaWithoutHistory++; }
AddToTpaWithoutHistoryCount(SIZE_TYPE num)367     inline void AddToTpaWithoutHistoryCount (SIZE_TYPE num) { m_NumTpaWithoutHistory += num; }
368 
369     // counting number of Pseudos and Pseudogenes
ResetPseudoCount(void)370     inline void ResetPseudoCount (void) { m_NumPseudo = 0; }
IncrementPseudoCount(void)371     inline void IncrementPseudoCount (void) { m_NumPseudo++; }
AddToPseudoCount(SIZE_TYPE num)372     inline void AddToPseudoCount (SIZE_TYPE num) { m_NumPseudo += num; }
ResetPseudogeneCount(void)373     inline void ResetPseudogeneCount (void) { m_NumPseudogene = 0; }
IncrementPseudogeneCount(void)374     inline void IncrementPseudogeneCount (void) { m_NumPseudogene++; }
AddToPseudogeneCount(SIZE_TYPE num)375     inline void AddToPseudogeneCount (SIZE_TYPE num) { m_NumPseudogene += num; }
376 
377     // set flag for farfetchfailure
SetFarFetchFailure(void)378     inline void SetFarFetchFailure (void) { m_FarFetchFailure = true; }
379 
GetTSE(void) const380     const CSeq_entry& GetTSE(void) const { return *m_TSE; };
GetTSEH(void)381     const CSeq_entry_Handle & GetTSEH(void) { return m_TSEH; }
GetTSE_Handle(void)382     const CTSE_Handle & GetTSE_Handle(void) { return
383             (m_TSEH ? m_TSEH.GetTSE_Handle() : CCacheImpl::kEmptyTSEHandle); }
GetSeqAnnot(void)384     const CConstRef<CSeq_annot>& GetSeqAnnot(void) { return m_SeqAnnot; }
385 
386     void AddBioseqWithNoPub(const CBioseq& seq);
387     void AddBioseqWithNoBiosource(const CBioseq& seq);
388     void AddProtWithoutFullRef(const CBioseq_Handle& seq);
389     static bool IsWGSIntermediate(const CBioseq& seq);
390     static bool IsTSAIntermediate(const CBioseq& seq);
391     void ReportMissingPubs(const CSeq_entry& se, const CCit_sub* cs);
392     void ReportMissingBiosource(const CSeq_entry& se);
393 
394     CConstRef<CSeq_feat> GetCDSGivenProduct(const CBioseq& seq);
395     CConstRef<CSeq_feat> GetmRNAGivenProduct(const CBioseq& seq);
396     const CSeq_entry* GetAncestor(const CBioseq& seq, CBioseq_set::EClass clss);
397     bool IsSerialNumberInComment(const string& comment);
398 
399     bool IsTransgenic(const CBioSource& bsrc);
400 
401     bool RequireLocalProduct(const CSeq_id* sid) const;
402 
403 private:
404 
405     // Setup common options during consturction;
406     void x_Init(Uint4 options);
407 
408     // This is so we can temporarily set m_Scope in a function
409     // and be sure that it will be set to its old value when we're done
410     class CScopeRestorer {
411     public:
CScopeRestorer(CRef<CScope> & scope)412         CScopeRestorer( CRef<CScope> &scope ) :
413           m_scopeToRestore(scope), m_scopeOriginalValue(scope) { }
414 
~CScopeRestorer(void)415         ~CScopeRestorer(void) { m_scopeToRestore = m_scopeOriginalValue; }
416     private:
417         CRef<CScope> &m_scopeToRestore;
418         CRef<CScope> m_scopeOriginalValue;
419     };
420 
421     // Prohibit copy constructor & assignment operator
422     CValidError_imp(const CValidError_imp&);
423     CValidError_imp& operator= (const CValidError_imp&);
424 
425     void Setup(const CSeq_entry_Handle& seh);
426     void Setup(const CSeq_annot_Handle& sa);
427     CSeq_entry_Handle Setup(const CBioseq& seq);
428     void SetScope(const CSeq_entry& se);
429 
430     void ValidateSubmitBlock(const CSubmit_block& block, const CSeq_submit& ss);
431 
432     void InitializeSourceQualTags();
433     void ValidateSourceQualTags(const string& str, const CSerialObject& obj, const CSeq_entry *ctx = 0);
434 
435     bool IsMixedStrands(const CSeq_loc& loc);
436 
437     void ValidatePubGen(const CCit_gen& gen, const CSerialObject& obj, const CSeq_entry *ctx = 0);
438     void ValidatePubArticle(const CCit_art& art, TEntrezId uid, const CSerialObject& obj, const CSeq_entry *ctx = 0);
439     void ValidatePubArticleNoPMID(const CCit_art& art, const CSerialObject& obj, const CSeq_entry *ctx = 0);
440     void x_ValidatePages(const string& pages, const CSerialObject& obj, const CSeq_entry *ctx = 0);
441     void ValidateAuthorList(const CAuth_list::C_Names& names, const CSerialObject& obj, const CSeq_entry *ctx = 0);
442     void ValidateAuthorsInPubequiv (const CPub_equiv& pe, const CSerialObject& obj, const CSeq_entry *ctx = 0);
443     void ValidatePubHasAuthor(const CPubdesc& pubdesc, const CSerialObject& obj, const CSeq_entry *ctx = 0);
444 
445     bool HasName(const CAuth_list& authors);
446     bool HasTitle(const CTitle& title);
447     bool HasIsoJTA(const CTitle& title);
448 
449     void FindEmbeddedScript(const CSerialObject& obj);
450     void FindNonAsciiText (const CSerialObject& obj);
451     void FindCollidingSerialNumbers (const CSerialObject& obj);
452 
453 
454     void GatherTentativeName (const CSeq_entry& se, vector<CConstRef<CSeqdesc> >& usr_descs, vector<CConstRef<CSeq_entry> >& desc_ctxs, vector<CConstRef<CSeq_feat> >& usr_feats);
455 
456     static bool s_IsSalmonellaGenus(const string& taxname);
457     EDiagSev x_SalmonellaErrorLevel();
458 
459     typedef struct tagSLocCheck {
460         bool chk;
461         bool unmarked_strand;
462         bool mixed_strand;
463         bool has_other;
464         bool has_not_other;
465         CConstRef<CSeq_id> id_cur;
466         CConstRef<CSeq_id> id_prv;
467         const CSeq_interval *int_cur = 0;
468         const CSeq_interval *int_prv = 0;
469         ENa_strand strand_cur;
470         ENa_strand strand_prv;
471         string prefix;
472     } SLocCheck;
473 
474     void x_InitLocCheck(SLocCheck& lc, const string& prefix);
475     void x_CheckForStrandChange(SLocCheck& lc);
476     void x_CheckLoc(const CSeq_loc& loc, const CSerialObject& obj, SLocCheck& lc, bool lowerSev = false);
477     void x_CheckPackedInt(const CPacked_seqint& packed_int,
478                           SLocCheck& lc,
479                           const CSerialObject& obj);
480     bool x_CheckSeqInt(CConstRef<CSeq_id>& id_cur,
481                        const CSeq_interval * int_cur,
482                        ENa_strand& strand_cur,
483                        const CSerialObject& obj);
484     void x_ReportInvalidFuzz(const CPacked_seqint& packed_int, const CSerialObject& obj);
485     void x_ReportInvalidFuzz(const CSeq_interval& interval, const CSerialObject& obj);
486     void x_ReportInvalidFuzz(const CSeq_point& point, const CSerialObject& obj);
487     void x_ReportInvalidFuzz(const CSeq_loc& loc, const CSerialObject& obj);
488     void x_ReportPCRSeqProblem(const string& primer_kind,
489                                char badch,
490                                const CSerialObject& obj,
491                                const CSeq_entry *ctx);
492     void x_CheckPCRPrimer(const CPCRPrimer& primer,
493                           const string& primer_kind,
494                           const CSerialObject& obj,
495                           const CSeq_entry *ctx);
496 
497     void x_DoBarcodeTests(CSeq_entry_Handle seh);
498 
499     bool x_DowngradeForMissingAffil(const CCit_sub& cs);
500 
501     CRef<CObjectManager>    m_ObjMgr;
502     CRef<CScope>            m_Scope;
503     CConstRef<CSeq_entry>   m_TSE;
504     CSeq_entry_Handle       m_TSEH;
505     CConstRef<CSeq_annot>   m_SeqAnnot;
506 
507     CCacheImpl              m_cache;
508     CGeneCache              m_GeneCache;
509 
510     // error repoitory
511     CValidError*       m_ErrRepository;
512 
513     // flags derived from options parameter
514     bool m_NonASCII;             // User sets if Non ASCII char found
515     bool m_SuppressContext;      // Include context in errors if true
516     bool m_ValidateAlignments;   // Validate Alignments if true
517     bool m_ValidateExons;        // Check exon feature splice sites
518     bool m_OvlPepErr;            // Peptide overlap error if true, else warn
519     bool m_RequireISOJTA;        // Journal requires ISO JTA
520     bool m_ValidateIdSet;        // validate update against ID set in database
521     bool m_RemoteFetch;          // Remote fetch enabled?
522     bool m_FarFetchMRNAproducts; // Remote fetch mRNA products
523     bool m_FarFetchCDSproducts;  // Remote fetch proteins
524     bool m_LatLonCheckState;
525     bool m_LatLonIgnoreWater;
526     bool m_LocusTagGeneralMatch;
527     bool m_DoRubiscoText;
528     bool m_IndexerVersion;
529     bool m_genomeSubmission;
530     bool m_UseEntrez;
531     bool m_IgnoreExceptions;             // ignore exceptions when validating translation
532     bool m_ValidateInferenceAccessions;  // check that accessions in inferences are valid
533     bool m_ReportSpliceAsError;
534     bool m_DoTaxLookup;
535     bool m_DoBarcodeTests;
536     bool m_RefSeqConventions;
537     bool m_CollectLocusTags; // collect locus tags for use in special formatted reports
538     bool m_SeqSubmitParent; // some errors are suppressed if this is run on a newly created submission
539     bool m_GenerateGoldenFile;
540     bool m_CompareVDJCtoCDS;
541 
542     // flags calculated by examining data in record
543     bool m_IsStandaloneAnnot;
544     bool m_NoPubs;                  // Suppress no pub error if true
545     bool m_NoCitSubPubs;            // Suppress no cit-sub pub error if true
546     bool m_NoBioSource;             // Suppress no organism error if true
547     bool m_IsGPS;
548     bool m_IsGED;
549     bool m_IsPDB;
550     bool m_IsPatent;
551     bool m_IsRefSeq;
552     bool m_IsEmbl;
553     bool m_IsDdbj;
554     bool m_IsTPE;
555     bool m_IsNC;
556     bool m_IsNG;
557     bool m_IsNM;
558     bool m_IsNP;
559     bool m_IsNR;
560     bool m_IsNZ;
561     bool m_IsNS;
562     bool m_IsNT;
563     bool m_IsNW;
564     bool m_IsWP;
565     bool m_IsXR;
566     bool m_IsGI;
567     bool m_IsGB;
568     bool m_IsGpipe;
569     bool m_IsLocalGeneralOnly;
570     bool m_HasGiOrAccnVer;
571     bool m_IsGenomic;
572     bool m_IsSeqSubmit;
573     bool m_IsSmallGenomeSet;
574     bool m_FeatLocHasGI;
575     bool m_ProductLocHasGI;
576     bool m_GeneHasLocusTag;
577     bool m_ProteinHasGeneralID;
578     bool m_IsINSDInSep;
579     bool m_FarFetchFailure;
580     bool m_IsGeneious;
581 
582     CBioSourceKind m_biosource_kind;
583 
584     bool m_IsTbl2Asn;
585 
586     // seq ids contained within the orignal seq entry.
587     // (used to check for far location)
588     vector< CConstRef<CSeq_id> >    m_InitialSeqIds;
589     // Bioseqs without source (should be considered only if m_NoSource is false)
590     vector< CConstRef<CBioseq> >    m_BioseqWithNoSource;
591 
592     // list of publication serial numbers
593     vector< int > m_PubSerialNumbers;
594 
595     // legal dbxref database strings
596     static const string legalDbXrefs[];
597     static const string legalRefSeqDbXrefs[];
598 
599     // source qulalifiers prefixes
600     static const string sm_SourceQualPrefixes[];
601     static auto_ptr<CTextFsa> m_SourceQualTags;
602 
603     CValidator::TProgressCallback m_PrgCallback;
604     CValidator::CProgressInfo     m_PrgInfo;
605     SIZE_TYPE   m_NumAlign;
606     SIZE_TYPE   m_NumAnnot;
607     SIZE_TYPE   m_NumBioseq;
608     SIZE_TYPE   m_NumBioseq_set;
609     SIZE_TYPE   m_NumDesc;
610     SIZE_TYPE   m_NumDescr;
611     SIZE_TYPE   m_NumFeat;
612     SIZE_TYPE   m_NumGraph;
613 
614     SIZE_TYPE   m_NumMisplacedFeatures;
615     SIZE_TYPE   m_NumSmallGenomeSetMisplaced;
616     SIZE_TYPE   m_NumMisplacedGraphs;
617     SIZE_TYPE   m_NumGenes;
618     SIZE_TYPE   m_NumGeneXrefs;
619 
620     SIZE_TYPE   m_NumTpaWithHistory;
621     SIZE_TYPE   m_NumTpaWithoutHistory;
622 
623     SIZE_TYPE   m_NumPseudo;
624     SIZE_TYPE   m_NumPseudogene;
625 
626     size_t      m_NumTopSetSiblings;
627 
628     // Taxonomy service interface.
629     ITaxon3* m_taxon;
630     ITaxon3* x_GetTaxonService();
631 
632 };
633 
634 
635 END_SCOPE(validator)
636 END_SCOPE(objects)
637 END_NCBI_SCOPE
638 
639 #endif  /* VALIDATOR___VALIDERROR_IMP__HPP */
640