1 /* $Id: validerror_imp.hpp 632625 2021-06-03 17:38:33Z ivanov $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 *` 26 * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko...... 27 * 28 * File Description: 29 * Privae classes and definition for the validator 30 * ....... 31 * 32 */ 33 34 #ifndef VALIDATOR___VALIDERROR_IMP__HPP 35 #define VALIDATOR___VALIDERROR_IMP__HPP 36 37 #include <corelib/ncbistd.hpp> 38 #include <corelib/ncbi_autoinit.hpp> 39 40 #include <objmgr/scope.hpp> 41 #include <objmgr/feat_ci.hpp> // for CMappedFeat 42 #include <objmgr/util/seq_loc_util.hpp> 43 #include <objects/seqset/Bioseq_set.hpp> 44 #include <objects/seq/GIBB_mol.hpp> 45 #include <util/strsearch.hpp> 46 #include <objects/misc/sequence_macros.hpp> 47 #include <objects/seqfeat/Seq_feat.hpp> 48 #include <objects/seqfeat/SeqFeatData.hpp> 49 #include <objects/seqalign/Seq_align.hpp> 50 #include <objects/seqalign/Std_seg.hpp> 51 #include <objects/seqalign/Packed_seg.hpp> 52 #include <objects/valid/Comment_set.hpp> 53 #include <objects/valid/Comment_rule.hpp> 54 #include <objects/taxon3/taxon3.hpp> 55 56 #include <objtools/validator/tax_validation_and_cleanup.hpp> 57 #include <objtools/validator/utilities.hpp> 58 #include <objtools/validator/feature_match.hpp> 59 #include <objtools/validator/cache_impl.hpp> 60 #include <objtools/validator/gene_cache.hpp> 61 62 #include <objtools/alnmgr/sparse_aln.hpp> 63 64 #include <objmgr/util/create_defline.hpp> 65 66 #include <objmgr/util/feature.hpp> 67 68 BEGIN_NCBI_SCOPE 69 BEGIN_SCOPE(objects) 70 71 class CSeq_entry; 72 class CCit_sub; 73 class CCit_art; 74 class CCit_gen; 75 class CSeq_feat; 76 class CBioseq; 77 class CSeqdesc; 78 class CSeq_annot; 79 class CTrna_ext; 80 class CProt_ref; 81 class CSeq_loc; 82 class CFeat_CI; 83 class CPub_set; 84 class CAuth_list; 85 class CTitle; 86 class CMolInfo; 87 class CUser_object; 88 class CSeqdesc_CI; 89 class CSeq_graph; 90 class CMappedGraph; 91 class CDense_diag; 92 class CDense_seg; 93 class CSeq_align_set; 94 class CPubdesc; 95 class CBioSource; 96 class COrg_ref; 97 class CByte_graph; 98 class CDelta_seq; 99 class CGene_ref; 100 class CCdregion; 101 class CRNA_ref; 102 class CImp_feat; 103 class CSeq_literal; 104 class CBioseq_Handle; 105 class CSeq_feat_Handle; 106 class CCountries; 107 class CInferencePrefixList; 108 class CComment_set; 109 class CTaxon3_reply; 110 class ITaxon3; 111 class CT3Error; 112 113 BEGIN_SCOPE(validator) 114 115 class CValidError_desc; 116 class CValidError_descr; 117 118 119 // =========================== Central Validation ========================== 120 121 // CValidError_imp provides the entry point to the validation process. 122 // It calls upon the various validation classes to perform validation of 123 // each part. 124 // The class holds all the data for the validation process. 125 class NCBI_VALIDATOR_EXPORT CValidError_imp 126 { 127 public: 128 typedef map<int, int> TCount; 129 130 // Interface to be used by the CValidError class 131 132 CValidError_imp(CObjectManager& objmgr, CValidError* errors, 133 Uint4 options = 0); 134 135 // Constructor allowing over-ride of Services 136 // Namely, the taxonomy service. 137 // NB: ITaxon is owned by CValidator. 138 CValidError_imp(CObjectManager& objmgr, CValidError* errors, 139 ITaxon3* taxon, Uint4 options = 0); 140 141 // Destructor 142 virtual ~CValidError_imp(void); 143 144 void SetOptions (Uint4 options); 145 void SetErrorRepository (CValidError* errors); 146 void Reset(void); 147 148 // Validation methods 149 bool Validate(const CSeq_entry& se, const CCit_sub* cs = 0, 150 CScope* scope = 0); 151 bool Validate( 152 const CSeq_entry_Handle& seh, const CCit_sub* cs = 0); 153 void Validate( 154 const CSeq_submit& ss, CScope* scope = 0); 155 void Validate(const CSeq_annot_Handle& sa); 156 157 void Validate(const CSeq_feat& feat, CScope* scope = 0); 158 void Validate(const CBioSource& src, CScope* scope = 0); 159 void Validate(const CPubdesc& pubdesc, CScope* scope = 0); 160 void Validate(const CSeqdesc& desc, const CSeq_entry& ctx); 161 void ValidateSubAffil(const CAffil::TStd& std, const CSerialObject& obj, const CSeq_entry *ctx); 162 void ValidateAffil(const CAffil::TStd& std, const CSerialObject& obj, const CSeq_entry *ctx); 163 164 bool GetTSANStretchErrors(const CSeq_entry_Handle& se); 165 bool GetTSACDSOnMinusStrandErrors (const CSeq_entry_Handle& se); 166 bool GetTSAConflictingBiomolTechErrors (const CSeq_entry_Handle& se); 167 bool GetTSANStretchErrors(const CBioseq& seq); 168 bool GetTSACDSOnMinusStrandErrors (const CSeq_feat& f, const CBioseq& seq); 169 bool GetTSAConflictingBiomolTechErrors (const CBioseq& seq); 170 171 172 void SetProgressCallback(CValidator::TProgressCallback callback, 173 void* user_data); 174 175 void SetTSE(const CSeq_entry_Handle& seh); 176 ShouldSubdivide() const177 bool ShouldSubdivide() const { if (m_NumTopSetSiblings > 1000) return true; else return false; } 178 179 public: 180 // interface to be used by the various validation classes 181 182 // typedefs: 183 typedef const CSeq_feat& TFeat; 184 typedef const CBioseq& TBioseq; 185 typedef const CBioseq_set& TSet; 186 typedef const CSeqdesc& TDesc; 187 typedef const CSeq_annot& TAnnot; 188 typedef const CSeq_graph& TGraph; 189 typedef const CSeq_align& TAlign; 190 typedef const CSeq_entry& TEntry; 191 typedef map < const CSeq_feat*, const CSeq_annot* >& TFeatAnnotMap; 192 193 // Posts errors. 194 void PostErr(EDiagSev sv, EErrType et, const string& msg, 195 const CSerialObject& obj); 196 void PostErr(EDiagSev sv, EErrType et, const string& msg, TDesc ds); 197 void PostErr(EDiagSev sv, EErrType et, const string& msg, TFeat ft); 198 void PostErr(EDiagSev sv, EErrType et, const string& msg, TBioseq sq); 199 void PostErr(EDiagSev sv, EErrType et, const string& msg, TEntry ctx, 200 TDesc ds); 201 void PostErr(EDiagSev sv, EErrType et, const string& msg, TSet set); 202 void PostErr(EDiagSev sv, EErrType et, const string& msg, TAnnot annot); 203 void PostErr(EDiagSev sv, EErrType et, const string& msg, TGraph graph); 204 void PostErr(EDiagSev sv, EErrType et, const string& msg, TBioseq sq, 205 TGraph graph); 206 void PostErr(EDiagSev sv, EErrType et, const string& msg, TAlign align); 207 void PostErr(EDiagSev sv, EErrType et, const string& msg, TEntry entry); 208 void PostErr(EDiagSev sv, EErrType et, const string& msg, const CBioSource& src); 209 void PostErr(EDiagSev sv, EErrType et, const string& msg, const COrg_ref& org); 210 void PostErr(EDiagSev sv, EErrType et, const string& msg, const CPubdesc& src); 211 void PostErr(EDiagSev sv, EErrType et, const string& msg, const CSeq_submit& ss); 212 void PostObjErr (EDiagSev sv, EErrType et, const string& msg, const CSerialObject& obj, const CSeq_entry *ctx = 0); 213 void PostBadDateError (EDiagSev sv, const string& msg, int flags, const CSerialObject& obj, const CSeq_entry *ctx = 0); 214 215 void HandleTaxonomyError(const CT3Error& error, const string& host, const COrg_ref& orf); 216 void HandleTaxonomyError(const CT3Error& error, const EErrType type, const CSeq_feat& feat); 217 void HandleTaxonomyError(const CT3Error& error, const EErrType type, const CSeqdesc& desc, const CSeq_entry* entry); 218 219 bool RaiseGenomeSeverity(EErrType et); 220 221 // General use validation methods 222 void ValidatePubdesc(const CPubdesc& pub, const CSerialObject& obj, const CSeq_entry *ctx = 0); 223 void ValidateBioSource(const CBioSource& bsrc, const CSerialObject& obj, const CSeq_entry *ctx = 0); 224 void ValidatePCRReactionSet(const CPCRReactionSet& pcrset, const CSerialObject& obj, const CSeq_entry *ctx = 0); 225 void ValidateSubSource(const CSubSource& subsrc, const CSerialObject& obj, const CSeq_entry *ctx = 0, const bool isViral = false); 226 void ValidateOrgRef(const COrg_ref& orgref, const CSerialObject& obj, const CSeq_entry *ctx); 227 void ValidateTaxNameOrgname(const string& taxname, const COrgName& orgname, const CSerialObject& obj, const CSeq_entry *ctx); 228 void ValidateOrgName(const COrgName& orgname, const bool has_taxon, const CSerialObject& obj, const CSeq_entry *ctx); 229 void ValidateOrgModVoucher(const COrgMod& orgmod, const CSerialObject& obj, const CSeq_entry *ctx); 230 void ValidateBioSourceForSeq(const CBioSource& bsrc, const CSerialObject& obj, const CSeq_entry *ctx, const CBioseq_Handle& bsh); 231 232 void ValidateLatLonCountry(string countryname, string lat_lon, const CSerialObject& obj, const CSeq_entry *ctx); 233 234 static bool IsSyntheticConstruct (const CBioSource& src); 235 bool IsArtificial (const CBioSource& src); 236 bool IsOtherDNA(const CBioseq_Handle& bsh) const; 237 void ValidateSeqLoc(const CSeq_loc& loc, const CBioseq_Handle& seq, bool report_abutting, 238 const string& prefix, const CSerialObject& obj, bool lowerSev = false); 239 240 void ValidateSeqLocIds(const CSeq_loc& loc, const CSerialObject& obj); 241 static bool IsInOrganelleSmallGenomeSet(const CSeq_id& id, CScope& scope); 242 static bool BadMultipleSequenceLocation(const CSeq_loc& loc, CScope& scope); 243 void CheckMultipleIds(const CSeq_loc& loc, const CSerialObject& obj); 244 void ValidateDbxref(const CDbtag& xref, const CSerialObject& obj, 245 bool biosource = false, const CSeq_entry *ctx = 0); 246 void ValidateDbxref(TDbtags& xref_list, const CSerialObject& obj, 247 bool biosource = false, const CSeq_entry *ctx = 0); 248 void ValidateCitSub(const CCit_sub& cs, const CSerialObject& obj, const CSeq_entry *ctx = 0); 249 void ValidateTaxonomy(const CSeq_entry& se); 250 void ValidateOrgRefs(CTaxValidationAndCleanup& tval); 251 void ValidateSpecificHost(CTaxValidationAndCleanup& tval); 252 void ValidateStrain(CTaxValidationAndCleanup& tval); 253 void ValidateSpecificHost (const CSeq_entry& se); 254 void ValidateTentativeName(const CSeq_entry& se); 255 void ValidateTaxonomy(const COrg_ref& org, int genome = CBioSource::eGenome_unknown); 256 void ValidateMultipleTaxIds(const CSeq_entry_Handle& seh); 257 void ValidateCitations (const CSeq_entry_Handle& seh); 258 bool x_IsFarFetchFailure (const CSeq_loc& loc); 259 260 // getters GetScope(void)261 inline CScope* GetScope(void) { return m_Scope; } GetCache(void)262 inline CCacheImpl & GetCache(void) { return m_cache; } 263 GetCachedGene(const CSeq_feat * f)264 inline CConstRef<CSeq_feat> GetCachedGene(const CSeq_feat* f) { return m_GeneCache.GetGeneFromCache(f, *m_Scope); } GetGeneCache()265 inline CGeneCache& GetGeneCache() { return m_GeneCache; } 266 267 // flags derived from options parameter IsNonASCII(void) const268 bool IsNonASCII(void) const { return m_NonASCII; } IsSuppressContext(void) const269 bool IsSuppressContext(void) const { return m_SuppressContext; } IsValidateAlignments(void) const270 bool IsValidateAlignments(void) const { return m_ValidateAlignments; } IsValidateExons(void) const271 bool IsValidateExons(void) const { return m_ValidateExons; } IsOvlPepErr(void) const272 bool IsOvlPepErr(void) const { return m_OvlPepErr; } IsRequireTaxonID(void) const273 bool IsRequireTaxonID(void) const { return !m_SeqSubmitParent; } IsSeqSubmitParent(void) const274 bool IsSeqSubmitParent(void) const { return m_SeqSubmitParent; } IsRequireISOJTA(void) const275 bool IsRequireISOJTA(void) const { return m_RequireISOJTA; } IsValidateIdSet(void) const276 bool IsValidateIdSet(void) const { return m_ValidateIdSet; } IsRemoteFetch(void) const277 bool IsRemoteFetch(void) const { return m_RemoteFetch; } IsFarFetchMRNAproducts(void) const278 bool IsFarFetchMRNAproducts(void) const { return m_FarFetchMRNAproducts; } IsFarFetchCDSproducts(void) const279 bool IsFarFetchCDSproducts(void) const { return m_FarFetchCDSproducts; } IsLocusTagGeneralMatch(void) const280 bool IsLocusTagGeneralMatch(void) const { return m_LocusTagGeneralMatch; } DoRubiscoTest(void) const281 bool DoRubiscoTest(void) const { return m_DoRubiscoText; } IsIndexerVersion(void) const282 bool IsIndexerVersion(void) const { return m_IndexerVersion; } IsGenomeSubmission(void) const283 bool IsGenomeSubmission(void) const { return m_genomeSubmission; } UseEntrez(void) const284 bool UseEntrez(void) const { return m_UseEntrez; } DoTaxLookup(void) const285 bool DoTaxLookup(void) const { return m_DoTaxLookup; } ValidateInferenceAccessions(void) const286 bool ValidateInferenceAccessions(void) const { return m_ValidateInferenceAccessions; } IgnoreExceptions(void) const287 bool IgnoreExceptions(void) const { return m_IgnoreExceptions; } ReportSpliceAsError(void) const288 bool ReportSpliceAsError(void) const { return m_ReportSpliceAsError; } IsLatLonCheckState(void) const289 bool IsLatLonCheckState(void) const { return m_LatLonCheckState; } IsLatLonIgnoreWater(void) const290 bool IsLatLonIgnoreWater(void) const { return m_LatLonIgnoreWater; } IsRefSeqConventions(void) const291 bool IsRefSeqConventions(void) const { return m_RefSeqConventions; } GenerateGoldenFile(void) const292 bool GenerateGoldenFile(void) const { return m_GenerateGoldenFile; } DoCompareVDJCtoCDS(void) const293 bool DoCompareVDJCtoCDS(void) const { return m_CompareVDJCtoCDS; } 294 295 296 // flags calculated by examining data in record IsStandaloneAnnot(void) const297 inline bool IsStandaloneAnnot(void) const { return m_IsStandaloneAnnot; } IsNoPubs(void) const298 inline bool IsNoPubs(void) const { return m_NoPubs; } IsNoCitSubPubs(void) const299 inline bool IsNoCitSubPubs(void) const { return m_NoCitSubPubs; } IsNoBioSource(void) const300 inline bool IsNoBioSource(void) const { return m_NoBioSource; } IsGPS(void) const301 inline bool IsGPS(void) const { return m_IsGPS; } IsGED(void) const302 inline bool IsGED(void) const { return m_IsGED; } IsPDB(void) const303 inline bool IsPDB(void) const { return m_IsPDB; } IsPatent(void) const304 inline bool IsPatent(void) const { return m_IsPatent; } IsRefSeq(void) const305 inline bool IsRefSeq(void) const { return m_IsRefSeq || m_RefSeqConventions; } IsEmbl(void) const306 inline bool IsEmbl(void) const { return m_IsEmbl; } IsDdbj(void) const307 inline bool IsDdbj(void) const { return m_IsDdbj; } IsTPE(void) const308 inline bool IsTPE(void) const { return m_IsTPE; } IsNC(void) const309 inline bool IsNC(void) const { return m_IsNC; } IsNG(void) const310 inline bool IsNG(void) const { return m_IsNG; } IsNM(void) const311 inline bool IsNM(void) const { return m_IsNM; } IsNP(void) const312 inline bool IsNP(void) const { return m_IsNP; } IsNR(void) const313 inline bool IsNR(void) const { return m_IsNR; } IsNZ(void) const314 inline bool IsNZ(void) const { return m_IsNZ; } IsNS(void) const315 inline bool IsNS(void) const { return m_IsNS; } IsNT(void) const316 inline bool IsNT(void) const { return m_IsNT; } IsNW(void) const317 inline bool IsNW(void) const { return m_IsNW; } IsWP(void) const318 inline bool IsWP(void) const { return m_IsWP; } IsXR(void) const319 inline bool IsXR(void) const { return m_IsXR; } IsGI(void) const320 inline bool IsGI(void) const { return m_IsGI; } IsGpipe(void) const321 inline bool IsGpipe(void) const { return m_IsGpipe; } 322 bool IsHtg(void) const; IsLocalGeneralOnly(void) const323 inline bool IsLocalGeneralOnly(void) const { return m_IsLocalGeneralOnly; } HasGiOrAccnVer(void) const324 inline bool HasGiOrAccnVer(void) const { return m_HasGiOrAccnVer; } IsGenomic(void) const325 inline bool IsGenomic(void) const { return m_IsGenomic; } IsSeqSubmit(void) const326 inline bool IsSeqSubmit(void) const { return m_IsSeqSubmit; } IsSmallGenomeSet(void) const327 inline bool IsSmallGenomeSet(void) const { return m_IsSmallGenomeSet; } 328 bool IsNoncuratedRefSeq(const CBioseq& seq, EDiagSev& sev); IsGenbank(void) const329 inline bool IsGenbank(void) const { return m_IsGB; } DoesAnyFeatLocHaveGI(void) const330 inline bool DoesAnyFeatLocHaveGI(void) const { return m_FeatLocHasGI; } DoesAnyProductLocHaveGI(void) const331 inline bool DoesAnyProductLocHaveGI(void) const { return m_ProductLocHasGI; } DoesAnyGeneHaveLocusTag(void) const332 inline bool DoesAnyGeneHaveLocusTag(void) const { return m_GeneHasLocusTag; } DoesAnyProteinHaveGeneralID(void) const333 inline bool DoesAnyProteinHaveGeneralID(void) const { return m_ProteinHasGeneralID; } IsINSDInSep(void) const334 inline bool IsINSDInSep(void) const { return m_IsINSDInSep; } IsGeneious(void) const335 inline bool IsGeneious(void) const { return m_IsGeneious; } BioSourceKind() const336 inline const CBioSourceKind& BioSourceKind() const { return m_biosource_kind; } 337 338 // counting number of misplaced features ResetMisplacedFeatureCount(void)339 inline void ResetMisplacedFeatureCount (void) { m_NumMisplacedFeatures = 0; } IncrementMisplacedFeatureCount(void)340 inline void IncrementMisplacedFeatureCount (void) { m_NumMisplacedFeatures++; } AddToMisplacedFeatureCount(SIZE_TYPE num)341 inline void AddToMisplacedFeatureCount (SIZE_TYPE num) { m_NumMisplacedFeatures += num; } 342 343 // counting number of small genome set misplaced features ResetSmallGenomeSetMisplacedCount(void)344 inline void ResetSmallGenomeSetMisplacedCount (void) { m_NumSmallGenomeSetMisplaced = 0; } IncrementSmallGenomeSetMisplacedCount(void)345 inline void IncrementSmallGenomeSetMisplacedCount (void) { m_NumSmallGenomeSetMisplaced++; } AddToSmallGenomeSetMisplacedCount(SIZE_TYPE num)346 inline void AddToSmallGenomeSetMisplacedCount (SIZE_TYPE num) { m_NumSmallGenomeSetMisplaced += num; } 347 348 // counting number of misplaced graphs ResetMisplacedGraphCount(void)349 inline void ResetMisplacedGraphCount (void) { m_NumMisplacedGraphs = 0; } IncrementMisplacedGraphCount(void)350 inline void IncrementMisplacedGraphCount (void) { m_NumMisplacedGraphs++; } AddToMisplacedGraphCount(SIZE_TYPE num)351 inline void AddToMisplacedGraphCount (SIZE_TYPE num) { m_NumMisplacedGraphs += num; } 352 353 // counting number of genes and gene xrefs ResetGeneCount(void)354 inline void ResetGeneCount (void) { m_NumGenes = 0; } IncrementGeneCount(void)355 inline void IncrementGeneCount (void) { m_NumGenes++; } AddToGeneCount(SIZE_TYPE num)356 inline void AddToGeneCount (SIZE_TYPE num) { m_NumGenes += num; } ResetGeneXrefCount(void)357 inline void ResetGeneXrefCount (void) { m_NumGeneXrefs = 0; } IncrementGeneXrefCount(void)358 inline void IncrementGeneXrefCount (void) { m_NumGeneXrefs++; } AddToGeneXrefCount(SIZE_TYPE num)359 inline void AddToGeneXrefCount (SIZE_TYPE num) { m_NumGeneXrefs += num; } 360 361 // counting sequences with and without TPA history ResetTpaWithHistoryCount(void)362 inline void ResetTpaWithHistoryCount (void) { m_NumTpaWithHistory = 0; } IncrementTpaWithHistoryCount(void)363 inline void IncrementTpaWithHistoryCount (void) { m_NumTpaWithHistory++; } AddToTpaWithHistoryCount(SIZE_TYPE num)364 inline void AddToTpaWithHistoryCount (SIZE_TYPE num) { m_NumTpaWithHistory += num; } ResetTpaWithoutHistoryCount(void)365 inline void ResetTpaWithoutHistoryCount (void) { m_NumTpaWithoutHistory = 0; } IncrementTpaWithoutHistoryCount(void)366 inline void IncrementTpaWithoutHistoryCount (void) { m_NumTpaWithoutHistory++; } AddToTpaWithoutHistoryCount(SIZE_TYPE num)367 inline void AddToTpaWithoutHistoryCount (SIZE_TYPE num) { m_NumTpaWithoutHistory += num; } 368 369 // counting number of Pseudos and Pseudogenes ResetPseudoCount(void)370 inline void ResetPseudoCount (void) { m_NumPseudo = 0; } IncrementPseudoCount(void)371 inline void IncrementPseudoCount (void) { m_NumPseudo++; } AddToPseudoCount(SIZE_TYPE num)372 inline void AddToPseudoCount (SIZE_TYPE num) { m_NumPseudo += num; } ResetPseudogeneCount(void)373 inline void ResetPseudogeneCount (void) { m_NumPseudogene = 0; } IncrementPseudogeneCount(void)374 inline void IncrementPseudogeneCount (void) { m_NumPseudogene++; } AddToPseudogeneCount(SIZE_TYPE num)375 inline void AddToPseudogeneCount (SIZE_TYPE num) { m_NumPseudogene += num; } 376 377 // set flag for farfetchfailure SetFarFetchFailure(void)378 inline void SetFarFetchFailure (void) { m_FarFetchFailure = true; } 379 GetTSE(void) const380 const CSeq_entry& GetTSE(void) const { return *m_TSE; }; GetTSEH(void)381 const CSeq_entry_Handle & GetTSEH(void) { return m_TSEH; } GetTSE_Handle(void)382 const CTSE_Handle & GetTSE_Handle(void) { return 383 (m_TSEH ? m_TSEH.GetTSE_Handle() : CCacheImpl::kEmptyTSEHandle); } GetSeqAnnot(void)384 const CConstRef<CSeq_annot>& GetSeqAnnot(void) { return m_SeqAnnot; } 385 386 void AddBioseqWithNoPub(const CBioseq& seq); 387 void AddBioseqWithNoBiosource(const CBioseq& seq); 388 void AddProtWithoutFullRef(const CBioseq_Handle& seq); 389 static bool IsWGSIntermediate(const CBioseq& seq); 390 static bool IsTSAIntermediate(const CBioseq& seq); 391 void ReportMissingPubs(const CSeq_entry& se, const CCit_sub* cs); 392 void ReportMissingBiosource(const CSeq_entry& se); 393 394 CConstRef<CSeq_feat> GetCDSGivenProduct(const CBioseq& seq); 395 CConstRef<CSeq_feat> GetmRNAGivenProduct(const CBioseq& seq); 396 const CSeq_entry* GetAncestor(const CBioseq& seq, CBioseq_set::EClass clss); 397 bool IsSerialNumberInComment(const string& comment); 398 399 bool IsTransgenic(const CBioSource& bsrc); 400 401 bool RequireLocalProduct(const CSeq_id* sid) const; 402 403 private: 404 405 // Setup common options during consturction; 406 void x_Init(Uint4 options); 407 408 // This is so we can temporarily set m_Scope in a function 409 // and be sure that it will be set to its old value when we're done 410 class CScopeRestorer { 411 public: CScopeRestorer(CRef<CScope> & scope)412 CScopeRestorer( CRef<CScope> &scope ) : 413 m_scopeToRestore(scope), m_scopeOriginalValue(scope) { } 414 ~CScopeRestorer(void)415 ~CScopeRestorer(void) { m_scopeToRestore = m_scopeOriginalValue; } 416 private: 417 CRef<CScope> &m_scopeToRestore; 418 CRef<CScope> m_scopeOriginalValue; 419 }; 420 421 // Prohibit copy constructor & assignment operator 422 CValidError_imp(const CValidError_imp&); 423 CValidError_imp& operator= (const CValidError_imp&); 424 425 void Setup(const CSeq_entry_Handle& seh); 426 void Setup(const CSeq_annot_Handle& sa); 427 CSeq_entry_Handle Setup(const CBioseq& seq); 428 void SetScope(const CSeq_entry& se); 429 430 void ValidateSubmitBlock(const CSubmit_block& block, const CSeq_submit& ss); 431 432 void InitializeSourceQualTags(); 433 void ValidateSourceQualTags(const string& str, const CSerialObject& obj, const CSeq_entry *ctx = 0); 434 435 bool IsMixedStrands(const CSeq_loc& loc); 436 437 void ValidatePubGen(const CCit_gen& gen, const CSerialObject& obj, const CSeq_entry *ctx = 0); 438 void ValidatePubArticle(const CCit_art& art, TEntrezId uid, const CSerialObject& obj, const CSeq_entry *ctx = 0); 439 void ValidatePubArticleNoPMID(const CCit_art& art, const CSerialObject& obj, const CSeq_entry *ctx = 0); 440 void x_ValidatePages(const string& pages, const CSerialObject& obj, const CSeq_entry *ctx = 0); 441 void ValidateAuthorList(const CAuth_list::C_Names& names, const CSerialObject& obj, const CSeq_entry *ctx = 0); 442 void ValidateAuthorsInPubequiv (const CPub_equiv& pe, const CSerialObject& obj, const CSeq_entry *ctx = 0); 443 void ValidatePubHasAuthor(const CPubdesc& pubdesc, const CSerialObject& obj, const CSeq_entry *ctx = 0); 444 445 bool HasName(const CAuth_list& authors); 446 bool HasTitle(const CTitle& title); 447 bool HasIsoJTA(const CTitle& title); 448 449 void FindEmbeddedScript(const CSerialObject& obj); 450 void FindNonAsciiText (const CSerialObject& obj); 451 void FindCollidingSerialNumbers (const CSerialObject& obj); 452 453 454 void GatherTentativeName (const CSeq_entry& se, vector<CConstRef<CSeqdesc> >& usr_descs, vector<CConstRef<CSeq_entry> >& desc_ctxs, vector<CConstRef<CSeq_feat> >& usr_feats); 455 456 static bool s_IsSalmonellaGenus(const string& taxname); 457 EDiagSev x_SalmonellaErrorLevel(); 458 459 typedef struct tagSLocCheck { 460 bool chk; 461 bool unmarked_strand; 462 bool mixed_strand; 463 bool has_other; 464 bool has_not_other; 465 CConstRef<CSeq_id> id_cur; 466 CConstRef<CSeq_id> id_prv; 467 const CSeq_interval *int_cur = 0; 468 const CSeq_interval *int_prv = 0; 469 ENa_strand strand_cur; 470 ENa_strand strand_prv; 471 string prefix; 472 } SLocCheck; 473 474 void x_InitLocCheck(SLocCheck& lc, const string& prefix); 475 void x_CheckForStrandChange(SLocCheck& lc); 476 void x_CheckLoc(const CSeq_loc& loc, const CSerialObject& obj, SLocCheck& lc, bool lowerSev = false); 477 void x_CheckPackedInt(const CPacked_seqint& packed_int, 478 SLocCheck& lc, 479 const CSerialObject& obj); 480 bool x_CheckSeqInt(CConstRef<CSeq_id>& id_cur, 481 const CSeq_interval * int_cur, 482 ENa_strand& strand_cur, 483 const CSerialObject& obj); 484 void x_ReportInvalidFuzz(const CPacked_seqint& packed_int, const CSerialObject& obj); 485 void x_ReportInvalidFuzz(const CSeq_interval& interval, const CSerialObject& obj); 486 void x_ReportInvalidFuzz(const CSeq_point& point, const CSerialObject& obj); 487 void x_ReportInvalidFuzz(const CSeq_loc& loc, const CSerialObject& obj); 488 void x_ReportPCRSeqProblem(const string& primer_kind, 489 char badch, 490 const CSerialObject& obj, 491 const CSeq_entry *ctx); 492 void x_CheckPCRPrimer(const CPCRPrimer& primer, 493 const string& primer_kind, 494 const CSerialObject& obj, 495 const CSeq_entry *ctx); 496 497 void x_DoBarcodeTests(CSeq_entry_Handle seh); 498 499 bool x_DowngradeForMissingAffil(const CCit_sub& cs); 500 501 CRef<CObjectManager> m_ObjMgr; 502 CRef<CScope> m_Scope; 503 CConstRef<CSeq_entry> m_TSE; 504 CSeq_entry_Handle m_TSEH; 505 CConstRef<CSeq_annot> m_SeqAnnot; 506 507 CCacheImpl m_cache; 508 CGeneCache m_GeneCache; 509 510 // error repoitory 511 CValidError* m_ErrRepository; 512 513 // flags derived from options parameter 514 bool m_NonASCII; // User sets if Non ASCII char found 515 bool m_SuppressContext; // Include context in errors if true 516 bool m_ValidateAlignments; // Validate Alignments if true 517 bool m_ValidateExons; // Check exon feature splice sites 518 bool m_OvlPepErr; // Peptide overlap error if true, else warn 519 bool m_RequireISOJTA; // Journal requires ISO JTA 520 bool m_ValidateIdSet; // validate update against ID set in database 521 bool m_RemoteFetch; // Remote fetch enabled? 522 bool m_FarFetchMRNAproducts; // Remote fetch mRNA products 523 bool m_FarFetchCDSproducts; // Remote fetch proteins 524 bool m_LatLonCheckState; 525 bool m_LatLonIgnoreWater; 526 bool m_LocusTagGeneralMatch; 527 bool m_DoRubiscoText; 528 bool m_IndexerVersion; 529 bool m_genomeSubmission; 530 bool m_UseEntrez; 531 bool m_IgnoreExceptions; // ignore exceptions when validating translation 532 bool m_ValidateInferenceAccessions; // check that accessions in inferences are valid 533 bool m_ReportSpliceAsError; 534 bool m_DoTaxLookup; 535 bool m_DoBarcodeTests; 536 bool m_RefSeqConventions; 537 bool m_CollectLocusTags; // collect locus tags for use in special formatted reports 538 bool m_SeqSubmitParent; // some errors are suppressed if this is run on a newly created submission 539 bool m_GenerateGoldenFile; 540 bool m_CompareVDJCtoCDS; 541 542 // flags calculated by examining data in record 543 bool m_IsStandaloneAnnot; 544 bool m_NoPubs; // Suppress no pub error if true 545 bool m_NoCitSubPubs; // Suppress no cit-sub pub error if true 546 bool m_NoBioSource; // Suppress no organism error if true 547 bool m_IsGPS; 548 bool m_IsGED; 549 bool m_IsPDB; 550 bool m_IsPatent; 551 bool m_IsRefSeq; 552 bool m_IsEmbl; 553 bool m_IsDdbj; 554 bool m_IsTPE; 555 bool m_IsNC; 556 bool m_IsNG; 557 bool m_IsNM; 558 bool m_IsNP; 559 bool m_IsNR; 560 bool m_IsNZ; 561 bool m_IsNS; 562 bool m_IsNT; 563 bool m_IsNW; 564 bool m_IsWP; 565 bool m_IsXR; 566 bool m_IsGI; 567 bool m_IsGB; 568 bool m_IsGpipe; 569 bool m_IsLocalGeneralOnly; 570 bool m_HasGiOrAccnVer; 571 bool m_IsGenomic; 572 bool m_IsSeqSubmit; 573 bool m_IsSmallGenomeSet; 574 bool m_FeatLocHasGI; 575 bool m_ProductLocHasGI; 576 bool m_GeneHasLocusTag; 577 bool m_ProteinHasGeneralID; 578 bool m_IsINSDInSep; 579 bool m_FarFetchFailure; 580 bool m_IsGeneious; 581 582 CBioSourceKind m_biosource_kind; 583 584 bool m_IsTbl2Asn; 585 586 // seq ids contained within the orignal seq entry. 587 // (used to check for far location) 588 vector< CConstRef<CSeq_id> > m_InitialSeqIds; 589 // Bioseqs without source (should be considered only if m_NoSource is false) 590 vector< CConstRef<CBioseq> > m_BioseqWithNoSource; 591 592 // list of publication serial numbers 593 vector< int > m_PubSerialNumbers; 594 595 // legal dbxref database strings 596 static const string legalDbXrefs[]; 597 static const string legalRefSeqDbXrefs[]; 598 599 // source qulalifiers prefixes 600 static const string sm_SourceQualPrefixes[]; 601 static auto_ptr<CTextFsa> m_SourceQualTags; 602 603 CValidator::TProgressCallback m_PrgCallback; 604 CValidator::CProgressInfo m_PrgInfo; 605 SIZE_TYPE m_NumAlign; 606 SIZE_TYPE m_NumAnnot; 607 SIZE_TYPE m_NumBioseq; 608 SIZE_TYPE m_NumBioseq_set; 609 SIZE_TYPE m_NumDesc; 610 SIZE_TYPE m_NumDescr; 611 SIZE_TYPE m_NumFeat; 612 SIZE_TYPE m_NumGraph; 613 614 SIZE_TYPE m_NumMisplacedFeatures; 615 SIZE_TYPE m_NumSmallGenomeSetMisplaced; 616 SIZE_TYPE m_NumMisplacedGraphs; 617 SIZE_TYPE m_NumGenes; 618 SIZE_TYPE m_NumGeneXrefs; 619 620 SIZE_TYPE m_NumTpaWithHistory; 621 SIZE_TYPE m_NumTpaWithoutHistory; 622 623 SIZE_TYPE m_NumPseudo; 624 SIZE_TYPE m_NumPseudogene; 625 626 size_t m_NumTopSetSiblings; 627 628 // Taxonomy service interface. 629 ITaxon3* m_taxon; 630 ITaxon3* x_GetTaxonService(); 631 632 }; 633 634 635 END_SCOPE(validator) 636 END_SCOPE(objects) 637 END_NCBI_SCOPE 638 639 #endif /* VALIDATOR___VALIDERROR_IMP__HPP */ 640