1 #ifndef VALIDATOR___VALIDATOR__HPP 2 #define VALIDATOR___VALIDATOR__HPP 3 4 /* $Id: validator.hpp 632625 2021-06-03 17:38:33Z ivanov $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko...... 30 * 31 * File Description: 32 * Validates CSeq_entries and CSeq_submits 33 * ....... 34 * 35 */ 36 #include <corelib/ncbistd.hpp> 37 #include <corelib/ncbidiag.hpp> 38 #include <serial/objectinfo.hpp> 39 #include <serial/serialbase.hpp> 40 #include <objects/general/Name_std.hpp> 41 #include <objects/biblio/Author.hpp> 42 #include <objects/valerr/ValidErrItem.hpp> 43 #include <objects/valerr/ValidError.hpp> 44 #include <objects/taxon3/itaxon3.hpp> 45 #include <objmgr/scope.hpp> 46 47 #include <map> 48 49 50 BEGIN_NCBI_SCOPE 51 BEGIN_SCOPE(objects) 52 53 class CSeq_entry; 54 class CSeq_entry_Handle; 55 class CSeq_submit; 56 class CSeq_annot; 57 class CSeq_annot_Handle; 58 class CSeq_feat; 59 class CBioSource; 60 class CPubdesc; 61 class CBioseq; 62 class CSeqdesc; 63 class CObjectManager; 64 class CScope; 65 class CDbtag; 66 67 BEGIN_SCOPE(validator) 68 69 70 class NCBI_VALIDATOR_EXPORT CValidator : public CObject 71 { 72 public: 73 74 enum EValidOptions { 75 eVal_non_ascii = 0x1, 76 eVal_no_context = 0x2, 77 eVal_val_align = 0x4, 78 eVal_val_exons = 0x8, 79 eVal_ovl_pep_err = 0x10, 80 eVal_seqsubmit_parent = 0x20, 81 eVal_need_isojta = 0x40, 82 eVal_validate_id_set = 0x80, 83 eVal_remote_fetch = 0x100, 84 eVal_far_fetch_mrna_products = 0x200, 85 eVal_far_fetch_cds_products = 0x400, 86 eVal_locus_tag_general_match = 0x800, 87 eVal_do_rubisco_test = 0x1000, 88 eVal_indexer_version = 0x2000, 89 eVal_use_entrez = 0x4000, 90 eVal_inference_accns = 0x8000, 91 eVal_ignore_exceptions = 0x10000, 92 eVal_report_splice_as_error = 0x20000, 93 eVal_latlon_check_state = 0x40000, 94 eVal_latlon_ignore_water = 0x80000, 95 eVal_genome_submission = 0x100000, 96 eVal_do_tax_lookup = 0x200000, 97 eVal_do_barcode_tests = 0x400000, 98 eVal_refseq_conventions = 0x800000, 99 eVal_collect_locus_tags = 0x1000000, 100 eVal_generate_golden_file = 0x2000000, 101 eVal_compare_vdjc_to_cds = 0x4000000, 102 }; 103 104 // Constructor / Destructor 105 // If no taxon service is provided, a CTAxon3 client will 106 // be created. 107 CValidator(CObjectManager& objmgr, AutoPtr<ITaxon3> taxon = NULL); 108 ~CValidator(void); 109 110 // If many validations are being done without changing the underlying 111 // objects, a cache can be given to speed up the process. 112 // 113 // Only functions that have some use for pCache or that directly 114 // or indirectly call some function that uses pCache should 115 // take pCache as an argument. 116 // 117 // (note PIMPL idiom to be as opaque as possible) 118 class CCacheImpl; 119 class NCBI_VALIDATOR_EXPORT CCache : public CObject { 120 public: 121 CCache(void); 122 123 // the containing CCache object owns the m_impl 124 auto_ptr<CCacheImpl> m_impl; 125 }; 126 static CRef<CCache> MakeEmptyCache(void); 127 128 // Validation methods: 129 // It is possible to validate objects of types CSeq_entry, CSeq_submit 130 // or CSeq_annot. In addition to the object to validate the user must 131 // provide the scope which contain that object, and validation options 132 // that are created by OR'ing EValidOptions (as specified above) 133 134 // Validate Seq-entry. 135 // If provding a scope the Seq-entry must be a 136 // top-level Seq-entry in that scope. 137 CConstRef<CValidError> Validate(const CSeq_entry& se, CScope* scope = 0, 138 Uint4 options = 0); 139 CConstRef<CValidError> Validate(const CSeq_entry_Handle& se, 140 Uint4 options = 0); 141 // Validate Seq-submit. 142 // Validates each of the Seq-entry contained in the submission. 143 CConstRef<CValidError> Validate(const CSeq_submit& ss, CScope* scope = 0, 144 Uint4 options = 0); 145 // Validate Seq-annot 146 // Validates stand alone Seq-annot objects. This will supress any 147 // check on the context of the annotaions. 148 CConstRef<CValidError> Validate(const CSeq_annot_Handle& sa, 149 Uint4 options = 0); 150 151 // Validate Seq-feat 152 CConstRef<CValidError> Validate(const CSeq_feat& feat, 153 CScope *scope = 0, 154 Uint4 options = 0); 155 // old call 156 NCBI_DEPRECATED 157 CConstRef<CValidError> Validate(const CSeq_feat& feat, 158 Uint4 options = 0); 159 160 // Validate BioSource 161 CConstRef<CValidError> Validate(const CBioSource& src, 162 CScope *scope = 0, 163 Uint4 options = 0); 164 // old call 165 NCBI_DEPRECATED 166 CConstRef<CValidError> Validate(const CBioSource& src, 167 Uint4 options = 0); 168 169 // Validate Pubdesc 170 CConstRef<CValidError> Validate(const CPubdesc& pubdesc, 171 CScope *scope = 0, 172 Uint4 options = 0); 173 // old call 174 NCBI_DEPRECATED 175 CConstRef<CValidError> Validate(const CPubdesc& pubdesc, 176 Uint4 options = 0); 177 178 // Validate Seqdesc 179 CConstRef<CValidError> Validate(const CSeqdesc& desc, 180 const CSeq_entry& ctx, 181 Uint4 options = 0); 182 183 // externally callable tests 184 CConstRef<CValidError> GetTSANStretchErrors(const CSeq_entry_Handle& se); 185 CConstRef<CValidError> GetTSACDSOnMinusStrandErrors (const CSeq_entry_Handle& se); 186 CConstRef<CValidError> GetTSAConflictingBiomolTechErrors (const CSeq_entry_Handle& se); 187 CConstRef<CValidError> GetTSANStretchErrors(const CBioseq& seq); 188 CConstRef<CValidError> GetTSACDSOnMinusStrandErrors (const CSeq_feat& f, const CBioseq& seq); 189 CConstRef<CValidError> GetTSAConflictingBiomolTechErrors (const CBioseq& seq); 190 191 static bool BadCharsInAuthorName(const string& str, bool allowcomma, bool allowperiod, bool last); 192 static bool BadCharsInAuthorLastName(const string& str); 193 static bool BadCharsInAuthorFirstName(const string& str); 194 static bool BadCharsInAuthorInitials(const string& str); 195 static bool BadCharsInAuthorSuffix(const string& str); 196 static string BadCharsInAuthor(const CName_std& author, bool& last_is_bad); 197 static string BadCharsInAuthor(const CAuthor& author, bool& last_is_bad); 198 199 static bool IsSeqLocCorrectlyOrdered(const CSeq_loc& loc, CScope& scope); 200 static bool DoesSeqLocContainAdjacentIntervals(const CSeq_loc& loc, CScope& scope); 201 static bool DoesSeqLocContainDuplicateIntervals(const CSeq_loc& loc, CScope& scope); 202 203 // progress reporting 204 class NCBI_VALIDATOR_EXPORT CProgressInfo 205 { 206 public: 207 enum EState { 208 eState_not_set, 209 eState_Initializing, 210 eState_Align, 211 eState_Annot, 212 eState_Bioseq, 213 eState_Bioseq_set, 214 eState_Desc, 215 eState_Descr, 216 eState_Feat, 217 eState_Graph 218 }; 219 CProgressInfo(void)220 CProgressInfo(void): m_State(eState_not_set), 221 m_Total(0), m_TotalDone(0), 222 m_Current(0), m_CurrentDone(0), 223 m_UserData(0) 224 {} GetState(void) const225 EState GetState(void) const { return m_State; } GetTotal(void) const226 size_t GetTotal(void) const { return m_Total; } GetTotalDone(void) const227 size_t GetTotalDone(void) const { return m_TotalDone; } GetCurrent(void) const228 size_t GetCurrent(void) const { return m_Current; } GetCurrentDone(void) const229 size_t GetCurrentDone(void) const { return m_CurrentDone; } GetUserData(void) const230 void* GetUserData(void) const { return m_UserData; } 231 232 private: 233 friend class CValidError_imp; 234 235 EState m_State; 236 size_t m_Total; 237 size_t m_TotalDone; 238 size_t m_Current; 239 size_t m_CurrentDone; 240 void* m_UserData; 241 }; 242 243 typedef bool (*TProgressCallback)(CProgressInfo*); 244 void SetProgressCallback(TProgressCallback callback, void* user_data = 0); 245 246 static EErrType ConvertCode(CSubSource::ELatLonCountryErr errcode); 247 248 enum EDbxrefValid { 249 eValid = 0, 250 eDbHasSgml = 1, 251 eTagHasSgml = 2, 252 eContainsSpace = 4, 253 eNotForSource = 8, 254 eOnlyForSource = 16, 255 eOnlyForRefSeq = 32, 256 eRefSeqNotForSource = 64, 257 eBadCapitalization = 128, 258 eUnrecognized = 256 259 }; 260 typedef int TDbxrefValidFlags; 261 static TDbxrefValidFlags IsValidDbxref(const CDbtag& xref, bool is_biosource, bool is_refseq_or_gps); 262 263 private: 264 // Prohibit copy constructor & assignment operator 265 CValidator(const CValidator&); 266 CValidator& operator= (const CValidator&); 267 268 // Services belong here, in the outside class 269 // and are passed into the implementation. 270 CRef<CObjectManager> m_ObjMgr; 271 AutoPtr<ITaxon3> m_Taxon; 272 273 TProgressCallback m_PrgCallback; 274 void* m_UserData; 275 }; 276 277 278 // Inline Functions: 279 280 281 END_SCOPE(validator) 282 END_SCOPE(objects) 283 END_NCBI_SCOPE 284 285 #endif /* VALIDATOR___VALIDATOR__HPP */ 286