1 /* $Id: single_feat_validator.hpp 632625 2021-06-03 17:38:33Z ivanov $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 *` 26 * Author: Colleen Bollin, Jonathan Kans, Clifford Clausen, Aaron Ucko...... 27 * 28 * File Description: 29 * For validating individual features 30 * ....... 31 * 32 */ 33 34 #ifndef VALIDATOR___SINGLE_FEAT_VALIDATOR__HPP 35 #define VALIDATOR___SINGLE_FEAT_VALIDATOR__HPP 36 37 #include <corelib/ncbistd.hpp> 38 #include <corelib/ncbi_autoinit.hpp> 39 40 #include <objmgr/scope.hpp> 41 #include <objmgr/feat_ci.hpp> // for CMappedFeat 42 #include <objects/seqfeat/Seq_feat.hpp> 43 #include <objects/seqfeat/SeqFeatData.hpp> 44 45 #include <objtools/validator/validator.hpp> 46 #include <objtools/validator/feature_match.hpp> 47 #include <objtools/validator/validerror_base.hpp> 48 #include <objtools/validator/translation_problems.hpp> 49 #include <objtools/validator/splice_problems.hpp> 50 51 #include <objmgr/util/feature.hpp> 52 53 BEGIN_NCBI_SCOPE 54 BEGIN_SCOPE(objects) 55 56 class CSeq_entry; 57 class CCit_sub; 58 class CCit_art; 59 class CCit_gen; 60 class CSeq_feat; 61 class CBioseq; 62 class CSeqdesc; 63 class CSeq_annot; 64 class CTrna_ext; 65 class CProt_ref; 66 class CSeq_loc; 67 class CFeat_CI; 68 class CPub_set; 69 class CAuth_list; 70 class CTitle; 71 class CMolInfo; 72 class CUser_object; 73 class CSeqdesc_CI; 74 //class CDense_diag; 75 //class CDense_seg; 76 //class CSeq_align_set; 77 class CPubdesc; 78 class CBioSource; 79 class COrg_ref; 80 class CDelta_seq; 81 class CGene_ref; 82 class CCdregion; 83 class CRNA_ref; 84 class CImp_feat; 85 class CSeq_literal; 86 class CBioseq_Handle; 87 class CSeq_feat_Handle; 88 class CCountries; 89 class CInferencePrefixList; 90 class CComment_set; 91 92 BEGIN_SCOPE(validator) 93 94 class CValidError_imp; 95 class CValidError_base; 96 97 // ============================= Validate SeqFeat ============================ 98 99 100 class CSingleFeatValidator 101 { 102 public: 103 CSingleFeatValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp); ~CSingleFeatValidator()104 virtual ~CSingleFeatValidator() {}; 105 106 virtual void Validate(); 107 108 static bool x_HasSeqLocBond(const CSeq_feat& feat); 109 static bool s_IsPseudo(const CSeq_feat& feat); 110 static bool s_IsPseudo(const CGene_ref& ref); 111 static bool s_BioseqHasRefSeqThatStartsWithPrefix(CBioseq_Handle bsh, string prefix); 112 static bool s_GeneRefsAreEquivalent(const CGene_ref& g1, const CGene_ref& g2, string& label); 113 static void s_RemoveDuplicateGoTerms(CUser_object::TData& field_list); 114 static void s_RemoveDuplicateGoTerms(CSeq_feat& feat); 115 protected: 116 const CSeq_feat& m_Feat; 117 CScope& m_Scope; 118 CValidError_imp& m_Imp; 119 CBioseq_Handle m_LocationBioseq; 120 CBioseq_Handle m_ProductBioseq; 121 bool m_ProductIsFar; 122 123 void PostErr(EDiagSev sv, EErrType et, const string& msg); 124 125 CBioseq_Handle x_GetBioseqByLocation(const CSeq_loc& loc); 126 void x_ValidateSeqFeatProduct(); 127 void x_ValidateBothStrands(); 128 static void x_LocHasStrandBoth(const CSeq_loc& feat, bool& both, bool& both_rev); 129 void x_ValidateGeneId(); 130 void x_ValidateFeatCit(); 131 virtual void x_ValidateFeatComment(); 132 void x_ValidateGbQual(const CGb_qual& qual); 133 void x_ReportECNumFileStatus(); 134 135 void x_ValidateExtUserObject(); 136 137 bool x_HasNamedQual(const string& qual_name); 138 139 void x_ValidateFeatPartialness(); 140 virtual void x_ValidateSeqFeatLoc(); 141 bool x_AllowFeatureToMatchGapExactly(); 142 143 typedef enum { 144 eLocationGapNoProblems = 0, 145 eLocationGapFeatureMatchesGap = 1, 146 eLocationGapContainedInGap = 4, 147 eLocationGapContainedInGapOfNs = 8, 148 eLocationGapInternalIntervalEndpointInGap = 16, 149 eLocationGapCrossesUnknownGap = 32, 150 eLocationGapMostlyNs = 64 151 152 } ELocationGap; 153 154 static size_t x_CalculateLocationGaps(CBioseq_Handle bsh, const CSeq_loc& loc, vector<TSeqPos>& gap_starts); 155 static bool x_IsMostlyNs(const CSeq_loc& loc, CBioseq_Handle bsh); 156 static size_t x_FindStartOfGap(CBioseq_Handle bsh, int pos, CScope* scope); 157 158 void x_ValidateExcept(); 159 virtual void x_ValidateExceptText(const string& text); 160 161 void x_ValidateGbquals(); 162 virtual bool x_ReportOrigProteinId(); 163 void x_ValidateRptUnitVal(const string& val, const string& key); 164 void x_ValidateRptUnitSeqVal(const string& val, const string& key); 165 void x_ValidateRptUnitRangeVal(const string& val); 166 void x_ValidateLabelVal(const string& val); 167 void x_ValidateCompareVal(const string& val); 168 void x_ValidateReplaceQual(const string& key, const string& qual_str, const string& val); 169 170 CBioseq_Handle x_GetFeatureProduct(bool look_far, bool& is_far); 171 CBioseq_Handle x_GetFeatureProduct(bool& is_far); 172 173 void ValidateCharactersInField (string value, string field_name); 174 void ValidateSplice(bool gene_pseudo, bool check_all); 175 EDiagSev x_SeverityForConsensusSplice(void); 176 void x_ReportSpliceProblems(const CSpliceProblems& problems, const string& label); 177 void x_ReportDonorSpliceSiteReadErrors(const CSpliceProblems::TSpliceProblem& problem, const string& label); 178 void x_ReportAcceptorSpliceSiteReadErrors(const CSpliceProblems::TSpliceProblem& problem, const string& label); 179 180 static bool x_BioseqHasNmAccession (CBioseq_Handle bsh); 181 182 void x_ValidateNonImpFeat(); 183 void x_ValidateGeneXRef(); 184 void x_ValidateGeneFeaturePair(const CSeq_feat& gene); 185 void x_ValidateNonGene(); 186 void x_ValidateOldLocusTag(const string& old_locus_tag); 187 188 void x_ValidateImpFeatLoc(); 189 void x_ValidateImpFeatQuals(); 190 void x_ValidateSeqFeatDataType(); 191 192 void x_ReportPseudogeneConflict(CConstRef <CSeq_feat> gene); 193 void x_ValidateLocusTagGeneralMatch(CConstRef <CSeq_feat> gene); 194 195 void x_CheckForNonAsciiCharacters(); 196 }; 197 198 class CCdregionValidator : public CSingleFeatValidator 199 { 200 public: 201 CCdregionValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp); 202 203 virtual void Validate() override; 204 205 protected: 206 virtual void x_ValidateFeatComment() override; 207 virtual void x_ValidateExceptText(const string& text) override; 208 void x_ValidateQuals(); 209 virtual bool x_ReportOrigProteinId() override; 210 static bool IsPlastid(int genome); 211 void x_ValidateGeneticCode(); 212 void x_ValidateBadMRNAOverlap(); 213 bool x_HasGoodParent(); 214 virtual void x_ValidateSeqFeatLoc() override; 215 void x_ValidateFarProducts(); 216 void x_ValidateCDSPeptides(); 217 void x_ValidateCDSPartial(); 218 bool x_BypassCDSPartialTest() const; 219 bool x_CDS3primePartialTest() const; 220 bool x_CDS5primePartialTest() const; 221 222 bool x_IsProductMisplaced() const; 223 224 typedef pair<TSeqPos, TSeqPos> TShortIntron; 225 static vector<TShortIntron> x_GetShortIntrons(const CSeq_loc& loc, CScope* scope); 226 static void x_AddToIntronList(vector<TShortIntron>& shortlist, TSeqPos last_start, TSeqPos last_stop, TSeqPos this_start, TSeqPos this_stop); 227 static string x_FormatIntronInterval(const TShortIntron& interval); 228 void ReportShortIntrons(); 229 230 void x_ValidateTrans(); 231 void x_ValidateCodebreak(); 232 void x_ReportTranslationProblems(const CCDSTranslationProblems& problems); 233 void x_ReportTranslExceptProblems(const CCDSTranslationProblems::TTranslExceptProblems& problems, bool has_exception); 234 void x_ReportTranslationMismatches(const CCDSTranslationProblems::TTranslationMismatches& mismatches); 235 string MapToNTCoords(TSeqPos pos); 236 237 void x_ValidateProductId(); 238 void x_ValidateConflict(); 239 void x_ValidateCommonProduct(); 240 241 void x_ValidateProductPartials(); 242 void x_ValidateParentPartialness(const CSeq_loc& parent_loc, const string& parent_name); 243 void x_ValidateParentPartialness(); 244 bool x_CheckPosNOrGap(TSeqPos pos, const CSeqVector& vec); 245 246 CConstRef<CSeq_feat> m_Gene; 247 bool m_GeneIsPseudo; 248 }; 249 250 251 class CGeneValidator : public CSingleFeatValidator 252 { 253 public: 254 using CSingleFeatValidator::CSingleFeatValidator; 255 256 virtual void Validate() override; 257 258 protected: 259 virtual void x_ValidateExceptText(const string& text) override; 260 void x_ValidateOperon(); 261 void x_ValidateMultiIntervalGene(); 262 bool x_AllIntervalGapsAreMobileElements(); 263 }; 264 265 266 class CProtValidator : public CSingleFeatValidator 267 { 268 public: 269 using CSingleFeatValidator::CSingleFeatValidator; 270 271 virtual void Validate() override; 272 273 protected: 274 void x_CheckForEmpty(); 275 void x_ReportUninformativeNames(); 276 void x_ValidateECNumbers(); 277 void x_ValidateProteinName(const string& prot_name); 278 void x_ValidateMolinfoPartials(); 279 }; 280 281 282 class CRNAValidator : public CSingleFeatValidator 283 { 284 public: CRNAValidator(const CSeq_feat & feat,CScope & scope,CValidError_imp & imp)285 CRNAValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp) 286 : CSingleFeatValidator(feat, scope, imp) {} 287 using CSingleFeatValidator::CSingleFeatValidator; 288 289 virtual void Validate() override; 290 291 protected: 292 void x_ValidateRnaProduct(bool feat_pseudo, bool pseudo); 293 void x_ValidateRnaProductType(); 294 void x_ReportRNATranslationProblems(size_t problems, size_t mismatches); 295 void x_ValidateRnaTrans(); 296 297 // for tRNAs 298 void x_ValidateAnticodon(const CSeq_loc& anticodon); 299 void x_ValidateTrnaCodons(); 300 void x_ValidateTrnaType(); 301 void x_ValidateTrnaData(); 302 void x_ValidateTrnaOverlap(); 303 304 }; 305 306 307 class CMRNAValidator : public CRNAValidator 308 { 309 public: 310 CMRNAValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp); 311 312 virtual void Validate() override; 313 314 protected: 315 // for mRNAs 316 void x_ValidateMrna(); 317 void x_ValidateCommonMRNAProduct(); 318 void x_ValidateMrnaGene(); 319 320 CConstRef<CSeq_feat> m_Gene; 321 bool m_GeneIsPseudo; 322 bool m_FeatIsPseudo; 323 }; 324 325 326 class CPubFeatValidator : public CSingleFeatValidator 327 { 328 public: 329 using CSingleFeatValidator::CSingleFeatValidator; 330 331 virtual void Validate() override; 332 333 protected: 334 }; 335 336 337 class CSrcFeatValidator : public CSingleFeatValidator 338 { 339 public: 340 using CSingleFeatValidator::CSingleFeatValidator; 341 342 virtual void Validate() override; 343 344 protected: 345 }; 346 347 348 class CPolyASiteValidator : public CSingleFeatValidator 349 { 350 public: 351 using CSingleFeatValidator::CSingleFeatValidator; 352 353 virtual void x_ValidateSeqFeatLoc() override; 354 355 protected: 356 }; 357 358 359 class CPolyASignalValidator : public CSingleFeatValidator 360 { 361 public: 362 using CSingleFeatValidator::CSingleFeatValidator; 363 364 virtual void x_ValidateSeqFeatLoc() override; 365 366 protected: 367 }; 368 369 370 class CPeptideValidator : public CSingleFeatValidator 371 { 372 public: 373 CPeptideValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp); 374 375 virtual void Validate() override; 376 377 protected: 378 void x_ValidatePeptideOnCodonBoundary(); 379 380 CConstRef<CSeq_feat> m_CDS; 381 }; 382 383 384 385 class CExonValidator : public CSingleFeatValidator 386 { 387 public: 388 using CSingleFeatValidator::CSingleFeatValidator; 389 390 virtual void Validate() override; 391 392 protected: 393 }; 394 395 396 class CIntronValidator : public CSingleFeatValidator 397 { 398 public: 399 using CSingleFeatValidator::CSingleFeatValidator; 400 401 virtual void Validate() override; 402 403 protected: 404 bool x_IsIntronShort(bool pseudo); 405 }; 406 407 408 class CMiscFeatValidator : public CSingleFeatValidator 409 { 410 public: 411 using CSingleFeatValidator::CSingleFeatValidator; 412 413 virtual void Validate() override; 414 415 protected: 416 }; 417 418 419 class CAssemblyGapValidator : public CSingleFeatValidator 420 { 421 public: 422 using CSingleFeatValidator::CSingleFeatValidator; 423 424 virtual void Validate() override; 425 426 protected: 427 }; 428 429 430 class CGapFeatValidator : public CSingleFeatValidator 431 { 432 public: 433 using CSingleFeatValidator::CSingleFeatValidator; 434 435 virtual void Validate() override; 436 437 protected: 438 }; 439 440 441 class CImpFeatValidator : public CSingleFeatValidator 442 { 443 public: 444 using CSingleFeatValidator::CSingleFeatValidator; 445 virtual void Validate() override; 446 protected: 447 }; 448 449 CSingleFeatValidator* FeatValidatorFactory(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp); 450 451 452 END_SCOPE(validator) 453 END_SCOPE(objects) 454 END_NCBI_SCOPE 455 456 #endif /* VALIDATOR___SINGLE_FEAT_VALIDATOR__HPP */ 457