1 /*  $Id: single_feat_validator.hpp 632625 2021-06-03 17:38:33Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *`
26  * Author:  Colleen Bollin, Jonathan Kans, Clifford Clausen, Aaron Ucko......
27  *
28  * File Description:
29  *   For validating individual features
30  *   .......
31  *
32  */
33 
34 #ifndef VALIDATOR___SINGLE_FEAT_VALIDATOR__HPP
35 #define VALIDATOR___SINGLE_FEAT_VALIDATOR__HPP
36 
37 #include <corelib/ncbistd.hpp>
38 #include <corelib/ncbi_autoinit.hpp>
39 
40 #include <objmgr/scope.hpp>
41 #include <objmgr/feat_ci.hpp>  // for CMappedFeat
42 #include <objects/seqfeat/Seq_feat.hpp>
43 #include <objects/seqfeat/SeqFeatData.hpp>
44 
45 #include <objtools/validator/validator.hpp>
46 #include <objtools/validator/feature_match.hpp>
47 #include <objtools/validator/validerror_base.hpp>
48 #include <objtools/validator/translation_problems.hpp>
49 #include <objtools/validator/splice_problems.hpp>
50 
51 #include <objmgr/util/feature.hpp>
52 
53 BEGIN_NCBI_SCOPE
54 BEGIN_SCOPE(objects)
55 
56 class CSeq_entry;
57 class CCit_sub;
58 class CCit_art;
59 class CCit_gen;
60 class CSeq_feat;
61 class CBioseq;
62 class CSeqdesc;
63 class CSeq_annot;
64 class CTrna_ext;
65 class CProt_ref;
66 class CSeq_loc;
67 class CFeat_CI;
68 class CPub_set;
69 class CAuth_list;
70 class CTitle;
71 class CMolInfo;
72 class CUser_object;
73 class CSeqdesc_CI;
74 //class CDense_diag;
75 //class CDense_seg;
76 //class CSeq_align_set;
77 class CPubdesc;
78 class CBioSource;
79 class COrg_ref;
80 class CDelta_seq;
81 class CGene_ref;
82 class CCdregion;
83 class CRNA_ref;
84 class CImp_feat;
85 class CSeq_literal;
86 class CBioseq_Handle;
87 class CSeq_feat_Handle;
88 class CCountries;
89 class CInferencePrefixList;
90 class CComment_set;
91 
92 BEGIN_SCOPE(validator)
93 
94 class CValidError_imp;
95 class CValidError_base;
96 
97 // =============================  Validate SeqFeat  ============================
98 
99 
100 class CSingleFeatValidator
101 {
102 public:
103     CSingleFeatValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp);
~CSingleFeatValidator()104     virtual ~CSingleFeatValidator() {};
105 
106     virtual void Validate();
107 
108     static bool x_HasSeqLocBond(const CSeq_feat& feat);
109     static bool s_IsPseudo(const CSeq_feat& feat);
110     static bool s_IsPseudo(const CGene_ref& ref);
111     static bool s_BioseqHasRefSeqThatStartsWithPrefix(CBioseq_Handle bsh, string prefix);
112     static bool s_GeneRefsAreEquivalent(const CGene_ref& g1, const CGene_ref& g2, string& label);
113     static void s_RemoveDuplicateGoTerms(CUser_object::TData& field_list);
114     static void s_RemoveDuplicateGoTerms(CSeq_feat& feat);
115 protected:
116     const CSeq_feat& m_Feat;
117     CScope& m_Scope;
118     CValidError_imp& m_Imp;
119     CBioseq_Handle m_LocationBioseq;
120     CBioseq_Handle m_ProductBioseq;
121     bool m_ProductIsFar;
122 
123     void PostErr(EDiagSev sv, EErrType et, const string& msg);
124 
125     CBioseq_Handle x_GetBioseqByLocation(const CSeq_loc& loc);
126     void x_ValidateSeqFeatProduct();
127     void x_ValidateBothStrands();
128     static void x_LocHasStrandBoth(const CSeq_loc& feat, bool& both, bool& both_rev);
129     void x_ValidateGeneId();
130     void x_ValidateFeatCit();
131     virtual void x_ValidateFeatComment();
132     void x_ValidateGbQual(const CGb_qual& qual);
133     void x_ReportECNumFileStatus();
134 
135     void x_ValidateExtUserObject();
136 
137     bool x_HasNamedQual(const string& qual_name);
138 
139     void x_ValidateFeatPartialness();
140     virtual void x_ValidateSeqFeatLoc();
141     bool x_AllowFeatureToMatchGapExactly();
142 
143     typedef enum {
144         eLocationGapNoProblems = 0,
145         eLocationGapFeatureMatchesGap = 1,
146         eLocationGapContainedInGap = 4,
147         eLocationGapContainedInGapOfNs = 8,
148         eLocationGapInternalIntervalEndpointInGap = 16,
149         eLocationGapCrossesUnknownGap = 32,
150         eLocationGapMostlyNs = 64
151 
152     } ELocationGap;
153 
154     static size_t x_CalculateLocationGaps(CBioseq_Handle bsh, const CSeq_loc& loc, vector<TSeqPos>& gap_starts);
155     static bool x_IsMostlyNs(const CSeq_loc& loc, CBioseq_Handle bsh);
156     static size_t x_FindStartOfGap(CBioseq_Handle bsh, int pos, CScope* scope);
157 
158     void x_ValidateExcept();
159     virtual void x_ValidateExceptText(const string& text);
160 
161     void x_ValidateGbquals();
162     virtual bool x_ReportOrigProteinId();
163     void x_ValidateRptUnitVal(const string& val, const string& key);
164     void x_ValidateRptUnitSeqVal(const string& val, const string& key);
165     void x_ValidateRptUnitRangeVal(const string& val);
166     void x_ValidateLabelVal(const string& val);
167     void x_ValidateCompareVal(const string& val);
168     void x_ValidateReplaceQual(const string& key, const string& qual_str, const string& val);
169 
170     CBioseq_Handle x_GetFeatureProduct(bool look_far, bool& is_far);
171     CBioseq_Handle x_GetFeatureProduct(bool& is_far);
172 
173     void ValidateCharactersInField (string value, string field_name);
174     void ValidateSplice(bool gene_pseudo, bool check_all);
175     EDiagSev x_SeverityForConsensusSplice(void);
176     void x_ReportSpliceProblems(const CSpliceProblems& problems, const string& label);
177     void x_ReportDonorSpliceSiteReadErrors(const CSpliceProblems::TSpliceProblem& problem, const string& label);
178     void x_ReportAcceptorSpliceSiteReadErrors(const CSpliceProblems::TSpliceProblem& problem, const string& label);
179 
180     static bool x_BioseqHasNmAccession (CBioseq_Handle bsh);
181 
182     void x_ValidateNonImpFeat();
183     void x_ValidateGeneXRef();
184     void x_ValidateGeneFeaturePair(const CSeq_feat& gene);
185     void x_ValidateNonGene();
186     void x_ValidateOldLocusTag(const string& old_locus_tag);
187 
188     void x_ValidateImpFeatLoc();
189     void x_ValidateImpFeatQuals();
190     void x_ValidateSeqFeatDataType();
191 
192     void x_ReportPseudogeneConflict(CConstRef <CSeq_feat> gene);
193     void x_ValidateLocusTagGeneralMatch(CConstRef <CSeq_feat> gene);
194 
195     void x_CheckForNonAsciiCharacters();
196 };
197 
198 class CCdregionValidator : public CSingleFeatValidator
199 {
200 public:
201     CCdregionValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp);
202 
203     virtual void Validate() override;
204 
205 protected:
206     virtual void x_ValidateFeatComment() override;
207     virtual void x_ValidateExceptText(const string& text) override;
208     void x_ValidateQuals();
209     virtual bool x_ReportOrigProteinId() override;
210     static bool IsPlastid(int genome);
211     void x_ValidateGeneticCode();
212     void x_ValidateBadMRNAOverlap();
213     bool x_HasGoodParent();
214     virtual void x_ValidateSeqFeatLoc() override;
215     void x_ValidateFarProducts();
216     void x_ValidateCDSPeptides();
217     void x_ValidateCDSPartial();
218     bool x_BypassCDSPartialTest() const;
219     bool x_CDS3primePartialTest() const;
220     bool x_CDS5primePartialTest() const;
221 
222     bool x_IsProductMisplaced() const;
223 
224     typedef pair<TSeqPos, TSeqPos> TShortIntron;
225     static vector<TShortIntron> x_GetShortIntrons(const CSeq_loc& loc, CScope* scope);
226     static void x_AddToIntronList(vector<TShortIntron>& shortlist, TSeqPos last_start, TSeqPos last_stop, TSeqPos this_start, TSeqPos this_stop);
227     static string x_FormatIntronInterval(const TShortIntron& interval);
228     void ReportShortIntrons();
229 
230     void x_ValidateTrans();
231     void x_ValidateCodebreak();
232     void x_ReportTranslationProblems(const CCDSTranslationProblems& problems);
233     void x_ReportTranslExceptProblems(const CCDSTranslationProblems::TTranslExceptProblems& problems, bool has_exception);
234     void x_ReportTranslationMismatches(const CCDSTranslationProblems::TTranslationMismatches& mismatches);
235     string MapToNTCoords(TSeqPos pos);
236 
237     void x_ValidateProductId();
238     void x_ValidateConflict();
239     void x_ValidateCommonProduct();
240 
241     void x_ValidateProductPartials();
242     void x_ValidateParentPartialness(const CSeq_loc& parent_loc, const string& parent_name);
243     void x_ValidateParentPartialness();
244     bool x_CheckPosNOrGap(TSeqPos pos, const CSeqVector& vec);
245 
246     CConstRef<CSeq_feat> m_Gene;
247     bool m_GeneIsPseudo;
248 };
249 
250 
251 class CGeneValidator : public CSingleFeatValidator
252 {
253 public:
254     using CSingleFeatValidator::CSingleFeatValidator;
255 
256     virtual void Validate() override;
257 
258 protected:
259     virtual void x_ValidateExceptText(const string& text) override;
260     void x_ValidateOperon();
261     void x_ValidateMultiIntervalGene();
262     bool x_AllIntervalGapsAreMobileElements();
263 };
264 
265 
266 class CProtValidator : public CSingleFeatValidator
267 {
268 public:
269     using CSingleFeatValidator::CSingleFeatValidator;
270 
271     virtual void Validate() override;
272 
273 protected:
274     void x_CheckForEmpty();
275     void x_ReportUninformativeNames();
276     void x_ValidateECNumbers();
277     void x_ValidateProteinName(const string& prot_name);
278     void x_ValidateMolinfoPartials();
279 };
280 
281 
282 class CRNAValidator : public CSingleFeatValidator
283 {
284 public:
CRNAValidator(const CSeq_feat & feat,CScope & scope,CValidError_imp & imp)285     CRNAValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp)
286         : CSingleFeatValidator(feat, scope, imp) {}
287     using CSingleFeatValidator::CSingleFeatValidator;
288 
289     virtual void Validate() override;
290 
291 protected:
292     void x_ValidateRnaProduct(bool feat_pseudo, bool pseudo);
293     void x_ValidateRnaProductType();
294     void x_ReportRNATranslationProblems(size_t problems, size_t mismatches);
295     void x_ValidateRnaTrans();
296 
297     // for tRNAs
298     void x_ValidateAnticodon(const CSeq_loc& anticodon);
299     void x_ValidateTrnaCodons();
300     void x_ValidateTrnaType();
301     void x_ValidateTrnaData();
302     void x_ValidateTrnaOverlap();
303 
304 };
305 
306 
307 class CMRNAValidator : public CRNAValidator
308 {
309 public:
310     CMRNAValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp);
311 
312     virtual void Validate() override;
313 
314 protected:
315     // for mRNAs
316     void x_ValidateMrna();
317     void x_ValidateCommonMRNAProduct();
318     void x_ValidateMrnaGene();
319 
320     CConstRef<CSeq_feat> m_Gene;
321     bool m_GeneIsPseudo;
322     bool m_FeatIsPseudo;
323 };
324 
325 
326 class CPubFeatValidator : public CSingleFeatValidator
327 {
328 public:
329     using CSingleFeatValidator::CSingleFeatValidator;
330 
331     virtual void Validate() override;
332 
333 protected:
334 };
335 
336 
337 class CSrcFeatValidator : public CSingleFeatValidator
338 {
339 public:
340     using CSingleFeatValidator::CSingleFeatValidator;
341 
342     virtual void Validate() override;
343 
344 protected:
345 };
346 
347 
348 class CPolyASiteValidator : public CSingleFeatValidator
349 {
350 public:
351     using CSingleFeatValidator::CSingleFeatValidator;
352 
353     virtual void x_ValidateSeqFeatLoc() override;
354 
355 protected:
356 };
357 
358 
359 class CPolyASignalValidator : public CSingleFeatValidator
360 {
361 public:
362     using CSingleFeatValidator::CSingleFeatValidator;
363 
364     virtual void x_ValidateSeqFeatLoc() override;
365 
366 protected:
367 };
368 
369 
370 class CPeptideValidator : public CSingleFeatValidator
371 {
372 public:
373     CPeptideValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp);
374 
375     virtual void Validate() override;
376 
377 protected:
378     void x_ValidatePeptideOnCodonBoundary();
379 
380     CConstRef<CSeq_feat> m_CDS;
381 };
382 
383 
384 
385 class CExonValidator : public CSingleFeatValidator
386 {
387 public:
388     using CSingleFeatValidator::CSingleFeatValidator;
389 
390     virtual void Validate() override;
391 
392 protected:
393 };
394 
395 
396 class CIntronValidator : public CSingleFeatValidator
397 {
398 public:
399     using CSingleFeatValidator::CSingleFeatValidator;
400 
401     virtual void Validate() override;
402 
403 protected:
404     bool x_IsIntronShort(bool pseudo);
405 };
406 
407 
408 class CMiscFeatValidator : public CSingleFeatValidator
409 {
410 public:
411     using CSingleFeatValidator::CSingleFeatValidator;
412 
413     virtual void Validate() override;
414 
415 protected:
416 };
417 
418 
419 class CAssemblyGapValidator : public CSingleFeatValidator
420 {
421 public:
422     using CSingleFeatValidator::CSingleFeatValidator;
423 
424     virtual void Validate() override;
425 
426 protected:
427 };
428 
429 
430 class CGapFeatValidator : public CSingleFeatValidator
431 {
432 public:
433     using CSingleFeatValidator::CSingleFeatValidator;
434 
435     virtual void Validate() override;
436 
437 protected:
438 };
439 
440 
441 class CImpFeatValidator : public CSingleFeatValidator
442 {
443 public:
444     using CSingleFeatValidator::CSingleFeatValidator;
445     virtual void Validate() override;
446 protected:
447 };
448 
449 CSingleFeatValidator* FeatValidatorFactory(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp);
450 
451 
452 END_SCOPE(validator)
453 END_SCOPE(objects)
454 END_NCBI_SCOPE
455 
456 #endif  /* VALIDATOR___SINGLE_FEAT_VALIDATOR__HPP */
457