1 /*  $Id: single_feat_validator.cpp 632625 2021-06-03 17:38:33Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Jonathan Kans, Clifford Clausen, Aaron Ucko......
27  *
28  * File Description:
29  *   validation of Seq_feat
30  *   .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <objtools/validator/validatorp.hpp>
37 #include <objtools/validator/single_feat_validator.hpp>
38 #include <objtools/validator/validerror_feat.hpp>
39 #include <objtools/validator/utilities.hpp>
40 #include <objtools/validator/go_term_validation_and_cleanup.hpp>
41 #include <objtools/format/items/gene_finder.hpp>
42 
43 #include <serial/serialbase.hpp>
44 
45 #include <objmgr/seqdesc_ci.hpp>
46 #include <objects/seq/seqport_util.hpp>
47 #include <util/sequtil/sequtil_convert.hpp>
48 #include <util/sgml_entity.hpp>
49 
50 
51 BEGIN_NCBI_SCOPE
52 BEGIN_SCOPE(objects)
53 BEGIN_SCOPE(validator)
54 using namespace sequence;
55 
CSingleFeatValidator(const CSeq_feat & feat,CScope & scope,CValidError_imp & imp)56 CSingleFeatValidator::CSingleFeatValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp)
57     : m_Feat(feat), m_Scope(scope), m_Imp(imp), m_ProductIsFar(false)
58 {
59 
60 }
61 
62 
Validate()63 void CSingleFeatValidator::Validate()
64 {
65     if (!m_Feat.IsSetLocation()) {
66         PostErr(eDiag_Critical, eErr_SEQ_FEAT_MissingLocation,
67             "The feature is missing a location");
68         return;
69     }
70 
71     m_LocationBioseq = x_GetBioseqByLocation(m_Feat.GetLocation());
72     bool lowerSev = false;
73     if (m_Imp.IsRefSeq() && m_Feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_variation) {
74         if ( m_Feat.IsSetDbxref() ) {
75             ITERATE( CSeq_feat::TDbxref, it, m_Feat.GetDbxref() ) {
76                 const CDbtag& dbtag = **it;
77                 if ( dbtag.GetDb() == "dbSNP" ) {
78                     lowerSev = true;
79                 }
80             }
81         }
82     }
83     m_Imp.ValidateSeqLoc(m_Feat.GetLocation(), m_LocationBioseq,
84         (m_Feat.GetData().IsGene() || !m_Imp.IsGpipe()),
85         "Location", m_Feat, lowerSev);
86 
87     x_ValidateSeqFeatLoc();
88     x_ValidateImpFeatLoc();
89 
90     if (m_Feat.IsSetProduct()) {
91         m_ProductBioseq = x_GetFeatureProduct(m_ProductIsFar);
92         if (m_ProductBioseq == m_LocationBioseq) {
93             PostErr(eDiag_Error, eErr_SEQ_FEAT_SelfReferentialProduct, "Self-referential feature product");
94         }
95         x_ValidateSeqFeatProduct();
96     }
97 
98     x_ValidateFeatPartialness();
99 
100     x_ValidateBothStrands();
101 
102     if (m_Feat.CanGetDbxref()) {
103         m_Imp.ValidateDbxref(m_Feat.GetDbxref(), m_Feat);
104         x_ValidateGeneId();
105     }
106 
107     x_ValidateFeatComment();
108 
109     x_ValidateFeatCit();
110 
111     if (m_Feat.IsSetQual()) {
112         for (auto it = m_Feat.GetQual().begin(); it != m_Feat.GetQual().end(); it++) {
113             x_ValidateGbQual(**it);
114         }
115     }
116 
117     x_ValidateExtUserObject();
118 
119     if (m_Feat.IsSetExp_ev() && m_Feat.GetExp_ev() > 0 &&
120         !x_HasNamedQual("inference") &&
121         !x_HasNamedQual("experiment") &&
122         !m_Imp.DoesAnyFeatLocHaveGI()) {
123         PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidInferenceValue,
124             "Inference or experiment qualifier missing but obsolete experimental evidence qualifier set");
125     }
126 
127     x_ValidateExcept();
128 
129     x_ValidateGbquals();
130     x_ValidateImpFeatQuals();
131 
132     x_ValidateSeqFeatDataType();
133 
134     x_ValidateNonImpFeat();
135 
136     x_ValidateNonGene();
137 
138     x_CheckForNonAsciiCharacters();
139 }
140 
PostErr(EDiagSev sv,EErrType et,const string & msg)141 void CSingleFeatValidator::PostErr(EDiagSev sv, EErrType et, const string& msg)
142 {
143     m_Imp.PostErr(sv, et, msg, m_Feat);
144 }
145 
146 
147 CBioseq_Handle
x_GetBioseqByLocation(const CSeq_loc & loc)148 CSingleFeatValidator::x_GetBioseqByLocation(const CSeq_loc& loc)
149 {
150     if (loc.IsInt() || loc.IsWhole()) {
151         return m_Scope.GetBioseqHandle(loc);
152     }
153     CBioseq_Handle rval;
154     CConstRef<CSeq_id> prev(NULL);
155     for (CSeq_loc_CI citer(loc); citer; ++citer) {
156         const CSeq_id& this_id = citer.GetSeq_id();
157         if (!prev || !prev->Equals(this_id)) {
158             rval = m_Scope.GetBioseqHandle(this_id);
159             if (rval) {
160                 break;
161             }
162             prev.Reset(&this_id);
163         }
164     }
165     return rval;
166 }
167 
168 
x_ValidateSeqFeatProduct()169 void CSingleFeatValidator::x_ValidateSeqFeatProduct()
170 {
171     if (!m_Feat.IsSetProduct()) {
172         return;
173     }
174     const CSeq_id& sid = GetId(m_Feat.GetProduct(), &m_Scope);
175 
176     switch (sid.Which()) {
177     case CSeq_id::e_Genbank:
178     case CSeq_id::e_Embl:
179     case CSeq_id::e_Ddbj:
180     case CSeq_id::e_Tpg:
181     case CSeq_id::e_Tpe:
182     case CSeq_id::e_Tpd:
183     {
184         const CTextseq_id* tsid = sid.GetTextseq_Id();
185         if (tsid != NULL) {
186             if (!tsid->CanGetAccession() && tsid->CanGetName()) {
187                 if (ValidateAccessionString(tsid->GetName(), false) == eAccessionFormat_valid) {
188                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadProductSeqId,
189                         "Feature product should not put an accession in the Textseq-id 'name' slot");
190                 } else {
191                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadProductSeqId,
192                         "Feature product should not use "
193                         "Textseq-id 'name' slot");
194                 }
195             }
196         }
197     }
198     break;
199 
200     default:
201         break;
202     }
203 
204     if (m_ProductBioseq) {
205         m_Imp.ValidateSeqLoc(m_Feat.GetProduct(), m_ProductBioseq, true, "Product", m_Feat);
206 
207         for (auto id : m_ProductBioseq.GetCompleteBioseq()->GetId()) {
208             if (id->Which() == sid.Which()) {
209                 // check to make sure capitalization is the same
210                 string from_seq = id->AsFastaString();
211                 string from_loc = sid.AsFastaString();
212                 if (!NStr::EqualCase(from_seq, from_loc) &&
213                     NStr::EqualNocase(from_seq, from_loc)) {
214                     PostErr(eDiag_Critical, eErr_SEQ_FEAT_BadProductSeqId,
215                         "Capitalization change from product location on feature to product sequence");
216                 }
217             }
218             switch (id->Which()) {
219             case CSeq_id::e_Genbank:
220             case CSeq_id::e_Embl:
221             case CSeq_id::e_Ddbj:
222             case CSeq_id::e_Tpg:
223             case CSeq_id::e_Tpe:
224             case CSeq_id::e_Tpd:
225             {
226                 const CTextseq_id* tsid = id->GetTextseq_Id();
227                 if (tsid != NULL) {
228                     if (!tsid->IsSetAccession() && tsid->IsSetName()) {
229                         if (ValidateAccessionString(tsid->GetName(), false) == eAccessionFormat_valid) {
230                             PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadProductSeqId,
231                                 "Protein bioseq has Textseq-id 'name' that "
232                                 "looks like it is derived from a nucleotide "
233                                 "accession");
234                         } else {
235                             PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadProductSeqId,
236                                 "Protein bioseq has Textseq-id 'name' and no accession");
237                         }
238                     }
239                 }
240             }
241             break;
242             default:
243                 break;
244             }
245         }
246     }
247 }
248 
249 
x_HasSeqLocBond(const CSeq_feat & feat)250 bool CSingleFeatValidator::x_HasSeqLocBond(const CSeq_feat& feat)
251 {
252     // check for bond locations - only allowable in bond feature and under special circumstances for het
253     bool is_seqloc_bond = false;
254     if (feat.IsSetData()) {
255         if (feat.GetData().IsHet()) {
256             // heterogen can have mix of bonds with just "a" point specified */
257             for (CSeq_loc_CI it(feat.GetLocation()); it; ++it) {
258                 if (it.GetEmbeddingSeq_loc().IsBond()
259                     && (!it.GetEmbeddingSeq_loc().GetBond().IsSetA()
260                     || it.GetEmbeddingSeq_loc().GetBond().IsSetB())) {
261                     is_seqloc_bond = true;
262                     break;
263                 }
264             }
265         } else if (!feat.GetData().IsBond()) {
266             for (CSeq_loc_CI it(feat.GetLocation()); it; ++it) {
267                 if (it.GetEmbeddingSeq_loc().IsBond()) {
268                     is_seqloc_bond = true;
269                     break;
270                 }
271             }
272         }
273     } else {
274         for (CSeq_loc_CI it(feat.GetLocation()); it; ++it) {
275             if (it.GetEmbeddingSeq_loc().IsBond()) {
276                 is_seqloc_bond = true;
277                 break;
278             }
279         }
280     }
281     return is_seqloc_bond;
282 }
283 
284 
x_ValidateBothStrands()285 void CSingleFeatValidator::x_ValidateBothStrands()
286 {
287     if (!m_Feat.IsSetLocation() || CSeqFeatData::AllowStrandBoth(m_Feat.GetData().GetSubtype())) {
288         return;
289     }
290     bool both, both_rev;
291     x_LocHasStrandBoth(m_Feat.GetLocation(), both, both_rev);
292     if (both || both_rev) {
293         string suffix;
294         if (both && both_rev) {
295             suffix = "(forward and reverse)";
296         } else if (both) {
297             suffix = "(forward)";
298         } else if (both_rev) {
299             suffix = "(reverse)";
300         }
301 
302         string label = CSeqFeatData::SubtypeValueToName(m_Feat.GetData().GetSubtype());
303 
304         PostErr(eDiag_Error, eErr_SEQ_FEAT_BothStrands,
305             label + " may not be on both " + suffix + " strands");
306     }
307 }
308 
309 
x_LocHasStrandBoth(const CSeq_loc & loc,bool & both,bool & both_rev)310 void CSingleFeatValidator::x_LocHasStrandBoth(const CSeq_loc& loc, bool& both, bool& both_rev)
311 {
312     both = false;
313     both_rev = false;
314     for (CSeq_loc_CI it(loc); it; ++it) {
315         if (it.IsSetStrand()) {
316             ENa_strand s = it.GetStrand();
317             if (s == eNa_strand_both && !both) {
318                 both = true;
319             } else if (s == eNa_strand_both_rev && !both_rev) {
320                 both_rev = true;
321             }
322         }
323         if (both && both_rev) {
324             break;
325         }
326     }
327 }
328 
329 
HasGeneIdXref(const CMappedFeat & sf,const CObject_id & tag,bool & has_parent_gene_id)330 bool HasGeneIdXref(const CMappedFeat& sf, const CObject_id& tag, bool& has_parent_gene_id)
331 {
332     has_parent_gene_id = false;
333     if (!sf.IsSetDbxref()) {
334         return false;
335     }
336     ITERATE(CSeq_feat::TDbxref, it, sf.GetDbxref()) {
337         if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "GeneID")) {
338             has_parent_gene_id = true;
339             if ((*it)->IsSetTag() && (*it)->GetTag().Equals(tag)) {
340                 return true;
341             }
342         }
343     }
344     return false;
345 }
346 
347 
x_ValidateGeneId()348 void CSingleFeatValidator::x_ValidateGeneId()
349 {
350     if (!m_Feat.IsSetDbxref()) {
351         return;
352     }
353 
354     // no tse, no feat-handle
355     auto tse = m_Imp.GetTSE_Handle();
356     if (!tse) {
357         return;
358     }
359 
360     CRef<feature::CFeatTree> feat_tree(NULL);
361     CMappedFeat mf = m_Scope.GetSeq_featHandle(m_Feat);
362     ITERATE(CSeq_feat::TDbxref, it, m_Feat.GetDbxref()) {
363         if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "GeneID") &&
364             (*it)->IsSetTag()) {
365             if (!feat_tree) {
366                 feat_tree = m_Imp.GetGeneCache().GetFeatTreeFromCache(m_Feat, m_Scope);
367             }
368             if (feat_tree) {
369                 CMappedFeat parent = feat_tree->GetParent(mf);
370                 while (parent) {
371                     bool has_parent_gene_id = false;
372                     if (!HasGeneIdXref(parent, (*it)->GetTag(), has_parent_gene_id)) {
373                         if (has_parent_gene_id ||
374                             parent.GetData().GetSubtype() == CSeqFeatData::eSubtype_gene) {
375                             PostErr(eDiag_Error, eErr_SEQ_FEAT_GeneIdMismatch,
376                                 "GeneID mismatch");
377                         }
378                     }
379                     parent = feat_tree->GetParent(parent);
380                 }
381             }
382         }
383     }
384 
385 }
386 
387 
x_ValidateFeatCit()388 void CSingleFeatValidator::x_ValidateFeatCit()
389 {
390     if (!m_Feat.IsSetCit()) {
391         return;
392     }
393 
394     if (m_Feat.GetCit().IsPub()) {
395         ITERATE(CPub_set::TPub, pi, m_Feat.GetCit().GetPub()) {
396             if ((*pi)->IsEquiv()) {
397                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryCitPubEquiv,
398                     "Citation on feature has unexpected internal Pub-equiv");
399                 return;
400             }
401         }
402     }
403 }
404 
405 
406 const string kInferenceMessage[] = {
407   "unknown error",
408   "empty inference string",
409   "bad inference prefix",
410   "bad inference body",
411   "single inference field",
412   "spaces in inference",
413   "possible comment in inference",
414   "same species misused",
415   "the value in the accession field is not legal. The only allowed value is accession.version, eg AF123456.1. Problem =",
416   "bad inference accession version",
417   "accession.version not public",
418   "bad accession type",
419   "unrecognized database",
420 };
421 
422 
x_ValidateGbQual(const CGb_qual & qual)423 void CSingleFeatValidator::x_ValidateGbQual(const CGb_qual& qual)
424 {
425     if (!qual.IsSetQual()) {
426         return;
427     }
428     /* first check for anything other than replace */
429     if (!qual.IsSetVal() || NStr::IsBlank(qual.GetVal())) {
430         if (NStr::EqualNocase(qual.GetQual(), "replace")) {
431             /* ok for replace */
432         } else {
433             PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidPunctuation,
434                 "Qualifier other than replace has just quotation marks");
435             if (NStr::EqualNocase(qual.GetQual(), "EC_number")) {
436                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberEmpty, "EC number should not be empty");
437             }
438         }
439         if (NStr::EqualNocase(qual.GetQual(), "inference")) {
440             PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidInferenceValue,
441                 "Inference qualifier problem - empty inference string ()");
442         } else if (NStr::EqualNocase(qual.GetQual(), "pseudogene")) {
443             PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidPseudoQualifier, "/pseudogene value should not be empty");
444         }
445     } else if (NStr::EqualNocase(qual.GetQual(), "EC_number")) {
446         if (!CProt_ref::IsValidECNumberFormat(qual.GetVal())) {
447             PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadEcNumberFormat,
448                 qual.GetVal() + " is not in proper EC_number format");
449         } else {
450             string ec_number = qual.GetVal();
451             CProt_ref::EECNumberStatus status = CProt_ref::GetECNumberStatus(ec_number);
452             x_ReportECNumFileStatus();
453             switch (status) {
454             case CProt_ref::eEC_deleted:
455                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_DeletedEcNumber,
456                     "EC_number " + ec_number + " was deleted");
457                 break;
458             case CProt_ref::eEC_replaced:
459                 PostErr(eDiag_Warning,
460                     CProt_ref::IsECNumberSplit(ec_number) ? eErr_SEQ_FEAT_SplitEcNumber : eErr_SEQ_FEAT_ReplacedEcNumber,
461                     "EC_number " + ec_number + " was replaced");
462                 break;
463             case CProt_ref::eEC_unknown:
464             {
465                 size_t pos = NStr::Find(ec_number, "n");
466                 if (pos == string::npos || !isdigit(ec_number.c_str()[pos + 1])) {
467                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadEcNumberValue,
468                         ec_number + " is not a legal value for qualifier EC_number");
469                 } else {
470                     PostErr(eDiag_Info, eErr_SEQ_FEAT_BadEcNumberValue,
471                         ec_number + " is not a legal preliminary value for qualifier EC_number");
472                 }
473             }
474             break;
475             default:
476                 break;
477             }
478         }
479     } else if (NStr::EqualNocase(qual.GetQual(), "inference")) {
480         /* TODO: Validate inference */
481         string val;
482         if (qual.IsSetVal()) {
483             val = qual.GetVal();
484         }
485         CValidError_feat::EInferenceValidCode rsult = CValidError_feat::ValidateInference(val, m_Imp.ValidateInferenceAccessions());
486         if (rsult > CValidError_feat::eInferenceValidCode_valid) {
487             if (NStr::IsBlank(val)) {
488                 val = "?";
489             }
490             PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidInferenceValue,
491                 "Inference qualifier problem - " + kInferenceMessage[(int)rsult] + " ("
492                 + val + ")");
493         }
494     } else if (NStr::EqualNocase(qual.GetQual(), "pseudogene")) {
495         m_Imp.IncrementPseudogeneCount();
496         if (!CGb_qual::IsValidPseudogeneValue(qual.GetVal())) {
497             m_Imp.PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidPseudoQualifier,
498                 "/pseudogene value should not be '" + qual.GetVal() + "'", m_Feat);
499         }
500     } else if (NStr::EqualNocase(qual.GetQual(), "number")) {
501         bool has_space = false;
502         bool has_char_after_space = false;
503         ITERATE(string, it, qual.GetVal()) {
504             if (isspace((unsigned char)(*it))) {
505                 has_space = true;
506             } else if (has_space) {
507                 // non-space after space
508                 has_char_after_space = true;
509                 break;
510             }
511         }
512         if (has_char_after_space) {
513             PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidNumberQualifier,
514                 "Number qualifiers should not contain spaces");
515         }
516     }
517     if (qual.IsSetVal() && ContainsSgml(qual.GetVal())) {
518         PostErr(eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
519             "feature qualifier " + qual.GetVal() + " has SGML");
520     }
521 
522 }
523 
524 
x_ReportECNumFileStatus()525 void CSingleFeatValidator::x_ReportECNumFileStatus()
526 {
527     static bool file_status_reported = false;
528 
529     if (!file_status_reported) {
530         if (CProt_ref::GetECNumAmbiguousStatus() == CProt_ref::eECFile_not_found) {
531             PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberDataMissing,
532                 "Unable to find EC number file 'ecnum_ambiguous.txt' in data directory");
533         }
534         if (CProt_ref::GetECNumDeletedStatus() == CProt_ref::eECFile_not_found) {
535             PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberDataMissing,
536                 "Unable to find EC number file 'ecnum_deleted.txt' in data directory");
537         }
538         if (CProt_ref::GetECNumReplacedStatus() == CProt_ref::eECFile_not_found) {
539             PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberDataMissing,
540                 "Unable to find EC number file 'ecnum_replaced.txt' in data directory");
541         }
542         if (CProt_ref::GetECNumSpecificStatus() == CProt_ref::eECFile_not_found) {
543             PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberDataMissing,
544                 "Unable to find EC number file 'ecnum_specific.txt' in data directory");
545         }
546         file_status_reported = true;
547     }
548 }
549 
550 
x_ValidateExtUserObject()551 void CSingleFeatValidator::x_ValidateExtUserObject()
552 {
553     vector<TGoTermError> errors = GetGoTermErrors(m_Feat);
554     for (auto it : errors) {
555         PostErr(it.first == eErr_SEQ_FEAT_DuplicateGeneOntologyTerm ? eDiag_Info : eDiag_Warning,
556             it.first, it.second);
557     }
558 }
559 
560 
x_HasNamedQual(const string & qual_name)561 bool CSingleFeatValidator::x_HasNamedQual(const string& qual_name)
562 {
563     if (!m_Feat.IsSetQual()) {
564         return false;
565     }
566     ITERATE(CSeq_feat::TQual, it, m_Feat.GetQual()) {
567         if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), qual_name)) {
568             return true;
569         }
570     }
571     return false;
572 }
573 
574 
x_ValidateFeatComment()575 void CSingleFeatValidator::x_ValidateFeatComment()
576 {
577     if (!m_Feat.IsSetComment()) {
578         return;
579     }
580     const string& comment = m_Feat.GetComment();
581     if (m_Imp.IsSerialNumberInComment(comment)) {
582         m_Imp.PostErr(eDiag_Info, eErr_SEQ_FEAT_SerialInComment,
583             "Feature comment may refer to reference by serial number - "
584             "attach reference specific comments to the reference "
585             "REMARK instead.", m_Feat);
586     }
587     if (ContainsSgml(comment)) {
588         m_Imp.PostErr(eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
589             "feature comment " + comment + " has SGML",
590             m_Feat);
591     }
592 }
593 
594 
x_ValidateFeatPartialness()595 void CSingleFeatValidator::x_ValidateFeatPartialness()
596 {
597     unsigned int  partial_prod = eSeqlocPartial_Complete,
598         partial_loc = eSeqlocPartial_Complete;
599 
600     bool is_partial = m_Feat.IsSetPartial() && m_Feat.GetPartial();
601     partial_loc = SeqLocPartialCheck(m_Feat.GetLocation(), &m_Scope);
602 
603     if (m_ProductBioseq) {
604         partial_prod = SeqLocPartialCheck(m_Feat.GetProduct(), &(m_ProductBioseq.GetScope()));
605     }
606 
607     if ((partial_loc != eSeqlocPartial_Complete) ||
608         (partial_prod != eSeqlocPartial_Complete) ||
609         is_partial) {
610 
611         // a feature on a partial sequence should be partial -- it often isn't
612         if (!is_partial &&
613             partial_loc != eSeqlocPartial_Complete  &&
614             m_Feat.IsSetLocation() &&
615             m_Feat.GetLocation().IsWhole()) {
616             PostErr(eDiag_Info, eErr_SEQ_FEAT_PartialProblem,
617                 "On partial Bioseq, SeqFeat.partial should be TRUE");
618         }
619         // a partial feature, with complete location, but partial product
620         else if (is_partial                        &&
621             partial_loc == eSeqlocPartial_Complete  &&
622             m_Feat.IsSetProduct() &&
623             m_Feat.GetProduct().IsWhole() &&
624             partial_prod != eSeqlocPartial_Complete) {
625             if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
626                 // suppress in gpipe genomic
627             } else {
628                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
629                     "When SeqFeat.product is a partial Bioseq, SeqFeat.location "
630                     "should also be partial");
631             }
632         }
633         // gene on segmented set is now 'order', should also be partial
634         else if (m_Feat.GetData().IsGene() &&
635             !is_partial                      &&
636             partial_loc == eSeqlocPartial_Internal) {
637             PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
638                 "Gene of 'order' with otherwise complete location should "
639                 "have partial flag set");
640         }
641         // inconsistent combination of partial/complete product,location,partial flag - part 1
642         else if (partial_prod == eSeqlocPartial_Complete  &&  m_Feat.IsSetProduct()) {
643             // if not local bioseq product, lower severity
644             EDiagSev sev = eDiag_Warning;
645             bool is_far_fail = false;
646             if (!m_ProductBioseq || m_ProductIsFar) {
647                 sev = eDiag_Info;
648                 if (!m_ProductBioseq && m_Imp.x_IsFarFetchFailure(m_Feat.GetProduct())) {
649                     is_far_fail = true;
650                 }
651             }
652 
653             string str("Inconsistent: Product= complete, Location= ");
654             str += (partial_loc != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
655             str += "Feature.partial= ";
656             str += is_partial ? "TRUE" : "FALSE";
657             if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
658                 // suppress for genomic gpipe
659             } else if (is_far_fail) {
660                 m_Imp.SetFarFetchFailure();
661             } else {
662                 PostErr(sev, eErr_SEQ_FEAT_PartialsInconsistent, str);
663             }
664         }
665         // inconsistent combination of partial/complete product,location,partial flag - part 2
666         else if (partial_loc == eSeqlocPartial_Complete || !is_partial) {
667             string str("Inconsistent: ");
668             if (m_Feat.IsSetProduct()) {
669                 str += "Product= ";
670                 str += (partial_prod != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
671             }
672             str += "Location= ";
673             str += (partial_loc != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
674             str += "Feature.partial= ";
675             str += is_partial ? "TRUE" : "FALSE";
676             if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
677                 // suppress for genomic gpipe
678             } else {
679                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialsInconsistent, str);
680             }
681         }
682         // 5' or 3' partial location giving unclassified partial product
683         else if ((((partial_loc & eSeqlocPartial_Start) != 0) ||
684             ((partial_loc & eSeqlocPartial_Stop) != 0)) &&
685             ((partial_prod & eSeqlocPartial_Other) != 0) &&
686             is_partial) {
687             PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
688                 "5' or 3' partial location should not have unclassified"
689                 " partial in product molinfo descriptor");
690         }
691 
692         // note - in analogous C Toolkit function there is additional code for ensuring
693         // that partial intervals are partial at splice sites, gaps, or the ends of the
694         // sequence.  This has been moved to CValidError_bioseq::ValidateFeatPartialInContext.
695     }
696 
697 }
698 
699 
x_ValidateSeqFeatLoc()700 void CSingleFeatValidator::x_ValidateSeqFeatLoc()
701 {
702     if (x_HasSeqLocBond(m_Feat)) {
703         PostErr(eDiag_Warning, eErr_SEQ_FEAT_ImproperBondLocation,
704             "Bond location should only be on bond features");
705     }
706 
707     // feature location should not be whole
708     if (m_Feat.GetLocation().IsWhole()) {
709         string prefix = "Feature";
710         if (m_Feat.IsSetData()) {
711             if (m_Feat.GetData().IsCdregion()) {
712                 prefix = "CDS";
713             } else if (m_Feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
714                 prefix = "mRNA";
715             }
716         }
717         PostErr(eDiag_Warning, eErr_SEQ_FEAT_WholeLocation, prefix + " may not have whole location");
718     }
719 
720     if (m_LocationBioseq) {
721         // look for mismatch in capitalization for IDs
722         CNcbiOstrstream os;
723         const CSeq_id *id = m_Feat.GetLocation().GetId();
724         if (id) {
725             id->WriteAsFasta(os);
726             string loc_id = CNcbiOstrstreamToString(os);
727             FOR_EACH_SEQID_ON_BIOSEQ(it, *(m_LocationBioseq.GetCompleteBioseq())) {
728                 if ((*it)->IsGi() || (*it)->IsGibbsq() || (*it)->IsGibbmt()) {
729                     continue;
730                 }
731                 CNcbiOstrstream os2;
732                 (*it)->WriteAsFasta(os2);
733                 string bs_id = CNcbiOstrstreamToString(os2);
734                 if (NStr::EqualNocase(loc_id, bs_id) && !NStr::EqualCase(loc_id, bs_id)) {
735                     PostErr(eDiag_Error, eErr_SEQ_FEAT_FeatureSeqIDCaseDifference,
736                         "Sequence identifier in feature location differs in capitalization with identifier on Bioseq");
737                 }
738             }
739         }
740         // look for protein features on the minus strand
741         if (m_LocationBioseq.IsAa() && m_Feat.GetLocation().IsSetStrand() &&
742             m_Feat.GetLocation().GetStrand() == eNa_strand_minus) {
743             PostErr(eDiag_Warning, eErr_SEQ_FEAT_MinusStrandProtein,
744                 "Feature on protein indicates negative strand");
745         }
746 
747         if (!m_Feat.GetData().IsImp()
748             || !m_Feat.GetData().GetImp().IsSetKey()
749             || !NStr::EqualNocase(m_Feat.GetData().GetImp().GetKey(), "gap")) {
750             try {
751                 vector<TSeqPos> gap_starts;
752                 size_t rval = x_CalculateLocationGaps(m_LocationBioseq, m_Feat.GetLocation(), gap_starts);
753                 bool mostly_raw_ns = x_IsMostlyNs(m_Feat.GetLocation(), m_LocationBioseq);
754 
755                 if ((rval & eLocationGapMostlyNs) || mostly_raw_ns) {
756                     PostErr(eDiag_Info, eErr_SEQ_FEAT_FeatureIsMostlyNs,
757                         "Feature contains more than 50% Ns");
758                 }
759                 for (auto gap_start : gap_starts) {
760                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_FeatureBeginsOrEndsInGap,
761                         "Feature begins or ends in gap starting at " + NStr::NumericToString(gap_start + 1));
762                 }
763                 if (rval & eLocationGapContainedInGap &&
764                     (!(rval & eLocationGapFeatureMatchesGap) || !x_AllowFeatureToMatchGapExactly())) {
765                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_FeatureInsideGap,
766                         "Feature inside sequence gap");
767                 }
768                 if (m_Feat.GetData().IsCdregion() || m_Feat.GetData().IsRna()) {
769                     if (rval & eLocationGapInternalIntervalEndpointInGap) {
770                         PostErr(eDiag_Info, eErr_SEQ_FEAT_IntervalBeginsOrEndsInGap,
771                             "Internal interval begins or ends in gap");
772                     }
773                     if (rval & eLocationGapCrossesUnknownGap) {
774                         PostErr(eDiag_Warning, eErr_SEQ_FEAT_FeatureCrossesGap,
775                             "Feature crosses gap of unknown length");
776                     }
777                 }
778             } catch (CException &e) {
779                 PostErr(eDiag_Fatal, eErr_INTERNAL_Exception,
780                     string("Exception while checking for intervals in gaps. EXCEPTION: ") +
781                     e.what());
782             } catch (std::exception) {
783             }
784         }
785     }
786 
787 }
788 
789 
x_AllowFeatureToMatchGapExactly()790 bool CSingleFeatValidator::x_AllowFeatureToMatchGapExactly()
791 {
792     if (m_Feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature ||
793         m_Feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_assembly_gap) {
794         return true;
795     } else {
796         return false;
797     }
798 }
799 
800 
801 class CGapCache {
802 public:
803     CGapCache(const CSeq_loc& loc, CBioseq_Handle bsh);
~CGapCache()804     ~CGapCache() {};
805     bool IsUnknownGap(size_t offset);
806     bool IsKnownGap(size_t offset);
807     bool IsGap(size_t offset);
808 
809 private:
810     typedef enum {
811         eGapType_unknown = 0,
812         eGapType_known
813     } EGapType;
814     typedef map <size_t, EGapType> TGapTypeMap;
815     TGapTypeMap m_Map;
816     size_t m_NumUnknown;
817     size_t m_NumKnown;
818 };
819 
CGapCache(const CSeq_loc & loc,CBioseq_Handle bsh)820 CGapCache::CGapCache(const CSeq_loc& loc, CBioseq_Handle bsh)
821 {
822     TSeqPos start = loc.GetStart(eExtreme_Positional);
823     TSeqPos stop = loc.GetStop(eExtreme_Positional);
824     CRange<TSeqPos> range(start, stop);
825     CSeqMap_CI map_iter(bsh, SSeqMapSelector(CSeqMap::fDefaultFlags, 100), range);
826     TSeqPos pos = start;
827     while (map_iter && pos <= stop) {
828         TSeqPos map_end = map_iter.GetPosition() + map_iter.GetLength();
829         if (map_iter.GetType() == CSeqMap::eSeqGap) {
830             for (; pos < map_end && pos <= stop; pos++) {
831                 if (map_iter.IsUnknownLength()) {
832                     m_Map[pos - start] = eGapType_unknown;
833                     m_NumUnknown++;
834                 } else {
835                     m_Map[pos - start] = eGapType_known;
836                     m_NumKnown++;
837                 }
838             }
839         } else {
840             pos = map_end;
841         }
842         ++map_iter;
843     }
844 }
845 
IsGap(size_t pos)846 bool CGapCache::IsGap(size_t pos)
847 {
848     if (m_Map.find(pos) != m_Map.end()) {
849         return true;
850     } else {
851         return false;
852     }
853 }
854 
855 
IsKnownGap(size_t pos)856 bool CGapCache::IsKnownGap(size_t pos)
857 {
858     TGapTypeMap::iterator it = m_Map.find(pos);
859     if (it == m_Map.end()) {
860         return false;
861     } else if (it->second == eGapType_known) {
862         return true;
863     } else {
864         return false;
865     }
866 }
867 
868 
IsUnknownGap(size_t pos)869 bool CGapCache::IsUnknownGap(size_t pos)
870 {
871     TGapTypeMap::iterator it = m_Map.find(pos);
872     if (it == m_Map.end()) {
873         return false;
874     } else if (it->second == eGapType_unknown) {
875         return true;
876     } else {
877         return false;
878     }
879 }
880 
881 
xf_IsDeltaLitOnly(CBioseq_Handle bsh)882 static bool xf_IsDeltaLitOnly (CBioseq_Handle bsh)
883 
884 {
885     if ( bsh.IsSetInst_Ext() ) {
886         const CBioseq_Handle::TInst_Ext& ext = bsh.GetInst_Ext();
887         if ( ext.IsDelta() ) {
888             ITERATE (CDelta_ext::Tdata, it, ext.GetDelta().Get()) {
889                 if ( (*it)->IsLoc() ) {
890                     return false;
891                 }
892             }
893         }
894     }
895     return true;
896 }
897 
898 
x_CalculateLocationGaps(CBioseq_Handle bsh,const CSeq_loc & loc,vector<TSeqPos> & gap_starts)899 size_t CSingleFeatValidator::x_CalculateLocationGaps(CBioseq_Handle bsh, const CSeq_loc& loc, vector<TSeqPos>& gap_starts)
900 {
901     size_t rval = eLocationGapNoProblems;
902     if (!bsh.IsNa() || !bsh.IsSetInst_Repr() || bsh.GetInst().GetRepr() != CSeq_inst::eRepr_delta) {
903         return rval;
904     }
905     // look for features inside gaps, crossing unknown gaps, or starting or ending in gaps
906     // ignore gap features for this
907     int num_n = 0;
908     int num_real = 0;
909     int num_gap = 0;
910     int num_unknown_gap = 0;
911     bool first_in_gap = false, last_in_gap = false;
912     bool local_first_gap = false, local_last_gap = false;
913     bool startsOrEndsInGap = false;
914     bool first = true;
915 
916     for (CSeq_loc_CI loc_it(loc); loc_it; ++loc_it) {
917         CConstRef<CSeq_loc> this_loc = loc_it.GetRangeAsSeq_loc();
918         CSeqVector vec = GetSequenceFromLoc(*this_loc, bsh.GetScope());
919         if (!vec.empty()) {
920             CBioseq_Handle ph;
921             bool match = false;
922             for (auto id_it : bsh.GetBioseqCore()->GetId()) {
923                 if (id_it->Equals(loc_it.GetSeq_id())) {
924                     match = true;
925                     break;
926                 }
927             }
928             if (match) {
929                 ph = bsh;
930             } else {
931                 ph = bsh.GetScope().GetBioseqHandle(*this_loc);
932             }
933             try {
934                 CGapCache gap_cache(*this_loc, ph);
935                 string vec_data;
936                 vec.GetSeqData(0, vec.size(), vec_data);
937 
938                 local_first_gap = false;
939                 local_last_gap = false;
940                 TSeqLength len = loc_it.GetRange().GetLength();
941                 ENa_strand strand = loc_it.GetStrand();
942 
943                 size_t pos = 0;
944                 string::iterator it = vec_data.begin();
945                 while (it != vec_data.end() && pos < len) {
946                     bool is_gap = false;
947                     bool unknown_length = false;
948                     if (strand == eNa_strand_minus) {
949                         if (gap_cache.IsKnownGap(len - pos - 1)) {
950                             is_gap = true;
951                         } else if (gap_cache.IsUnknownGap(len - pos - 1)) {
952                             is_gap = true;
953                             unknown_length = true;
954                         }
955                     } else {
956                         if (gap_cache.IsKnownGap(pos)) {
957                             is_gap = true;
958                         } else if (gap_cache.IsUnknownGap(pos)) {
959                             is_gap = true;
960                             unknown_length = true;
961                         }
962 
963                     }
964                     if (is_gap) {
965                         if (pos == 0) {
966                             local_first_gap = true;
967                         } else if (pos == len - 1) {
968                             local_last_gap = true;
969                         }
970                         if (unknown_length) {
971                             num_unknown_gap++;
972                         } else {
973                             num_gap++;
974                         }
975                     } else if (*it == 'N') {
976                         num_n++;
977                     } else {
978                         num_real++;
979                     }
980                     ++it;
981                     ++pos;
982                 }
983             } catch (CException&/* ex*/) {
984                 /*
985                 PostErr(eDiag_Fatal, eErr_INTERNAL_Exception,
986                 string("Exception while checking for intervals in gaps. EXCEPTION: ") +
987                 ex.what(), feat);
988                 */
989             }
990         }
991         if (first) {
992             first_in_gap = local_first_gap;
993             first = false;
994         }
995         last_in_gap = local_last_gap;
996         if (local_first_gap || local_last_gap) {
997             startsOrEndsInGap = true;
998         }
999     }
1000 
1001     if (num_real == 0 && num_n == 0) {
1002         TSeqPos start = loc.GetStart(eExtreme_Positional);
1003         TSeqPos stop = loc.GetStop(eExtreme_Positional);
1004         if ((start == 0 || CSeqMap_CI(bsh, SSeqMapSelector(), start - 1).GetType() != CSeqMap::eSeqGap)
1005             && (stop == bsh.GetBioseqLength() - 1 || CSeqMap_CI(bsh, SSeqMapSelector(), stop + 1).GetType() != CSeqMap::eSeqGap)) {
1006             rval |= eLocationGapFeatureMatchesGap;
1007         }
1008     }
1009 
1010 
1011     if (num_gap == 0 && num_unknown_gap == 0 && num_n == 0) {
1012         // ignore features that do not cover any gap characters
1013     } else if (first_in_gap || last_in_gap) {
1014         if (num_real > 0) {
1015             TSeqPos gap_start = x_FindStartOfGap(bsh,
1016                 first_in_gap ? loc.GetStart(eExtreme_Biological)
1017                 : loc.GetStop(eExtreme_Biological), &(bsh.GetScope()));
1018             gap_starts.push_back(gap_start);
1019         } else {
1020             rval |= eLocationGapContainedInGap;
1021         }
1022     } else if (num_real == 0 && num_gap == 0 && num_unknown_gap == 0 && num_n >= 50) {
1023         rval |= eLocationGapContainedInGapOfNs;
1024     } else if (startsOrEndsInGap) {
1025         rval |= eLocationGapInternalIntervalEndpointInGap;
1026     } else if (num_unknown_gap > 0) {
1027         rval |= eLocationGapCrossesUnknownGap;
1028     }
1029 
1030     if (num_n > num_real && xf_IsDeltaLitOnly(bsh)) {
1031         rval |= eLocationGapMostlyNs;
1032     }
1033 
1034     return rval;
1035 }
1036 
1037 
x_FindStartOfGap(CBioseq_Handle bsh,int pos,CScope * scope)1038 size_t CSingleFeatValidator::x_FindStartOfGap(CBioseq_Handle bsh, int pos, CScope* scope)
1039 {
1040     if (!bsh || !bsh.IsNa() || !bsh.IsSetInst_Repr()
1041         || bsh.GetInst_Repr() != CSeq_inst::eRepr_delta
1042         || !bsh.GetInst().IsSetExt()
1043         || !bsh.GetInst().GetExt().IsDelta()) {
1044         return bsh.GetInst_Length();
1045     }
1046     size_t offset = 0;
1047 
1048     ITERATE(CSeq_inst::TExt::TDelta::Tdata, it, bsh.GetInst_Ext().GetDelta().Get()) {
1049         unsigned int len = 0;
1050         if ((*it)->IsLiteral()) {
1051             len = (*it)->GetLiteral().GetLength();
1052         } else if ((*it)->IsLoc()) {
1053             len = sequence::GetLength((*it)->GetLoc(), scope);
1054         }
1055         if ((unsigned int)pos >= offset && (unsigned int)pos < offset + len) {
1056             return offset;
1057         } else {
1058             offset += len;
1059         }
1060     }
1061     return offset;
1062 }
1063 
1064 
x_IsMostlyNs(const CSeq_loc & loc,CBioseq_Handle bsh)1065 bool CSingleFeatValidator::x_IsMostlyNs(const CSeq_loc& loc, CBioseq_Handle bsh)
1066 {
1067     if (!bsh.IsNa() || !bsh.IsSetInst_Repr() || bsh.GetInst_Repr() != CSeq_inst::eRepr_raw) {
1068         return false;
1069     }
1070     int num_n = 0;
1071     int real_bases = 0;
1072 
1073     for (CSeq_loc_CI loc_it(loc); loc_it; ++loc_it) {
1074         CConstRef<CSeq_loc> this_loc = loc_it.GetRangeAsSeq_loc();
1075         CSeqVector vec = GetSequenceFromLoc(*this_loc, bsh.GetScope());
1076         if (!vec.empty()) {
1077             CBioseq_Handle ph;
1078             bool match = false;
1079             for (auto id_it : bsh.GetBioseqCore()->GetId()) {
1080                 if (id_it->Equals(loc_it.GetSeq_id())) {
1081                     match = true;
1082                     break;
1083                 }
1084             }
1085             if (match) {
1086                 ph = bsh;
1087             } else {
1088                 ph = bsh.GetScope().GetBioseqHandle(*this_loc);
1089             }
1090             TSeqPos offset = this_loc->GetStart(eExtreme_Positional);
1091             string vec_data;
1092             try {
1093                 vec.GetSeqData(0, vec.size(), vec_data);
1094 
1095                 int pos = 0;
1096                 string::iterator it = vec_data.begin();
1097                 while (it != vec_data.end()) {
1098                     if (*it == 'N') {
1099                         CSeqMap_CI map_iter(ph, SSeqMapSelector(), offset + pos);
1100                         if (map_iter.GetType() == CSeqMap::eSeqGap) {
1101                         } else {
1102                             num_n++;
1103                         }
1104                     } else {
1105                         if ((unsigned)(*it + 1) <= 256 && isalpha(*it)) {
1106                             real_bases++;
1107                         }
1108                     }
1109                     ++it;
1110                     ++pos;
1111                 }
1112             } catch (CException) {
1113             } catch (std::exception) {
1114             }
1115         }
1116     }
1117 
1118     return (num_n > real_bases);
1119 }
1120 
1121 
x_GetFeatureProduct(bool look_far,bool & is_far)1122 CBioseq_Handle CSingleFeatValidator::x_GetFeatureProduct(bool look_far, bool& is_far)
1123 {
1124     CBioseq_Handle prot_handle;
1125     is_far = false;
1126     if (!m_Feat.IsSetProduct()) {
1127         return prot_handle;
1128     }
1129     const CSeq_id* protid = NULL;
1130     try {
1131         protid = &sequence::GetId(m_Feat.GetProduct(), &m_Scope);
1132     } catch (CException&) {}
1133 
1134     if (!protid) {
1135         return prot_handle;
1136     }
1137 
1138     // try "local" scope
1139     prot_handle = m_Scope.GetBioseqHandleFromTSE(*protid, m_LocationBioseq.GetTSE_Handle());
1140     if (!prot_handle) {
1141         prot_handle = m_Scope.GetBioseqHandleFromTSE(*protid, m_Imp.GetTSE_Handle());
1142     }
1143     if (!prot_handle  &&  look_far) {
1144         prot_handle = m_Scope.GetBioseqHandle(*protid);
1145         if (prot_handle) {
1146             is_far = true;
1147         }
1148     }
1149 
1150     return prot_handle;
1151 }
1152 
1153 
x_GetFeatureProduct(bool & is_far)1154 CBioseq_Handle CSingleFeatValidator::x_GetFeatureProduct(bool& is_far)
1155 {
1156     bool look_far = false;
1157 
1158     if (m_Feat.IsSetData()) {
1159         if (m_Feat.GetData().IsCdregion()) {
1160             look_far = m_Imp.IsFarFetchCDSproducts();
1161         } else if (m_Feat.GetData().IsRna()) {
1162             look_far = m_Imp.IsFarFetchMRNAproducts();
1163         } else {
1164             look_far = m_Imp.IsRemoteFetch();
1165         }
1166     }
1167 
1168     return x_GetFeatureProduct(look_far, is_far);
1169 }
1170 
1171 
x_ValidateExcept()1172 void CSingleFeatValidator::x_ValidateExcept()
1173 {
1174     if (m_Feat.IsSetExcept_text() && !NStr::IsBlank(m_Feat.GetExcept_text()) &&
1175         (!m_Feat.IsSetExcept() || !m_Feat.GetExcept())) {
1176         PostErr(eDiag_Warning, eErr_SEQ_FEAT_MissingExceptionFlag,
1177             "Exception text is present, but exception flag is not set");
1178     } else if (m_Feat.IsSetExcept() && m_Feat.GetExcept() &&
1179         (!m_Feat.IsSetExcept_text() || NStr::IsBlank(m_Feat.GetExcept_text()))) {
1180         PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptionMissingText,
1181             "Exception flag is set, but exception text is empty");
1182     }
1183     if (m_Feat.IsSetExcept_text() && !m_Feat.GetExcept_text().empty()) {
1184         x_ValidateExceptText(m_Feat.GetExcept_text());
1185     }
1186 }
1187 
1188 
x_ValidateExceptText(const string & text)1189 void CSingleFeatValidator::x_ValidateExceptText(const string& text)
1190 {
1191     if (text.empty()) return;
1192 
1193     EDiagSev sev = eDiag_Error;
1194     bool found = false;
1195 
1196     string str;
1197 
1198     bool reasons_in_cit = false;
1199     bool annotated_by_transcript_or_proteomic = false;
1200     bool redundant_with_comment = false;
1201     bool refseq_except = false;
1202     vector<string> exceptions;
1203     NStr::Split(text, ",", exceptions, 0);
1204     ITERATE(vector<string>, it, exceptions) {
1205         found = false;
1206         str = NStr::TruncateSpaces(*it);
1207         if (NStr::IsBlank(*it)) {
1208             continue;
1209         }
1210         found = CSeq_feat::IsExceptionTextInLegalList(str, false);
1211 
1212         if (found) {
1213             if (NStr::EqualNocase(str, "reasons given in citation")) {
1214                 reasons_in_cit = true;
1215             } else if (NStr::EqualNocase(str, "annotated by transcript or proteomic data")) {
1216                 annotated_by_transcript_or_proteomic = true;
1217             }
1218         }
1219         if (!found) {
1220             if (m_LocationBioseq) {
1221                 bool check_refseq = false;
1222                 if (m_Imp.IsRefSeqConventions()) {
1223                     check_refseq = true;
1224                 } else if (GetGenProdSetParent(m_LocationBioseq)) {
1225                     check_refseq = true;
1226                 } else {
1227                     FOR_EACH_SEQID_ON_BIOSEQ(id_it, *(m_LocationBioseq.GetCompleteBioseq())) {
1228                         if ((*id_it)->IsOther()) {
1229                             check_refseq = true;
1230                             break;
1231                         }
1232                     }
1233                 }
1234 
1235                 if (check_refseq) {
1236                     if (CSeq_feat::IsExceptionTextRefSeqOnly(str)) {
1237                         found = true;
1238                         refseq_except = true;
1239                     }
1240                 }
1241             }
1242         }
1243         if (!found) {
1244             // lower to warning for genomic refseq
1245             const CSeq_id *id = m_Feat.GetLocation().GetId();
1246             if ((id != NULL && IsNTNCNWACAccession(*id)) ||
1247                 (m_LocationBioseq && IsNTNCNWACAccession(*(m_LocationBioseq.GetCompleteBioseq())))) {
1248                 sev = eDiag_Warning;
1249             }
1250             PostErr(sev, eErr_SEQ_FEAT_ExceptionProblem,
1251                 str + " is not a legal exception explanation");
1252         }
1253         if (m_Feat.IsSetComment() && NStr::Find(m_Feat.GetComment(), str) != string::npos) {
1254             if (!NStr::EqualNocase(str, "ribosomal slippage") &&
1255                 !NStr::EqualNocase(str, "trans-splicing") &&
1256                 !NStr::EqualNocase(str, "RNA editing") &&
1257                 !NStr::EqualNocase(str, "artificial location")) {
1258                 redundant_with_comment = true;
1259             } else if (NStr::EqualNocase(m_Feat.GetComment(), str)) {
1260                 redundant_with_comment = true;
1261             }
1262         }
1263     }
1264     if (redundant_with_comment) {
1265         PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptionProblem,
1266             "Exception explanation text is also found in feature comment");
1267     }
1268     if (refseq_except) {
1269         bool found_just_the_exception = CSeq_feat::IsExceptionTextRefSeqOnly(str);
1270 
1271         if (!found_just_the_exception) {
1272             PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptionProblem,
1273                 "Genome processing exception should not be combined with other explanations");
1274         }
1275     }
1276 
1277     if (reasons_in_cit && !m_Feat.IsSetCit()) {
1278         PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptionProblem,
1279             "Reasons given in citation exception does not have the required citation");
1280     }
1281     if (annotated_by_transcript_or_proteomic) {
1282         bool has_inference = false;
1283         FOR_EACH_GBQUAL_ON_SEQFEAT(it, m_Feat) {
1284             if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "inference")) {
1285                 has_inference = true;
1286                 break;
1287             }
1288         }
1289         if (!has_inference) {
1290             PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptionProblem,
1291                 "Annotated by transcript or proteomic data exception does not have the required inference qualifier");
1292         }
1293     }
1294 }
1295 
1296 
1297 const string kOrigProteinId = "orig_protein_id";
1298 
x_ReportOrigProteinId()1299 bool CSingleFeatValidator::x_ReportOrigProteinId()
1300 {
1301     if (!m_Feat.GetData().IsRna()) {
1302         return true;
1303     } else {
1304         return false;
1305     }
1306 }
1307 
1308 
x_ValidateGbquals()1309 void CSingleFeatValidator::x_ValidateGbquals()
1310 {
1311     if (!m_Feat.IsSetQual()) {
1312         return;
1313     }
1314     string key;
1315     bool is_imp = false;
1316     CSeqFeatData::ESubtype ftype = m_Feat.GetData().GetSubtype();
1317 
1318     if (m_Feat.IsSetData() && m_Feat.GetData().IsImp()) {
1319         is_imp = true;
1320         key = m_Feat.GetData().GetImp().GetKey();
1321         if (ftype == CSeqFeatData::eSubtype_imp && NStr::EqualNocase (key, "gene")) {
1322             ftype = CSeqFeatData::eSubtype_gene;
1323         } else if (ftype == CSeqFeatData::eSubtype_imp) {
1324             ftype = CSeqFeatData::eSubtype_bad;
1325         } else if (ftype == CSeqFeatData::eSubtype_Imp_CDS
1326                    || ftype == CSeqFeatData::eSubtype_site_ref
1327                    || ftype == CSeqFeatData::eSubtype_org) {
1328             ftype = CSeqFeatData::eSubtype_bad;
1329         }
1330     }
1331     else {
1332         key = m_Feat.GetData().GetKey();
1333         if (NStr::Equal (key, "Gene")) {
1334             key = "gene";
1335         }
1336     }
1337 
1338     for (auto gbq : m_Feat.GetQual()) {
1339         const string& qual_str = gbq->GetQual();
1340 
1341         if ( NStr::Equal (qual_str, "gsdb_id")) {
1342             continue;
1343         }
1344         auto gbqual_and_value = CSeqFeatData::GetQualifierTypeAndValue(qual_str);
1345         auto gbqual = gbqual_and_value.first;
1346         bool same_case = (gbqual == CSeqFeatData::eQual_bad) || NStr::EqualCase(gbqual_and_value.second, qual_str);
1347 
1348         if ( !same_case ) {
1349             PostErr(eDiag_Warning, eErr_SEQ_FEAT_IncorrectQualifierCapitalization,
1350                 qual_str + " is improperly capitalized");
1351         }
1352 
1353         if ( gbqual == CSeqFeatData::eQual_bad ) {
1354             if (is_imp) {
1355                 if (!gbq->IsSetQual() || NStr::IsBlank(gbq->GetQual())) {
1356                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatQual,
1357                         "NULL qualifier");
1358                 }
1359                 else {
1360                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatQual,
1361                         "Unknown qualifier " + qual_str);
1362                 }
1363             } else if (NStr::Equal(qual_str, kOrigProteinId)) {
1364                 if (x_ReportOrigProteinId()) {
1365                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownFeatureQual, "Unknown qualifier " + kOrigProteinId);
1366                 }
1367             } else {
1368                 CSeqFeatData::E_Choice chs = m_Feat.GetData().Which();
1369                 if (chs == CSeqFeatData::e_Gene) {
1370                     if (NStr::Equal(qual_str, "gen_map")
1371                         || NStr::Equal(qual_str, "cyt_map")
1372                         || NStr::Equal(qual_str, "rad_map")) {
1373                         continue;
1374                     }
1375                 } else if (chs == CSeqFeatData::e_Cdregion) {
1376                     if (NStr::Equal(qual_str, "orig_transcript_id")) {
1377                         continue;
1378                     }
1379                 } else if (chs == CSeqFeatData::e_Rna) {
1380                     if (NStr::Equal(qual_str, "orig_transcript_id")) {
1381                         continue;
1382                     }
1383                 }
1384                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownFeatureQual, "Unknown qualifier " + qual_str);
1385             }
1386         } else {
1387             if ( ftype != CSeqFeatData::eSubtype_bad && !CSeqFeatData::IsLegalQualifier(ftype, gbqual) ) {
1388                 PostErr(eDiag_Warning,
1389                     is_imp ? eErr_SEQ_FEAT_WrongQualOnImpFeat : eErr_SEQ_FEAT_WrongQualOnFeature,
1390                     "Wrong qualifier " + qual_str + " for feature " +
1391                     key);
1392             }
1393 
1394             if (gbq->IsSetVal() && !NStr::IsBlank(gbq->GetVal())) {
1395                 // validate value of gbqual
1396                 const string& val = gbq->GetVal();
1397                 switch (gbqual) {
1398 
1399                     case CSeqFeatData::eQual_rpt_type:
1400                         if (!CGb_qual::IsValidRptTypeValue(val)) {
1401                             PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
1402                                 val + " is not a legal value for qualifier " + qual_str);
1403                         }
1404                         break;
1405 
1406                     case CSeqFeatData::eQual_rpt_unit:
1407                         x_ValidateRptUnitVal (val, key);
1408                         break;
1409 
1410                     case CSeqFeatData::eQual_rpt_unit_seq:
1411                         x_ValidateRptUnitSeqVal (val, key);
1412                         break;
1413 
1414                     case CSeqFeatData::eQual_rpt_unit_range:
1415                         x_ValidateRptUnitRangeVal (val);
1416                         break;
1417 
1418                     case CSeqFeatData::eQual_label:
1419                         x_ValidateLabelVal (val);
1420                         break;
1421 
1422                     case CSeqFeatData::eQual_replace:
1423                         if (is_imp) {
1424                             x_ValidateReplaceQual(key, qual_str, val);
1425                         }
1426                         break;
1427 
1428                     case CSeqFeatData::eQual_mobile_element:
1429                     case CSeqFeatData::eQual_mobile_element_type:
1430                         if (is_imp && !CGb_qual::IsLegalMobileElementValue(val)) {
1431                             PostErr(eDiag_Warning, eErr_SEQ_FEAT_MobileElementInvalidQualifier,
1432                                   val + " is not a legal value for qualifier " + qual_str);
1433                         }
1434                         break;
1435 
1436                     case CSeqFeatData::eQual_compare:
1437                         x_ValidateCompareVal (val);
1438                         break;
1439 
1440                     case CSeqFeatData::eQual_standard_name:
1441                         if (is_imp && ftype == CSeqFeatData::eSubtype_misc_feature
1442                             && NStr::EqualCase (val, "Vector Contamination")) {
1443                             PostErr (eDiag_Warning, eErr_SEQ_FEAT_VectorContamination,
1444                                      "Vector Contamination region should be trimmed from sequence");
1445                         }
1446                         break;
1447 
1448                     case CSeqFeatData::eQual_product:
1449                         if (!is_imp) {
1450                             CSeqFeatData::E_Choice chs = m_Feat.GetData().Which();
1451                             if (chs == CSeqFeatData::e_Gene) {
1452                                 PostErr(eDiag_Info, eErr_SEQ_FEAT_InvalidProductOnGene,
1453                                     "A product qualifier is not used on a gene feature");
1454                             }
1455                         }
1456                         break;
1457 
1458                     // for VR-825
1459                     case CSeqFeatData::eQual_locus_tag:
1460                         PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnFeature,
1461                             "locus-tag values should be on genes");
1462                         break;
1463                     default:
1464                         break;
1465                 } // end of switch statement
1466             }
1467         }
1468     }
1469 }
1470 
1471 
x_ValidateRptUnitVal(const string & val,const string & key)1472 void CSingleFeatValidator::x_ValidateRptUnitVal (const string& val, const string& key)
1473 {
1474     bool /* found = false, */ multiple_rpt_unit = false;
1475     ITERATE(string, it, val) {
1476         if ( *it <= ' ' ) {
1477             /* found = true; */
1478         } else if ( *it == '('  ||  *it == ')'  ||
1479             *it == ','  ||  *it == '.'  ||
1480             isdigit((unsigned char)(*it)) ) {
1481             multiple_rpt_unit = true;
1482         }
1483     }
1484     /*
1485     if ( found ||
1486     (!multiple_rpt_unit && val.length() > 48) ) {
1487     error = true;
1488     }
1489     */
1490     if ( NStr::CompareNocase(key, "repeat_region") == 0  &&
1491          !multiple_rpt_unit ) {
1492         if (val.length() <= GetLength(m_Feat.GetLocation(), &m_Scope) ) {
1493             bool just_nuc_letters = true;
1494             static const string nuc_letters = "ACGTNacgtn";
1495             ITERATE(string, it, val) {
1496                 if ( nuc_letters.find(*it) == NPOS ) {
1497                     just_nuc_letters = false;
1498                     break;
1499                 }
1500             }
1501 
1502             if ( just_nuc_letters ) {
1503                 CSeqVector vec = GetSequenceFromFeature(m_Feat, m_Scope);
1504                 if ( !vec.empty() ) {
1505                     string vec_data;
1506                     vec.GetSeqData(0, vec.size(), vec_data);
1507                     if (NStr::FindNoCase (vec_data, val) == string::npos) {
1508                         PostErr(eDiag_Warning, eErr_SEQ_FEAT_RepeatSeqDoNotMatch,
1509                             "repeat_region /rpt_unit and underlying "
1510                             "sequence do not match");
1511                     }
1512                 }
1513             }
1514         } else {
1515             PostErr(eDiag_Info, eErr_SEQ_FEAT_InvalidRepeatUnitLength,
1516                 "Length of rpt_unit_seq is greater than feature length");
1517         }
1518     }
1519 }
1520 
1521 
x_ValidateRptUnitSeqVal(const string & val,const string & key)1522 void CSingleFeatValidator::x_ValidateRptUnitSeqVal (const string& val, const string& key)
1523 {
1524     // do validation common to rpt_unit
1525     x_ValidateRptUnitVal (val, key);
1526 
1527     // do the validation specific to rpt_unit_seq
1528     const char *cp = val.c_str();
1529     bool badchars = false;
1530     while (*cp != 0 && !badchars) {
1531         if (*cp < ' ') {
1532             badchars = true;
1533         } else if (*cp != '(' && *cp != ')'
1534                    && !isdigit (*cp) && !isalpha (*cp)
1535                    && *cp != ',' && *cp != ';') {
1536             badchars = true;
1537         }
1538         cp++;
1539     }
1540     if (badchars) {
1541         PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidRptUnitSeqCharacters,
1542             "/rpt_unit_seq has illegal characters");
1543     }
1544 }
1545 
1546 
s_RptUnitIsBaseRange(string str,TSeqPos & from,TSeqPos & to)1547 static bool s_RptUnitIsBaseRange (string str, TSeqPos& from, TSeqPos& to)
1548 
1549 {
1550     if (str.length() > 25) {
1551         return false;
1552     }
1553     SIZE_TYPE pos = NStr::Find (str, "..");
1554     if (pos == string::npos) {
1555         return false;
1556     }
1557 
1558     int tmp_from, tmp_to;
1559     try {
1560         tmp_from = NStr::StringToInt (str.substr(0, pos));
1561         from = tmp_from;
1562         tmp_to = NStr::StringToInt (str.substr (pos + 2));
1563         to = tmp_to;
1564       } catch (CException ) {
1565           return false;
1566       } catch (std::exception ) {
1567         return false;
1568     }
1569     if (tmp_from < 0 || tmp_to < 0) {
1570         return false;
1571     }
1572     return true;
1573 }
1574 
1575 
x_ValidateRptUnitRangeVal(const string & val)1576 void CSingleFeatValidator::x_ValidateRptUnitRangeVal (const string& val)
1577 {
1578     TSeqPos from = kInvalidSeqPos, to = kInvalidSeqPos;
1579     if (!s_RptUnitIsBaseRange(val, from, to)) {
1580         PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidRptUnitRange,
1581                  "/rpt_unit_range is not a base range");
1582     } else {
1583         CSeq_loc::TRange range = m_Feat.GetLocation().GetTotalRange();
1584         if (from - 1 < range.GetFrom() || from - 1> range.GetTo() || to - 1 < range.GetFrom() || to - 1 > range.GetTo()) {
1585             PostErr (eDiag_Warning, eErr_SEQ_FEAT_RptUnitRangeProblem,
1586                      "/rpt_unit_range is not within sequence length");
1587         } else {
1588             bool nulls_between = false;
1589             for ( CTypeConstIterator<CSeq_loc> lit = ConstBegin(m_Feat.GetLocation()); lit; ++lit ) {
1590                 if ( lit->Which() == CSeq_loc::e_Null ) {
1591                     nulls_between = true;
1592                 }
1593             }
1594             if (nulls_between) {
1595                 bool in_range = false;
1596                 for ( CSeq_loc_CI it(m_Feat.GetLocation()); it; ++it ) {
1597                     range = it.GetEmbeddingSeq_loc().GetTotalRange();
1598                     if (from - 1 < range.GetFrom() || from - 1> range.GetTo() || to - 1 < range.GetFrom() || to - 1 > range.GetTo()) {
1599                     } else {
1600                         in_range = true;
1601                     }
1602                 }
1603                 if (! in_range) {
1604                     PostErr (eDiag_Warning, eErr_SEQ_FEAT_RptUnitRangeProblem,
1605                              "/rpt_unit_range is not within ordered intervals");
1606                 }
1607             }
1608         }
1609     }
1610 }
1611 
1612 
x_ValidateLabelVal(const string & val)1613 void CSingleFeatValidator::x_ValidateLabelVal (const string& val)
1614 {
1615     bool only_digits = true,
1616         has_spaces = false;
1617 
1618     ITERATE(string, it, val) {
1619         if ( isspace((unsigned char)(*it)) ) {
1620             has_spaces = true;
1621         }
1622         if ( !isdigit((unsigned char)(*it)) ) {
1623             only_digits = false;
1624         }
1625     }
1626     if (only_digits  ||  has_spaces) {
1627         PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue, "Illegal value for qualifier label");
1628     }
1629 }
1630 
1631 
x_ValidateCompareVal(const string & val)1632 void CSingleFeatValidator::x_ValidateCompareVal (const string& val)
1633 {
1634     if (!NStr::StartsWith (val, "(")) {
1635         EAccessionFormatError valid_accession = ValidateAccessionString (val, true);
1636         if (valid_accession == eAccessionFormat_missing_version) {
1637             PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidCompareMissingVersion,
1638                      val + " accession missing version for qualifier compare");
1639         } else if (valid_accession == eAccessionFormat_bad_version) {
1640             PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
1641                      val + " accession has bad version for qualifier compare");
1642         } else if (valid_accession != eAccessionFormat_valid) {
1643             PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidCompareBadAccession,
1644                      val + " is not a legal accession for qualifier compare");
1645         } else if (m_Imp.IsINSDInSep() && NStr::Find (val, "_") != string::npos) {
1646             PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidCompareRefSeqAccession,
1647                      "RefSeq accession " + val + " cannot be used for qualifier compare");
1648         }
1649     }
1650 }
1651 
1652 
s_StringConsistsOf(string str,string consist)1653 static bool s_StringConsistsOf (string str, string consist)
1654 {
1655     const char *src = str.c_str();
1656     const char *find = consist.c_str();
1657     bool rval = true;
1658 
1659     while (*src != 0 && rval) {
1660         if (strchr (find, *src) == NULL) {
1661             rval = false;
1662         }
1663         src++;
1664     }
1665     return rval;
1666 }
1667 
1668 
x_ValidateReplaceQual(const string & key,const string & qual_str,const string & val)1669 void CSingleFeatValidator::x_ValidateReplaceQual(const string& key, const string& qual_str, const string& val)
1670 {
1671     if (m_LocationBioseq) {
1672         if (m_LocationBioseq.IsNa()) {
1673             if (NStr::Equal(key, "variation")) {
1674                 if (!s_StringConsistsOf (val, "acgtACGT")) {
1675                     PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidVariationReplace,
1676                                 val + " is not a legal value for qualifier " + qual_str
1677                                 + " - should only be composed of acgt unambiguous nucleotide bases");
1678                 }
1679             } else if (!s_StringConsistsOf (val, "acgtmrwsykvhdbn")) {
1680                     PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidReplace,
1681                             val + " is not a legal value for qualifier " + qual_str
1682                             + " - should only be composed of acgtmrwsykvhdbn nucleotide bases");
1683             }
1684         } else if (m_LocationBioseq.IsAa()) {
1685             if (!s_StringConsistsOf (val, "acdefghiklmnpqrstuvwy*")) {
1686                 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidReplace,
1687                             val + " is not a legal value for qualifier " + qual_str
1688                             + " - should only be composed of acdefghiklmnpqrstuvwy* amino acids");
1689             }
1690         }
1691 
1692         // if no point in location with fuzz, info if text matches sequence
1693         bool has_fuzz = false;
1694         for( objects::CSeq_loc_CI it(m_Feat.GetLocation()); it && !has_fuzz; ++it) {
1695             if (it.IsPoint() && (it.GetFuzzFrom() != NULL || it.GetFuzzTo() != NULL)) {
1696                 has_fuzz = true;
1697             }
1698         }
1699         if (!has_fuzz && val.length() == GetLength (m_Feat.GetLocation(), &m_Scope)) {
1700             try {
1701                 CSeqVector nuc_vec(m_Feat.GetLocation(), m_Scope, CBioseq_Handle::eCoding_Iupac);
1702                 string bases;
1703                 nuc_vec.GetSeqData(0, nuc_vec.size(), bases);
1704                 if (NStr::EqualNocase(val, bases)) {
1705                     PostErr(eDiag_Info, eErr_SEQ_FEAT_InvalidMatchingReplace,
1706                                 "/replace already matches underlying sequence (" + val + ")");
1707                 }
1708             } catch (CException ) {
1709             } catch (std::exception ) {
1710             }
1711         }
1712     }
1713 }
1714 
1715 
ValidateCharactersInField(string value,string field_name)1716 void CSingleFeatValidator::ValidateCharactersInField (string value, string field_name)
1717 {
1718     if (HasBadCharacter (value)) {
1719         PostErr (eDiag_Warning, eErr_SEQ_FEAT_BadInternalCharacter,
1720                  field_name + " contains undesired character");
1721     }
1722     if (EndsWithBadCharacter (value)) {
1723         PostErr (eDiag_Warning, eErr_SEQ_FEAT_BadTrailingCharacter,
1724                  field_name + " ends with undesired character");
1725     }
1726     if (NStr::EndsWith (value, "-")) {
1727         if (!m_Imp.IsGpipe() || !m_Imp.BioSourceKind().IsOrganismEukaryote() )
1728             PostErr (eDiag_Warning, eErr_SEQ_FEAT_BadTrailingHyphen,
1729                 field_name + " ends with hyphen");
1730     }
1731 }
1732 
1733 
ValidateSplice(bool gene_pseudo,bool check_all)1734 void CSingleFeatValidator::ValidateSplice(bool gene_pseudo, bool check_all)
1735 {
1736     if (!m_LocationBioseq) {
1737         return;
1738     }
1739 
1740     CSpliceProblems splice_problems;
1741     splice_problems.CalculateSpliceProblems(m_Feat, check_all, gene_pseudo, m_LocationBioseq);
1742 
1743     if (splice_problems.AreErrorsUnexpected()) {
1744         string label = GetBioseqIdLabel(*(m_LocationBioseq.GetCompleteBioseq()), true);
1745         x_ReportSpliceProblems(splice_problems, label);
1746     }
1747 
1748     if (splice_problems.IsExceptionUnnecessary()) {
1749         PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryException,
1750             "feature has exception but passes splice site test");
1751     }
1752 }
1753 
1754 
x_SeverityForConsensusSplice(void)1755 EDiagSev CSingleFeatValidator::x_SeverityForConsensusSplice(void)
1756 {
1757     EDiagSev sev = eDiag_Warning;
1758     if (m_Imp.IsGpipe() && m_Imp.IsGenomic()) {
1759         sev = eDiag_Info;
1760     } else if ((m_Imp.IsGPS() || m_Imp.IsRefSeq()) && !m_Imp.ReportSpliceAsError()) {
1761         sev = eDiag_Warning;
1762     }
1763     return sev;
1764 }
1765 
1766 
x_ReportDonorSpliceSiteReadErrors(const CSpliceProblems::TSpliceProblem & problem,const string & label)1767 void CSingleFeatValidator::x_ReportDonorSpliceSiteReadErrors(const CSpliceProblems::TSpliceProblem& problem, const string& label)
1768 {
1769     if (problem.first == CSpliceProblems::eSpliceSiteRead_BadSeq) {
1770         PostErr(eDiag_Warning, eErr_SEQ_FEAT_NotSpliceConsensusDonor,
1771             "Bad sequence at splice donor after exon ending at position "
1772             + NStr::IntToString(problem.second + 1) + " of " + label);
1773     } else if (problem.first == CSpliceProblems::eSpliceSiteRead_WrongNT) {
1774         PostErr(x_SeverityForConsensusSplice(), eErr_SEQ_FEAT_NotSpliceConsensusDonor,
1775             "Splice donor consensus (GT) not found after exon ending at position "
1776             + NStr::IntToString(problem.second + 1) + " of " + label);
1777     }
1778 
1779 }
1780 
1781 
x_ReportAcceptorSpliceSiteReadErrors(const CSpliceProblems::TSpliceProblem & problem,const string & label)1782 void CSingleFeatValidator::x_ReportAcceptorSpliceSiteReadErrors(const CSpliceProblems::TSpliceProblem& problem, const string& label)
1783 {
1784     if (problem.first == CSpliceProblems::eSpliceSiteRead_BadSeq) {
1785         PostErr(x_SeverityForConsensusSplice(), eErr_SEQ_FEAT_NotSpliceConsensusAcceptor,
1786             "Bad sequence at splice acceptor before exon starting at position "
1787             + NStr::IntToString(problem.second + 1) + " of " + label);
1788     } else if (problem.first == CSpliceProblems::eSpliceSiteRead_WrongNT) {
1789         PostErr(x_SeverityForConsensusSplice(), eErr_SEQ_FEAT_NotSpliceConsensusAcceptor,
1790             "Splice acceptor consensus (AG) not found before exon starting at position "
1791             + NStr::IntToString(problem.second + 1) + " of " + label);
1792     }
1793 
1794 }
1795 
1796 
x_ReportSpliceProblems(const CSpliceProblems & problems,const string & label)1797 void CSingleFeatValidator::x_ReportSpliceProblems
1798 (const CSpliceProblems& problems, const string& label)
1799 {
1800     const CSpliceProblems::TSpliceProblemList& donor_problems = problems.GetDonorProblems();
1801     for (auto it = donor_problems.begin(); it != donor_problems.end(); it++) {
1802         x_ReportDonorSpliceSiteReadErrors(*it, label);
1803     }
1804     const CSpliceProblems::TSpliceProblemList& acceptor_problems = problems.GetAcceptorProblems();
1805     for (auto it = acceptor_problems.begin(); it != acceptor_problems.end(); it++) {
1806         x_ReportAcceptorSpliceSiteReadErrors(*it, label);
1807     }
1808 }
1809 
1810 
x_BioseqHasNmAccession(CBioseq_Handle bsh)1811 bool CSingleFeatValidator::x_BioseqHasNmAccession (CBioseq_Handle bsh)
1812 {
1813     if (bsh) {
1814         FOR_EACH_SEQID_ON_BIOSEQ (it, *(bsh.GetBioseqCore())) {
1815             if ((*it)->IsOther() && (*it)->GetOther().IsSetAccession()
1816                 && NStr::StartsWith ((*it)->GetOther().GetAccession(), "NM_")) {
1817                 return true;
1818             }
1819         }
1820     }
1821     return false;
1822 }
1823 
1824 
x_ValidateNonImpFeat()1825 void CSingleFeatValidator::x_ValidateNonImpFeat ()
1826 {
1827     if (m_Feat.GetData().IsImp()) {
1828         return;
1829     }
1830     string key = m_Feat.GetData().GetKey();
1831 
1832     CSeqFeatData::ESubtype subtype = m_Feat.GetData().GetSubtype();
1833 
1834     // look for mandatory qualifiers
1835     EDiagSev sev = eDiag_Warning;
1836 
1837     for (auto required : CSeqFeatData::GetMandatoryQualifiers(subtype))
1838     {
1839         bool found = false;
1840         if (m_Feat.IsSetQual()) {
1841             for (auto qual : m_Feat.GetQual()) {
1842                 if (qual->IsSetQual() && CSeqFeatData::GetQualifierType(qual->GetQual()) == required) {
1843                     found = true;
1844                     break;
1845                 }
1846             }
1847         }
1848 
1849         if (!found) {
1850             if (required == CSeqFeatData::eQual_citation) {
1851                 if (m_Feat.IsSetCit()) {
1852                     found = true;
1853                 } else if (m_Feat.IsSetComment() && NStr::EqualNocase (key, "conflict")) {
1854                     // RefSeq allows conflict with accession in comment instead of sfp->cit
1855                     FOR_EACH_SEQID_ON_BIOSEQ (it, *(m_LocationBioseq.GetCompleteBioseq())) {
1856                         if ((*it)->IsOther()) {
1857                             found = true;
1858                             break;
1859                         }
1860                     }
1861                 }
1862             }
1863         }
1864         if (!found && (NStr::EqualNocase (key, "conflict") || NStr::EqualNocase (key, "old_sequence"))) {
1865             if (m_Feat.IsSetQual()) {
1866                 for (auto qual : m_Feat.GetQual()) {
1867                     if (qual->IsSetQual() && NStr::EqualNocase(qual->GetQual(), "compare")
1868                         && qual->IsSetVal() && !NStr::IsBlank(qual->GetVal())) {
1869                         found = true;
1870                         break;
1871                     }
1872                 }
1873             }
1874         }
1875         if (!found && required == CSeqFeatData::eQual_ncRNA_class) {
1876             sev = eDiag_Error;
1877             if (m_Feat.GetData().IsRna() && m_Feat.GetData().GetRna().IsSetExt()
1878                 && m_Feat.GetData().GetRna().GetExt().IsGen()
1879                 && m_Feat.GetData().GetRna().GetExt().GetGen().IsSetClass()
1880                 && !NStr::IsBlank(m_Feat.GetData().GetRna().GetExt().GetGen().GetClass())) {
1881                 found = true;
1882             }
1883         }
1884 
1885         if (!found) {
1886             PostErr(sev, eErr_SEQ_FEAT_MissingQualOnFeature,
1887                 "Missing qualifier " + CSeqFeatData::GetQualifierAsString(required) +
1888                 " for feature " + key);
1889         }
1890     }
1891 }
1892 
1893 
s_LocationStrandsIncompatible(const CSeq_loc & loc1,const CSeq_loc & loc2,CScope * scope)1894 static bool s_LocationStrandsIncompatible (const CSeq_loc& loc1, const CSeq_loc& loc2, CScope * scope)
1895 {
1896     ENa_strand strand1 = loc1.GetStrand();
1897     ENa_strand strand2 = loc2.GetStrand();
1898 
1899     if (strand1 == strand2) {
1900         return false;
1901     }
1902     if ((strand1 == eNa_strand_unknown || strand1 == eNa_strand_plus) &&
1903         (strand2 == eNa_strand_unknown || strand2 == eNa_strand_plus)) {
1904             return false;
1905     }
1906     if (strand1 == eNa_strand_other) {
1907         ECompare comp = Compare(loc1, loc2, scope, fCompareOverlapping);
1908         if (comp == eContains) {
1909             return false;
1910         }
1911     } else if (strand2 == eNa_strand_other) {
1912         ECompare comp = Compare(loc1, loc2, scope, fCompareOverlapping);
1913         if (comp == eContained) {
1914             return false;
1915         }
1916     }
1917 
1918     return true;
1919 }
1920 
1921 
x_ValidateGeneFeaturePair(const CSeq_feat & gene)1922 void CSingleFeatValidator::x_ValidateGeneFeaturePair(const CSeq_feat& gene)
1923 {
1924     bool bad_strand = s_LocationStrandsIncompatible(gene.GetLocation(), m_Feat.GetLocation(), &m_Scope);
1925     if (bad_strand) {
1926         PostErr(eDiag_Warning, eErr_SEQ_FEAT_GeneXrefStrandProblem,
1927                 "Gene cross-reference is not on expected strand");
1928     }
1929 
1930 }
1931 
1932 
s_GeneRefsAreEquivalent(const CGene_ref & g1,const CGene_ref & g2,string & label)1933 bool CSingleFeatValidator::s_GeneRefsAreEquivalent (const CGene_ref& g1, const CGene_ref& g2, string& label)
1934 {
1935     bool equivalent = false;
1936     if (g1.IsSetLocus_tag()
1937         && g2.IsSetLocus_tag()) {
1938         if (NStr::EqualNocase(g1.GetLocus_tag(),
1939                               g2.GetLocus_tag())) {
1940             label = g1.GetLocus_tag();
1941             equivalent = true;
1942         }
1943     } else if (g1.IsSetLocus()
1944                && g2.IsSetLocus()) {
1945         if (NStr::EqualNocase(g1.GetLocus(),
1946                               g2.GetLocus())) {
1947             label = g1.GetLocus();
1948             equivalent = true;
1949         }
1950     } else if (g1.IsSetSyn()
1951                && g2.IsSetSyn()) {
1952         if (NStr::EqualNocase (g1.GetSyn().front(),
1953                                g2.GetSyn().front())) {
1954             label = g1.GetSyn().front();
1955             equivalent = true;
1956         }
1957     }
1958     return equivalent;
1959 }
1960 
1961 
1962 // Check for redundant gene Xref
1963 // Do not call if feat is gene
x_ValidateGeneXRef()1964 void CSingleFeatValidator::x_ValidateGeneXRef()
1965 {
1966     if (m_Feat.IsSetData() && m_Feat.GetData().IsGene()) {
1967         return;
1968     }
1969     auto tse = m_Imp.GetTSE_Handle();
1970     if (!tse) {
1971         return;
1972     }
1973 
1974     // first, look for gene by feature id xref
1975     bool has_gene_id_xref = false;
1976     if (m_Feat.IsSetXref()) {
1977         ITERATE(CSeq_feat::TXref, xref, m_Feat.GetXref()) {
1978             if ((*xref)->IsSetId() && (*xref)->GetId().IsLocal()) {
1979                 CTSE_Handle::TSeq_feat_Handles gene_feats =
1980                     tse.GetFeaturesWithId(CSeqFeatData::eSubtype_gene, (*xref)->GetId().GetLocal());
1981                 if (gene_feats.size() > 0) {
1982                     has_gene_id_xref = true;
1983                     ITERATE(CTSE_Handle::TSeq_feat_Handles, gene, gene_feats) {
1984                         x_ValidateGeneFeaturePair(*(gene->GetSeq_feat()));
1985                     }
1986                 }
1987             }
1988         }
1989     }
1990     if (has_gene_id_xref) {
1991         return;
1992     }
1993 
1994     // if we can't get the bioseq on which the gene is located, we can't check for
1995     // overlapping/ambiguous/redundant conditions
1996     if (!m_LocationBioseq) {
1997         return;
1998     }
1999 
2000     const CGene_ref* gene_xref = m_Feat.GetGeneXref();
2001 
2002     size_t num_genes = 0;
2003     size_t max = 0;
2004     size_t num_trans_spliced = 0;
2005     bool equivalent = false;
2006     /*
2007     CFeat_CI gene_it(bsh, CSeqFeatData::e_Gene);
2008     */
2009 
2010     //CFeat_CI gene_it(*m_Scope, feat.GetLocation(), SAnnotSelector (CSeqFeatData::e_Gene));
2011     CFeat_CI gene_it(m_LocationBioseq,
2012                      CRange<TSeqPos>(m_Feat.GetLocation().GetStart(eExtreme_Positional),
2013                                      m_Feat.GetLocation().GetStop(eExtreme_Positional)),
2014                      SAnnotSelector(CSeqFeatData::e_Gene));
2015     CFeat_CI prev_gene;
2016     string label = "?";
2017     size_t num_match_by_locus = 0;
2018     size_t num_match_by_locus_tag = 0;
2019 
2020     for ( ; gene_it; ++gene_it) {
2021         if (gene_xref && gene_xref->IsSetLocus() &&
2022             gene_it->GetData().GetGene().IsSetLocus() &&
2023             NStr::Equal(gene_xref->GetLocus(), gene_it->GetData().GetGene().GetLocus())) {
2024             num_match_by_locus++;
2025             x_ValidateGeneFeaturePair(*(gene_it->GetSeq_feat()));
2026         }
2027         if (gene_xref && gene_xref->IsSetLocus_tag() &&
2028             gene_it->GetData().GetGene().IsSetLocus_tag() &&
2029             NStr::Equal(gene_xref->GetLocus_tag(), gene_it->GetData().GetGene().GetLocus_tag())) {
2030             num_match_by_locus_tag++;
2031             x_ValidateGeneFeaturePair(*(gene_it->GetSeq_feat()));
2032             if ((!gene_xref->IsSetLocus() || NStr::IsBlank(gene_xref->GetLocus())) &&
2033                 gene_it->GetData().GetGene().IsSetLocus() &&
2034                 !NStr::IsBlank(gene_it->GetData().GetGene().GetLocus())) {
2035                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GeneXrefWithoutLocus,
2036                     "Feature has Gene Xref with locus_tag but no locus, gene with locus_tag and locus exists");
2037             }
2038         }
2039 
2040         if (TestForOverlapEx (gene_it->GetLocation(), m_Feat.GetLocation(),
2041           gene_it->GetLocation().IsInt() ? eOverlap_Contained : eOverlap_Subset, &m_Scope) >= 0) {
2042             size_t len = GetLength(gene_it->GetLocation(), &m_Scope);
2043             if (len < max || num_genes == 0) {
2044                 num_genes = 1;
2045                 max = len;
2046                 num_trans_spliced = 0;
2047                 if (gene_it->IsSetExcept() && gene_it->IsSetExcept_text() &&
2048                     NStr::FindNoCase (gene_it->GetExcept_text(), "trans-splicing") != string::npos) {
2049                     num_trans_spliced++;
2050                 }
2051                 equivalent = false;
2052                 prev_gene = gene_it;
2053             } else if (len == max) {
2054                 equivalent |= s_GeneRefsAreEquivalent(gene_it->GetData().GetGene(), prev_gene->GetData().GetGene(), label);
2055                 num_genes++;
2056                 if (gene_it->IsSetExcept() && gene_it->IsSetExcept_text() &&
2057                     NStr::FindNoCase (gene_it->GetExcept_text(), "trans-splicing") != string::npos) {
2058                     num_trans_spliced++;
2059                 }
2060             }
2061         }
2062     }
2063 
2064     if ( gene_xref == 0) {
2065         // if there is no gene xref, then there should be 0 or 1 overlapping genes
2066         // so that mapping by overlap is unambiguous
2067         if (num_genes > 1 &&
2068             m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_repeat_region &&
2069             m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_mobile_element) {
2070             if (m_Imp.IsSmallGenomeSet() && num_genes == num_trans_spliced) {
2071                 /* suppress for trans-spliced genes on small genome set */
2072             } else if (equivalent) {
2073                 PostErr (eDiag_Warning, eErr_SEQ_FEAT_GeneXrefNeeded,
2074                          "Feature overlapped by "
2075                          + NStr::SizetToString(num_genes)
2076                          + " identical-length equivalent genes but has no cross-reference");
2077             } else {
2078                 PostErr (eDiag_Warning, eErr_SEQ_FEAT_MissingGeneXref,
2079                          "Feature overlapped by "
2080                          + NStr::SizetToString(num_genes)
2081                          + " identical-length genes but has no cross-reference");
2082             }
2083         } else if (num_genes == 1
2084                    && prev_gene->GetData().GetGene().IsSetAllele()
2085                    && !NStr::IsBlank(prev_gene->GetData().GetGene().GetAllele())) {
2086             const string& allele = prev_gene->GetData().GetGene().GetAllele();
2087             // overlapping gene should not conflict with allele qualifier
2088             FOR_EACH_GBQUAL_ON_FEATURE (qual_iter, m_Feat) {
2089                 const CGb_qual& qual = **qual_iter;
2090                 if ( qual.IsSetQual()  &&
2091                      NStr::Compare(qual.GetQual(), "allele") == 0 ) {
2092                     if ( qual.CanGetVal()  &&
2093                          NStr::CompareNocase(qual.GetVal(), allele) == 0 ) {
2094                         PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidAlleleDuplicates,
2095                             "Redundant allele qualifier (" + allele +
2096                             ") on gene and feature");
2097                     } else if (m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_variation) {
2098                         PostErr(eDiag_Warning, eErr_SEQ_FEAT_MismatchedAllele,
2099                             "Mismatched allele qualifier on gene (" + allele +
2100                             ") and feature (" + qual.GetVal() +")");
2101                     }
2102                 }
2103             }
2104         }
2105     } else if ( !gene_xref->IsSuppressed() ) {
2106         // we are counting features with gene xrefs
2107         m_Imp.IncrementGeneXrefCount();
2108 
2109         // make sure overlapping gene and gene xref do not conflict
2110         if (gene_xref->IsSetAllele() && !NStr::IsBlank(gene_xref->GetAllele())) {
2111             const string& allele = gene_xref->GetAllele();
2112 
2113             FOR_EACH_GBQUAL_ON_FEATURE (qual_iter, m_Feat) {
2114                 const CGb_qual& qual = **qual_iter;
2115                 if ( qual.CanGetQual()  &&
2116                      NStr::Compare(qual.GetQual(), "allele") == 0 ) {
2117                     if ( qual.CanGetVal()  &&
2118                          NStr::CompareNocase(qual.GetVal(), allele) == 0 ) {
2119                         PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidAlleleDuplicates,
2120                             "Redundant allele qualifier (" + allele +
2121                             ") on gene and feature");
2122                     } else if (m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_variation) {
2123                         PostErr(eDiag_Warning, eErr_SEQ_FEAT_MismatchedAllele,
2124                             "Mismatched allele qualifier on gene (" + allele +
2125                             ") and feature (" + qual.GetVal() +")");
2126                     }
2127                 }
2128             }
2129         }
2130 
2131         if (num_match_by_locus == 0 && num_match_by_locus_tag == 0) {
2132             // find gene on bioseq to match genexref
2133             if ((gene_xref->IsSetLocus_tag() &&
2134                 !NStr::IsBlank(gene_xref->GetLocus_tag())) ||
2135                 (gene_xref->IsSetLocus() &&
2136                     !NStr::IsBlank(gene_xref->GetLocus()))) {
2137                 CConstRef<CSeq_feat> gene = m_Imp.GetGeneCache().GetGeneFromCache(&m_Feat, m_Scope);
2138                 if (!gene && m_LocationBioseq && m_LocationBioseq.IsAa()) {
2139                     const CSeq_feat* cds = GetCDSForProduct(m_LocationBioseq);
2140                     if (cds != 0) {
2141                         if (cds->IsSetLocation()) {
2142                             const CSeq_loc& loc = cds->GetLocation();
2143                             const CSeq_id* id = loc.GetId();
2144                             if (id != NULL) {
2145                                 CBioseq_Handle nbsh = m_LocationBioseq.GetScope().GetBioseqHandle(*id);
2146                                 if (nbsh) {
2147                                     gene = m_Imp.GetGeneCache().GetGeneFromCache(cds, m_Scope);
2148                                 }
2149                             }
2150                         }
2151                     }
2152                 }
2153                 string label;
2154                 if (gene && !CSingleFeatValidator::s_GeneRefsAreEquivalent(*gene_xref, gene->GetData().GetGene(), label)) {
2155                     gene.Reset(NULL);
2156                 }
2157                 if (gene_xref->IsSetLocus_tag() &&
2158                     !NStr::IsBlank(gene_xref->GetLocus_tag()) &&
2159                     !gene) {
2160                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_GeneXrefWithoutGene,
2161                         "Feature has gene locus_tag cross-reference but no equivalent gene feature exists");
2162                 } else if (gene_xref->IsSetLocus() &&
2163                     !NStr::IsBlank(gene_xref->GetLocus()) &&
2164                     !gene) {
2165                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_GeneXrefWithoutGene,
2166                         "Feature has gene locus cross-reference but no equivalent gene feature exists");
2167                 }
2168             }
2169         }
2170     }
2171 
2172 }
2173 
2174 
x_ValidateNonGene()2175 void CSingleFeatValidator::x_ValidateNonGene()
2176 {
2177     if (m_Feat.GetData().IsGene()) {
2178         return;
2179     }
2180     x_ValidateGeneXRef();
2181 
2182     if (m_Feat.IsSetQual()) {
2183         // check old locus tag on feature and overlapping gene
2184         for (auto it : m_Feat.GetQual()) {
2185             if (it->IsSetQual() && NStr::Equal(it->GetQual(), "old_locus_tag")
2186                 && it->IsSetVal() && !NStr::IsBlank(it->GetVal())) {
2187                 x_ValidateOldLocusTag(it->GetVal());
2188             }
2189         }
2190     }
2191 }
2192 
2193 
s_IsPseudo(const CGene_ref & ref)2194 bool CSingleFeatValidator::s_IsPseudo(const CGene_ref& ref)
2195 {
2196     if (ref.IsSetPseudo() && ref.GetPseudo()) {
2197         return true;
2198     } else {
2199         return false;
2200     }
2201 }
2202 
2203 
s_HasNamedQual(const CSeq_feat & feat,const string & qual)2204 bool s_HasNamedQual(const CSeq_feat& feat, const string& qual)
2205 {
2206     bool rval = false;
2207     if (feat.IsSetQual()) {
2208         for (auto it : feat.GetQual()) {
2209             if (it->IsSetQual() && NStr::EqualNocase(it->GetQual(), qual)) {
2210                 rval = true;
2211                 break;
2212             }
2213         }
2214     }
2215     return rval;
2216 }
2217 
2218 
s_IsPseudo(const CSeq_feat & feat)2219 bool CSingleFeatValidator::s_IsPseudo(const CSeq_feat& feat)
2220 {
2221     if (feat.IsSetPseudo() && feat.GetPseudo()) {
2222         return true;
2223     } else if (s_HasNamedQual(feat, "pseudogene")) {
2224         return true;
2225     } else if (feat.IsSetData() && feat.GetData().IsGene() &&
2226         s_IsPseudo(feat.GetData().GetGene())) {
2227         return true;
2228     } else {
2229         return false;
2230     }
2231 }
2232 
2233 
x_ValidateOldLocusTag(const string & old_locus_tag)2234 void CSingleFeatValidator::x_ValidateOldLocusTag(const string& old_locus_tag)
2235 {
2236     if (NStr::IsBlank(old_locus_tag)) {
2237         return;
2238     }
2239     bool pseudo = s_IsPseudo(m_Feat);
2240     const CGene_ref* grp = m_Feat.GetGeneXref();
2241     if ( !grp) {
2242         // check overlapping gene
2243         CConstRef<CSeq_feat> overlap = m_Imp.GetGeneCache().GetGeneFromCache(&m_Feat, m_Scope);
2244         if ( overlap ) {
2245             if (s_IsPseudo(*overlap)) {
2246                 pseudo = true;
2247             }
2248             string gene_old_locus_tag;
2249 
2250             FOR_EACH_GBQUAL_ON_SEQFEAT (it, *overlap) {
2251                 if ((*it)->IsSetQual() && NStr::Equal ((*it)->GetQual(), "old_locus_tag")
2252                     && (*it)->IsSetVal() && !NStr::IsBlank((*it)->GetVal())) {
2253                     gene_old_locus_tag = (*it)->GetVal();
2254                     break;
2255                 }
2256             }
2257             if (!NStr::IsBlank (gene_old_locus_tag)
2258                 && !NStr::EqualNocase (gene_old_locus_tag, old_locus_tag)) {
2259                 PostErr (eDiag_Warning, eErr_SEQ_FEAT_OldLocusTagMismtach,
2260                             "Old locus tag on feature (" + old_locus_tag
2261                             + ") does not match that on gene (" + gene_old_locus_tag + ")");
2262             }
2263             grp = &(overlap->GetData().GetGene());
2264         }
2265     }
2266     if (grp && s_IsPseudo(*grp)) {
2267         pseudo = true;
2268     }
2269     if (grp == 0 || ! grp->IsSetLocus_tag() || NStr::IsBlank (grp->GetLocus_tag())) {
2270         if (! pseudo) {
2271             PostErr(eDiag_Error, eErr_SEQ_FEAT_OldLocusTagWithoutLocusTag,
2272                     "old_locus_tag without inherited locus_tag");
2273         }
2274     }
2275 }
2276 
2277 
x_ValidateImpFeatLoc()2278 void CSingleFeatValidator::x_ValidateImpFeatLoc()
2279 {
2280     if (!m_Feat.GetData().IsImp()) {
2281         return;
2282     }
2283     const string& key = m_Feat.GetData().GetImp().GetKey();
2284     // validate the feature's location
2285     if ( m_Feat.GetData().GetImp().IsSetLoc() ) {
2286         const string& imp_loc = m_Feat.GetData().GetImp().GetLoc();
2287         if ( imp_loc.find("one-of") != string::npos ) {
2288             PostErr(eDiag_Error, eErr_SEQ_FEAT_ImpFeatBadLoc,
2289                 "ImpFeat loc " + imp_loc +
2290                 " has obsolete 'one-of' text for feature " + key);
2291         } else if ( m_Feat.GetLocation().IsInt() ) {
2292             const CSeq_interval& seq_int = m_Feat.GetLocation().GetInt();
2293             string temp_loc = NStr::IntToString(seq_int.GetFrom() + 1) +
2294                 ".." + NStr::IntToString(seq_int.GetTo() + 1);
2295             if ( imp_loc != temp_loc ) {
2296                 PostErr(eDiag_Error, eErr_SEQ_FEAT_ImpFeatBadLoc,
2297                     "ImpFeat loc " + imp_loc + " does not equal feature location " +
2298                     temp_loc + " for feature " + key);
2299             }
2300         }
2301     }
2302 
2303 }
2304 
2305 
x_ValidateImpFeatQuals()2306 void CSingleFeatValidator::x_ValidateImpFeatQuals()
2307 {
2308     if (!m_Feat.GetData().IsImp()) {
2309         return;
2310     }
2311     const string& key = m_Feat.GetData().GetImp().GetKey();
2312 
2313     // Make sure a feature has its mandatory qualifiers
2314     for (auto required : CSeqFeatData::GetMandatoryQualifiers(m_Feat.GetData().GetSubtype())) {
2315         bool found = false;
2316         if (m_Feat.IsSetQual()) {
2317             for (auto qual : m_Feat.GetQual()) {
2318                 if (qual->IsSetQual() && CSeqFeatData::GetQualifierType(qual->GetQual()) == required) {
2319                     found = true;
2320                     break;
2321                 }
2322             }
2323             if (!found && required == CSeqFeatData::eQual_citation) {
2324                 if (m_Feat.IsSetCit()) {
2325                     found = true;
2326                 }
2327                 else if (m_Feat.IsSetComment() && !NStr::IsBlank(m_Feat.GetComment())) {
2328                     // RefSeq allows conflict with accession in comment instead of sfp->cit
2329                     if (m_LocationBioseq) {
2330                         FOR_EACH_SEQID_ON_BIOSEQ(it, *(m_LocationBioseq.GetCompleteBioseq())) {
2331                             if ((*it)->IsOther()) {
2332                                 found = true;
2333                                 break;
2334                             }
2335                         }
2336                     }
2337                 }
2338                 if (!found
2339                     && (NStr::EqualNocase(key, "conflict")
2340                         || NStr::EqualNocase(key, "old_sequence"))) {
2341                     // compare qualifier can now substitute for citation qualifier for conflict and old_sequence
2342                     FOR_EACH_GBQUAL_ON_FEATURE(qual, m_Feat) {
2343                         if ((*qual)->IsSetQual() && CSeqFeatData::GetQualifierType((*qual)->GetQual()) == CSeqFeatData::eQual_compare) {
2344                             found = true;
2345                             break;
2346                         }
2347                     }
2348                 }
2349             }
2350         }
2351         if (!found) {
2352             PostErr(eDiag_Warning, eErr_SEQ_FEAT_MissingQualOnImpFeat,
2353                 "Missing qualifier " + CSeqFeatData::GetQualifierAsString(required) +
2354                 " for feature " + key);
2355         }
2356     }
2357 }
2358 
2359 
x_ValidateSeqFeatDataType()2360 void CSingleFeatValidator::x_ValidateSeqFeatDataType()
2361 {
2362     switch ( m_Feat.GetData().Which () ) {
2363     case CSeqFeatData::e_Gene:
2364     case CSeqFeatData::e_Cdregion:
2365     case CSeqFeatData::e_Prot:
2366     case CSeqFeatData::e_Rna:
2367     case CSeqFeatData::e_Pub:
2368     case CSeqFeatData::e_Imp:
2369     case CSeqFeatData::e_Biosrc:
2370     case CSeqFeatData::e_Org:
2371     case CSeqFeatData::e_Region:
2372     case CSeqFeatData::e_Seq:
2373     case CSeqFeatData::e_Comment:
2374     case CSeqFeatData::e_Bond:
2375     case CSeqFeatData::e_Site:
2376     case CSeqFeatData::e_Rsite:
2377     case CSeqFeatData::e_User:
2378     case CSeqFeatData::e_Txinit:
2379     case CSeqFeatData::e_Num:
2380     case CSeqFeatData::e_Psec_str:
2381     case CSeqFeatData::e_Non_std_residue:
2382     case CSeqFeatData::e_Het:
2383     case CSeqFeatData::e_Clone:
2384     case CSeqFeatData::e_Variation:
2385         break;
2386     default:
2387         PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidType,
2388             "Invalid SeqFeat type [" + NStr::IntToString(m_Feat.GetData().Which()) + "]");
2389         break;
2390     }
2391 }
2392 
2393 
s_BioseqHasRefSeqThatStartsWithPrefix(CBioseq_Handle bsh,string prefix)2394 bool CSingleFeatValidator::s_BioseqHasRefSeqThatStartsWithPrefix (CBioseq_Handle bsh, string prefix)
2395 {
2396     bool rval = false;
2397     FOR_EACH_SEQID_ON_BIOSEQ (it, *(bsh.GetBioseqCore())) {
2398         if ((*it)->IsOther() && (*it)->GetTextseq_Id()->IsSetAccession()
2399             && NStr::StartsWith ((*it)->GetTextseq_Id()->GetAccession(), prefix)) {
2400             rval = true;
2401             break;
2402         }
2403     }
2404     return rval;
2405 }
2406 
2407 
x_ReportPseudogeneConflict(CConstRef<CSeq_feat> gene)2408 void CSingleFeatValidator::x_ReportPseudogeneConflict(CConstRef <CSeq_feat> gene)
2409 {
2410     if (!m_Feat.IsSetData() ||
2411         (m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_mRNA &&
2412         m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_cdregion)) {
2413         return;
2414     }
2415     string sfp_pseudo;
2416     string gene_pseudo;
2417     bool has_sfp_pseudo = false;
2418     bool has_gene_pseudo = false;
2419     if (m_Feat.IsSetQual()) {
2420         for (auto it : m_Feat.GetQual()) {
2421             if (it->IsSetQual() &&
2422                 NStr::EqualNocase(it->GetQual(), "pseudogene") &&
2423                 it->IsSetVal()) {
2424                 sfp_pseudo = it->GetVal();
2425                 has_sfp_pseudo = true;
2426             }
2427         }
2428     }
2429     if (gene && gene->IsSetQual()) {
2430         for (auto it : gene->GetQual()) {
2431             if (it->IsSetQual() &&
2432                 NStr::EqualNocase(it->GetQual(), "pseudogene") &&
2433                 it->IsSetVal()) {
2434                 gene_pseudo = it->GetVal();
2435                 has_gene_pseudo = true;
2436             }
2437         }
2438     }
2439 
2440     if (!has_sfp_pseudo && !has_gene_pseudo) {
2441         return;
2442     } else if (!has_sfp_pseudo) {
2443         return;
2444     } else if (has_sfp_pseudo && !has_gene_pseudo) {
2445         string msg = m_Feat.GetData().IsCdregion() ? "CDS" : "mRNA";
2446         msg += " has pseudogene qualifier, gene does not";
2447         PostErr(eDiag_Warning, eErr_SEQ_FEAT_InconsistentPseudogeneValue,
2448                 msg);
2449     } else if (!NStr::EqualNocase(sfp_pseudo, gene_pseudo)) {
2450         string msg = "Different pseudogene values on ";
2451         msg += m_Feat.GetData().IsCdregion() ? "CDS" : "mRNA";
2452         msg += " (" + sfp_pseudo + ") and gene (" + gene_pseudo + ")";
2453         PostErr(eDiag_Warning, eErr_SEQ_FEAT_InconsistentPseudogeneValue,
2454                 msg);
2455     }
2456 }
2457 
2458 
2459 // grp is from gene xref or from overlapping gene
x_ValidateLocusTagGeneralMatch(CConstRef<CSeq_feat> gene)2460 void CSingleFeatValidator::x_ValidateLocusTagGeneralMatch(CConstRef <CSeq_feat> gene)
2461 {
2462     if (!m_Imp.IsLocusTagGeneralMatch()) {
2463         return;
2464     }
2465     if (!m_Feat.IsSetProduct()) {
2466         return;
2467     }
2468 
2469     CTempString locus_tag = kEmptyStr;
2470     // obtain the gene-ref from the feature or the overlapping gene
2471     const CGene_ref* grp = m_Feat.GetGeneXref();
2472     if (grp && grp->IsSuppressed()) {
2473         return;
2474     } else if (grp && grp->IsSetLocus_tag() &&
2475                !NStr::IsBlank(grp->GetLocus_tag())) {
2476         locus_tag = grp->GetLocus_tag();
2477     } else if (gene && gene->GetData().GetGene().IsSetLocus_tag() &&
2478                !NStr::IsBlank(gene->GetData().GetGene().GetLocus_tag())) {
2479         locus_tag = gene->GetData().GetGene().GetLocus_tag();
2480     } else {
2481         return;
2482     }
2483 
2484     if (!m_ProductBioseq) {
2485         return;
2486     }
2487 
2488     for (auto id : m_ProductBioseq.GetId()) {
2489         CConstRef<CSeq_id> seqid = id.GetSeqId();
2490         if (!seqid || !seqid->IsGeneral()) {
2491             continue;
2492         }
2493         const CDbtag& dbt = seqid->GetGeneral();
2494         if (!dbt.IsSetDb() || dbt.IsSkippable()) {
2495             continue;
2496         }
2497 
2498         if (dbt.IsSetTag() && dbt.GetTag().IsStr()) {
2499             SIZE_TYPE pos = dbt.GetTag().GetStr().find('-');
2500             string str = dbt.GetTag().GetStr().substr(0, pos);
2501             if (!NStr::EqualNocase(locus_tag, str)) {
2502                 PostErr(eDiag_Error, eErr_SEQ_FEAT_LocusTagProductMismatch,
2503                     "Gene locus_tag does not match general ID of product");
2504             }
2505         }
2506     }
2507 }
2508 
2509 
x_CheckForNonAsciiCharacters()2510 void CSingleFeatValidator::x_CheckForNonAsciiCharacters()
2511 {
2512     CStdTypeConstIterator<string> it(m_Feat);
2513 
2514     for (; it; ++it) {
2515         const string& str = *it;
2516         FOR_EACH_CHAR_IN_STRING(c_it, str) {
2517             const char& ch = *c_it;
2518             unsigned char chu = ch;
2519             if (ch > 127 || (ch < 32 && ch != '\t' && ch != '\r' && ch != '\n')) {
2520                 PostErr(eDiag_Fatal, eErr_GENERIC_NonAsciiAsn,
2521                     "Non-ASCII character '" + NStr::NumericToString(chu) + "' found in feature (" + str + ")");
2522                 break;
2523             }
2524         }
2525     }
2526 }
2527 
Validate()2528 void CProtValidator::Validate()
2529 {
2530     CSingleFeatValidator::Validate();
2531 
2532     x_CheckForEmpty();
2533 
2534     const CProt_ref& prot = m_Feat.GetData().GetProt();
2535     for (auto it : prot.GetName()) {
2536         if (prot.IsSetEc() && !prot.IsSetProcessed()
2537             && (NStr::EqualCase (it, "Hypothetical protein")
2538                 || NStr::EqualCase (it, "hypothetical protein")
2539                 || NStr::EqualCase (it, "Unknown protein")
2540                 || NStr::EqualCase (it, "unknown protein"))) {
2541             PostErr (eDiag_Warning, eErr_SEQ_FEAT_BadProteinName,
2542                      "Unknown or hypothetical protein should not have EC number");
2543         }
2544 
2545     }
2546 
2547     if (prot.IsSetDesc() && ContainsSgml(prot.GetDesc())) {
2548         PostErr (eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
2549                  "protein description " + prot.GetDesc() + " has SGML");
2550     }
2551 
2552     if (prot.IsSetDesc() && m_Feat.IsSetComment()
2553         && NStr::EqualCase(prot.GetDesc(), m_Feat.GetComment())) {
2554         PostErr (eDiag_Warning, eErr_SEQ_FEAT_RedundantFields,
2555                  "Comment has same value as protein description");
2556     }
2557 
2558     if (m_Feat.IsSetComment() && HasECnumberPattern(m_Feat.GetComment())) {
2559         PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberInProteinComment,
2560                  "Apparent EC number in protein comment");
2561     }
2562 
2563     x_ReportUninformativeNames();
2564 
2565     // only look for EC numbers in first protein name
2566     // only look for brackets and hypothetical protein XP_ in first protein name
2567     if (prot.IsSetName() && prot.GetName().size() > 0) {
2568         if (HasECnumberPattern(prot.GetName().front())) {
2569             PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberInProteinName,
2570                 "Apparent EC number in protein title");
2571         }
2572         x_ValidateProteinName(prot.GetName().front());
2573     }
2574 
2575     if ( prot.CanGetDb () ) {
2576         m_Imp.ValidateDbxref(prot.GetDb(), m_Feat);
2577     }
2578     if ( (!prot.IsSetName() || prot.GetName().empty()) &&
2579          (!prot.IsSetProcessed()
2580           || (prot.GetProcessed() != CProt_ref::eProcessed_signal_peptide
2581               && prot.GetProcessed() !=  CProt_ref::eProcessed_transit_peptide))) {
2582         if (prot.IsSetDesc() && !NStr::IsBlank (prot.GetDesc())) {
2583             PostErr(eDiag_Warning, eErr_SEQ_FEAT_NoNameForProtein,
2584                 "Protein feature has description but no name");
2585         } else if (prot.IsSetActivity() && !prot.GetActivity().empty()) {
2586             PostErr(eDiag_Warning, eErr_SEQ_FEAT_NoNameForProtein,
2587                 "Protein feature has function but no name");
2588         } else if (prot.IsSetEc() && !prot.GetEc().empty()) {
2589             PostErr(eDiag_Warning, eErr_SEQ_FEAT_NoNameForProtein,
2590                 "Protein feature has EC number but no name");
2591         } else {
2592             PostErr(eDiag_Warning, eErr_SEQ_FEAT_NoNameForProtein,
2593                 "Protein feature has no name");
2594         }
2595     }
2596 
2597     x_ValidateECNumbers();
2598 
2599     x_ValidateMolinfoPartials();
2600 }
2601 
2602 
x_CheckForEmpty()2603 void CProtValidator::x_CheckForEmpty()
2604 {
2605     const CProt_ref& prot = m_Feat.GetData().GetProt();
2606     CProt_ref::EProcessed processed = CProt_ref::eProcessed_not_set;
2607 
2608     if ( prot.IsSetProcessed() ) {
2609         processed = prot.GetProcessed();
2610     }
2611 
2612     bool empty = true;
2613     if ( processed != CProt_ref::eProcessed_signal_peptide  &&
2614          processed != CProt_ref::eProcessed_transit_peptide ) {
2615         if ( prot.IsSetName()  &&
2616             !prot.GetName().empty()  &&
2617             !prot.GetName().front().empty() ) {
2618             empty = false;
2619         }
2620         if ( prot.CanGetDesc()  &&  !prot.GetDesc().empty() ) {
2621             empty = false;
2622         }
2623         if ( prot.CanGetEc()  &&  !prot.GetEc().empty() ) {
2624             empty = false;
2625         }
2626         if ( prot.CanGetActivity()  &&  !prot.GetActivity().empty() ) {
2627             empty = false;
2628         }
2629         if ( prot.CanGetDb()  &&  !prot.GetDb().empty() ) {
2630             empty = false;
2631         }
2632 
2633         if ( empty ) {
2634             PostErr(eDiag_Warning, eErr_SEQ_FEAT_ProtRefHasNoData,
2635                 "There is a protein feature where all fields are empty");
2636         }
2637     }
2638 }
2639 
2640 
2641 // note - list bad protein names in lower case, as search term is converted to lower case
2642 // before looking for exact match
2643 static const char* const sc_BadProtNameText [] = {
2644   "'hypothetical protein",
2645   "alpha",
2646   "alternative",
2647   "alternatively spliced",
2648   "bacteriophage hypothetical protein",
2649   "beta",
2650   "cellular",
2651   "cnserved hypothetical protein",
2652   "conesrved hypothetical protein",
2653   "conserevd hypothetical protein",
2654   "conserved archaeal protein",
2655   "conserved domain protein",
2656   "conserved hypohetical protein",
2657   "conserved hypotehtical protein",
2658   "conserved hypotheical protein",
2659   "conserved hypothertical protein",
2660   "conserved hypothetcial protein",
2661   "conserved hypothetical",
2662   "conserved hypothetical exported protein",
2663   "conserved hypothetical integral membrane protein",
2664   "conserved hypothetical membrane protein",
2665   "conserved hypothetical phage protein",
2666   "conserved hypothetical prophage protein",
2667   "conserved hypothetical protein",
2668   "conserved hypothetical protein - phage associated",
2669   "conserved hypothetical protein fragment 3",
2670   "conserved hypothetical protein, fragment",
2671   "conserved hypothetical protein, putative",
2672   "conserved hypothetical protein, truncated",
2673   "conserved hypothetical protein, truncation",
2674   "conserved hypothetical protein.",
2675   "conserved hypothetical protein; possible membrane protein",
2676   "conserved hypothetical protein; putative membrane protein",
2677   "conserved hypothetical proteins",
2678   "conserved hypothetical protien",
2679   "conserved hypothetical transmembrane protein",
2680   "conserved hypotheticcal protein",
2681   "conserved hypthetical protein",
2682   "conserved in bacteria",
2683   "conserved membrane protein",
2684   "conserved protein",
2685   "conserved protein of unknown function",
2686   "conserved protein of unknown function ; putative membrane protein",
2687   "conserved unknown protein",
2688   "conservedhypothetical protein",
2689   "conserverd hypothetical protein",
2690   "conservered hypothetical protein",
2691   "consrved hypothetical protein",
2692   "converved hypothetical protein",
2693   "cytokine",
2694   "delta",
2695   "drosophila",
2696   "duplicated hypothetical protein",
2697   "epsilon",
2698   "gamma",
2699   "hla",
2700   "homeodomain",
2701   "homeodomain protein",
2702   "homolog",
2703   "hyopthetical protein",
2704   "hypotethical",
2705   "hypotheical protein",
2706   "hypothertical protein",
2707   "hypothetcical protein",
2708   "hypothetical",
2709   "hypothetical  protein",
2710   "hypothetical conserved protein",
2711   "hypothetical exported protein",
2712   "hypothetical novel protein",
2713   "hypothetical orf",
2714   "hypothetical phage protein",
2715   "hypothetical prophage protein",
2716   "hypothetical protein (fragment)",
2717   "hypothetical protein (multi-domain)",
2718   "hypothetical protein (phage associated)",
2719   "hypothetical protein - phage associated",
2720   "hypothetical protein fragment",
2721   "hypothetical protein fragment 1",
2722   "hypothetical protein predicted by genemark",
2723   "hypothetical protein predicted by glimmer",
2724   "hypothetical protein predicted by glimmer/critica",
2725   "hypothetical protein, conserved",
2726   "hypothetical protein, phage associated",
2727   "hypothetical protein, truncated",
2728   "hypothetical protein-putative conserved hypothetical protein",
2729   "hypothetical protein.",
2730   "hypothetical proteins",
2731   "hypothetical protien",
2732   "hypothetical transmembrane protein",
2733   "hypothetoical protein",
2734   "hypothteical protein",
2735   "identified by sequence similarity; putative; orf located~using blastx/framed",
2736   "identified by sequence similarity; putative; orf located~using blastx/glimmer/genemark",
2737   "ion channel",
2738   "membrane protein, putative",
2739   "mouse",
2740   "narrowly conserved hypothetical protein",
2741   "novel protein",
2742   "orf",
2743   "orf, conserved hypothetical protein",
2744   "orf, hypothetical",
2745   "orf, hypothetical protein",
2746   "orf, hypothetical, fragment",
2747   "orf, partial conserved hypothetical protein",
2748   "orf; hypothetical protein",
2749   "orf; unknown function",
2750   "partial",
2751   "partial cds, hypothetical",
2752   "partially conserved hypothetical protein",
2753   "phage hypothetical protein",
2754   "phage-related conserved hypothetical protein",
2755   "phage-related protein",
2756   "plasma",
2757   "possible hypothetical protein",
2758   "precursor",
2759   "predicted coding region",
2760   "predicted protein",
2761   "predicted protein (pseudogene)",
2762   "predicted protein family",
2763   "product uncharacterised protein family",
2764   "protein family",
2765   "protein of unknown function",
2766   "pseudogene",
2767   "putative",
2768   "putative conserved protein",
2769   "putative exported protein",
2770   "putative hypothetical protein",
2771   "putative membrane protein",
2772   "putative orf; unknown function",
2773   "putative phage protein",
2774   "putative protein",
2775   "rearranged",
2776   "repeats containing protein",
2777   "reserved",
2778   "ribosomal protein",
2779   "similar to",
2780   "small",
2781   "small hypothetical protein",
2782   "transmembrane protein",
2783   "trna",
2784   "trp repeat",
2785   "trp-repeat protein",
2786   "truncated conserved hypothetical protein",
2787   "truncated hypothetical protein",
2788   "uncharacterized conserved membrane protein",
2789   "uncharacterized conserved protein",
2790   "uncharacterized conserved secreted protein",
2791   "uncharacterized protein",
2792   "uncharacterized protein conserved in archaea",
2793   "uncharacterized protein conserved in bacteria",
2794   "unique hypothetical",
2795   "unique hypothetical protein",
2796   "unknown",
2797   "unknown cds",
2798   "unknown function",
2799   "unknown gene",
2800   "unknown protein",
2801   "unknown, conserved protein",
2802   "unknown, hypothetical",
2803   "unknown-related protein",
2804   "unknown; predicted coding region",
2805   "unnamed",
2806   "unnamed protein product",
2807   "very hypothetical protein"
2808 };
2809 typedef CStaticArraySet<const char*, PCase_CStr> TBadProtNameSet;
2810 DEFINE_STATIC_ARRAY_MAP(TBadProtNameSet, sc_BadProtName, sc_BadProtNameText);
2811 
2812 
x_ReportUninformativeNames()2813 void CProtValidator::x_ReportUninformativeNames()
2814 {
2815     if (!m_Imp.IsRefSeq()) {
2816         return;
2817     }
2818     const CProt_ref& prot = m_Feat.GetData().GetProt();
2819     if (!prot.IsSetName()) {
2820         if (!prot.IsSetProcessed() ||
2821             (prot.GetProcessed() != CProt_ref::eProcessed_signal_peptide &&
2822              prot.GetProcessed() !=  CProt_ref::eProcessed_transit_peptide)) {
2823                 PostErr (eDiag_Error, eErr_SEQ_FEAT_NoNameForProtein,
2824                          "Protein name is not set");
2825               }
2826         return;
2827     }
2828     for (auto it : m_Feat.GetData().GetProt().GetName()) {
2829         string search = it;
2830         search = NStr::ToLower(search);
2831         if (search.empty()) {
2832             PostErr (eDiag_Error, eErr_SEQ_FEAT_NoNameForProtein,
2833                      "Protein name is empty");
2834         } else if (sc_BadProtName.find (search.c_str()) != sc_BadProtName.end()
2835             || NStr::Find(search, "=") != string::npos
2836             || NStr::Find(search, "~") != string::npos
2837             || NStr::FindNoCase(search, "uniprot") != string::npos
2838             || NStr::FindNoCase(search, "uniprotkb") != string::npos
2839             || NStr::FindNoCase(search, "pmid") != string::npos
2840             || NStr::FindNoCase(search, "dbxref") != string::npos) {
2841             PostErr (eDiag_Warning, eErr_SEQ_FEAT_UndesiredProteinName,
2842                      "Uninformative protein name '" + it + "'");
2843         }
2844     }
2845 }
2846 
2847 
x_ValidateECNumbers()2848 void CProtValidator::x_ValidateECNumbers()
2849 {
2850     if (!m_Feat.GetData().GetProt().IsSetEc()) {
2851         return;
2852     }
2853     for (auto it : m_Feat.GetData().GetProt().GetEc()) {
2854         if (NStr::IsBlank (it)) {
2855             PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberEmpty, "EC number should not be empty");
2856         } else if (!CProt_ref::IsValidECNumberFormat(it)) {
2857             PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadEcNumberFormat,
2858                     (it) + " is not in proper EC_number format");
2859         } else {
2860             const string& ec_number = it;
2861             CProt_ref::EECNumberStatus status = CProt_ref::GetECNumberStatus (ec_number);
2862             x_ReportECNumFileStatus();
2863             switch (status) {
2864                 case CProt_ref::eEC_deleted:
2865                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_DeletedEcNumber,
2866                              "EC_number " + ec_number + " was deleted");
2867                     break;
2868                 case CProt_ref::eEC_replaced:
2869                     PostErr (eDiag_Warning,
2870                              CProt_ref::IsECNumberSplit(ec_number) ? eErr_SEQ_FEAT_SplitEcNumber : eErr_SEQ_FEAT_ReplacedEcNumber,
2871                              "EC_number " + ec_number + " was transferred and is no longer valid");
2872                     break;
2873                 case CProt_ref::eEC_unknown:
2874                   {
2875                       size_t pos = NStr::Find (ec_number, "n");
2876                       if (pos == string::npos || !isdigit (ec_number.c_str()[pos + 1])) {
2877                             PostErr (eDiag_Warning, eErr_SEQ_FEAT_BadEcNumberValue,
2878                                    ec_number + " is not a legal value for qualifier EC_number");
2879                       } else {
2880                             PostErr (eDiag_Info, eErr_SEQ_FEAT_BadEcNumberValue,
2881                                    ec_number + " is not a legal preliminary value for qualifier EC_number");
2882                       }
2883                   }
2884                     break;
2885                 default:
2886                     break;
2887             }
2888         }
2889     }
2890 
2891 }
2892 
2893 
x_ValidateProteinName(const string & prot_name)2894 void CProtValidator::x_ValidateProteinName(const string& prot_name)
2895 {
2896     if (NStr::EndsWith(prot_name, "]")) {
2897         bool report_name = true;
2898         size_t pos = NStr::Find(prot_name, "[", NStr::eNocase, NStr::eReverseSearch);
2899         if (pos == string::npos) {
2900             // no disqualifying text
2901         } else if (prot_name.length() - pos < 5) {
2902             // no disqualifying text
2903         } else if (NStr::EqualCase(prot_name, pos, 4, "[NAD")) {
2904             report_name = false;
2905         }
2906         if (!m_Imp.IsEmbl() && !m_Imp.IsTPE()) {
2907             if (report_name) {
2908                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ProteinNameEndsInBracket,
2909                     "Protein name ends with bracket and may contain organism name");
2910             }
2911         }
2912     }
2913     if (NStr::StartsWith(prot_name, "hypothetical protein XP_") && m_LocationBioseq) {
2914         for (auto id_it : m_LocationBioseq.GetCompleteBioseq()->GetId()) {
2915             if (id_it->IsOther()
2916                 && id_it->GetOther().IsSetAccession()
2917                 && !NStr::EqualNocase(id_it->GetOther().GetAccession(),
2918                 prot_name.substr(21))) {
2919                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_HypotheticalProteinMismatch,
2920                     "Hypothetical protein reference does not match accession");
2921             }
2922         }
2923     }
2924     if (!m_Imp.IsRefSeq() && NStr::FindNoCase(prot_name, "RefSeq") != string::npos) {
2925         PostErr(eDiag_Error, eErr_SEQ_FEAT_RefSeqInText, "Protein name contains 'RefSeq'");
2926     }
2927     if (m_Feat.IsSetComment() && NStr::EqualCase(m_Feat.GetComment(), prot_name)) {
2928         PostErr(eDiag_Warning, eErr_SEQ_FEAT_RedundantFields,
2929             "Comment has same value as protein name");
2930     }
2931 
2932     if (s_StringHasPMID(prot_name)) {
2933         PostErr(eDiag_Warning, eErr_SEQ_FEAT_ProteinNameHasPMID,
2934             "Protein name has internal PMID");
2935     }
2936 
2937     if (m_Imp.DoRubiscoTest()) {
2938         if (NStr::FindCase(prot_name, "ribulose") != string::npos
2939             && NStr::FindCase(prot_name, "bisphosphate") != string::npos
2940             && NStr::FindCase(prot_name, "methyltransferase") == string::npos
2941             && NStr::FindCase(prot_name, "activase") == string::npos) {
2942             if (NStr::EqualNocase(prot_name, "ribulose-1,5-bisphosphate carboxylase/oxygenase")) {
2943                 // allow standard name without large or small subunit designation - later need kingdom test
2944             } else if (!NStr::EqualNocase(prot_name, "ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit")
2945                 && !NStr::EqualNocase(prot_name, "ribulose-1,5-bisphosphate carboxylase/oxygenase small subunit")) {
2946                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_RubiscoProblem,
2947                     "Nonstandard ribulose bisphosphate protein name");
2948             }
2949         }
2950     }
2951 
2952 
2953 
2954     ValidateCharactersInField(prot_name, "Protein name");
2955     if (ContainsSgml(prot_name)) {
2956         PostErr(eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
2957             "protein name " + prot_name + " has SGML");
2958     }
2959 
2960 }
2961 
2962 
x_ValidateMolinfoPartials()2963 void CProtValidator::x_ValidateMolinfoPartials()
2964 {
2965     if (!m_LocationBioseq) {
2966         return;
2967     }
2968     const CBioseq& pbioseq = *(m_LocationBioseq.GetCompleteBioseq());
2969     // if there is a coding region for this bioseq, this type of error
2970     // will be handled there
2971     const CSeq_feat* cds = m_Imp.GetCDSGivenProduct(pbioseq);
2972     if (cds) return;
2973     CFeat_CI prot(m_LocationBioseq, CSeqFeatData::eSubtype_prot);
2974     if (! prot) return;
2975 
2976     CSeqdesc_CI mi_i(m_LocationBioseq, CSeqdesc::e_Molinfo);
2977     if (! mi_i) return;
2978     const CMolInfo& mi = mi_i->GetMolinfo();
2979     if (! mi.IsSetCompleteness()) return;
2980     int completeness = mi.GetCompleteness();
2981 
2982     const CSeq_loc& prot_loc = prot->GetLocation();
2983     bool prot_partial5 = prot_loc.IsPartialStart(eExtreme_Biological);
2984     bool prot_partial3 = prot_loc.IsPartialStop(eExtreme_Biological);
2985 
2986     bool conflict = false;
2987     if (completeness == CMolInfo::eCompleteness_partial && ((! prot_partial5) && (! prot_partial3))) {
2988         conflict = true;
2989     } else if (completeness == CMolInfo::eCompleteness_no_left && ((! prot_partial5) || prot_partial3)) {
2990         conflict = true;
2991     } else if (completeness == CMolInfo::eCompleteness_no_right && (prot_partial5 || (! prot_partial3))) {
2992         conflict = true;
2993     } else if (completeness == CMolInfo::eCompleteness_no_ends && ((! prot_partial5) || (! prot_partial3))) {
2994         conflict = true;
2995     } else if ((completeness < CMolInfo::eCompleteness_partial || completeness > CMolInfo::eCompleteness_no_ends) && (prot_partial5 || prot_partial3)) {
2996         conflict = true;
2997     }
2998 
2999     if (conflict) {
3000         PostErr (eDiag_Warning, eErr_SEQ_FEAT_PartialsInconsistent,
3001             "Molinfo completeness and protein feature partials conflict");
3002     }
3003 }
3004 
Validate()3005 void CRNAValidator::Validate()
3006 {
3007     CSingleFeatValidator::Validate();
3008 
3009     const CRNA_ref& rna = m_Feat.GetData().GetRna();
3010 
3011     CRNA_ref::EType rna_type = CRNA_ref::eType_unknown;
3012     if (rna.IsSetType()) {
3013         rna_type = rna.GetType();
3014     }
3015 
3016     if (rna_type == CRNA_ref::eType_rRNA) {
3017         if (rna.CanGetExt() && rna.GetExt().IsName()) {
3018             const string& rna_name = rna.GetExt().GetName();
3019             ValidateCharactersInField (rna_name, "rRNA name");
3020             if (ContainsSgml(rna_name)) {
3021                 PostErr (eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
3022                     "rRNA name " + rna_name + " has SGML");
3023             }
3024         }
3025     }
3026 
3027     x_ValidateTrnaData();
3028     x_ValidateTrnaType();
3029 
3030     bool feat_pseudo = s_IsPseudo(m_Feat);
3031     bool pseudo = feat_pseudo;
3032     if (!pseudo) {
3033         CConstRef<CSeq_feat> gene = m_Imp.GetGeneCache().GetGeneFromCache(&m_Feat, m_Scope);
3034         if (gene) {
3035             pseudo = s_IsPseudo(*gene);
3036         }
3037     }
3038 
3039     if (!pseudo) {
3040         x_ValidateRnaTrans();
3041     }
3042 
3043     x_ValidateRnaProduct(feat_pseudo, pseudo);
3044 
3045     if (rna_type == CRNA_ref::eType_rRNA
3046         || rna_type == CRNA_ref::eType_snRNA
3047         || rna_type == CRNA_ref::eType_scRNA
3048         || rna_type == CRNA_ref::eType_snoRNA) {
3049         if (!rna.IsSetExt() || !rna.GetExt().IsName() || NStr::IsBlank(rna.GetExt().GetName())) {
3050             if (!pseudo) {
3051                 string rna_typename = CRNA_ref::GetRnaTypeName(rna_type);
3052                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_rRNADoesNotHaveProduct,
3053                          rna_typename + " has no name");
3054             }
3055         }
3056     }
3057 
3058 
3059     if ( rna_type == CRNA_ref::eType_unknown ) {
3060         PostErr(eDiag_Warning, eErr_SEQ_FEAT_RNAtype0,
3061             "RNA type 0 (unknown) not supported");
3062     }
3063 
3064 
3065 }
3066 
3067 
x_ValidateRnaProduct(bool feat_pseudo,bool pseudo)3068 void CRNAValidator::x_ValidateRnaProduct(bool feat_pseudo, bool pseudo)
3069 {
3070     if (!m_Feat.IsSetProduct()) {
3071         return;
3072     }
3073 
3074     x_ValidateRnaProductType();
3075 
3076     if ((!m_Feat.IsSetExcept_text()
3077          || NStr::FindNoCase (m_Feat.GetExcept_text(), "transcribed pseudogene") == string::npos)
3078         && !m_Imp.IsRefSeq()) {
3079         if (feat_pseudo) {
3080             PostErr (eDiag_Warning, eErr_SEQ_FEAT_PseudoRnaHasProduct,
3081                      "A pseudo RNA should not have a product");
3082         } else if (pseudo) {
3083             PostErr (eDiag_Warning, eErr_SEQ_FEAT_PseudoRnaViaGeneHasProduct,
3084                      "An RNA overlapped by a pseudogene should not have a product");
3085         }
3086     }
3087 
3088 }
3089 
3090 
x_ValidateRnaProductType()3091 void CRNAValidator::x_ValidateRnaProductType()
3092 {
3093     if ( !m_Feat.GetData().GetRna().IsSetType() || !m_ProductBioseq ) {
3094         return;
3095     }
3096     CSeqdesc_CI di(m_ProductBioseq, CSeqdesc::e_Molinfo);
3097     if ( !di ) {
3098         return;
3099     }
3100     const CMolInfo& mol_info = di->GetMolinfo();
3101     if ( !mol_info.CanGetBiomol() ) {
3102         return;
3103     }
3104     int biomol = mol_info.GetBiomol();
3105 
3106     switch ( m_Feat.GetData().GetRna().GetType() ) {
3107 
3108     case CRNA_ref::eType_mRNA:
3109         if ( biomol == CMolInfo::eBiomol_mRNA ) {
3110             return;
3111         }
3112         break;
3113 
3114     case CRNA_ref::eType_tRNA:
3115         if ( biomol == CMolInfo::eBiomol_tRNA ) {
3116             return;
3117         }
3118         break;
3119 
3120     case CRNA_ref::eType_rRNA:
3121         if ( biomol == CMolInfo::eBiomol_rRNA ) {
3122             return;
3123         }
3124         break;
3125 
3126     default:
3127         return;
3128     }
3129 
3130     PostErr(eDiag_Error, eErr_SEQ_FEAT_RnaProductMismatch,
3131         "Type of RNA does not match MolInfo of product Bioseq");
3132 }
3133 
3134 
s_IsBioseqPartial(CBioseq_Handle bsh)3135 static bool s_IsBioseqPartial (CBioseq_Handle bsh)
3136 {
3137     CSeqdesc_CI sd(bsh, CSeqdesc::e_Molinfo);
3138     if ( !sd ) {
3139         return false;
3140     }
3141     const CMolInfo& molinfo = sd->GetMolinfo();
3142     if (!molinfo.IsSetCompleteness ()) {
3143         return false;
3144     }
3145     CMolInfo::TCompleteness completeness = molinfo.GetCompleteness();
3146     if (completeness == CMolInfo::eCompleteness_partial
3147         || completeness == CMolInfo::eCompleteness_no_ends
3148         || completeness == CMolInfo::eCompleteness_no_left
3149         || completeness == CMolInfo::eCompleteness_no_right) {
3150         return true;
3151     } else {
3152         return false;
3153     }
3154 }
3155 
3156 
x_ValidateTrnaData()3157 void CRNAValidator::x_ValidateTrnaData()
3158 {
3159     if (!m_Feat.GetData().GetRna().IsSetExt() || !m_Feat.GetData().GetRna().GetExt().IsTRNA()) {
3160         return;
3161     }
3162     if ( !m_Feat.GetData().GetRna().IsSetType() ||
3163           m_Feat.GetData().GetRna().GetType() != CRNA_ref::eType_tRNA ) {
3164         PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidTRNAdata,
3165                 "tRNA data structure on non-tRNA feature");
3166     }
3167 
3168     const CTrna_ext& trna = m_Feat.GetData().GetRna().GetExt ().GetTRNA ();
3169     if ( trna.CanGetAnticodon () ) {
3170         const CSeq_loc& anticodon = trna.GetAnticodon();
3171         size_t anticodon_len = GetLength(anticodon, &m_Scope);
3172         if ( anticodon_len != 3 ) {
3173             PostErr (eDiag_Warning, eErr_SEQ_FEAT_tRNArange,
3174                 "Anticodon is not 3 bases in length");
3175         }
3176         ECompare comp = sequence::Compare(anticodon,
3177                                             m_Feat.GetLocation(),
3178                                             &m_Scope,
3179                                             sequence::fCompareOverlapping);
3180         if ( comp != eContained  &&  comp != eSame ) {
3181             PostErr (eDiag_Error, eErr_SEQ_FEAT_tRNArange,
3182                 "Anticodon location not in tRNA");
3183         }
3184         x_ValidateAnticodon(anticodon);
3185     }
3186     x_ValidateTrnaCodons();
3187 
3188 }
3189 
3190 
x_ValidateTrnaType()3191 void CRNAValidator::x_ValidateTrnaType()
3192 {
3193     if (!m_Feat.GetData().GetRna().IsSetType() ||
3194         m_Feat.GetData().GetRna().GetType() != CRNA_ref::eType_tRNA) {
3195         return;
3196     }
3197     const CRNA_ref& rna = m_Feat.GetData().GetRna();
3198 
3199     // check for unparsed qualifiers
3200     for (auto& gbqual : m_Feat.GetQual()) {
3201         if ( NStr::CompareNocase(gbqual->GetQual (), "anticodon") == 0 ) {
3202             PostErr(eDiag_Error, eErr_SEQ_FEAT_UnparsedtRNAAnticodon,
3203                 "Unparsed anticodon qualifier in tRNA");
3204         } else if (NStr::CompareNocase (gbqual->GetQual (), "product") == 0 ) {
3205             if (NStr::CompareNocase (gbqual->GetVal (), "tRNA-fMet") != 0 &&
3206                 NStr::CompareNocase (gbqual->GetVal (), "tRNA-iMet") != 0 &&
3207                 NStr::CompareNocase (gbqual->GetVal (), "tRNA-Ile2") != 0) {
3208                 PostErr(eDiag_Error, eErr_SEQ_FEAT_UnparsedtRNAProduct,
3209                     "Unparsed product qualifier in tRNA");
3210             }
3211         }
3212     }
3213 
3214 
3215     /* tRNA with string extension */
3216     if ( rna.IsSetExt() &&
3217             rna.GetExt().Which () == CRNA_ref::C_Ext::e_Name ) {
3218         PostErr(eDiag_Error, eErr_SEQ_FEAT_UnparsedtRNAProduct,
3219             "Unparsed product qualifier in tRNA");
3220     } else if (!rna.IsSetExt() || rna.GetExt().Which() == CRNA_ref::C_Ext::e_not_set ) {
3221         PostErr (eDiag_Warning, eErr_SEQ_FEAT_MissingTrnaAA,
3222             "Missing encoded amino acid qualifier in tRNA");
3223     }
3224 
3225     x_ValidateTrnaOverlap();
3226 }
3227 
3228 
x_ValidateAnticodon(const CSeq_loc & anticodon)3229 void CRNAValidator::x_ValidateAnticodon(const CSeq_loc& anticodon)
3230 {
3231     bool ordered = true;
3232     bool adjacent = false;
3233     bool unmarked_strand = false;
3234     bool mixed_strand = false;
3235 
3236     CSeq_loc_CI prev;
3237     for (CSeq_loc_CI curr(anticodon); curr; ++curr) {
3238         bool chk = true;
3239         if (curr.GetEmbeddingSeq_loc().IsInt()) {
3240             chk = sequence::IsValid(curr.GetEmbeddingSeq_loc().GetInt(), &m_Scope);
3241         } else if (curr.GetEmbeddingSeq_loc().IsPnt()) {
3242             chk = sequence::IsValid(curr.GetEmbeddingSeq_loc().GetPnt(), &m_Scope);
3243         } else {
3244             continue;
3245         }
3246 
3247         if ( !chk ) {
3248             string lbl;
3249             curr.GetEmbeddingSeq_loc().GetLabel(&lbl);
3250             PostErr(eDiag_Critical, eErr_SEQ_FEAT_tRNArange,
3251                 "Anticodon location [" + lbl + "] out of range");
3252         }
3253 
3254         if ( prev  &&  curr  &&
3255              IsSameBioseq(curr.GetSeq_id(), prev.GetSeq_id(), &m_Scope) ) {
3256             CSeq_loc_CI::TRange prev_range = prev.GetRange();
3257             CSeq_loc_CI::TRange curr_range = curr.GetRange();
3258             if ( ordered ) {
3259                 if ( curr.GetStrand() == eNa_strand_minus ) {
3260                     if (prev_range.GetTo() < curr_range.GetTo()) {
3261                         ordered = false;
3262                     }
3263                     if (curr_range.GetTo() + 1 == prev_range.GetFrom()) {
3264                         adjacent = true;
3265                     }
3266                 } else {
3267                     if (prev_range.GetTo() > curr_range.GetTo()) {
3268                         ordered = false;
3269                     }
3270                     if (prev_range.GetTo() + 1 == curr_range.GetFrom()) {
3271                         adjacent = true;
3272                     }
3273                 }
3274             }
3275             ENa_strand curr_strand = curr.GetStrand();
3276             ENa_strand prev_strand = prev.GetStrand();
3277             if ( curr_range == prev_range  &&  curr_strand == prev_strand ) {
3278                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_DuplicateAnticodonInterval,
3279                     "Duplicate anticodon exons in location");
3280             }
3281             if ( curr_strand != prev_strand ) {
3282                 if (curr_strand == eNa_strand_plus  &&  prev_strand == eNa_strand_unknown) {
3283                     unmarked_strand = true;
3284                 } else if (curr_strand == eNa_strand_unknown  &&  prev_strand == eNa_strand_plus) {
3285                     unmarked_strand = true;
3286                 } else {
3287                     mixed_strand = true;
3288                 }
3289             }
3290         }
3291         prev = curr;
3292     }
3293     if (adjacent) {
3294         PostErr(eDiag_Warning, eErr_SEQ_FEAT_AbuttingIntervals,
3295             "Adjacent intervals in Anticodon");
3296     }
3297 
3298     ENa_strand loc_strand = m_Feat.GetLocation().GetStrand();
3299     ENa_strand ac_strand = anticodon.GetStrand();
3300     if (loc_strand == eNa_strand_minus && ac_strand != eNa_strand_minus) {
3301         PostErr (eDiag_Error, eErr_SEQ_FEAT_AnticodonStrandConflict,
3302                  "Anticodon strand and tRNA strand do not match.");
3303     } else if (loc_strand != eNa_strand_minus && ac_strand == eNa_strand_minus) {
3304         PostErr (eDiag_Error, eErr_SEQ_FEAT_AnticodonStrandConflict,
3305                  "Anticodon strand and tRNA strand do not match.");
3306     }
3307 
3308     // trans splicing exception turns off both mixed_strand and out_of_order messages
3309     bool trans_splice = false;
3310     if (m_Feat.CanGetExcept()  &&  m_Feat.GetExcept()  && m_Feat.CanGetExcept_text()) {
3311         if (NStr::FindNoCase(m_Feat.GetExcept_text(), "trans-splicing") != NPOS) {
3312             trans_splice = true;
3313         }
3314     }
3315     if (!trans_splice) {
3316         string loc_lbl;
3317         anticodon.GetLabel(&loc_lbl);
3318         if (mixed_strand) {
3319             PostErr(eDiag_Warning, eErr_SEQ_FEAT_AnticodonMixedStrand,
3320                 "Mixed strands in Anticodon [" + loc_lbl + "]");
3321         }
3322         if (unmarked_strand) {
3323             PostErr(eDiag_Warning, eErr_SEQ_FEAT_AnticodonMixedStrand,
3324                 "Mixed plus and unknown strands in Anticodon [" + loc_lbl + "]");
3325         }
3326         if (!ordered) {
3327             PostErr(eDiag_Warning, eErr_SEQ_FEAT_SeqLocOrder,
3328                 "Intervals out of order in Anticodon [" + loc_lbl + "]");
3329         }
3330     }
3331 }
3332 
3333 
3334 int s_LegalNcbieaaValues[] = { 42, 65, 66, 67, 68, 69, 70, 71, 72, 73,
3335                                74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
3336                                84, 85, 86, 87, 88, 89, 90 };
3337 
3338 static const char* kAANames[] = {
3339     "---", "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
3340     "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr",
3341     "Val", "Trp", "OTHER", "Tyr", "Glx", "Sec", "TERM", "Pyl", "Xle"
3342 };
3343 
3344 
GetAAName(unsigned char aa,bool is_ascii)3345 const char* GetAAName(unsigned char aa, bool is_ascii)
3346 {
3347     try {
3348         if (is_ascii) {
3349             aa = CSeqportUtil::GetMapToIndex
3350                 (CSeq_data::e_Ncbieaa, CSeq_data::e_Ncbistdaa, aa);
3351         }
3352         return (aa < sizeof(kAANames)/sizeof(*kAANames)) ? kAANames[aa] : "OTHER";
3353     } catch (CException ) {
3354         return "OTHER";
3355     } catch (std::exception ) {
3356         return "OTHER";
3357     }
3358 }
3359 
3360 
GetGeneticCodeName(int gcode)3361 static string GetGeneticCodeName (int gcode)
3362 {
3363     const CGenetic_code_table& code_table = CGen_code_table::GetCodeTable();
3364     const list<CRef<CGenetic_code> >& codes = code_table.Get();
3365 
3366     for ( list<CRef<CGenetic_code> >::const_iterator code_it = codes.begin(), code_it_end = codes.end();  code_it != code_it_end;  ++code_it ) {
3367         if ((*code_it)->GetId() == gcode) {
3368             return (*code_it)->GetName();
3369         }
3370     }
3371     return "unknown";
3372 }
3373 
3374 
x_ValidateTrnaCodons()3375 void CRNAValidator::x_ValidateTrnaCodons()
3376 {
3377     if (!m_Feat.IsSetData() || !m_Feat.GetData().IsRna() ||
3378         !m_Feat.GetData().GetRna().IsSetExt() ||
3379         !m_Feat.GetData().GetRna().GetExt().IsTRNA()) {
3380         return;
3381     }
3382     const CTrna_ext& trna = m_Feat.GetData().GetRna().GetExt().GetTRNA();
3383 
3384     if (!trna.IsSetAa()) {
3385         PostErr (eDiag_Error, eErr_SEQ_FEAT_BadTrnaAA, "Missing tRNA amino acid");
3386         return;
3387     }
3388 
3389     unsigned char aa = 0, orig_aa;
3390     vector<char> seqData;
3391     string str;
3392 
3393     switch (trna.GetAa().Which()) {
3394         case CTrna_ext::C_Aa::e_Iupacaa:
3395             str = trna.GetAa().GetIupacaa();
3396             CSeqConvert::Convert(str, CSeqUtil::e_Iupacaa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
3397             aa = seqData[0];
3398             break;
3399         case CTrna_ext::C_Aa::e_Ncbi8aa:
3400             str = trna.GetAa().GetNcbi8aa();
3401             CSeqConvert::Convert(str, CSeqUtil::e_Ncbi8aa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
3402             aa = seqData[0];
3403             break;
3404         case CTrna_ext::C_Aa::e_Ncbistdaa:
3405             str = trna.GetAa().GetNcbi8aa();
3406             CSeqConvert::Convert(str, CSeqUtil::e_Ncbistdaa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
3407             aa = seqData[0];
3408             break;
3409         case CTrna_ext::C_Aa::e_Ncbieaa:
3410             seqData.push_back(trna.GetAa().GetNcbieaa());
3411             aa = seqData[0];
3412             break;
3413         default:
3414             NCBI_THROW (CCoreException, eCore, "Unrecognized tRNA aa coding");
3415             break;
3416     }
3417 
3418     // make sure the amino acid is valid
3419     bool found = false;
3420     for ( unsigned int i = 0; i < sizeof (s_LegalNcbieaaValues) / sizeof (int); ++i ) {
3421         if ( aa == s_LegalNcbieaaValues[i] ) {
3422             found = true;
3423             break;
3424         }
3425     }
3426     orig_aa = aa;
3427     if ( !found ) {
3428         aa = ' ';
3429     }
3430 
3431     if (m_Feat.GetData().GetRna().IsSetType() &&
3432         m_Feat.GetData().GetRna().GetType() == CRNA_ref::eType_tRNA) {
3433         bool mustbemethionine = false;
3434         for (auto gbqual : m_Feat.GetQual()) {
3435             if (NStr::CompareNocase(gbqual->GetQual(), "product") == 0 &&
3436                 (NStr::CompareNocase(gbqual->GetVal(), "tRNA-fMet") == 0 ||
3437                     NStr::CompareNocase(gbqual->GetVal(), "tRNA-iMet") == 0)) {
3438                 mustbemethionine = true;
3439                 break;
3440             }
3441         }
3442         if (mustbemethionine) {
3443             if (aa != 'M') {
3444                 string aanm = GetAAName(aa, true);
3445                 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
3446                     "Initiation tRNA claims to be tRNA-" + aanm +
3447                     ", but should be tRNA-Met");
3448             }
3449         }
3450     }
3451 
3452     // Retrive the Genetic code id for the tRNA
3453     int gcode = 1;
3454     if ( m_LocationBioseq ) {
3455         // need only the closest biosoure.
3456         CSeqdesc_CI diter(m_LocationBioseq, CSeqdesc::e_Source);
3457         if ( diter ) {
3458             gcode = diter->GetSource().GetGenCode();
3459         }
3460     }
3461 
3462     const string& ncbieaa = CGen_code_table::GetNcbieaa(gcode);
3463     if ( ncbieaa.length() != 64 ) {
3464         return;
3465     }
3466 
3467     string codename = GetGeneticCodeName (gcode);
3468     char buf[2];
3469     buf[0] = aa;
3470     buf[1] = 0;
3471     string aaname = buf;
3472     aaname += "/";
3473     aaname += GetAAName (aa, true);
3474 
3475     EDiagSev sev = (aa == 'U' || aa == 'O') ? eDiag_Warning : eDiag_Error;
3476 
3477     bool modified_codon_recognition = false;
3478     bool rna_editing = false;
3479     if ( m_Feat.IsSetExcept_text() ) {
3480         string excpt_text = m_Feat.GetExcept_text();
3481         if ( NStr::FindNoCase(excpt_text, "modified codon recognition") != NPOS ) {
3482             modified_codon_recognition = true;
3483         }
3484         if ( NStr::FindNoCase(excpt_text, "RNA editing") != NPOS ) {
3485             rna_editing = true;
3486         }
3487     }
3488 
3489     vector<string> recognized_codon_values;
3490     vector<unsigned char> recognized_taa_values;
3491 
3492     ITERATE( CTrna_ext::TCodon, iter, trna.GetCodon() ) {
3493         if (*iter == 255) continue;
3494         // test that codon value is in range 0 - 63
3495         if ( *iter > 63 ) {
3496             PostErr(sev, eErr_SEQ_FEAT_BadTrnaCodon,
3497                 "tRNA codon value " + NStr::IntToString(*iter) +
3498                 " is greater than maximum 63");
3499             continue;
3500         } else if (*iter < 0) {
3501             PostErr(sev, eErr_SEQ_FEAT_BadTrnaCodon,
3502                 "tRNA codon value " + NStr::IntToString(*iter) +
3503                 " is less than 0");
3504             continue;
3505         }
3506 
3507         if ( !modified_codon_recognition && !rna_editing ) {
3508             unsigned char taa = ncbieaa[*iter];
3509             string codon = CGen_code_table::IndexToCodon(*iter);
3510             recognized_codon_values.push_back (codon);
3511             recognized_taa_values.push_back (taa);
3512 
3513             if ( taa != aa ) {
3514                 if ( (aa == 'U')  &&  (taa == '*')  &&  (*iter == 14) ) {
3515                     // selenocysteine normally uses TGA (14), so ignore without requiring exception in record
3516                     // TAG (11) is used for pyrrolysine in archaebacteria
3517                     // TAA (10) is not yet known to be used for an exceptional amino acid
3518                 } else {
3519                     NStr::ReplaceInPlace (codon, "T", "U");
3520 
3521                     PostErr(sev, eErr_SEQ_FEAT_TrnaCodonWrong,
3522                       "Codon recognized by tRNA (" + codon + ") does not match amino acid ("
3523                        + aaname + ") specified by genetic code ("
3524                        + NStr::IntToString (gcode) + "/" + codename + ")");
3525                 }
3526             }
3527         }
3528     }
3529 
3530     // see if anticodon is compatible with codons recognized and amino acid
3531     string anticodon = "?";
3532     vector<string> codon_values;
3533     vector<unsigned char> taa_values;
3534 
3535     if (trna.IsSetAnticodon() && GetLength (trna.GetAnticodon(), &m_Scope) == 3) {
3536         try {
3537             anticodon = GetSequenceStringFromLoc(trna.GetAnticodon(), m_Scope);
3538             // get reverse complement sequence for location
3539             CRef<CSeq_loc> codon_loc(SeqLocRevCmpl(trna.GetAnticodon(), &m_Scope));
3540             string codon = GetSequenceStringFromLoc(*codon_loc, m_Scope);
3541             if (codon.length() > 3) {
3542                 codon = codon.substr (0, 3);
3543             }
3544 
3545             // expand wobble base to known binding partners
3546             string wobble;
3547 
3548             char ch = anticodon.c_str()[0];
3549             switch (ch) {
3550                 case 'A' :
3551                     wobble = "ACT";
3552                     break;
3553                 case 'C' :
3554                     wobble = "G";
3555                     break;
3556                 case 'G' :
3557                     wobble = "CT";
3558                     break;
3559                 case 'T' :
3560                     wobble = "AG";
3561                     break;
3562                 default :
3563                     break;
3564             }
3565             if (!NStr::IsBlank(wobble)) {
3566                 string::iterator str_it = wobble.begin();
3567                 while (str_it != wobble.end()) {
3568                     codon[2] = *str_it;
3569                     int index = CGen_code_table::CodonToIndex (codon);
3570                     if (index < 64 && index > -1) {
3571                         unsigned char taa = ncbieaa[index];
3572                         taa_values.push_back(taa);
3573                         codon_values.push_back(codon);
3574                     }
3575                     ++str_it;
3576                 }
3577             }
3578             NStr::ReplaceInPlace (anticodon, "T", "U");
3579             if (anticodon.length() > 3) {
3580                 anticodon = anticodon.substr(0, 3);
3581             }
3582             } catch (CException ) {
3583             } catch (std::exception ) {
3584         }
3585 
3586         if (codon_values.size() > 0) {
3587             bool ok = false;
3588             // check that codons predicted from anticodon can transfer indicated amino acid
3589             for (size_t i = 0; i < codon_values.size(); i++) {
3590                 if (!NStr::IsBlank (codon_values[i]) && aa == taa_values[i]) {
3591                     ok = true;
3592                 }
3593             }
3594             if (!ok) {
3595                 if (aa == 'U' && NStr::Equal (anticodon, "UCA")) {
3596                     // ignore TGA codon for selenocysteine
3597                 } else if (aa == 'O' && NStr::Equal (anticodon, "CUA")) {
3598                     // ignore TAG codon for pyrrolysine
3599                 } else if (aa == 'I' && NStr::Equal (anticodon, "CAU")) {
3600                     // ignore ATG predicted codon for Ile2
3601                 } else if (!m_Feat.IsSetExcept_text()
3602                           || (NStr::FindNoCase(m_Feat.GetExcept_text(), "modified codon recognition") == string::npos
3603                               &&NStr::FindNoCase(m_Feat.GetExcept_text(), "RNA editing") == string::npos)) {
3604                     PostErr (eDiag_Warning, eErr_SEQ_FEAT_BadAnticodonAA,
3605                               "Codons predicted from anticodon (" + anticodon
3606                               + ") cannot produce amino acid (" + aaname + ")");
3607                 }
3608             }
3609 
3610             // check that codons recognized match codons predicted from anticodon
3611             if (recognized_codon_values.size() > 0) {
3612                 bool ok = false;
3613                 for (size_t i = 0; i < codon_values.size() && !ok; i++) {
3614                     for (size_t j = 0; j < recognized_codon_values.size() && !ok; j++) {
3615                         if (NStr::Equal (codon_values[i], recognized_codon_values[j])) {
3616                             ok = true;
3617                         } else if ( NStr::Equal (codon_values[i], "ATG") && aa == 'I') {
3618                             // allow ATG recognized codon (pre-RNA-editing) for Ile2
3619                             ok = true;
3620                         }
3621                     }
3622                 }
3623                 if (!ok
3624                     && (!m_Feat.IsSetExcept_text()
3625                         || NStr::FindNoCase (m_Feat.GetExcept_text(), "RNA editing") == string::npos)) {
3626                     PostErr (eDiag_Warning, eErr_SEQ_FEAT_BadAnticodonCodon,
3627                              "Codon recognized cannot be produced from anticodon ("
3628                              + anticodon + ")");
3629                 }
3630             }
3631         }
3632     }
3633 
3634     if (!m_Feat.IsSetPseudo() || !m_Feat.GetPseudo()) {
3635         if (orig_aa == 0 || orig_aa == 255) {
3636             PostErr (sev, eErr_SEQ_FEAT_BadTrnaAA, "Missing tRNA amino acid");
3637         } else {
3638             // verify that legal amino acid is indicated
3639             unsigned int idx;
3640             if (aa != '*') {
3641                 idx = aa - 64;
3642             } else {
3643                 idx = 25;
3644             }
3645             if (idx == 0 || idx >= 28) {
3646                 PostErr (sev, eErr_SEQ_FEAT_BadTrnaAA, "Invalid tRNA amino acid");
3647             }
3648         }
3649     }
3650 }
3651 
3652 
x_ValidateTrnaOverlap()3653 void CRNAValidator::x_ValidateTrnaOverlap()
3654 {
3655     if (!m_Feat.GetData().GetRna().IsSetType() ||
3656         m_Feat.GetData().GetRna().GetType() != CRNA_ref::eType_tRNA) {
3657         return;
3658     }
3659     TFeatScores scores;
3660     GetOverlappingFeatures(m_Feat.GetLocation(),
3661                             CSeqFeatData::e_Rna,
3662                             CSeqFeatData::eSubtype_rRNA,
3663                             eOverlap_Interval,
3664                             scores, m_Scope);
3665     bool found_bad = false;
3666     for (auto it : scores) {
3667         CRef<CSeq_loc> intersection = it.second->GetLocation().Intersect(m_Feat.GetLocation(),
3668             0 /* flags*/,
3669             NULL /* synonym mapper */);
3670         if (intersection) {
3671             TSeqPos length = sequence::GetLength(*intersection, &m_Scope);
3672             if (length >= 5) {
3673                 found_bad = true;
3674                 break;
3675             }
3676         }
3677     }
3678     if (found_bad) {
3679         PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadRRNAcomponentOverlapTRNA,
3680             "tRNA-rRNA overlap");
3681     }
3682 }
3683 
3684 
x_ValidateRnaTrans()3685 void CRNAValidator::x_ValidateRnaTrans()
3686 {
3687     size_t mismatches = 0;
3688     size_t problems = GetMRNATranslationProblems
3689         (m_Feat, mismatches, m_Imp.IgnoreExceptions(),
3690          m_LocationBioseq, m_ProductBioseq,
3691          m_Imp.IsFarFetchMRNAproducts(), m_Imp.IsGpipe(),
3692          m_Imp.IsGenomic(), &m_Scope);
3693     x_ReportRNATranslationProblems(problems, mismatches);
3694 }
3695 
3696 
x_ReportRNATranslationProblems(size_t problems,size_t mismatches)3697 void CRNAValidator::x_ReportRNATranslationProblems(size_t problems, size_t mismatches)
3698 {
3699     if (problems & eMRNAProblem_TransFail) {
3700         PostErr(eDiag_Error, eErr_SEQ_FEAT_MrnaTransFail,
3701             "Unable to transcribe mRNA");
3702     }
3703     if (problems & eMRNAProblem_UnableToFetch) {
3704         const CSeq_id& product_id = GetId(m_Feat.GetProduct(), &m_Scope);
3705         string label = product_id.AsFastaString();
3706         PostErr(eDiag_Error, eErr_SEQ_FEAT_ProductFetchFailure,
3707             "Unable to fetch mRNA transcript '" + label + "'");
3708     }
3709 
3710     bool is_refseq = m_Imp.IsRefSeqConventions();
3711     if (m_LocationBioseq) {
3712         FOR_EACH_SEQID_ON_BIOSEQ(it, *(m_LocationBioseq.GetCompleteBioseq())) {
3713             if ((*it)->IsOther()) {
3714                 is_refseq = true;
3715                 break;
3716             }
3717         }
3718     }
3719 
3720     TSeqPos feat_len = sequence::GetLength(m_Feat.GetLocation(), &m_Scope);
3721 
3722     string farstr;
3723     EDiagSev sev = eDiag_Error;
3724 
3725     // if not local bioseq product, lower severity (with the exception of Refseq)
3726     if (m_ProductIsFar && !is_refseq) {
3727         sev = eDiag_Warning;
3728     }
3729     if (m_ProductIsFar) {
3730         farstr = "(far) ";
3731         if (m_Feat.IsSetPartial()
3732             && !s_IsBioseqPartial(m_ProductBioseq)
3733             && s_BioseqHasRefSeqThatStartsWithPrefix(m_ProductBioseq, "NM_")) {
3734             sev = eDiag_Warning;
3735         }
3736     }
3737 
3738     if (problems & eMRNAProblem_TranscriptLenLess) {
3739         PostErr(sev, eErr_SEQ_FEAT_TranscriptLen,
3740             "Transcript length [" + NStr::SizetToString(feat_len) +
3741             "] less than " + farstr + "product length [" +
3742             NStr::SizetToString(m_ProductBioseq.GetInst_Length()) + "], and tail < 95% polyA");
3743     }
3744 
3745     if (problems & eMRNAProblem_PolyATail100) {
3746         PostErr(eDiag_Info, eErr_SEQ_FEAT_PolyATail,
3747             "Transcript length [" + NStr::SizetToString(feat_len)
3748             + "] less than " + farstr + "product length ["
3749             + NStr::SizetToString(m_ProductBioseq.GetInst_Length()) + "], but tail is 100% polyA");
3750     }
3751     if (problems & eMRNAProblem_PolyATail95) {
3752         PostErr(eDiag_Info, eErr_SEQ_FEAT_PolyATail,
3753             "Transcript length [" + NStr::SizetToString(feat_len) +
3754             "] less than " + farstr + "product length [" +
3755             NStr::SizetToString(m_ProductBioseq.GetInst_Length()) + "], but tail >= 95% polyA");
3756     }
3757     if (problems & eMRNAProblem_TranscriptLenMore) {
3758         PostErr(sev, eErr_SEQ_FEAT_TranscriptLen,
3759             "Transcript length [" + NStr::IntToString(feat_len) + "] " +
3760             "greater than " + farstr + "product length [" +
3761             NStr::IntToString(m_ProductBioseq.GetInst_Length()) + "]");
3762     }
3763     if ((problems & eMRNAProblem_Mismatch) && mismatches > 0) {
3764         PostErr(sev, eErr_SEQ_FEAT_TranscriptMismatches,
3765             "There are " + NStr::SizetToString(mismatches) +
3766             " mismatches out of " + NStr::SizetToString(feat_len) +
3767             " bases between the transcript and " + farstr + "product sequence");
3768     }
3769     if (problems & eMRNAProblem_UnnecessaryException) {
3770         PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryException,
3771             "mRNA has exception but passes transcription test");
3772     }
3773     if (problems & eMRNAProblem_ErroneousException) {
3774         size_t total = min(feat_len, m_ProductBioseq.GetInst_Length());
3775         PostErr(eDiag_Warning, eErr_SEQ_FEAT_ErroneousException,
3776             "mRNA has unclassified exception but only difference is " + NStr::SizetToString(mismatches)
3777             + " mismatches out of " + NStr::SizetToString(total) + " bases");
3778     }
3779     if (problems & eMRNAProblem_ProductReplaced) {
3780         PostErr(eDiag_Warning, eErr_SEQ_FEAT_mRNAUnnecessaryException,
3781             "mRNA has transcribed product replaced exception");
3782     }
3783 }
3784 
3785 
CMRNAValidator(const CSeq_feat & feat,CScope & scope,CValidError_imp & imp)3786 CMRNAValidator::CMRNAValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp) :
3787 CRNAValidator(feat, scope, imp)
3788 {
3789     m_Gene = m_Imp.GetGeneCache().GetGeneFromCache(&feat, m_Scope);
3790     if (m_Gene) {
3791         m_GeneIsPseudo = s_IsPseudo(*m_Gene);
3792     } else {
3793         m_GeneIsPseudo = false;
3794     }
3795     m_FeatIsPseudo = s_IsPseudo(m_Feat);
3796 }
3797 
3798 
Validate()3799 void CMRNAValidator::Validate()
3800 {
3801     CRNAValidator::Validate();
3802 
3803     x_ReportPseudogeneConflict(m_Gene);
3804     x_ValidateLocusTagGeneralMatch(m_Gene);
3805 
3806     x_ValidateMrna();
3807 
3808     if (!m_GeneIsPseudo && !m_FeatIsPseudo) {
3809         x_ValidateCommonMRNAProduct();
3810     }
3811     x_ValidateMrnaGene();
3812 }
3813 
3814 
x_ValidateMrna()3815 void CMRNAValidator::x_ValidateMrna()
3816 {
3817     bool pseudo = m_GeneIsPseudo;
3818     if (!pseudo) {
3819         pseudo = s_IsPseudo(m_Feat);
3820     }
3821     ValidateSplice(pseudo, false);
3822 
3823     const CRNA_ref& rna = m_Feat.GetData().GetRna();
3824 
3825     if (m_Feat.IsSetQual()) {
3826         for (auto it : m_Feat.GetQual()) {
3827             const CGb_qual& qual = *it;
3828             if (qual.CanGetQual()) {
3829                 const string& key = qual.GetQual();
3830                 if (NStr::EqualNocase(key, "protein_id")) {
3831                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnFeature,
3832                         "protein_id should not be a gbqual on an mRNA feature");
3833                 }
3834                 else if (NStr::EqualNocase(key, "transcript_id")) {
3835                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnFeature,
3836                         "transcript_id should not be a gbqual on an mRNA feature");
3837                 }
3838             }
3839         }
3840     }
3841 
3842     if (rna.IsSetExt() && rna.GetExt().IsName()) {
3843         const string& rna_name = rna.GetExt().GetName();
3844         if (NStr::StartsWith(rna_name, "transfer RNA ") &&
3845             (!NStr::EqualNocase(rna_name, "transfer RNA nucleotidyltransferase")) &&
3846             (!NStr::EqualNocase(rna_name, "transfer RNA methyltransferase"))) {
3847             PostErr(eDiag_Warning, eErr_SEQ_FEAT_tRNAmRNAmixup,
3848                 "mRNA feature product indicates it should be a tRNA feature");
3849         }
3850         ValidateCharactersInField(rna_name, "mRNA name");
3851         if (ContainsSgml(rna_name)) {
3852             PostErr(eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
3853                 "mRNA name " + rna_name + " has SGML");
3854         }
3855     }
3856 }
3857 
3858 
x_ValidateCommonMRNAProduct()3859 void CMRNAValidator::x_ValidateCommonMRNAProduct()
3860 {
3861     if (!m_Feat.IsSetProduct()) {
3862         return;
3863     }
3864     if ( !m_ProductBioseq) {
3865         if (m_LocationBioseq) {
3866             CSeq_entry_Handle seh = m_LocationBioseq.GetTopLevelEntry();
3867             if (seh.IsSet() && seh.GetSet().IsSetClass()
3868                 && (seh.GetSet().GetClass() == CBioseq_set::eClass_gen_prod_set
3869                     || seh.GetSet().GetClass() == CBioseq_set::eClass_other)) {
3870                 PostErr(eDiag_Error, eErr_SEQ_FEAT_MissingMRNAproduct,
3871                     "Product Bioseq of mRNA feature is not "
3872                     "packaged in the record");
3873             }
3874         }
3875     } else {
3876 
3877         CConstRef<CSeq_feat> mrna = m_Imp.GetmRNAGivenProduct (*(m_ProductBioseq.GetCompleteBioseq()));
3878         if (mrna && mrna.GetPointer() != &m_Feat) {
3879             PostErr(eDiag_Critical, eErr_SEQ_FEAT_IdenticalMRNAtranscriptIDs,
3880                     "Identical transcript IDs found on multiple mRNAs");
3881         }
3882     }
3883 }
3884 
3885 
s_EqualGene_ref(const CGene_ref & genomic,const CGene_ref & mrna)3886 static bool s_EqualGene_ref(const CGene_ref& genomic, const CGene_ref& mrna)
3887 {
3888     bool locus = (!genomic.CanGetLocus()  &&  !mrna.CanGetLocus())  ||
3889         (genomic.CanGetLocus()  &&  mrna.CanGetLocus()  &&
3890         genomic.GetLocus() == mrna.GetLocus());
3891     bool allele = (!genomic.CanGetAllele()  &&  !mrna.CanGetAllele())  ||
3892         (genomic.CanGetAllele()  &&  mrna.CanGetAllele()  &&
3893         genomic.GetAllele() == mrna.GetAllele());
3894     bool desc = (!genomic.CanGetDesc()  &&  !mrna.CanGetDesc())  ||
3895         (genomic.CanGetDesc()  &&  mrna.CanGetDesc()  &&
3896         genomic.GetDesc() == mrna.GetDesc());
3897     bool locus_tag = (!genomic.CanGetLocus_tag()  &&  !mrna.CanGetLocus_tag())  ||
3898         (genomic.CanGetLocus_tag()  &&  mrna.CanGetLocus_tag()  &&
3899         genomic.GetLocus_tag() == mrna.GetLocus_tag());
3900 
3901     return locus  &&  allele  &&  desc  && locus_tag;
3902 }
3903 
3904 
3905 // check that there is no conflict between the gene on the genomic
3906 // and the gene on the mrna.
x_ValidateMrnaGene()3907 void CMRNAValidator::x_ValidateMrnaGene()
3908 {
3909     if (!m_ProductBioseq) {
3910         return;
3911     }
3912     const CGene_ref* genomicgrp = NULL;
3913     if (m_Gene) {
3914         genomicgrp = &(m_Gene->GetData().GetGene());
3915     } else {
3916         genomicgrp = m_Feat.GetGeneXref();
3917     }
3918     if (!genomicgrp) {
3919         return;
3920     }
3921     CFeat_CI mrna_gene(m_ProductBioseq, CSeqFeatData::e_Gene);
3922     if ( mrna_gene ) {
3923         const CGene_ref& mrnagrp = mrna_gene->GetData().GetGene();
3924         if ( !s_EqualGene_ref(*genomicgrp, mrnagrp) ) {
3925             m_Imp.PostErr(eDiag_Warning, eErr_SEQ_FEAT_GenesInconsistent,
3926                 "Gene on mRNA bioseq does not match gene on genomic bioseq",
3927                 mrna_gene->GetOriginalFeature());
3928         }
3929     }
3930 
3931 }
3932 
3933 
Validate()3934 void CPubFeatValidator::Validate()
3935 {
3936     CSingleFeatValidator::Validate();
3937     m_Imp.ValidatePubdesc(m_Feat.GetData().GetPub (), m_Feat);
3938 }
3939 
3940 
Validate()3941 void CSrcFeatValidator::Validate()
3942 {
3943     CSingleFeatValidator::Validate();
3944     const CBioSource& bsrc = m_Feat.GetData().GetBiosrc();
3945     if ( bsrc.IsSetIs_focus() ) {
3946         PostErr(eDiag_Error, eErr_SEQ_FEAT_FocusOnBioSourceFeature,
3947             "Focus must be on BioSource descriptor, not BioSource feature.");
3948     }
3949 
3950     m_Imp.ValidateBioSource(bsrc, m_Feat);
3951 
3952     CSeqdesc_CI dbsrc_i(m_LocationBioseq, CSeqdesc::e_Source);
3953     if ( !dbsrc_i ) {
3954         return;
3955     }
3956 
3957     const COrg_ref& org = bsrc.GetOrg();
3958     const CBioSource& dbsrc = dbsrc_i->GetSource();
3959     const COrg_ref& dorg = dbsrc.GetOrg();
3960 
3961     if ( org.CanGetTaxname()  &&  !org.GetTaxname().empty()  &&
3962             dorg.CanGetTaxname() ) {
3963         string taxname = org.GetTaxname();
3964         string dtaxname = dorg.GetTaxname();
3965         if ( NStr::CompareNocase(taxname, dtaxname) != 0 ) {
3966             if ( !dbsrc.IsSetIs_focus()  &&  !m_Imp.IsTransgenic(dbsrc) ) {
3967                 PostErr(eDiag_Error, eErr_SEQ_DESCR_BioSourceNeedsFocus,
3968                     "BioSource descriptor must have focus or transgenic "
3969                     "when BioSource feature with different taxname is "
3970                     "present.");
3971             }
3972         }
3973     }
3974 }
3975 
3976 
x_ValidateSeqFeatLoc()3977 void CPolyASiteValidator::x_ValidateSeqFeatLoc()
3978 {
3979     CSingleFeatValidator::x_ValidateSeqFeatLoc();
3980     CSeq_loc::TRange range = m_Feat.GetLocation().GetTotalRange();
3981     if ( range.GetFrom() != range.GetTo() ) {
3982         EDiagSev sev = eDiag_Warning;
3983         if (m_Imp.IsRefSeq()) {
3984             sev = eDiag_Error;
3985         }
3986         PostErr(sev, eErr_SEQ_FEAT_PolyAsiteNotPoint,
3987             "PolyA_site should be a single point");
3988     }
3989 
3990 }
3991 
3992 
x_ValidateSeqFeatLoc()3993 void CPolyASignalValidator::x_ValidateSeqFeatLoc()
3994 {
3995     CSeq_loc::TRange range = m_Feat.GetLocation().GetTotalRange();
3996     if ( range.GetFrom() == range.GetTo() ) {
3997         EDiagSev sev = eDiag_Warning;
3998         if (m_Imp.IsRefSeq()) {
3999             sev = eDiag_Error;
4000         }
4001         PostErr (sev, eErr_SEQ_FEAT_PolyAsignalNotRange, "PolyA_signal should be a range");
4002     }
4003 }
4004 
4005 
CPeptideValidator(const CSeq_feat & feat,CScope & scope,CValidError_imp & imp)4006 CPeptideValidator::CPeptideValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp) :
4007         CSingleFeatValidator(feat, scope, imp)
4008 {
4009     m_CDS = GetOverlappingCDS(m_Feat.GetLocation(), m_Scope);
4010 }
4011 
4012 
Validate()4013 void CPeptideValidator::Validate()
4014 {
4015     CSingleFeatValidator::Validate();
4016 
4017     if (m_Imp.IsEmbl() || m_Imp.IsDdbj()) {
4018         PostErr(!m_CDS ? eDiag_Error : eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatureLacksCDS,
4019             "sig/mat/transit_peptide feature cannot be associated with a "
4020             "protein product of a coding region feature");
4021     } else {
4022         PostErr(m_Imp.IsRefSeq() ? eDiag_Error : eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatureLacksCDS,
4023             "Peptide processing feature should be converted to the "
4024             "appropriate protein feature subtype");
4025     }
4026     x_ValidatePeptideOnCodonBoundary();
4027 }
4028 
4029 
x_ValidatePeptideOnCodonBoundary()4030 void CPeptideValidator::x_ValidatePeptideOnCodonBoundary()
4031 {
4032     if (!m_CDS) {
4033         return;
4034     }
4035 
4036     const string& key = m_Feat.GetData().GetImp().GetKey();
4037 
4038     feature::ELocationInFrame in_frame = feature::IsLocationInFrame(m_Scope.GetSeq_featHandle(*m_CDS), m_Feat.GetLocation());
4039     if (NStr::Equal(key, "sig_peptide") && in_frame == feature::eLocationInFrame_NotIn) {
4040         return;
4041     }
4042     switch (in_frame) {
4043         case feature::eLocationInFrame_NotIn:
4044             if (NStr::Equal(key, "sig_peptide")) {
4045                 // ignore
4046             } else {
4047                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
4048                     "Start and stop of " + key + " are out of frame with CDS codons");
4049             }
4050             break;
4051         case feature::eLocationInFrame_BadStartAndStop:
4052             PostErr(eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
4053                 "Start and stop of " + key + " are out of frame with CDS codons");
4054             break;
4055         case feature::eLocationInFrame_BadStart:
4056             PostErr(eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
4057                 "Start of " + key + " is out of frame with CDS codons");
4058             break;
4059         case feature::eLocationInFrame_BadStop:
4060             PostErr(eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
4061                 "Stop of " + key + " is out of frame with CDS codons");
4062             break;
4063         case feature::eLocationInFrame_InFrame:
4064             break;
4065     }
4066 }
4067 
4068 
Validate()4069 void CExonValidator::Validate()
4070 {
4071     CSingleFeatValidator::Validate();
4072     bool feat_pseudo = s_IsPseudo(m_Feat);
4073     bool pseudo = feat_pseudo;
4074     if (!pseudo) {
4075         CConstRef<CSeq_feat> gene = m_Imp.GetGeneCache().GetGeneFromCache(&m_Feat, m_Scope);
4076         if (gene) {
4077             pseudo = s_IsPseudo(*gene);
4078         }
4079     }
4080     if (m_Imp.IsValidateExons()) {
4081         ValidateSplice(pseudo, true);
4082     }
4083 }
4084 
4085 
Validate()4086 void CIntronValidator::Validate()
4087 {
4088     CSingleFeatValidator::Validate();
4089     bool feat_pseudo = s_IsPseudo(m_Feat);
4090     bool pseudo = feat_pseudo;
4091     if (!pseudo) {
4092         CConstRef<CSeq_feat> gene = m_Imp.GetGeneCache().GetGeneFromCache(&m_Feat, m_Scope);
4093         if (gene) {
4094             pseudo = s_IsPseudo(*gene);
4095         }
4096     }
4097 
4098     if (x_IsIntronShort(pseudo)) {
4099         PostErr (eDiag_Warning, eErr_SEQ_FEAT_ShortIntron,
4100                  "Introns should be at least 10 nt long");
4101     }
4102 
4103     if (m_Feat.IsSetExcept() && m_Feat.IsSetExcept_text()
4104         && NStr::FindNoCase (m_Feat.GetExcept_text(), "nonconsensus splice site") != string::npos) {
4105         return;
4106     }
4107 
4108     const CSeq_loc& loc = m_Feat.GetLocation();
4109 
4110     bool partial5 = loc.IsPartialStart(eExtreme_Biological);
4111     bool partial3 = loc.IsPartialStop(eExtreme_Biological);
4112     if (partial5 && partial3) {
4113         return;
4114     }
4115 
4116     // suppress if contained by rRNA - different consensus splice site
4117     TFeatScores scores;
4118     GetOverlappingFeatures(loc,
4119                            CSeqFeatData::e_Rna,
4120                            CSeqFeatData::eSubtype_rRNA,
4121                            eOverlap_Contained,
4122                            scores, m_Scope);
4123     if (scores.size() > 0) {
4124         return;
4125     }
4126 
4127     // suppress if contained by tRNA - different consensus splice site
4128     scores.clear();
4129     GetOverlappingFeatures(loc,
4130                            CSeqFeatData::e_Rna,
4131                            CSeqFeatData::eSubtype_tRNA,
4132                            eOverlap_Contained,
4133                            scores, m_Scope);
4134     if (scores.size() > 0) {
4135         return;
4136     }
4137 
4138     // skip if more than one bioseq
4139     if (!IsOneBioseq(loc, &m_Scope)) {
4140         return;
4141     }
4142 
4143     // skip if organelle
4144     if (!m_LocationBioseq || IsOrganelle(m_LocationBioseq)) {
4145         return;
4146     }
4147 
4148     TSeqPos seq_len = m_LocationBioseq.GetBioseqLength();
4149     string label;
4150     m_LocationBioseq.GetId().front().GetSeqId()->GetLabel(&label);
4151     CSeqVector vec = m_LocationBioseq.GetSeqVector (CBioseq_Handle::eCoding_Iupac);
4152 
4153     ENa_strand strand = loc.GetStrand();
4154 
4155     if (eNa_strand_minus != strand && eNa_strand_plus != strand) {
4156         strand = eNa_strand_plus;
4157     }
4158 
4159     bool donor_in_gap = false;
4160     bool acceptor_in_gap = false;
4161 
4162     TSeqPos end5 = loc.GetStart (eExtreme_Biological);
4163     if (vec.IsInGap(end5)) {
4164         donor_in_gap = true;
4165     }
4166 
4167     TSeqPos end3 = loc.GetStop (eExtreme_Biological);
4168     if (vec.IsInGap(end3)) {
4169         acceptor_in_gap = true;
4170     }
4171 
4172     if (!partial5 && !partial3) {
4173         if (donor_in_gap && acceptor_in_gap) {
4174             return;
4175         }
4176     }
4177 
4178     Char donor[2];  // donor site signature
4179     Char acceptor[2];  // acceptor site signature
4180     bool donor_good = false;  // flag == "true" indicates that donor signature is in @donor
4181     bool acceptor_good = false;  // flag == "true" indicates that acceptor signature is in @acceptor
4182 
4183     // Read donor signature into @donor
4184     if (!partial5 && !donor_in_gap) {
4185         if (eNa_strand_minus == strand) {
4186             if (end5 > 0 && IsResidue (vec[end5 - 1]) && IsResidue (vec[end5])) {
4187                 donor[0] = vec[end5 - 1];
4188                 donor[1] = vec[end5];
4189                 donor_good = true;
4190             }
4191         }
4192         else {
4193             if( end5 < seq_len - 1 && IsResidue (vec[end5]) && IsResidue (vec[end5 + 1])) {
4194                 donor[0] = vec[end5];
4195                 donor[1] = vec[end5 + 1];
4196                 donor_good = true;
4197             }
4198         }
4199     }
4200 
4201     // Read acceptor signature into @acceptor
4202     if (!partial3 && !acceptor_in_gap) {
4203         if (eNa_strand_minus == strand) {
4204             if (end3 <  seq_len - 1 && IsResidue (vec[end3]) && IsResidue (vec[end3 + 1])) {
4205                 acceptor[0] = vec[end3];
4206                 acceptor[1] = vec[end3 + 1];
4207                 acceptor_good = true;
4208             }
4209         }
4210         else {
4211             if (end3 > 0 && IsResidue (vec[end3 - 1]) && IsResidue (vec[end3])) {
4212                 acceptor[0] = vec[end3 - 1];
4213                 acceptor[1] = vec[end3];
4214                 acceptor_good = true;
4215             }
4216         }
4217     }
4218 
4219     // Check intron's both ends.
4220     if (!partial5 && !partial3) {
4221         if (donor_good && acceptor_good) {
4222             if (CheckIntronSpliceSites(strand, donor, acceptor)) {
4223                 return;
4224             }
4225         }
4226     }
4227 
4228     // Check 5'-most
4229     if (!partial5) {
4230         if (!donor_in_gap) {
4231             bool not_found = true;
4232 
4233             if (donor_good) {
4234                 if (CheckIntronDonor(strand, donor)) {
4235                     not_found = false;
4236                 }
4237             }
4238             //
4239             if (not_found) {
4240                 if ((strand == eNa_strand_minus && end5 == seq_len - 1) ||
4241                     (strand == eNa_strand_plus && end5 == 0)) {
4242 
4243                     PostErr(eDiag_Info, eErr_SEQ_FEAT_NotSpliceConsensusDonorTerminalIntron,
4244                             "Splice donor consensus (GT) not found at start of terminal intron, position "
4245                             + NStr::IntToString (end5 + 1) + " of " + label);
4246                 }
4247                 else {
4248                     PostErr(x_SeverityForConsensusSplice(), eErr_SEQ_FEAT_NotSpliceConsensusDonor,
4249                              "Splice donor consensus (GT) not found at start of intron, position "
4250                               + NStr::IntToString (end5 + 1) + " of " + label);
4251                 }
4252             }
4253         }
4254     }
4255 
4256     // Check 3'-most
4257     if (!partial3) {
4258         if (!acceptor_in_gap) {
4259             bool not_found = true;
4260 
4261             if (acceptor_good) {
4262                 if (CheckIntronAcceptor(strand, acceptor)) {
4263                     not_found = false;
4264                 }
4265             }
4266 
4267             if (not_found) {
4268                 if ((strand == eNa_strand_minus && end3 == 0) ||
4269                     (strand == eNa_strand_plus && end3 == seq_len - 1)) {
4270                     PostErr(eDiag_Info, eErr_SEQ_FEAT_NotSpliceConsensusAcceptorTerminalIntron,
4271                               "Splice acceptor consensus (AG) not found at end of terminal intron, position "
4272                               + NStr::IntToString (end3 + 1) + " of " + label + ", but at end of sequence");
4273                 }
4274                 else {
4275                     PostErr(x_SeverityForConsensusSplice(), eErr_SEQ_FEAT_NotSpliceConsensusAcceptor,
4276                              "Splice acceptor consensus (AG) not found at end of intron, position "
4277                               + NStr::IntToString (end3 + 1) + " of " + label);
4278                }
4279             }
4280         }
4281     }
4282 
4283 }
4284 
4285 
x_IsIntronShort(bool pseudo)4286 bool CIntronValidator::x_IsIntronShort(bool pseudo)
4287 {
4288     if (!m_Feat.IsSetData()
4289         || m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_intron
4290         || !m_Feat.IsSetLocation()
4291         || pseudo) {
4292         return false;
4293     }
4294 
4295     const CSeq_loc& loc = m_Feat.GetLocation();
4296     bool is_short = false;
4297 
4298     if (! m_Imp.IsIndexerVersion()) {
4299         if (!m_LocationBioseq || IsOrganelle(m_LocationBioseq)) return is_short;
4300     }
4301 
4302     if (GetLength(loc, &m_Scope) < 11) {
4303         bool partial_left = loc.IsPartialStart(eExtreme_Positional);
4304         bool partial_right = loc.IsPartialStop(eExtreme_Positional);
4305 
4306         CBioseq_Handle bsh;
4307         if (partial_left && loc.GetStart(eExtreme_Positional) == 0) {
4308             // partial at beginning of sequence, ok
4309         } else if (partial_right &&
4310             (m_LocationBioseq) &&
4311                    loc.GetStop(eExtreme_Positional) == (
4312                        m_LocationBioseq.GetBioseqLength() - 1))
4313         {
4314             // partial at end of sequence
4315         } else {
4316             is_short = true;
4317         }
4318     }
4319     return is_short;
4320 }
4321 
4322 
Validate()4323 void CMiscFeatValidator::Validate()
4324 {
4325     CSingleFeatValidator::Validate();
4326     if ((!m_Feat.IsSetComment() || NStr::IsBlank (m_Feat.GetComment()))
4327             && (!m_Feat.IsSetQual() || m_Feat.GetQual().empty())
4328             && (!m_Feat.IsSetDbxref() || m_Feat.GetDbxref().empty())) {
4329         PostErr(eDiag_Warning, eErr_SEQ_FEAT_MiscFeatureNeedsNote,
4330                 "A note or other qualifier is required for a misc_feature");
4331     }
4332     if (m_Feat.IsSetComment() && ! NStr::IsBlank (m_Feat.GetComment())) {
4333         if (NStr::FindWord(m_Feat.GetComment(), "cspA") != NPOS) {
4334             CConstRef<CSeq_feat> cds = GetBestOverlappingFeat(m_Feat.GetLocation(), CSeqFeatData::eSubtype_cdregion, eOverlap_Simple, m_Scope);
4335             if (cds) {
4336                 string content_label;
4337                 feature::GetLabel(*cds, &content_label, feature::fFGL_Content, &m_Scope);
4338                 if (NStr::Equal(content_label, "cold-shock protein")) {
4339                     PostErr(eDiag_Error, eErr_SEQ_FEAT_ColdShockProteinProblem,
4340                             "cspA misc_feature overlapped by cold-shock protein CDS");
4341                 }
4342             }
4343         }
4344     }
4345 
4346 }
4347 
4348 
Validate()4349 void CAssemblyGapValidator::Validate()
4350 {
4351     CSingleFeatValidator::Validate();
4352 
4353     bool is_far_delta = false;
4354     if ( m_LocationBioseq.IsSetInst_Repr()) {
4355         CBioseq_Handle::TInst::TRepr repr = m_LocationBioseq.GetInst_Repr();
4356         if ( repr == CSeq_inst::eRepr_delta ) {
4357             is_far_delta = true;
4358             const CBioseq& seq = *(m_LocationBioseq.GetCompleteBioseq());
4359             const CSeq_inst& inst = seq.GetInst();
4360             ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
4361                 if ( !(*sg) ) continue;
4362                 if (( (**sg).Which() == CDelta_seq::e_Loc )) continue;
4363                 is_far_delta = false;
4364             }
4365         }
4366     }
4367     if (! is_far_delta) {
4368         PostErr(eDiag_Warning, eErr_SEQ_FEAT_AssemblyGapFeatureProblem,
4369                 "An assembly_gap feature should only be on a contig record");
4370     }
4371     if (m_Feat.GetLocation().IsInt()) {
4372         TSeqPos from = m_Feat.GetLocation().GetInt().GetFrom();
4373         TSeqPos to = m_Feat.GetLocation().GetInt().GetTo();
4374         CSeqVector vec = m_LocationBioseq.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
4375         string sequence;
4376         bool is5 = false;
4377         bool is3 = false;
4378         long int count = 0;
4379         vec.GetSeqData(from - 1, from, sequence);
4380         if (NStr::Equal (sequence, "N")) {
4381             is5 = true;
4382         }
4383         vec.GetSeqData(to + 1, to + 2, sequence);
4384         if (NStr::Equal (sequence, "N")) {
4385             is3 = true;
4386         }
4387         EDiagSev sv = eDiag_Warning;
4388         if (m_Imp.IsGenomeSubmission()) {
4389             sv = eDiag_Error;
4390         }
4391         if (is5 && is3) {
4392             PostErr(sv, eErr_SEQ_FEAT_AssemblyGapAdjacentToNs,
4393                     "Assembly_gap flanked by Ns on 5' and 3' sides");
4394         } else if (is5) {
4395             PostErr(sv, eErr_SEQ_FEAT_AssemblyGapAdjacentToNs,
4396                     "Assembly_gap flanked by Ns on 5' side");
4397         } else if (is3) {
4398             PostErr(sv, eErr_SEQ_FEAT_AssemblyGapAdjacentToNs,
4399                     "Assembly_gap flanked by Ns on 3' side");
4400         }
4401         vec.GetSeqData(from, to + 1, sequence);
4402         for (size_t i = 0; i < sequence.size(); i++) {
4403             if (sequence[i] != 'N') {
4404                 count++;
4405             }
4406         }
4407         if (count > 0) {
4408             PostErr(eDiag_Error, eErr_SEQ_FEAT_AssemblyGapCoversSequence, "Assembly_gap extends into sequence");
4409         }
4410     }
4411 }
4412 
4413 
Validate()4414 void CGapFeatValidator::Validate()
4415 {
4416     CSingleFeatValidator::Validate();
4417     int loc_len = GetLength (m_Feat.GetLocation(), &m_Scope);
4418     // look for estimated length qualifier
4419     FOR_EACH_GBQUAL_ON_SEQFEAT (it, m_Feat) {
4420         if ((*it)->IsSetQual() && NStr::EqualNocase ((*it)->GetQual(), "estimated_length")
4421             && (*it)->IsSetVal() && !NStr::EqualNocase ((*it)->GetVal(), "unknown")) {
4422             try {
4423                 int estimated_length = NStr::StringToInt ((*it)->GetVal());
4424                 if (estimated_length != loc_len) {
4425                     PostErr (eDiag_Error, eErr_SEQ_FEAT_GapFeatureProblem,
4426                              "Gap feature estimated_length " + NStr::IntToString (estimated_length)
4427                              + " does not match " + NStr::IntToString (loc_len) + " feature length");
4428                     return;
4429                 }
4430             } catch (CException ) {
4431             } catch (std::exception ) {
4432             }
4433         }
4434     }
4435     try {
4436         string s_data = GetSequenceStringFromLoc(m_Feat.GetLocation(), m_Scope);
4437         CSeqVector vec = GetSequenceFromFeature(m_Feat, m_Scope);
4438         if ( !vec.empty() ) {
4439             string vec_data;
4440             vec.GetSeqData(0, vec.size(), vec_data);
4441             int num_n = 0;
4442             int num_real = 0;
4443             unsigned int num_gap = 0;
4444             int pos = 0;
4445             string::iterator it = vec_data.begin();
4446             while (it != vec_data.end()) {
4447                 if (*it == 'N') {
4448                     if (vec.IsInGap(pos)) {
4449                         // gap not N
4450                         num_gap++;
4451                     } else {
4452                         num_n++;
4453                     }
4454                 } else if (*it != '-') {
4455                     num_real++;
4456                 }
4457                 ++it;
4458                 ++pos;
4459             }
4460             if (num_real > 0 && num_n > 0) {
4461                 PostErr (eDiag_Error, eErr_SEQ_FEAT_GapFeatureProblem,
4462                          "Gap feature over " + NStr::IntToString (num_real)
4463                          + " real bases and " + NStr::IntToString (num_n)
4464                          + " Ns");
4465             } else if (num_real > 0) {
4466                 PostErr (eDiag_Error, eErr_SEQ_FEAT_GapFeatureProblem,
4467                          "Gap feature over " + NStr::IntToString (num_real)
4468                          + " real bases");
4469             } else if (num_n > 0) {
4470                 PostErr (eDiag_Error, eErr_SEQ_FEAT_GapFeatureProblem,
4471                          "Gap feature over " + NStr::IntToString (num_n)
4472                          + " Ns");
4473             } else if (num_gap != GetLength (m_Feat.GetLocation(), &m_Scope)) {
4474                 PostErr (eDiag_Error, eErr_SEQ_FEAT_GapFeatureProblem,
4475                          "Gap feature estimated_length " + NStr::IntToString (loc_len)
4476                          + " does not match " + NStr::IntToString (num_gap)
4477                          + " gap characters");
4478             }
4479         }
4480 
4481     } catch (CException ) {
4482     } catch (std::exception ) {
4483     }
4484 }
4485 
4486 
Validate()4487 void CImpFeatValidator::Validate()
4488 {
4489     CSingleFeatValidator::Validate();
4490     CSeqFeatData::ESubtype subtype = m_Feat.GetData().GetSubtype();
4491 
4492     const string& key = m_Feat.GetData().GetImp().GetKey();
4493     if (NStr::IsBlank(key)) {
4494         PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatKey,
4495                 "NULL feature key");
4496         return;
4497     }
4498 
4499     if (subtype == CSeqFeatData::eSubtype_imp || subtype == CSeqFeatData::eSubtype_bad) {
4500         if (NStr::Equal(key, "mRNA")) {
4501             subtype = CSeqFeatData::eSubtype_mRNA;
4502         } else if (NStr::Equal(key, "tRNA")) {
4503             subtype = CSeqFeatData::eSubtype_tRNA;
4504         } else if (NStr::Equal(key, "tRNA")) {
4505             subtype = CSeqFeatData::eSubtype_tRNA;
4506         } else if (NStr::Equal(key, "rRNA")) {
4507             subtype = CSeqFeatData::eSubtype_rRNA;
4508         } else if (NStr::Equal(key, "snRNA")) {
4509             subtype = CSeqFeatData::eSubtype_snRNA;
4510         } else if (NStr::Equal(key, "scRNA")) {
4511             subtype = CSeqFeatData::eSubtype_scRNA;
4512         } else if (NStr::Equal(key, "snoRNA")) {
4513             subtype = CSeqFeatData::eSubtype_snoRNA;
4514         } else if (NStr::Equal(key, "misc_RNA")) {
4515             subtype = CSeqFeatData::eSubtype_misc_RNA;
4516         } else if (NStr::Equal(key, "precursor_RNA")) {
4517             subtype = CSeqFeatData::eSubtype_precursor_RNA;
4518         } else if (NStr::EqualNocase (key, "mat_peptide")) {
4519             subtype = CSeqFeatData::eSubtype_mat_peptide;
4520         } else if (NStr::EqualNocase (key, "propeptide")) {
4521             subtype = CSeqFeatData::eSubtype_propeptide;
4522         } else if (NStr::EqualNocase (key, "sig_peptide")) {
4523             subtype = CSeqFeatData::eSubtype_sig_peptide;
4524         } else if (NStr::EqualNocase (key, "transit_peptide")) {
4525             subtype = CSeqFeatData::eSubtype_transit_peptide;
4526         } else if (NStr::EqualNocase (key, "preprotein")
4527             || NStr::EqualNocase(key, "proprotein")) {
4528             subtype = CSeqFeatData::eSubtype_preprotein;
4529         } else if (NStr::EqualNocase (key, "virion")) {
4530             subtype = CSeqFeatData::eSubtype_virion;
4531         } else if (NStr::EqualNocase(key, "mutation")) {
4532             subtype = CSeqFeatData::eSubtype_mutation;
4533         } else if (NStr::EqualNocase(key, "allele")) {
4534             subtype = CSeqFeatData::eSubtype_allele;
4535         } else if (NStr::EqualNocase (key, "CDS")) {
4536             subtype = CSeqFeatData::eSubtype_Imp_CDS;
4537         } else if (NStr::EqualNocase(key, "Import")) {
4538             PostErr(eDiag_Error, eErr_SEQ_FEAT_UnknownImpFeatKey,
4539                     "Feature key Import is no longer legal");
4540             return;
4541         }
4542     }
4543 
4544     switch ( subtype ) {
4545 
4546     case CSeqFeatData::eSubtype_bad:
4547     case CSeqFeatData::eSubtype_site_ref:
4548     case CSeqFeatData::eSubtype_gene:
4549         PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatKey,
4550             "Unknown feature key " + key);
4551         break;
4552 
4553     case CSeqFeatData::eSubtype_virion:
4554     case CSeqFeatData::eSubtype_mutation:
4555     case CSeqFeatData::eSubtype_allele:
4556         PostErr(eDiag_Error, eErr_SEQ_FEAT_UnknownImpFeatKey,
4557             "Feature key " + key + " is no longer legal");
4558         break;
4559 
4560 
4561     case CSeqFeatData::eSubtype_preprotein:
4562         if (m_Imp.IsEmbl() || m_Imp.IsDdbj()) {
4563             const CSeq_loc& loc = m_Feat.GetLocation();
4564             CConstRef<CSeq_feat> cds = GetOverlappingCDS(loc, m_Scope);
4565             PostErr(!cds ? eDiag_Error : eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatureLacksCDS,
4566                 "Pre/pro protein feature cannot be associated with a "
4567                 "protein product of a coding region feature");
4568         } else {
4569             PostErr(m_Imp.IsRefSeq() ? eDiag_Error : eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatureLacksCDS,
4570                 "Peptide processing feature should be converted to the "
4571                 "appropriate protein feature subtype");
4572         }
4573         break;
4574 
4575     case CSeqFeatData::eSubtype_mRNA:
4576     case CSeqFeatData::eSubtype_tRNA:
4577     case CSeqFeatData::eSubtype_rRNA:
4578     case CSeqFeatData::eSubtype_snRNA:
4579     case CSeqFeatData::eSubtype_scRNA:
4580     case CSeqFeatData::eSubtype_snoRNA:
4581     case CSeqFeatData::eSubtype_misc_RNA:
4582     case CSeqFeatData::eSubtype_precursor_RNA:
4583     // !!! what about other RNA types (preRNA, otherRNA)?
4584         PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidRNAFeature,
4585               "RNA feature should be converted to the appropriate RNA feature "
4586               "subtype, location should be converted manually");
4587         break;
4588 
4589     case CSeqFeatData::eSubtype_Imp_CDS:
4590         {
4591             // impfeat CDS must be pseudo; fail if not
4592             bool pseudo = sequence::IsPseudo(m_Feat, m_Scope);
4593             if ( !pseudo ) {
4594                 PostErr(eDiag_Info, eErr_SEQ_FEAT_ImpCDSnotPseudo,
4595                     "ImpFeat CDS should be pseudo");
4596             }
4597 
4598             FOR_EACH_GBQUAL_ON_FEATURE (gbqual, m_Feat) {
4599                 if ( NStr::CompareNocase( (*gbqual)->GetQual(), "translation") == 0 ) {
4600                     PostErr(eDiag_Error, eErr_SEQ_FEAT_ImpCDShasTranslation,
4601                         "ImpFeat CDS with /translation found");
4602                 }
4603             }
4604         }
4605         break;
4606     case CSeqFeatData::eSubtype_imp:
4607         PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatKey,
4608             "Unknown feature key " + key);
4609         break;
4610     case CSeqFeatData::eSubtype_repeat_region:
4611         if ((!m_Feat.IsSetComment() || NStr::IsBlank (m_Feat.GetComment()))
4612             && (!m_Feat.IsSetDbxref() || m_Feat.GetDbxref().empty())) {
4613             if (!m_Feat.IsSetQual() || m_Feat.GetQual().empty()) {
4614                PostErr(eDiag_Warning, eErr_SEQ_FEAT_RepeatRegionNeedsNote,
4615                         "repeat_region has no qualifiers");
4616             } else if ( ! m_Imp.IsGenomeSubmission() ) {
4617                 bool okay = false;
4618                 FOR_EACH_GBQUAL_ON_FEATURE (gbqual, m_Feat) {
4619                     if ( ! NStr::EqualNocase((*gbqual)->GetQual(), "rpt_type") ) {
4620                         okay = true;
4621                         break;
4622                     }
4623                     const string& val = (*gbqual)->GetVal();
4624                     if ( ! NStr::Equal (val, "other") ) {
4625                         okay = true;
4626                         break;
4627                     }
4628                 }
4629                 if ( ! okay ) {
4630                    PostErr(eDiag_Warning, eErr_SEQ_FEAT_RepeatRegionNeedsNote,
4631                             "repeat_region has no qualifiers except rpt_type other");
4632                 }
4633             }
4634          }
4635         break;
4636     case CSeqFeatData::eSubtype_regulatory:
4637         {
4638             vector<string> valid_types = CSeqFeatData::GetRegulatoryClassList();
4639             FOR_EACH_GBQUAL_ON_FEATURE (gbqual, m_Feat) {
4640                 if ( NStr::CompareNocase( (*gbqual)->GetQual(), "regulatory_class") != 0 ) continue;
4641                 const string& val = (*gbqual)->GetVal();
4642                 bool missing = true;
4643                 FOR_EACH_STRING_IN_VECTOR (itr, valid_types) {
4644                     string str = *itr;
4645                     if ( NStr::Equal (str, val) ) {
4646                         missing = false;
4647                     }
4648                 }
4649                 if ( missing ) {
4650                     if ( NStr::Equal (val, "other") && !m_Feat.IsSetComment() ) {
4651                         PostErr(eDiag_Error, eErr_SEQ_FEAT_RegulatoryClassOtherNeedsNote,
4652                             "The regulatory_class 'other' is missing the required /note");
4653                     }
4654                 }
4655             }
4656         }
4657         break;
4658     case CSeqFeatData::eSubtype_misc_recomb:
4659         {
4660             const CGb_qual::TLegalRecombinationClassSet recomb_values = CGb_qual::GetSetOfLegalRecombinationClassValues();
4661             FOR_EACH_GBQUAL_ON_FEATURE (gbqual, m_Feat) {
4662                 if ( NStr::CompareNocase( (*gbqual)->GetQual(), "recombination_class") != 0 ) continue;
4663                 const string& val = (*gbqual)->GetVal();
4664                if ( recomb_values.find(val.c_str()) == recomb_values.end() ) {
4665                     if ( NStr::Equal (val, "other")) {
4666                         if (!m_Feat.IsSetComment()) {
4667                             PostErr(eDiag_Error, eErr_SEQ_FEAT_RecombinationClassOtherNeedsNote,
4668                                 "The recombination_class 'other' is missing the required /note");
4669                         }
4670                     } else {
4671                         // Removed per VR-770. FlatFile will automatically
4672                         // display the unrecognized recombination_class value
4673                         // in the note and list the recombination_class as other
4674 //                        PostErr(eDiag_Info, eErr_SEQ_FEAT_InvalidQualifierValue,
4675 //                            "'" + val + "' is not a legal value for recombination_class", feat);
4676                     }
4677                 }
4678             }
4679         }
4680         break;
4681     default:
4682         break;
4683     }// end of switch statement
4684 }
4685 
4686 
FeatValidatorFactory(const CSeq_feat & feat,CScope & scope,CValidError_imp & imp)4687 CSingleFeatValidator* FeatValidatorFactory(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp)
4688 {
4689     if (!feat.IsSetData()) {
4690         return new CSingleFeatValidator(feat, scope, imp);
4691     } else if (feat.GetData().IsCdregion()) {
4692         return new CCdregionValidator(feat, scope, imp);
4693     } else if (feat.GetData().IsGene()) {
4694         return new CGeneValidator(feat, scope, imp);
4695     } else if (feat.GetData().IsProt()) {
4696         return new CProtValidator(feat, scope, imp);
4697     } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
4698         return new CMRNAValidator(feat, scope, imp);
4699     } else if (feat.GetData().IsRna()) {
4700         return new CRNAValidator(feat, scope, imp);
4701     } else if (feat.GetData().IsPub()) {
4702         return new CPubFeatValidator(feat, scope, imp);
4703     } else if (feat.GetData().IsBiosrc()) {
4704         return new CSrcFeatValidator(feat, scope, imp);
4705     } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_exon) {
4706         return new CExonValidator(feat, scope, imp);
4707     } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_intron) {
4708         return new CIntronValidator(feat, scope, imp);
4709     } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature) {
4710         return new CMiscFeatValidator(feat, scope, imp);
4711     } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_assembly_gap) {
4712         return new CAssemblyGapValidator(feat, scope, imp);
4713     } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_polyA_site) {
4714         return new CPolyASiteValidator(feat, scope, imp);
4715     }
4716     else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_polyA_signal) {
4717         return new CPolyASignalValidator(feat, scope, imp);
4718     } else if (feat.GetData().IsImp()) {
4719         CSeqFeatData::ESubtype subtype = feat.GetData().GetSubtype();
4720         switch (subtype) {
4721             case CSeqFeatData::eSubtype_mat_peptide:
4722             case CSeqFeatData::eSubtype_propeptide:
4723             case CSeqFeatData::eSubtype_sig_peptide:
4724             case CSeqFeatData::eSubtype_transit_peptide:
4725                 return new CPeptideValidator(feat, scope, imp);
4726                 break;
4727             case CSeqFeatData::eSubtype_gap:
4728                 return new CGapFeatValidator(feat, scope, imp);
4729                 break;
4730             default:
4731                 return new CImpFeatValidator(feat, scope, imp);
4732                 break;
4733         }
4734     } else {
4735         return new CSingleFeatValidator(feat, scope, imp);
4736     }
4737 }
4738 
4739 
4740 
4741 
4742 END_SCOPE(validator)
4743 END_SCOPE(objects)
4744 END_NCBI_SCOPE
4745