1 /* $Id: single_feat_validator.cpp 632625 2021-06-03 17:38:33Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko......
27 *
28 * File Description:
29 * validation of Seq_feat
30 * .......
31 *
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <objtools/validator/validatorp.hpp>
37 #include <objtools/validator/single_feat_validator.hpp>
38 #include <objtools/validator/validerror_feat.hpp>
39 #include <objtools/validator/utilities.hpp>
40 #include <objtools/validator/go_term_validation_and_cleanup.hpp>
41 #include <objtools/format/items/gene_finder.hpp>
42
43 #include <serial/serialbase.hpp>
44
45 #include <objmgr/seqdesc_ci.hpp>
46 #include <objects/seq/seqport_util.hpp>
47 #include <util/sequtil/sequtil_convert.hpp>
48 #include <util/sgml_entity.hpp>
49
50
51 BEGIN_NCBI_SCOPE
52 BEGIN_SCOPE(objects)
53 BEGIN_SCOPE(validator)
54 using namespace sequence;
55
CSingleFeatValidator(const CSeq_feat & feat,CScope & scope,CValidError_imp & imp)56 CSingleFeatValidator::CSingleFeatValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp)
57 : m_Feat(feat), m_Scope(scope), m_Imp(imp), m_ProductIsFar(false)
58 {
59
60 }
61
62
Validate()63 void CSingleFeatValidator::Validate()
64 {
65 if (!m_Feat.IsSetLocation()) {
66 PostErr(eDiag_Critical, eErr_SEQ_FEAT_MissingLocation,
67 "The feature is missing a location");
68 return;
69 }
70
71 m_LocationBioseq = x_GetBioseqByLocation(m_Feat.GetLocation());
72 bool lowerSev = false;
73 if (m_Imp.IsRefSeq() && m_Feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_variation) {
74 if ( m_Feat.IsSetDbxref() ) {
75 ITERATE( CSeq_feat::TDbxref, it, m_Feat.GetDbxref() ) {
76 const CDbtag& dbtag = **it;
77 if ( dbtag.GetDb() == "dbSNP" ) {
78 lowerSev = true;
79 }
80 }
81 }
82 }
83 m_Imp.ValidateSeqLoc(m_Feat.GetLocation(), m_LocationBioseq,
84 (m_Feat.GetData().IsGene() || !m_Imp.IsGpipe()),
85 "Location", m_Feat, lowerSev);
86
87 x_ValidateSeqFeatLoc();
88 x_ValidateImpFeatLoc();
89
90 if (m_Feat.IsSetProduct()) {
91 m_ProductBioseq = x_GetFeatureProduct(m_ProductIsFar);
92 if (m_ProductBioseq == m_LocationBioseq) {
93 PostErr(eDiag_Error, eErr_SEQ_FEAT_SelfReferentialProduct, "Self-referential feature product");
94 }
95 x_ValidateSeqFeatProduct();
96 }
97
98 x_ValidateFeatPartialness();
99
100 x_ValidateBothStrands();
101
102 if (m_Feat.CanGetDbxref()) {
103 m_Imp.ValidateDbxref(m_Feat.GetDbxref(), m_Feat);
104 x_ValidateGeneId();
105 }
106
107 x_ValidateFeatComment();
108
109 x_ValidateFeatCit();
110
111 if (m_Feat.IsSetQual()) {
112 for (auto it = m_Feat.GetQual().begin(); it != m_Feat.GetQual().end(); it++) {
113 x_ValidateGbQual(**it);
114 }
115 }
116
117 x_ValidateExtUserObject();
118
119 if (m_Feat.IsSetExp_ev() && m_Feat.GetExp_ev() > 0 &&
120 !x_HasNamedQual("inference") &&
121 !x_HasNamedQual("experiment") &&
122 !m_Imp.DoesAnyFeatLocHaveGI()) {
123 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidInferenceValue,
124 "Inference or experiment qualifier missing but obsolete experimental evidence qualifier set");
125 }
126
127 x_ValidateExcept();
128
129 x_ValidateGbquals();
130 x_ValidateImpFeatQuals();
131
132 x_ValidateSeqFeatDataType();
133
134 x_ValidateNonImpFeat();
135
136 x_ValidateNonGene();
137
138 x_CheckForNonAsciiCharacters();
139 }
140
PostErr(EDiagSev sv,EErrType et,const string & msg)141 void CSingleFeatValidator::PostErr(EDiagSev sv, EErrType et, const string& msg)
142 {
143 m_Imp.PostErr(sv, et, msg, m_Feat);
144 }
145
146
147 CBioseq_Handle
x_GetBioseqByLocation(const CSeq_loc & loc)148 CSingleFeatValidator::x_GetBioseqByLocation(const CSeq_loc& loc)
149 {
150 if (loc.IsInt() || loc.IsWhole()) {
151 return m_Scope.GetBioseqHandle(loc);
152 }
153 CBioseq_Handle rval;
154 CConstRef<CSeq_id> prev(NULL);
155 for (CSeq_loc_CI citer(loc); citer; ++citer) {
156 const CSeq_id& this_id = citer.GetSeq_id();
157 if (!prev || !prev->Equals(this_id)) {
158 rval = m_Scope.GetBioseqHandle(this_id);
159 if (rval) {
160 break;
161 }
162 prev.Reset(&this_id);
163 }
164 }
165 return rval;
166 }
167
168
x_ValidateSeqFeatProduct()169 void CSingleFeatValidator::x_ValidateSeqFeatProduct()
170 {
171 if (!m_Feat.IsSetProduct()) {
172 return;
173 }
174 const CSeq_id& sid = GetId(m_Feat.GetProduct(), &m_Scope);
175
176 switch (sid.Which()) {
177 case CSeq_id::e_Genbank:
178 case CSeq_id::e_Embl:
179 case CSeq_id::e_Ddbj:
180 case CSeq_id::e_Tpg:
181 case CSeq_id::e_Tpe:
182 case CSeq_id::e_Tpd:
183 {
184 const CTextseq_id* tsid = sid.GetTextseq_Id();
185 if (tsid != NULL) {
186 if (!tsid->CanGetAccession() && tsid->CanGetName()) {
187 if (ValidateAccessionString(tsid->GetName(), false) == eAccessionFormat_valid) {
188 PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadProductSeqId,
189 "Feature product should not put an accession in the Textseq-id 'name' slot");
190 } else {
191 PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadProductSeqId,
192 "Feature product should not use "
193 "Textseq-id 'name' slot");
194 }
195 }
196 }
197 }
198 break;
199
200 default:
201 break;
202 }
203
204 if (m_ProductBioseq) {
205 m_Imp.ValidateSeqLoc(m_Feat.GetProduct(), m_ProductBioseq, true, "Product", m_Feat);
206
207 for (auto id : m_ProductBioseq.GetCompleteBioseq()->GetId()) {
208 if (id->Which() == sid.Which()) {
209 // check to make sure capitalization is the same
210 string from_seq = id->AsFastaString();
211 string from_loc = sid.AsFastaString();
212 if (!NStr::EqualCase(from_seq, from_loc) &&
213 NStr::EqualNocase(from_seq, from_loc)) {
214 PostErr(eDiag_Critical, eErr_SEQ_FEAT_BadProductSeqId,
215 "Capitalization change from product location on feature to product sequence");
216 }
217 }
218 switch (id->Which()) {
219 case CSeq_id::e_Genbank:
220 case CSeq_id::e_Embl:
221 case CSeq_id::e_Ddbj:
222 case CSeq_id::e_Tpg:
223 case CSeq_id::e_Tpe:
224 case CSeq_id::e_Tpd:
225 {
226 const CTextseq_id* tsid = id->GetTextseq_Id();
227 if (tsid != NULL) {
228 if (!tsid->IsSetAccession() && tsid->IsSetName()) {
229 if (ValidateAccessionString(tsid->GetName(), false) == eAccessionFormat_valid) {
230 PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadProductSeqId,
231 "Protein bioseq has Textseq-id 'name' that "
232 "looks like it is derived from a nucleotide "
233 "accession");
234 } else {
235 PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadProductSeqId,
236 "Protein bioseq has Textseq-id 'name' and no accession");
237 }
238 }
239 }
240 }
241 break;
242 default:
243 break;
244 }
245 }
246 }
247 }
248
249
x_HasSeqLocBond(const CSeq_feat & feat)250 bool CSingleFeatValidator::x_HasSeqLocBond(const CSeq_feat& feat)
251 {
252 // check for bond locations - only allowable in bond feature and under special circumstances for het
253 bool is_seqloc_bond = false;
254 if (feat.IsSetData()) {
255 if (feat.GetData().IsHet()) {
256 // heterogen can have mix of bonds with just "a" point specified */
257 for (CSeq_loc_CI it(feat.GetLocation()); it; ++it) {
258 if (it.GetEmbeddingSeq_loc().IsBond()
259 && (!it.GetEmbeddingSeq_loc().GetBond().IsSetA()
260 || it.GetEmbeddingSeq_loc().GetBond().IsSetB())) {
261 is_seqloc_bond = true;
262 break;
263 }
264 }
265 } else if (!feat.GetData().IsBond()) {
266 for (CSeq_loc_CI it(feat.GetLocation()); it; ++it) {
267 if (it.GetEmbeddingSeq_loc().IsBond()) {
268 is_seqloc_bond = true;
269 break;
270 }
271 }
272 }
273 } else {
274 for (CSeq_loc_CI it(feat.GetLocation()); it; ++it) {
275 if (it.GetEmbeddingSeq_loc().IsBond()) {
276 is_seqloc_bond = true;
277 break;
278 }
279 }
280 }
281 return is_seqloc_bond;
282 }
283
284
x_ValidateBothStrands()285 void CSingleFeatValidator::x_ValidateBothStrands()
286 {
287 if (!m_Feat.IsSetLocation() || CSeqFeatData::AllowStrandBoth(m_Feat.GetData().GetSubtype())) {
288 return;
289 }
290 bool both, both_rev;
291 x_LocHasStrandBoth(m_Feat.GetLocation(), both, both_rev);
292 if (both || both_rev) {
293 string suffix;
294 if (both && both_rev) {
295 suffix = "(forward and reverse)";
296 } else if (both) {
297 suffix = "(forward)";
298 } else if (both_rev) {
299 suffix = "(reverse)";
300 }
301
302 string label = CSeqFeatData::SubtypeValueToName(m_Feat.GetData().GetSubtype());
303
304 PostErr(eDiag_Error, eErr_SEQ_FEAT_BothStrands,
305 label + " may not be on both " + suffix + " strands");
306 }
307 }
308
309
x_LocHasStrandBoth(const CSeq_loc & loc,bool & both,bool & both_rev)310 void CSingleFeatValidator::x_LocHasStrandBoth(const CSeq_loc& loc, bool& both, bool& both_rev)
311 {
312 both = false;
313 both_rev = false;
314 for (CSeq_loc_CI it(loc); it; ++it) {
315 if (it.IsSetStrand()) {
316 ENa_strand s = it.GetStrand();
317 if (s == eNa_strand_both && !both) {
318 both = true;
319 } else if (s == eNa_strand_both_rev && !both_rev) {
320 both_rev = true;
321 }
322 }
323 if (both && both_rev) {
324 break;
325 }
326 }
327 }
328
329
HasGeneIdXref(const CMappedFeat & sf,const CObject_id & tag,bool & has_parent_gene_id)330 bool HasGeneIdXref(const CMappedFeat& sf, const CObject_id& tag, bool& has_parent_gene_id)
331 {
332 has_parent_gene_id = false;
333 if (!sf.IsSetDbxref()) {
334 return false;
335 }
336 ITERATE(CSeq_feat::TDbxref, it, sf.GetDbxref()) {
337 if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "GeneID")) {
338 has_parent_gene_id = true;
339 if ((*it)->IsSetTag() && (*it)->GetTag().Equals(tag)) {
340 return true;
341 }
342 }
343 }
344 return false;
345 }
346
347
x_ValidateGeneId()348 void CSingleFeatValidator::x_ValidateGeneId()
349 {
350 if (!m_Feat.IsSetDbxref()) {
351 return;
352 }
353
354 // no tse, no feat-handle
355 auto tse = m_Imp.GetTSE_Handle();
356 if (!tse) {
357 return;
358 }
359
360 CRef<feature::CFeatTree> feat_tree(NULL);
361 CMappedFeat mf = m_Scope.GetSeq_featHandle(m_Feat);
362 ITERATE(CSeq_feat::TDbxref, it, m_Feat.GetDbxref()) {
363 if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "GeneID") &&
364 (*it)->IsSetTag()) {
365 if (!feat_tree) {
366 feat_tree = m_Imp.GetGeneCache().GetFeatTreeFromCache(m_Feat, m_Scope);
367 }
368 if (feat_tree) {
369 CMappedFeat parent = feat_tree->GetParent(mf);
370 while (parent) {
371 bool has_parent_gene_id = false;
372 if (!HasGeneIdXref(parent, (*it)->GetTag(), has_parent_gene_id)) {
373 if (has_parent_gene_id ||
374 parent.GetData().GetSubtype() == CSeqFeatData::eSubtype_gene) {
375 PostErr(eDiag_Error, eErr_SEQ_FEAT_GeneIdMismatch,
376 "GeneID mismatch");
377 }
378 }
379 parent = feat_tree->GetParent(parent);
380 }
381 }
382 }
383 }
384
385 }
386
387
x_ValidateFeatCit()388 void CSingleFeatValidator::x_ValidateFeatCit()
389 {
390 if (!m_Feat.IsSetCit()) {
391 return;
392 }
393
394 if (m_Feat.GetCit().IsPub()) {
395 ITERATE(CPub_set::TPub, pi, m_Feat.GetCit().GetPub()) {
396 if ((*pi)->IsEquiv()) {
397 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryCitPubEquiv,
398 "Citation on feature has unexpected internal Pub-equiv");
399 return;
400 }
401 }
402 }
403 }
404
405
406 const string kInferenceMessage[] = {
407 "unknown error",
408 "empty inference string",
409 "bad inference prefix",
410 "bad inference body",
411 "single inference field",
412 "spaces in inference",
413 "possible comment in inference",
414 "same species misused",
415 "the value in the accession field is not legal. The only allowed value is accession.version, eg AF123456.1. Problem =",
416 "bad inference accession version",
417 "accession.version not public",
418 "bad accession type",
419 "unrecognized database",
420 };
421
422
x_ValidateGbQual(const CGb_qual & qual)423 void CSingleFeatValidator::x_ValidateGbQual(const CGb_qual& qual)
424 {
425 if (!qual.IsSetQual()) {
426 return;
427 }
428 /* first check for anything other than replace */
429 if (!qual.IsSetVal() || NStr::IsBlank(qual.GetVal())) {
430 if (NStr::EqualNocase(qual.GetQual(), "replace")) {
431 /* ok for replace */
432 } else {
433 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidPunctuation,
434 "Qualifier other than replace has just quotation marks");
435 if (NStr::EqualNocase(qual.GetQual(), "EC_number")) {
436 PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberEmpty, "EC number should not be empty");
437 }
438 }
439 if (NStr::EqualNocase(qual.GetQual(), "inference")) {
440 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidInferenceValue,
441 "Inference qualifier problem - empty inference string ()");
442 } else if (NStr::EqualNocase(qual.GetQual(), "pseudogene")) {
443 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidPseudoQualifier, "/pseudogene value should not be empty");
444 }
445 } else if (NStr::EqualNocase(qual.GetQual(), "EC_number")) {
446 if (!CProt_ref::IsValidECNumberFormat(qual.GetVal())) {
447 PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadEcNumberFormat,
448 qual.GetVal() + " is not in proper EC_number format");
449 } else {
450 string ec_number = qual.GetVal();
451 CProt_ref::EECNumberStatus status = CProt_ref::GetECNumberStatus(ec_number);
452 x_ReportECNumFileStatus();
453 switch (status) {
454 case CProt_ref::eEC_deleted:
455 PostErr(eDiag_Warning, eErr_SEQ_FEAT_DeletedEcNumber,
456 "EC_number " + ec_number + " was deleted");
457 break;
458 case CProt_ref::eEC_replaced:
459 PostErr(eDiag_Warning,
460 CProt_ref::IsECNumberSplit(ec_number) ? eErr_SEQ_FEAT_SplitEcNumber : eErr_SEQ_FEAT_ReplacedEcNumber,
461 "EC_number " + ec_number + " was replaced");
462 break;
463 case CProt_ref::eEC_unknown:
464 {
465 size_t pos = NStr::Find(ec_number, "n");
466 if (pos == string::npos || !isdigit(ec_number.c_str()[pos + 1])) {
467 PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadEcNumberValue,
468 ec_number + " is not a legal value for qualifier EC_number");
469 } else {
470 PostErr(eDiag_Info, eErr_SEQ_FEAT_BadEcNumberValue,
471 ec_number + " is not a legal preliminary value for qualifier EC_number");
472 }
473 }
474 break;
475 default:
476 break;
477 }
478 }
479 } else if (NStr::EqualNocase(qual.GetQual(), "inference")) {
480 /* TODO: Validate inference */
481 string val;
482 if (qual.IsSetVal()) {
483 val = qual.GetVal();
484 }
485 CValidError_feat::EInferenceValidCode rsult = CValidError_feat::ValidateInference(val, m_Imp.ValidateInferenceAccessions());
486 if (rsult > CValidError_feat::eInferenceValidCode_valid) {
487 if (NStr::IsBlank(val)) {
488 val = "?";
489 }
490 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidInferenceValue,
491 "Inference qualifier problem - " + kInferenceMessage[(int)rsult] + " ("
492 + val + ")");
493 }
494 } else if (NStr::EqualNocase(qual.GetQual(), "pseudogene")) {
495 m_Imp.IncrementPseudogeneCount();
496 if (!CGb_qual::IsValidPseudogeneValue(qual.GetVal())) {
497 m_Imp.PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidPseudoQualifier,
498 "/pseudogene value should not be '" + qual.GetVal() + "'", m_Feat);
499 }
500 } else if (NStr::EqualNocase(qual.GetQual(), "number")) {
501 bool has_space = false;
502 bool has_char_after_space = false;
503 ITERATE(string, it, qual.GetVal()) {
504 if (isspace((unsigned char)(*it))) {
505 has_space = true;
506 } else if (has_space) {
507 // non-space after space
508 has_char_after_space = true;
509 break;
510 }
511 }
512 if (has_char_after_space) {
513 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidNumberQualifier,
514 "Number qualifiers should not contain spaces");
515 }
516 }
517 if (qual.IsSetVal() && ContainsSgml(qual.GetVal())) {
518 PostErr(eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
519 "feature qualifier " + qual.GetVal() + " has SGML");
520 }
521
522 }
523
524
x_ReportECNumFileStatus()525 void CSingleFeatValidator::x_ReportECNumFileStatus()
526 {
527 static bool file_status_reported = false;
528
529 if (!file_status_reported) {
530 if (CProt_ref::GetECNumAmbiguousStatus() == CProt_ref::eECFile_not_found) {
531 PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberDataMissing,
532 "Unable to find EC number file 'ecnum_ambiguous.txt' in data directory");
533 }
534 if (CProt_ref::GetECNumDeletedStatus() == CProt_ref::eECFile_not_found) {
535 PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberDataMissing,
536 "Unable to find EC number file 'ecnum_deleted.txt' in data directory");
537 }
538 if (CProt_ref::GetECNumReplacedStatus() == CProt_ref::eECFile_not_found) {
539 PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberDataMissing,
540 "Unable to find EC number file 'ecnum_replaced.txt' in data directory");
541 }
542 if (CProt_ref::GetECNumSpecificStatus() == CProt_ref::eECFile_not_found) {
543 PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberDataMissing,
544 "Unable to find EC number file 'ecnum_specific.txt' in data directory");
545 }
546 file_status_reported = true;
547 }
548 }
549
550
x_ValidateExtUserObject()551 void CSingleFeatValidator::x_ValidateExtUserObject()
552 {
553 vector<TGoTermError> errors = GetGoTermErrors(m_Feat);
554 for (auto it : errors) {
555 PostErr(it.first == eErr_SEQ_FEAT_DuplicateGeneOntologyTerm ? eDiag_Info : eDiag_Warning,
556 it.first, it.second);
557 }
558 }
559
560
x_HasNamedQual(const string & qual_name)561 bool CSingleFeatValidator::x_HasNamedQual(const string& qual_name)
562 {
563 if (!m_Feat.IsSetQual()) {
564 return false;
565 }
566 ITERATE(CSeq_feat::TQual, it, m_Feat.GetQual()) {
567 if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), qual_name)) {
568 return true;
569 }
570 }
571 return false;
572 }
573
574
x_ValidateFeatComment()575 void CSingleFeatValidator::x_ValidateFeatComment()
576 {
577 if (!m_Feat.IsSetComment()) {
578 return;
579 }
580 const string& comment = m_Feat.GetComment();
581 if (m_Imp.IsSerialNumberInComment(comment)) {
582 m_Imp.PostErr(eDiag_Info, eErr_SEQ_FEAT_SerialInComment,
583 "Feature comment may refer to reference by serial number - "
584 "attach reference specific comments to the reference "
585 "REMARK instead.", m_Feat);
586 }
587 if (ContainsSgml(comment)) {
588 m_Imp.PostErr(eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
589 "feature comment " + comment + " has SGML",
590 m_Feat);
591 }
592 }
593
594
x_ValidateFeatPartialness()595 void CSingleFeatValidator::x_ValidateFeatPartialness()
596 {
597 unsigned int partial_prod = eSeqlocPartial_Complete,
598 partial_loc = eSeqlocPartial_Complete;
599
600 bool is_partial = m_Feat.IsSetPartial() && m_Feat.GetPartial();
601 partial_loc = SeqLocPartialCheck(m_Feat.GetLocation(), &m_Scope);
602
603 if (m_ProductBioseq) {
604 partial_prod = SeqLocPartialCheck(m_Feat.GetProduct(), &(m_ProductBioseq.GetScope()));
605 }
606
607 if ((partial_loc != eSeqlocPartial_Complete) ||
608 (partial_prod != eSeqlocPartial_Complete) ||
609 is_partial) {
610
611 // a feature on a partial sequence should be partial -- it often isn't
612 if (!is_partial &&
613 partial_loc != eSeqlocPartial_Complete &&
614 m_Feat.IsSetLocation() &&
615 m_Feat.GetLocation().IsWhole()) {
616 PostErr(eDiag_Info, eErr_SEQ_FEAT_PartialProblem,
617 "On partial Bioseq, SeqFeat.partial should be TRUE");
618 }
619 // a partial feature, with complete location, but partial product
620 else if (is_partial &&
621 partial_loc == eSeqlocPartial_Complete &&
622 m_Feat.IsSetProduct() &&
623 m_Feat.GetProduct().IsWhole() &&
624 partial_prod != eSeqlocPartial_Complete) {
625 if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
626 // suppress in gpipe genomic
627 } else {
628 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
629 "When SeqFeat.product is a partial Bioseq, SeqFeat.location "
630 "should also be partial");
631 }
632 }
633 // gene on segmented set is now 'order', should also be partial
634 else if (m_Feat.GetData().IsGene() &&
635 !is_partial &&
636 partial_loc == eSeqlocPartial_Internal) {
637 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
638 "Gene of 'order' with otherwise complete location should "
639 "have partial flag set");
640 }
641 // inconsistent combination of partial/complete product,location,partial flag - part 1
642 else if (partial_prod == eSeqlocPartial_Complete && m_Feat.IsSetProduct()) {
643 // if not local bioseq product, lower severity
644 EDiagSev sev = eDiag_Warning;
645 bool is_far_fail = false;
646 if (!m_ProductBioseq || m_ProductIsFar) {
647 sev = eDiag_Info;
648 if (!m_ProductBioseq && m_Imp.x_IsFarFetchFailure(m_Feat.GetProduct())) {
649 is_far_fail = true;
650 }
651 }
652
653 string str("Inconsistent: Product= complete, Location= ");
654 str += (partial_loc != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
655 str += "Feature.partial= ";
656 str += is_partial ? "TRUE" : "FALSE";
657 if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
658 // suppress for genomic gpipe
659 } else if (is_far_fail) {
660 m_Imp.SetFarFetchFailure();
661 } else {
662 PostErr(sev, eErr_SEQ_FEAT_PartialsInconsistent, str);
663 }
664 }
665 // inconsistent combination of partial/complete product,location,partial flag - part 2
666 else if (partial_loc == eSeqlocPartial_Complete || !is_partial) {
667 string str("Inconsistent: ");
668 if (m_Feat.IsSetProduct()) {
669 str += "Product= ";
670 str += (partial_prod != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
671 }
672 str += "Location= ";
673 str += (partial_loc != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
674 str += "Feature.partial= ";
675 str += is_partial ? "TRUE" : "FALSE";
676 if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
677 // suppress for genomic gpipe
678 } else {
679 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialsInconsistent, str);
680 }
681 }
682 // 5' or 3' partial location giving unclassified partial product
683 else if ((((partial_loc & eSeqlocPartial_Start) != 0) ||
684 ((partial_loc & eSeqlocPartial_Stop) != 0)) &&
685 ((partial_prod & eSeqlocPartial_Other) != 0) &&
686 is_partial) {
687 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
688 "5' or 3' partial location should not have unclassified"
689 " partial in product molinfo descriptor");
690 }
691
692 // note - in analogous C Toolkit function there is additional code for ensuring
693 // that partial intervals are partial at splice sites, gaps, or the ends of the
694 // sequence. This has been moved to CValidError_bioseq::ValidateFeatPartialInContext.
695 }
696
697 }
698
699
x_ValidateSeqFeatLoc()700 void CSingleFeatValidator::x_ValidateSeqFeatLoc()
701 {
702 if (x_HasSeqLocBond(m_Feat)) {
703 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ImproperBondLocation,
704 "Bond location should only be on bond features");
705 }
706
707 // feature location should not be whole
708 if (m_Feat.GetLocation().IsWhole()) {
709 string prefix = "Feature";
710 if (m_Feat.IsSetData()) {
711 if (m_Feat.GetData().IsCdregion()) {
712 prefix = "CDS";
713 } else if (m_Feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
714 prefix = "mRNA";
715 }
716 }
717 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WholeLocation, prefix + " may not have whole location");
718 }
719
720 if (m_LocationBioseq) {
721 // look for mismatch in capitalization for IDs
722 CNcbiOstrstream os;
723 const CSeq_id *id = m_Feat.GetLocation().GetId();
724 if (id) {
725 id->WriteAsFasta(os);
726 string loc_id = CNcbiOstrstreamToString(os);
727 FOR_EACH_SEQID_ON_BIOSEQ(it, *(m_LocationBioseq.GetCompleteBioseq())) {
728 if ((*it)->IsGi() || (*it)->IsGibbsq() || (*it)->IsGibbmt()) {
729 continue;
730 }
731 CNcbiOstrstream os2;
732 (*it)->WriteAsFasta(os2);
733 string bs_id = CNcbiOstrstreamToString(os2);
734 if (NStr::EqualNocase(loc_id, bs_id) && !NStr::EqualCase(loc_id, bs_id)) {
735 PostErr(eDiag_Error, eErr_SEQ_FEAT_FeatureSeqIDCaseDifference,
736 "Sequence identifier in feature location differs in capitalization with identifier on Bioseq");
737 }
738 }
739 }
740 // look for protein features on the minus strand
741 if (m_LocationBioseq.IsAa() && m_Feat.GetLocation().IsSetStrand() &&
742 m_Feat.GetLocation().GetStrand() == eNa_strand_minus) {
743 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MinusStrandProtein,
744 "Feature on protein indicates negative strand");
745 }
746
747 if (!m_Feat.GetData().IsImp()
748 || !m_Feat.GetData().GetImp().IsSetKey()
749 || !NStr::EqualNocase(m_Feat.GetData().GetImp().GetKey(), "gap")) {
750 try {
751 vector<TSeqPos> gap_starts;
752 size_t rval = x_CalculateLocationGaps(m_LocationBioseq, m_Feat.GetLocation(), gap_starts);
753 bool mostly_raw_ns = x_IsMostlyNs(m_Feat.GetLocation(), m_LocationBioseq);
754
755 if ((rval & eLocationGapMostlyNs) || mostly_raw_ns) {
756 PostErr(eDiag_Info, eErr_SEQ_FEAT_FeatureIsMostlyNs,
757 "Feature contains more than 50% Ns");
758 }
759 for (auto gap_start : gap_starts) {
760 PostErr(eDiag_Warning, eErr_SEQ_FEAT_FeatureBeginsOrEndsInGap,
761 "Feature begins or ends in gap starting at " + NStr::NumericToString(gap_start + 1));
762 }
763 if (rval & eLocationGapContainedInGap &&
764 (!(rval & eLocationGapFeatureMatchesGap) || !x_AllowFeatureToMatchGapExactly())) {
765 PostErr(eDiag_Warning, eErr_SEQ_FEAT_FeatureInsideGap,
766 "Feature inside sequence gap");
767 }
768 if (m_Feat.GetData().IsCdregion() || m_Feat.GetData().IsRna()) {
769 if (rval & eLocationGapInternalIntervalEndpointInGap) {
770 PostErr(eDiag_Info, eErr_SEQ_FEAT_IntervalBeginsOrEndsInGap,
771 "Internal interval begins or ends in gap");
772 }
773 if (rval & eLocationGapCrossesUnknownGap) {
774 PostErr(eDiag_Warning, eErr_SEQ_FEAT_FeatureCrossesGap,
775 "Feature crosses gap of unknown length");
776 }
777 }
778 } catch (CException &e) {
779 PostErr(eDiag_Fatal, eErr_INTERNAL_Exception,
780 string("Exception while checking for intervals in gaps. EXCEPTION: ") +
781 e.what());
782 } catch (std::exception) {
783 }
784 }
785 }
786
787 }
788
789
x_AllowFeatureToMatchGapExactly()790 bool CSingleFeatValidator::x_AllowFeatureToMatchGapExactly()
791 {
792 if (m_Feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature ||
793 m_Feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_assembly_gap) {
794 return true;
795 } else {
796 return false;
797 }
798 }
799
800
801 class CGapCache {
802 public:
803 CGapCache(const CSeq_loc& loc, CBioseq_Handle bsh);
~CGapCache()804 ~CGapCache() {};
805 bool IsUnknownGap(size_t offset);
806 bool IsKnownGap(size_t offset);
807 bool IsGap(size_t offset);
808
809 private:
810 typedef enum {
811 eGapType_unknown = 0,
812 eGapType_known
813 } EGapType;
814 typedef map <size_t, EGapType> TGapTypeMap;
815 TGapTypeMap m_Map;
816 size_t m_NumUnknown;
817 size_t m_NumKnown;
818 };
819
CGapCache(const CSeq_loc & loc,CBioseq_Handle bsh)820 CGapCache::CGapCache(const CSeq_loc& loc, CBioseq_Handle bsh)
821 {
822 TSeqPos start = loc.GetStart(eExtreme_Positional);
823 TSeqPos stop = loc.GetStop(eExtreme_Positional);
824 CRange<TSeqPos> range(start, stop);
825 CSeqMap_CI map_iter(bsh, SSeqMapSelector(CSeqMap::fDefaultFlags, 100), range);
826 TSeqPos pos = start;
827 while (map_iter && pos <= stop) {
828 TSeqPos map_end = map_iter.GetPosition() + map_iter.GetLength();
829 if (map_iter.GetType() == CSeqMap::eSeqGap) {
830 for (; pos < map_end && pos <= stop; pos++) {
831 if (map_iter.IsUnknownLength()) {
832 m_Map[pos - start] = eGapType_unknown;
833 m_NumUnknown++;
834 } else {
835 m_Map[pos - start] = eGapType_known;
836 m_NumKnown++;
837 }
838 }
839 } else {
840 pos = map_end;
841 }
842 ++map_iter;
843 }
844 }
845
IsGap(size_t pos)846 bool CGapCache::IsGap(size_t pos)
847 {
848 if (m_Map.find(pos) != m_Map.end()) {
849 return true;
850 } else {
851 return false;
852 }
853 }
854
855
IsKnownGap(size_t pos)856 bool CGapCache::IsKnownGap(size_t pos)
857 {
858 TGapTypeMap::iterator it = m_Map.find(pos);
859 if (it == m_Map.end()) {
860 return false;
861 } else if (it->second == eGapType_known) {
862 return true;
863 } else {
864 return false;
865 }
866 }
867
868
IsUnknownGap(size_t pos)869 bool CGapCache::IsUnknownGap(size_t pos)
870 {
871 TGapTypeMap::iterator it = m_Map.find(pos);
872 if (it == m_Map.end()) {
873 return false;
874 } else if (it->second == eGapType_unknown) {
875 return true;
876 } else {
877 return false;
878 }
879 }
880
881
xf_IsDeltaLitOnly(CBioseq_Handle bsh)882 static bool xf_IsDeltaLitOnly (CBioseq_Handle bsh)
883
884 {
885 if ( bsh.IsSetInst_Ext() ) {
886 const CBioseq_Handle::TInst_Ext& ext = bsh.GetInst_Ext();
887 if ( ext.IsDelta() ) {
888 ITERATE (CDelta_ext::Tdata, it, ext.GetDelta().Get()) {
889 if ( (*it)->IsLoc() ) {
890 return false;
891 }
892 }
893 }
894 }
895 return true;
896 }
897
898
x_CalculateLocationGaps(CBioseq_Handle bsh,const CSeq_loc & loc,vector<TSeqPos> & gap_starts)899 size_t CSingleFeatValidator::x_CalculateLocationGaps(CBioseq_Handle bsh, const CSeq_loc& loc, vector<TSeqPos>& gap_starts)
900 {
901 size_t rval = eLocationGapNoProblems;
902 if (!bsh.IsNa() || !bsh.IsSetInst_Repr() || bsh.GetInst().GetRepr() != CSeq_inst::eRepr_delta) {
903 return rval;
904 }
905 // look for features inside gaps, crossing unknown gaps, or starting or ending in gaps
906 // ignore gap features for this
907 int num_n = 0;
908 int num_real = 0;
909 int num_gap = 0;
910 int num_unknown_gap = 0;
911 bool first_in_gap = false, last_in_gap = false;
912 bool local_first_gap = false, local_last_gap = false;
913 bool startsOrEndsInGap = false;
914 bool first = true;
915
916 for (CSeq_loc_CI loc_it(loc); loc_it; ++loc_it) {
917 CConstRef<CSeq_loc> this_loc = loc_it.GetRangeAsSeq_loc();
918 CSeqVector vec = GetSequenceFromLoc(*this_loc, bsh.GetScope());
919 if (!vec.empty()) {
920 CBioseq_Handle ph;
921 bool match = false;
922 for (auto id_it : bsh.GetBioseqCore()->GetId()) {
923 if (id_it->Equals(loc_it.GetSeq_id())) {
924 match = true;
925 break;
926 }
927 }
928 if (match) {
929 ph = bsh;
930 } else {
931 ph = bsh.GetScope().GetBioseqHandle(*this_loc);
932 }
933 try {
934 CGapCache gap_cache(*this_loc, ph);
935 string vec_data;
936 vec.GetSeqData(0, vec.size(), vec_data);
937
938 local_first_gap = false;
939 local_last_gap = false;
940 TSeqLength len = loc_it.GetRange().GetLength();
941 ENa_strand strand = loc_it.GetStrand();
942
943 size_t pos = 0;
944 string::iterator it = vec_data.begin();
945 while (it != vec_data.end() && pos < len) {
946 bool is_gap = false;
947 bool unknown_length = false;
948 if (strand == eNa_strand_minus) {
949 if (gap_cache.IsKnownGap(len - pos - 1)) {
950 is_gap = true;
951 } else if (gap_cache.IsUnknownGap(len - pos - 1)) {
952 is_gap = true;
953 unknown_length = true;
954 }
955 } else {
956 if (gap_cache.IsKnownGap(pos)) {
957 is_gap = true;
958 } else if (gap_cache.IsUnknownGap(pos)) {
959 is_gap = true;
960 unknown_length = true;
961 }
962
963 }
964 if (is_gap) {
965 if (pos == 0) {
966 local_first_gap = true;
967 } else if (pos == len - 1) {
968 local_last_gap = true;
969 }
970 if (unknown_length) {
971 num_unknown_gap++;
972 } else {
973 num_gap++;
974 }
975 } else if (*it == 'N') {
976 num_n++;
977 } else {
978 num_real++;
979 }
980 ++it;
981 ++pos;
982 }
983 } catch (CException&/* ex*/) {
984 /*
985 PostErr(eDiag_Fatal, eErr_INTERNAL_Exception,
986 string("Exception while checking for intervals in gaps. EXCEPTION: ") +
987 ex.what(), feat);
988 */
989 }
990 }
991 if (first) {
992 first_in_gap = local_first_gap;
993 first = false;
994 }
995 last_in_gap = local_last_gap;
996 if (local_first_gap || local_last_gap) {
997 startsOrEndsInGap = true;
998 }
999 }
1000
1001 if (num_real == 0 && num_n == 0) {
1002 TSeqPos start = loc.GetStart(eExtreme_Positional);
1003 TSeqPos stop = loc.GetStop(eExtreme_Positional);
1004 if ((start == 0 || CSeqMap_CI(bsh, SSeqMapSelector(), start - 1).GetType() != CSeqMap::eSeqGap)
1005 && (stop == bsh.GetBioseqLength() - 1 || CSeqMap_CI(bsh, SSeqMapSelector(), stop + 1).GetType() != CSeqMap::eSeqGap)) {
1006 rval |= eLocationGapFeatureMatchesGap;
1007 }
1008 }
1009
1010
1011 if (num_gap == 0 && num_unknown_gap == 0 && num_n == 0) {
1012 // ignore features that do not cover any gap characters
1013 } else if (first_in_gap || last_in_gap) {
1014 if (num_real > 0) {
1015 TSeqPos gap_start = x_FindStartOfGap(bsh,
1016 first_in_gap ? loc.GetStart(eExtreme_Biological)
1017 : loc.GetStop(eExtreme_Biological), &(bsh.GetScope()));
1018 gap_starts.push_back(gap_start);
1019 } else {
1020 rval |= eLocationGapContainedInGap;
1021 }
1022 } else if (num_real == 0 && num_gap == 0 && num_unknown_gap == 0 && num_n >= 50) {
1023 rval |= eLocationGapContainedInGapOfNs;
1024 } else if (startsOrEndsInGap) {
1025 rval |= eLocationGapInternalIntervalEndpointInGap;
1026 } else if (num_unknown_gap > 0) {
1027 rval |= eLocationGapCrossesUnknownGap;
1028 }
1029
1030 if (num_n > num_real && xf_IsDeltaLitOnly(bsh)) {
1031 rval |= eLocationGapMostlyNs;
1032 }
1033
1034 return rval;
1035 }
1036
1037
x_FindStartOfGap(CBioseq_Handle bsh,int pos,CScope * scope)1038 size_t CSingleFeatValidator::x_FindStartOfGap(CBioseq_Handle bsh, int pos, CScope* scope)
1039 {
1040 if (!bsh || !bsh.IsNa() || !bsh.IsSetInst_Repr()
1041 || bsh.GetInst_Repr() != CSeq_inst::eRepr_delta
1042 || !bsh.GetInst().IsSetExt()
1043 || !bsh.GetInst().GetExt().IsDelta()) {
1044 return bsh.GetInst_Length();
1045 }
1046 size_t offset = 0;
1047
1048 ITERATE(CSeq_inst::TExt::TDelta::Tdata, it, bsh.GetInst_Ext().GetDelta().Get()) {
1049 unsigned int len = 0;
1050 if ((*it)->IsLiteral()) {
1051 len = (*it)->GetLiteral().GetLength();
1052 } else if ((*it)->IsLoc()) {
1053 len = sequence::GetLength((*it)->GetLoc(), scope);
1054 }
1055 if ((unsigned int)pos >= offset && (unsigned int)pos < offset + len) {
1056 return offset;
1057 } else {
1058 offset += len;
1059 }
1060 }
1061 return offset;
1062 }
1063
1064
x_IsMostlyNs(const CSeq_loc & loc,CBioseq_Handle bsh)1065 bool CSingleFeatValidator::x_IsMostlyNs(const CSeq_loc& loc, CBioseq_Handle bsh)
1066 {
1067 if (!bsh.IsNa() || !bsh.IsSetInst_Repr() || bsh.GetInst_Repr() != CSeq_inst::eRepr_raw) {
1068 return false;
1069 }
1070 int num_n = 0;
1071 int real_bases = 0;
1072
1073 for (CSeq_loc_CI loc_it(loc); loc_it; ++loc_it) {
1074 CConstRef<CSeq_loc> this_loc = loc_it.GetRangeAsSeq_loc();
1075 CSeqVector vec = GetSequenceFromLoc(*this_loc, bsh.GetScope());
1076 if (!vec.empty()) {
1077 CBioseq_Handle ph;
1078 bool match = false;
1079 for (auto id_it : bsh.GetBioseqCore()->GetId()) {
1080 if (id_it->Equals(loc_it.GetSeq_id())) {
1081 match = true;
1082 break;
1083 }
1084 }
1085 if (match) {
1086 ph = bsh;
1087 } else {
1088 ph = bsh.GetScope().GetBioseqHandle(*this_loc);
1089 }
1090 TSeqPos offset = this_loc->GetStart(eExtreme_Positional);
1091 string vec_data;
1092 try {
1093 vec.GetSeqData(0, vec.size(), vec_data);
1094
1095 int pos = 0;
1096 string::iterator it = vec_data.begin();
1097 while (it != vec_data.end()) {
1098 if (*it == 'N') {
1099 CSeqMap_CI map_iter(ph, SSeqMapSelector(), offset + pos);
1100 if (map_iter.GetType() == CSeqMap::eSeqGap) {
1101 } else {
1102 num_n++;
1103 }
1104 } else {
1105 if ((unsigned)(*it + 1) <= 256 && isalpha(*it)) {
1106 real_bases++;
1107 }
1108 }
1109 ++it;
1110 ++pos;
1111 }
1112 } catch (CException) {
1113 } catch (std::exception) {
1114 }
1115 }
1116 }
1117
1118 return (num_n > real_bases);
1119 }
1120
1121
x_GetFeatureProduct(bool look_far,bool & is_far)1122 CBioseq_Handle CSingleFeatValidator::x_GetFeatureProduct(bool look_far, bool& is_far)
1123 {
1124 CBioseq_Handle prot_handle;
1125 is_far = false;
1126 if (!m_Feat.IsSetProduct()) {
1127 return prot_handle;
1128 }
1129 const CSeq_id* protid = NULL;
1130 try {
1131 protid = &sequence::GetId(m_Feat.GetProduct(), &m_Scope);
1132 } catch (CException&) {}
1133
1134 if (!protid) {
1135 return prot_handle;
1136 }
1137
1138 // try "local" scope
1139 prot_handle = m_Scope.GetBioseqHandleFromTSE(*protid, m_LocationBioseq.GetTSE_Handle());
1140 if (!prot_handle) {
1141 prot_handle = m_Scope.GetBioseqHandleFromTSE(*protid, m_Imp.GetTSE_Handle());
1142 }
1143 if (!prot_handle && look_far) {
1144 prot_handle = m_Scope.GetBioseqHandle(*protid);
1145 if (prot_handle) {
1146 is_far = true;
1147 }
1148 }
1149
1150 return prot_handle;
1151 }
1152
1153
x_GetFeatureProduct(bool & is_far)1154 CBioseq_Handle CSingleFeatValidator::x_GetFeatureProduct(bool& is_far)
1155 {
1156 bool look_far = false;
1157
1158 if (m_Feat.IsSetData()) {
1159 if (m_Feat.GetData().IsCdregion()) {
1160 look_far = m_Imp.IsFarFetchCDSproducts();
1161 } else if (m_Feat.GetData().IsRna()) {
1162 look_far = m_Imp.IsFarFetchMRNAproducts();
1163 } else {
1164 look_far = m_Imp.IsRemoteFetch();
1165 }
1166 }
1167
1168 return x_GetFeatureProduct(look_far, is_far);
1169 }
1170
1171
x_ValidateExcept()1172 void CSingleFeatValidator::x_ValidateExcept()
1173 {
1174 if (m_Feat.IsSetExcept_text() && !NStr::IsBlank(m_Feat.GetExcept_text()) &&
1175 (!m_Feat.IsSetExcept() || !m_Feat.GetExcept())) {
1176 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MissingExceptionFlag,
1177 "Exception text is present, but exception flag is not set");
1178 } else if (m_Feat.IsSetExcept() && m_Feat.GetExcept() &&
1179 (!m_Feat.IsSetExcept_text() || NStr::IsBlank(m_Feat.GetExcept_text()))) {
1180 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptionMissingText,
1181 "Exception flag is set, but exception text is empty");
1182 }
1183 if (m_Feat.IsSetExcept_text() && !m_Feat.GetExcept_text().empty()) {
1184 x_ValidateExceptText(m_Feat.GetExcept_text());
1185 }
1186 }
1187
1188
x_ValidateExceptText(const string & text)1189 void CSingleFeatValidator::x_ValidateExceptText(const string& text)
1190 {
1191 if (text.empty()) return;
1192
1193 EDiagSev sev = eDiag_Error;
1194 bool found = false;
1195
1196 string str;
1197
1198 bool reasons_in_cit = false;
1199 bool annotated_by_transcript_or_proteomic = false;
1200 bool redundant_with_comment = false;
1201 bool refseq_except = false;
1202 vector<string> exceptions;
1203 NStr::Split(text, ",", exceptions, 0);
1204 ITERATE(vector<string>, it, exceptions) {
1205 found = false;
1206 str = NStr::TruncateSpaces(*it);
1207 if (NStr::IsBlank(*it)) {
1208 continue;
1209 }
1210 found = CSeq_feat::IsExceptionTextInLegalList(str, false);
1211
1212 if (found) {
1213 if (NStr::EqualNocase(str, "reasons given in citation")) {
1214 reasons_in_cit = true;
1215 } else if (NStr::EqualNocase(str, "annotated by transcript or proteomic data")) {
1216 annotated_by_transcript_or_proteomic = true;
1217 }
1218 }
1219 if (!found) {
1220 if (m_LocationBioseq) {
1221 bool check_refseq = false;
1222 if (m_Imp.IsRefSeqConventions()) {
1223 check_refseq = true;
1224 } else if (GetGenProdSetParent(m_LocationBioseq)) {
1225 check_refseq = true;
1226 } else {
1227 FOR_EACH_SEQID_ON_BIOSEQ(id_it, *(m_LocationBioseq.GetCompleteBioseq())) {
1228 if ((*id_it)->IsOther()) {
1229 check_refseq = true;
1230 break;
1231 }
1232 }
1233 }
1234
1235 if (check_refseq) {
1236 if (CSeq_feat::IsExceptionTextRefSeqOnly(str)) {
1237 found = true;
1238 refseq_except = true;
1239 }
1240 }
1241 }
1242 }
1243 if (!found) {
1244 // lower to warning for genomic refseq
1245 const CSeq_id *id = m_Feat.GetLocation().GetId();
1246 if ((id != NULL && IsNTNCNWACAccession(*id)) ||
1247 (m_LocationBioseq && IsNTNCNWACAccession(*(m_LocationBioseq.GetCompleteBioseq())))) {
1248 sev = eDiag_Warning;
1249 }
1250 PostErr(sev, eErr_SEQ_FEAT_ExceptionProblem,
1251 str + " is not a legal exception explanation");
1252 }
1253 if (m_Feat.IsSetComment() && NStr::Find(m_Feat.GetComment(), str) != string::npos) {
1254 if (!NStr::EqualNocase(str, "ribosomal slippage") &&
1255 !NStr::EqualNocase(str, "trans-splicing") &&
1256 !NStr::EqualNocase(str, "RNA editing") &&
1257 !NStr::EqualNocase(str, "artificial location")) {
1258 redundant_with_comment = true;
1259 } else if (NStr::EqualNocase(m_Feat.GetComment(), str)) {
1260 redundant_with_comment = true;
1261 }
1262 }
1263 }
1264 if (redundant_with_comment) {
1265 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptionProblem,
1266 "Exception explanation text is also found in feature comment");
1267 }
1268 if (refseq_except) {
1269 bool found_just_the_exception = CSeq_feat::IsExceptionTextRefSeqOnly(str);
1270
1271 if (!found_just_the_exception) {
1272 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptionProblem,
1273 "Genome processing exception should not be combined with other explanations");
1274 }
1275 }
1276
1277 if (reasons_in_cit && !m_Feat.IsSetCit()) {
1278 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptionProblem,
1279 "Reasons given in citation exception does not have the required citation");
1280 }
1281 if (annotated_by_transcript_or_proteomic) {
1282 bool has_inference = false;
1283 FOR_EACH_GBQUAL_ON_SEQFEAT(it, m_Feat) {
1284 if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "inference")) {
1285 has_inference = true;
1286 break;
1287 }
1288 }
1289 if (!has_inference) {
1290 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptionProblem,
1291 "Annotated by transcript or proteomic data exception does not have the required inference qualifier");
1292 }
1293 }
1294 }
1295
1296
1297 const string kOrigProteinId = "orig_protein_id";
1298
x_ReportOrigProteinId()1299 bool CSingleFeatValidator::x_ReportOrigProteinId()
1300 {
1301 if (!m_Feat.GetData().IsRna()) {
1302 return true;
1303 } else {
1304 return false;
1305 }
1306 }
1307
1308
x_ValidateGbquals()1309 void CSingleFeatValidator::x_ValidateGbquals()
1310 {
1311 if (!m_Feat.IsSetQual()) {
1312 return;
1313 }
1314 string key;
1315 bool is_imp = false;
1316 CSeqFeatData::ESubtype ftype = m_Feat.GetData().GetSubtype();
1317
1318 if (m_Feat.IsSetData() && m_Feat.GetData().IsImp()) {
1319 is_imp = true;
1320 key = m_Feat.GetData().GetImp().GetKey();
1321 if (ftype == CSeqFeatData::eSubtype_imp && NStr::EqualNocase (key, "gene")) {
1322 ftype = CSeqFeatData::eSubtype_gene;
1323 } else if (ftype == CSeqFeatData::eSubtype_imp) {
1324 ftype = CSeqFeatData::eSubtype_bad;
1325 } else if (ftype == CSeqFeatData::eSubtype_Imp_CDS
1326 || ftype == CSeqFeatData::eSubtype_site_ref
1327 || ftype == CSeqFeatData::eSubtype_org) {
1328 ftype = CSeqFeatData::eSubtype_bad;
1329 }
1330 }
1331 else {
1332 key = m_Feat.GetData().GetKey();
1333 if (NStr::Equal (key, "Gene")) {
1334 key = "gene";
1335 }
1336 }
1337
1338 for (auto gbq : m_Feat.GetQual()) {
1339 const string& qual_str = gbq->GetQual();
1340
1341 if ( NStr::Equal (qual_str, "gsdb_id")) {
1342 continue;
1343 }
1344 auto gbqual_and_value = CSeqFeatData::GetQualifierTypeAndValue(qual_str);
1345 auto gbqual = gbqual_and_value.first;
1346 bool same_case = (gbqual == CSeqFeatData::eQual_bad) || NStr::EqualCase(gbqual_and_value.second, qual_str);
1347
1348 if ( !same_case ) {
1349 PostErr(eDiag_Warning, eErr_SEQ_FEAT_IncorrectQualifierCapitalization,
1350 qual_str + " is improperly capitalized");
1351 }
1352
1353 if ( gbqual == CSeqFeatData::eQual_bad ) {
1354 if (is_imp) {
1355 if (!gbq->IsSetQual() || NStr::IsBlank(gbq->GetQual())) {
1356 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatQual,
1357 "NULL qualifier");
1358 }
1359 else {
1360 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatQual,
1361 "Unknown qualifier " + qual_str);
1362 }
1363 } else if (NStr::Equal(qual_str, kOrigProteinId)) {
1364 if (x_ReportOrigProteinId()) {
1365 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownFeatureQual, "Unknown qualifier " + kOrigProteinId);
1366 }
1367 } else {
1368 CSeqFeatData::E_Choice chs = m_Feat.GetData().Which();
1369 if (chs == CSeqFeatData::e_Gene) {
1370 if (NStr::Equal(qual_str, "gen_map")
1371 || NStr::Equal(qual_str, "cyt_map")
1372 || NStr::Equal(qual_str, "rad_map")) {
1373 continue;
1374 }
1375 } else if (chs == CSeqFeatData::e_Cdregion) {
1376 if (NStr::Equal(qual_str, "orig_transcript_id")) {
1377 continue;
1378 }
1379 } else if (chs == CSeqFeatData::e_Rna) {
1380 if (NStr::Equal(qual_str, "orig_transcript_id")) {
1381 continue;
1382 }
1383 }
1384 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownFeatureQual, "Unknown qualifier " + qual_str);
1385 }
1386 } else {
1387 if ( ftype != CSeqFeatData::eSubtype_bad && !CSeqFeatData::IsLegalQualifier(ftype, gbqual) ) {
1388 PostErr(eDiag_Warning,
1389 is_imp ? eErr_SEQ_FEAT_WrongQualOnImpFeat : eErr_SEQ_FEAT_WrongQualOnFeature,
1390 "Wrong qualifier " + qual_str + " for feature " +
1391 key);
1392 }
1393
1394 if (gbq->IsSetVal() && !NStr::IsBlank(gbq->GetVal())) {
1395 // validate value of gbqual
1396 const string& val = gbq->GetVal();
1397 switch (gbqual) {
1398
1399 case CSeqFeatData::eQual_rpt_type:
1400 if (!CGb_qual::IsValidRptTypeValue(val)) {
1401 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
1402 val + " is not a legal value for qualifier " + qual_str);
1403 }
1404 break;
1405
1406 case CSeqFeatData::eQual_rpt_unit:
1407 x_ValidateRptUnitVal (val, key);
1408 break;
1409
1410 case CSeqFeatData::eQual_rpt_unit_seq:
1411 x_ValidateRptUnitSeqVal (val, key);
1412 break;
1413
1414 case CSeqFeatData::eQual_rpt_unit_range:
1415 x_ValidateRptUnitRangeVal (val);
1416 break;
1417
1418 case CSeqFeatData::eQual_label:
1419 x_ValidateLabelVal (val);
1420 break;
1421
1422 case CSeqFeatData::eQual_replace:
1423 if (is_imp) {
1424 x_ValidateReplaceQual(key, qual_str, val);
1425 }
1426 break;
1427
1428 case CSeqFeatData::eQual_mobile_element:
1429 case CSeqFeatData::eQual_mobile_element_type:
1430 if (is_imp && !CGb_qual::IsLegalMobileElementValue(val)) {
1431 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MobileElementInvalidQualifier,
1432 val + " is not a legal value for qualifier " + qual_str);
1433 }
1434 break;
1435
1436 case CSeqFeatData::eQual_compare:
1437 x_ValidateCompareVal (val);
1438 break;
1439
1440 case CSeqFeatData::eQual_standard_name:
1441 if (is_imp && ftype == CSeqFeatData::eSubtype_misc_feature
1442 && NStr::EqualCase (val, "Vector Contamination")) {
1443 PostErr (eDiag_Warning, eErr_SEQ_FEAT_VectorContamination,
1444 "Vector Contamination region should be trimmed from sequence");
1445 }
1446 break;
1447
1448 case CSeqFeatData::eQual_product:
1449 if (!is_imp) {
1450 CSeqFeatData::E_Choice chs = m_Feat.GetData().Which();
1451 if (chs == CSeqFeatData::e_Gene) {
1452 PostErr(eDiag_Info, eErr_SEQ_FEAT_InvalidProductOnGene,
1453 "A product qualifier is not used on a gene feature");
1454 }
1455 }
1456 break;
1457
1458 // for VR-825
1459 case CSeqFeatData::eQual_locus_tag:
1460 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnFeature,
1461 "locus-tag values should be on genes");
1462 break;
1463 default:
1464 break;
1465 } // end of switch statement
1466 }
1467 }
1468 }
1469 }
1470
1471
x_ValidateRptUnitVal(const string & val,const string & key)1472 void CSingleFeatValidator::x_ValidateRptUnitVal (const string& val, const string& key)
1473 {
1474 bool /* found = false, */ multiple_rpt_unit = false;
1475 ITERATE(string, it, val) {
1476 if ( *it <= ' ' ) {
1477 /* found = true; */
1478 } else if ( *it == '(' || *it == ')' ||
1479 *it == ',' || *it == '.' ||
1480 isdigit((unsigned char)(*it)) ) {
1481 multiple_rpt_unit = true;
1482 }
1483 }
1484 /*
1485 if ( found ||
1486 (!multiple_rpt_unit && val.length() > 48) ) {
1487 error = true;
1488 }
1489 */
1490 if ( NStr::CompareNocase(key, "repeat_region") == 0 &&
1491 !multiple_rpt_unit ) {
1492 if (val.length() <= GetLength(m_Feat.GetLocation(), &m_Scope) ) {
1493 bool just_nuc_letters = true;
1494 static const string nuc_letters = "ACGTNacgtn";
1495 ITERATE(string, it, val) {
1496 if ( nuc_letters.find(*it) == NPOS ) {
1497 just_nuc_letters = false;
1498 break;
1499 }
1500 }
1501
1502 if ( just_nuc_letters ) {
1503 CSeqVector vec = GetSequenceFromFeature(m_Feat, m_Scope);
1504 if ( !vec.empty() ) {
1505 string vec_data;
1506 vec.GetSeqData(0, vec.size(), vec_data);
1507 if (NStr::FindNoCase (vec_data, val) == string::npos) {
1508 PostErr(eDiag_Warning, eErr_SEQ_FEAT_RepeatSeqDoNotMatch,
1509 "repeat_region /rpt_unit and underlying "
1510 "sequence do not match");
1511 }
1512 }
1513 }
1514 } else {
1515 PostErr(eDiag_Info, eErr_SEQ_FEAT_InvalidRepeatUnitLength,
1516 "Length of rpt_unit_seq is greater than feature length");
1517 }
1518 }
1519 }
1520
1521
x_ValidateRptUnitSeqVal(const string & val,const string & key)1522 void CSingleFeatValidator::x_ValidateRptUnitSeqVal (const string& val, const string& key)
1523 {
1524 // do validation common to rpt_unit
1525 x_ValidateRptUnitVal (val, key);
1526
1527 // do the validation specific to rpt_unit_seq
1528 const char *cp = val.c_str();
1529 bool badchars = false;
1530 while (*cp != 0 && !badchars) {
1531 if (*cp < ' ') {
1532 badchars = true;
1533 } else if (*cp != '(' && *cp != ')'
1534 && !isdigit (*cp) && !isalpha (*cp)
1535 && *cp != ',' && *cp != ';') {
1536 badchars = true;
1537 }
1538 cp++;
1539 }
1540 if (badchars) {
1541 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidRptUnitSeqCharacters,
1542 "/rpt_unit_seq has illegal characters");
1543 }
1544 }
1545
1546
s_RptUnitIsBaseRange(string str,TSeqPos & from,TSeqPos & to)1547 static bool s_RptUnitIsBaseRange (string str, TSeqPos& from, TSeqPos& to)
1548
1549 {
1550 if (str.length() > 25) {
1551 return false;
1552 }
1553 SIZE_TYPE pos = NStr::Find (str, "..");
1554 if (pos == string::npos) {
1555 return false;
1556 }
1557
1558 int tmp_from, tmp_to;
1559 try {
1560 tmp_from = NStr::StringToInt (str.substr(0, pos));
1561 from = tmp_from;
1562 tmp_to = NStr::StringToInt (str.substr (pos + 2));
1563 to = tmp_to;
1564 } catch (CException ) {
1565 return false;
1566 } catch (std::exception ) {
1567 return false;
1568 }
1569 if (tmp_from < 0 || tmp_to < 0) {
1570 return false;
1571 }
1572 return true;
1573 }
1574
1575
x_ValidateRptUnitRangeVal(const string & val)1576 void CSingleFeatValidator::x_ValidateRptUnitRangeVal (const string& val)
1577 {
1578 TSeqPos from = kInvalidSeqPos, to = kInvalidSeqPos;
1579 if (!s_RptUnitIsBaseRange(val, from, to)) {
1580 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidRptUnitRange,
1581 "/rpt_unit_range is not a base range");
1582 } else {
1583 CSeq_loc::TRange range = m_Feat.GetLocation().GetTotalRange();
1584 if (from - 1 < range.GetFrom() || from - 1> range.GetTo() || to - 1 < range.GetFrom() || to - 1 > range.GetTo()) {
1585 PostErr (eDiag_Warning, eErr_SEQ_FEAT_RptUnitRangeProblem,
1586 "/rpt_unit_range is not within sequence length");
1587 } else {
1588 bool nulls_between = false;
1589 for ( CTypeConstIterator<CSeq_loc> lit = ConstBegin(m_Feat.GetLocation()); lit; ++lit ) {
1590 if ( lit->Which() == CSeq_loc::e_Null ) {
1591 nulls_between = true;
1592 }
1593 }
1594 if (nulls_between) {
1595 bool in_range = false;
1596 for ( CSeq_loc_CI it(m_Feat.GetLocation()); it; ++it ) {
1597 range = it.GetEmbeddingSeq_loc().GetTotalRange();
1598 if (from - 1 < range.GetFrom() || from - 1> range.GetTo() || to - 1 < range.GetFrom() || to - 1 > range.GetTo()) {
1599 } else {
1600 in_range = true;
1601 }
1602 }
1603 if (! in_range) {
1604 PostErr (eDiag_Warning, eErr_SEQ_FEAT_RptUnitRangeProblem,
1605 "/rpt_unit_range is not within ordered intervals");
1606 }
1607 }
1608 }
1609 }
1610 }
1611
1612
x_ValidateLabelVal(const string & val)1613 void CSingleFeatValidator::x_ValidateLabelVal (const string& val)
1614 {
1615 bool only_digits = true,
1616 has_spaces = false;
1617
1618 ITERATE(string, it, val) {
1619 if ( isspace((unsigned char)(*it)) ) {
1620 has_spaces = true;
1621 }
1622 if ( !isdigit((unsigned char)(*it)) ) {
1623 only_digits = false;
1624 }
1625 }
1626 if (only_digits || has_spaces) {
1627 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue, "Illegal value for qualifier label");
1628 }
1629 }
1630
1631
x_ValidateCompareVal(const string & val)1632 void CSingleFeatValidator::x_ValidateCompareVal (const string& val)
1633 {
1634 if (!NStr::StartsWith (val, "(")) {
1635 EAccessionFormatError valid_accession = ValidateAccessionString (val, true);
1636 if (valid_accession == eAccessionFormat_missing_version) {
1637 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidCompareMissingVersion,
1638 val + " accession missing version for qualifier compare");
1639 } else if (valid_accession == eAccessionFormat_bad_version) {
1640 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
1641 val + " accession has bad version for qualifier compare");
1642 } else if (valid_accession != eAccessionFormat_valid) {
1643 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidCompareBadAccession,
1644 val + " is not a legal accession for qualifier compare");
1645 } else if (m_Imp.IsINSDInSep() && NStr::Find (val, "_") != string::npos) {
1646 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidCompareRefSeqAccession,
1647 "RefSeq accession " + val + " cannot be used for qualifier compare");
1648 }
1649 }
1650 }
1651
1652
s_StringConsistsOf(string str,string consist)1653 static bool s_StringConsistsOf (string str, string consist)
1654 {
1655 const char *src = str.c_str();
1656 const char *find = consist.c_str();
1657 bool rval = true;
1658
1659 while (*src != 0 && rval) {
1660 if (strchr (find, *src) == NULL) {
1661 rval = false;
1662 }
1663 src++;
1664 }
1665 return rval;
1666 }
1667
1668
x_ValidateReplaceQual(const string & key,const string & qual_str,const string & val)1669 void CSingleFeatValidator::x_ValidateReplaceQual(const string& key, const string& qual_str, const string& val)
1670 {
1671 if (m_LocationBioseq) {
1672 if (m_LocationBioseq.IsNa()) {
1673 if (NStr::Equal(key, "variation")) {
1674 if (!s_StringConsistsOf (val, "acgtACGT")) {
1675 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidVariationReplace,
1676 val + " is not a legal value for qualifier " + qual_str
1677 + " - should only be composed of acgt unambiguous nucleotide bases");
1678 }
1679 } else if (!s_StringConsistsOf (val, "acgtmrwsykvhdbn")) {
1680 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidReplace,
1681 val + " is not a legal value for qualifier " + qual_str
1682 + " - should only be composed of acgtmrwsykvhdbn nucleotide bases");
1683 }
1684 } else if (m_LocationBioseq.IsAa()) {
1685 if (!s_StringConsistsOf (val, "acdefghiklmnpqrstuvwy*")) {
1686 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidReplace,
1687 val + " is not a legal value for qualifier " + qual_str
1688 + " - should only be composed of acdefghiklmnpqrstuvwy* amino acids");
1689 }
1690 }
1691
1692 // if no point in location with fuzz, info if text matches sequence
1693 bool has_fuzz = false;
1694 for( objects::CSeq_loc_CI it(m_Feat.GetLocation()); it && !has_fuzz; ++it) {
1695 if (it.IsPoint() && (it.GetFuzzFrom() != NULL || it.GetFuzzTo() != NULL)) {
1696 has_fuzz = true;
1697 }
1698 }
1699 if (!has_fuzz && val.length() == GetLength (m_Feat.GetLocation(), &m_Scope)) {
1700 try {
1701 CSeqVector nuc_vec(m_Feat.GetLocation(), m_Scope, CBioseq_Handle::eCoding_Iupac);
1702 string bases;
1703 nuc_vec.GetSeqData(0, nuc_vec.size(), bases);
1704 if (NStr::EqualNocase(val, bases)) {
1705 PostErr(eDiag_Info, eErr_SEQ_FEAT_InvalidMatchingReplace,
1706 "/replace already matches underlying sequence (" + val + ")");
1707 }
1708 } catch (CException ) {
1709 } catch (std::exception ) {
1710 }
1711 }
1712 }
1713 }
1714
1715
ValidateCharactersInField(string value,string field_name)1716 void CSingleFeatValidator::ValidateCharactersInField (string value, string field_name)
1717 {
1718 if (HasBadCharacter (value)) {
1719 PostErr (eDiag_Warning, eErr_SEQ_FEAT_BadInternalCharacter,
1720 field_name + " contains undesired character");
1721 }
1722 if (EndsWithBadCharacter (value)) {
1723 PostErr (eDiag_Warning, eErr_SEQ_FEAT_BadTrailingCharacter,
1724 field_name + " ends with undesired character");
1725 }
1726 if (NStr::EndsWith (value, "-")) {
1727 if (!m_Imp.IsGpipe() || !m_Imp.BioSourceKind().IsOrganismEukaryote() )
1728 PostErr (eDiag_Warning, eErr_SEQ_FEAT_BadTrailingHyphen,
1729 field_name + " ends with hyphen");
1730 }
1731 }
1732
1733
ValidateSplice(bool gene_pseudo,bool check_all)1734 void CSingleFeatValidator::ValidateSplice(bool gene_pseudo, bool check_all)
1735 {
1736 if (!m_LocationBioseq) {
1737 return;
1738 }
1739
1740 CSpliceProblems splice_problems;
1741 splice_problems.CalculateSpliceProblems(m_Feat, check_all, gene_pseudo, m_LocationBioseq);
1742
1743 if (splice_problems.AreErrorsUnexpected()) {
1744 string label = GetBioseqIdLabel(*(m_LocationBioseq.GetCompleteBioseq()), true);
1745 x_ReportSpliceProblems(splice_problems, label);
1746 }
1747
1748 if (splice_problems.IsExceptionUnnecessary()) {
1749 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryException,
1750 "feature has exception but passes splice site test");
1751 }
1752 }
1753
1754
x_SeverityForConsensusSplice(void)1755 EDiagSev CSingleFeatValidator::x_SeverityForConsensusSplice(void)
1756 {
1757 EDiagSev sev = eDiag_Warning;
1758 if (m_Imp.IsGpipe() && m_Imp.IsGenomic()) {
1759 sev = eDiag_Info;
1760 } else if ((m_Imp.IsGPS() || m_Imp.IsRefSeq()) && !m_Imp.ReportSpliceAsError()) {
1761 sev = eDiag_Warning;
1762 }
1763 return sev;
1764 }
1765
1766
x_ReportDonorSpliceSiteReadErrors(const CSpliceProblems::TSpliceProblem & problem,const string & label)1767 void CSingleFeatValidator::x_ReportDonorSpliceSiteReadErrors(const CSpliceProblems::TSpliceProblem& problem, const string& label)
1768 {
1769 if (problem.first == CSpliceProblems::eSpliceSiteRead_BadSeq) {
1770 PostErr(eDiag_Warning, eErr_SEQ_FEAT_NotSpliceConsensusDonor,
1771 "Bad sequence at splice donor after exon ending at position "
1772 + NStr::IntToString(problem.second + 1) + " of " + label);
1773 } else if (problem.first == CSpliceProblems::eSpliceSiteRead_WrongNT) {
1774 PostErr(x_SeverityForConsensusSplice(), eErr_SEQ_FEAT_NotSpliceConsensusDonor,
1775 "Splice donor consensus (GT) not found after exon ending at position "
1776 + NStr::IntToString(problem.second + 1) + " of " + label);
1777 }
1778
1779 }
1780
1781
x_ReportAcceptorSpliceSiteReadErrors(const CSpliceProblems::TSpliceProblem & problem,const string & label)1782 void CSingleFeatValidator::x_ReportAcceptorSpliceSiteReadErrors(const CSpliceProblems::TSpliceProblem& problem, const string& label)
1783 {
1784 if (problem.first == CSpliceProblems::eSpliceSiteRead_BadSeq) {
1785 PostErr(x_SeverityForConsensusSplice(), eErr_SEQ_FEAT_NotSpliceConsensusAcceptor,
1786 "Bad sequence at splice acceptor before exon starting at position "
1787 + NStr::IntToString(problem.second + 1) + " of " + label);
1788 } else if (problem.first == CSpliceProblems::eSpliceSiteRead_WrongNT) {
1789 PostErr(x_SeverityForConsensusSplice(), eErr_SEQ_FEAT_NotSpliceConsensusAcceptor,
1790 "Splice acceptor consensus (AG) not found before exon starting at position "
1791 + NStr::IntToString(problem.second + 1) + " of " + label);
1792 }
1793
1794 }
1795
1796
x_ReportSpliceProblems(const CSpliceProblems & problems,const string & label)1797 void CSingleFeatValidator::x_ReportSpliceProblems
1798 (const CSpliceProblems& problems, const string& label)
1799 {
1800 const CSpliceProblems::TSpliceProblemList& donor_problems = problems.GetDonorProblems();
1801 for (auto it = donor_problems.begin(); it != donor_problems.end(); it++) {
1802 x_ReportDonorSpliceSiteReadErrors(*it, label);
1803 }
1804 const CSpliceProblems::TSpliceProblemList& acceptor_problems = problems.GetAcceptorProblems();
1805 for (auto it = acceptor_problems.begin(); it != acceptor_problems.end(); it++) {
1806 x_ReportAcceptorSpliceSiteReadErrors(*it, label);
1807 }
1808 }
1809
1810
x_BioseqHasNmAccession(CBioseq_Handle bsh)1811 bool CSingleFeatValidator::x_BioseqHasNmAccession (CBioseq_Handle bsh)
1812 {
1813 if (bsh) {
1814 FOR_EACH_SEQID_ON_BIOSEQ (it, *(bsh.GetBioseqCore())) {
1815 if ((*it)->IsOther() && (*it)->GetOther().IsSetAccession()
1816 && NStr::StartsWith ((*it)->GetOther().GetAccession(), "NM_")) {
1817 return true;
1818 }
1819 }
1820 }
1821 return false;
1822 }
1823
1824
x_ValidateNonImpFeat()1825 void CSingleFeatValidator::x_ValidateNonImpFeat ()
1826 {
1827 if (m_Feat.GetData().IsImp()) {
1828 return;
1829 }
1830 string key = m_Feat.GetData().GetKey();
1831
1832 CSeqFeatData::ESubtype subtype = m_Feat.GetData().GetSubtype();
1833
1834 // look for mandatory qualifiers
1835 EDiagSev sev = eDiag_Warning;
1836
1837 for (auto required : CSeqFeatData::GetMandatoryQualifiers(subtype))
1838 {
1839 bool found = false;
1840 if (m_Feat.IsSetQual()) {
1841 for (auto qual : m_Feat.GetQual()) {
1842 if (qual->IsSetQual() && CSeqFeatData::GetQualifierType(qual->GetQual()) == required) {
1843 found = true;
1844 break;
1845 }
1846 }
1847 }
1848
1849 if (!found) {
1850 if (required == CSeqFeatData::eQual_citation) {
1851 if (m_Feat.IsSetCit()) {
1852 found = true;
1853 } else if (m_Feat.IsSetComment() && NStr::EqualNocase (key, "conflict")) {
1854 // RefSeq allows conflict with accession in comment instead of sfp->cit
1855 FOR_EACH_SEQID_ON_BIOSEQ (it, *(m_LocationBioseq.GetCompleteBioseq())) {
1856 if ((*it)->IsOther()) {
1857 found = true;
1858 break;
1859 }
1860 }
1861 }
1862 }
1863 }
1864 if (!found && (NStr::EqualNocase (key, "conflict") || NStr::EqualNocase (key, "old_sequence"))) {
1865 if (m_Feat.IsSetQual()) {
1866 for (auto qual : m_Feat.GetQual()) {
1867 if (qual->IsSetQual() && NStr::EqualNocase(qual->GetQual(), "compare")
1868 && qual->IsSetVal() && !NStr::IsBlank(qual->GetVal())) {
1869 found = true;
1870 break;
1871 }
1872 }
1873 }
1874 }
1875 if (!found && required == CSeqFeatData::eQual_ncRNA_class) {
1876 sev = eDiag_Error;
1877 if (m_Feat.GetData().IsRna() && m_Feat.GetData().GetRna().IsSetExt()
1878 && m_Feat.GetData().GetRna().GetExt().IsGen()
1879 && m_Feat.GetData().GetRna().GetExt().GetGen().IsSetClass()
1880 && !NStr::IsBlank(m_Feat.GetData().GetRna().GetExt().GetGen().GetClass())) {
1881 found = true;
1882 }
1883 }
1884
1885 if (!found) {
1886 PostErr(sev, eErr_SEQ_FEAT_MissingQualOnFeature,
1887 "Missing qualifier " + CSeqFeatData::GetQualifierAsString(required) +
1888 " for feature " + key);
1889 }
1890 }
1891 }
1892
1893
s_LocationStrandsIncompatible(const CSeq_loc & loc1,const CSeq_loc & loc2,CScope * scope)1894 static bool s_LocationStrandsIncompatible (const CSeq_loc& loc1, const CSeq_loc& loc2, CScope * scope)
1895 {
1896 ENa_strand strand1 = loc1.GetStrand();
1897 ENa_strand strand2 = loc2.GetStrand();
1898
1899 if (strand1 == strand2) {
1900 return false;
1901 }
1902 if ((strand1 == eNa_strand_unknown || strand1 == eNa_strand_plus) &&
1903 (strand2 == eNa_strand_unknown || strand2 == eNa_strand_plus)) {
1904 return false;
1905 }
1906 if (strand1 == eNa_strand_other) {
1907 ECompare comp = Compare(loc1, loc2, scope, fCompareOverlapping);
1908 if (comp == eContains) {
1909 return false;
1910 }
1911 } else if (strand2 == eNa_strand_other) {
1912 ECompare comp = Compare(loc1, loc2, scope, fCompareOverlapping);
1913 if (comp == eContained) {
1914 return false;
1915 }
1916 }
1917
1918 return true;
1919 }
1920
1921
x_ValidateGeneFeaturePair(const CSeq_feat & gene)1922 void CSingleFeatValidator::x_ValidateGeneFeaturePair(const CSeq_feat& gene)
1923 {
1924 bool bad_strand = s_LocationStrandsIncompatible(gene.GetLocation(), m_Feat.GetLocation(), &m_Scope);
1925 if (bad_strand) {
1926 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GeneXrefStrandProblem,
1927 "Gene cross-reference is not on expected strand");
1928 }
1929
1930 }
1931
1932
s_GeneRefsAreEquivalent(const CGene_ref & g1,const CGene_ref & g2,string & label)1933 bool CSingleFeatValidator::s_GeneRefsAreEquivalent (const CGene_ref& g1, const CGene_ref& g2, string& label)
1934 {
1935 bool equivalent = false;
1936 if (g1.IsSetLocus_tag()
1937 && g2.IsSetLocus_tag()) {
1938 if (NStr::EqualNocase(g1.GetLocus_tag(),
1939 g2.GetLocus_tag())) {
1940 label = g1.GetLocus_tag();
1941 equivalent = true;
1942 }
1943 } else if (g1.IsSetLocus()
1944 && g2.IsSetLocus()) {
1945 if (NStr::EqualNocase(g1.GetLocus(),
1946 g2.GetLocus())) {
1947 label = g1.GetLocus();
1948 equivalent = true;
1949 }
1950 } else if (g1.IsSetSyn()
1951 && g2.IsSetSyn()) {
1952 if (NStr::EqualNocase (g1.GetSyn().front(),
1953 g2.GetSyn().front())) {
1954 label = g1.GetSyn().front();
1955 equivalent = true;
1956 }
1957 }
1958 return equivalent;
1959 }
1960
1961
1962 // Check for redundant gene Xref
1963 // Do not call if feat is gene
x_ValidateGeneXRef()1964 void CSingleFeatValidator::x_ValidateGeneXRef()
1965 {
1966 if (m_Feat.IsSetData() && m_Feat.GetData().IsGene()) {
1967 return;
1968 }
1969 auto tse = m_Imp.GetTSE_Handle();
1970 if (!tse) {
1971 return;
1972 }
1973
1974 // first, look for gene by feature id xref
1975 bool has_gene_id_xref = false;
1976 if (m_Feat.IsSetXref()) {
1977 ITERATE(CSeq_feat::TXref, xref, m_Feat.GetXref()) {
1978 if ((*xref)->IsSetId() && (*xref)->GetId().IsLocal()) {
1979 CTSE_Handle::TSeq_feat_Handles gene_feats =
1980 tse.GetFeaturesWithId(CSeqFeatData::eSubtype_gene, (*xref)->GetId().GetLocal());
1981 if (gene_feats.size() > 0) {
1982 has_gene_id_xref = true;
1983 ITERATE(CTSE_Handle::TSeq_feat_Handles, gene, gene_feats) {
1984 x_ValidateGeneFeaturePair(*(gene->GetSeq_feat()));
1985 }
1986 }
1987 }
1988 }
1989 }
1990 if (has_gene_id_xref) {
1991 return;
1992 }
1993
1994 // if we can't get the bioseq on which the gene is located, we can't check for
1995 // overlapping/ambiguous/redundant conditions
1996 if (!m_LocationBioseq) {
1997 return;
1998 }
1999
2000 const CGene_ref* gene_xref = m_Feat.GetGeneXref();
2001
2002 size_t num_genes = 0;
2003 size_t max = 0;
2004 size_t num_trans_spliced = 0;
2005 bool equivalent = false;
2006 /*
2007 CFeat_CI gene_it(bsh, CSeqFeatData::e_Gene);
2008 */
2009
2010 //CFeat_CI gene_it(*m_Scope, feat.GetLocation(), SAnnotSelector (CSeqFeatData::e_Gene));
2011 CFeat_CI gene_it(m_LocationBioseq,
2012 CRange<TSeqPos>(m_Feat.GetLocation().GetStart(eExtreme_Positional),
2013 m_Feat.GetLocation().GetStop(eExtreme_Positional)),
2014 SAnnotSelector(CSeqFeatData::e_Gene));
2015 CFeat_CI prev_gene;
2016 string label = "?";
2017 size_t num_match_by_locus = 0;
2018 size_t num_match_by_locus_tag = 0;
2019
2020 for ( ; gene_it; ++gene_it) {
2021 if (gene_xref && gene_xref->IsSetLocus() &&
2022 gene_it->GetData().GetGene().IsSetLocus() &&
2023 NStr::Equal(gene_xref->GetLocus(), gene_it->GetData().GetGene().GetLocus())) {
2024 num_match_by_locus++;
2025 x_ValidateGeneFeaturePair(*(gene_it->GetSeq_feat()));
2026 }
2027 if (gene_xref && gene_xref->IsSetLocus_tag() &&
2028 gene_it->GetData().GetGene().IsSetLocus_tag() &&
2029 NStr::Equal(gene_xref->GetLocus_tag(), gene_it->GetData().GetGene().GetLocus_tag())) {
2030 num_match_by_locus_tag++;
2031 x_ValidateGeneFeaturePair(*(gene_it->GetSeq_feat()));
2032 if ((!gene_xref->IsSetLocus() || NStr::IsBlank(gene_xref->GetLocus())) &&
2033 gene_it->GetData().GetGene().IsSetLocus() &&
2034 !NStr::IsBlank(gene_it->GetData().GetGene().GetLocus())) {
2035 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GeneXrefWithoutLocus,
2036 "Feature has Gene Xref with locus_tag but no locus, gene with locus_tag and locus exists");
2037 }
2038 }
2039
2040 if (TestForOverlapEx (gene_it->GetLocation(), m_Feat.GetLocation(),
2041 gene_it->GetLocation().IsInt() ? eOverlap_Contained : eOverlap_Subset, &m_Scope) >= 0) {
2042 size_t len = GetLength(gene_it->GetLocation(), &m_Scope);
2043 if (len < max || num_genes == 0) {
2044 num_genes = 1;
2045 max = len;
2046 num_trans_spliced = 0;
2047 if (gene_it->IsSetExcept() && gene_it->IsSetExcept_text() &&
2048 NStr::FindNoCase (gene_it->GetExcept_text(), "trans-splicing") != string::npos) {
2049 num_trans_spliced++;
2050 }
2051 equivalent = false;
2052 prev_gene = gene_it;
2053 } else if (len == max) {
2054 equivalent |= s_GeneRefsAreEquivalent(gene_it->GetData().GetGene(), prev_gene->GetData().GetGene(), label);
2055 num_genes++;
2056 if (gene_it->IsSetExcept() && gene_it->IsSetExcept_text() &&
2057 NStr::FindNoCase (gene_it->GetExcept_text(), "trans-splicing") != string::npos) {
2058 num_trans_spliced++;
2059 }
2060 }
2061 }
2062 }
2063
2064 if ( gene_xref == 0) {
2065 // if there is no gene xref, then there should be 0 or 1 overlapping genes
2066 // so that mapping by overlap is unambiguous
2067 if (num_genes > 1 &&
2068 m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_repeat_region &&
2069 m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_mobile_element) {
2070 if (m_Imp.IsSmallGenomeSet() && num_genes == num_trans_spliced) {
2071 /* suppress for trans-spliced genes on small genome set */
2072 } else if (equivalent) {
2073 PostErr (eDiag_Warning, eErr_SEQ_FEAT_GeneXrefNeeded,
2074 "Feature overlapped by "
2075 + NStr::SizetToString(num_genes)
2076 + " identical-length equivalent genes but has no cross-reference");
2077 } else {
2078 PostErr (eDiag_Warning, eErr_SEQ_FEAT_MissingGeneXref,
2079 "Feature overlapped by "
2080 + NStr::SizetToString(num_genes)
2081 + " identical-length genes but has no cross-reference");
2082 }
2083 } else if (num_genes == 1
2084 && prev_gene->GetData().GetGene().IsSetAllele()
2085 && !NStr::IsBlank(prev_gene->GetData().GetGene().GetAllele())) {
2086 const string& allele = prev_gene->GetData().GetGene().GetAllele();
2087 // overlapping gene should not conflict with allele qualifier
2088 FOR_EACH_GBQUAL_ON_FEATURE (qual_iter, m_Feat) {
2089 const CGb_qual& qual = **qual_iter;
2090 if ( qual.IsSetQual() &&
2091 NStr::Compare(qual.GetQual(), "allele") == 0 ) {
2092 if ( qual.CanGetVal() &&
2093 NStr::CompareNocase(qual.GetVal(), allele) == 0 ) {
2094 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidAlleleDuplicates,
2095 "Redundant allele qualifier (" + allele +
2096 ") on gene and feature");
2097 } else if (m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_variation) {
2098 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MismatchedAllele,
2099 "Mismatched allele qualifier on gene (" + allele +
2100 ") and feature (" + qual.GetVal() +")");
2101 }
2102 }
2103 }
2104 }
2105 } else if ( !gene_xref->IsSuppressed() ) {
2106 // we are counting features with gene xrefs
2107 m_Imp.IncrementGeneXrefCount();
2108
2109 // make sure overlapping gene and gene xref do not conflict
2110 if (gene_xref->IsSetAllele() && !NStr::IsBlank(gene_xref->GetAllele())) {
2111 const string& allele = gene_xref->GetAllele();
2112
2113 FOR_EACH_GBQUAL_ON_FEATURE (qual_iter, m_Feat) {
2114 const CGb_qual& qual = **qual_iter;
2115 if ( qual.CanGetQual() &&
2116 NStr::Compare(qual.GetQual(), "allele") == 0 ) {
2117 if ( qual.CanGetVal() &&
2118 NStr::CompareNocase(qual.GetVal(), allele) == 0 ) {
2119 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidAlleleDuplicates,
2120 "Redundant allele qualifier (" + allele +
2121 ") on gene and feature");
2122 } else if (m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_variation) {
2123 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MismatchedAllele,
2124 "Mismatched allele qualifier on gene (" + allele +
2125 ") and feature (" + qual.GetVal() +")");
2126 }
2127 }
2128 }
2129 }
2130
2131 if (num_match_by_locus == 0 && num_match_by_locus_tag == 0) {
2132 // find gene on bioseq to match genexref
2133 if ((gene_xref->IsSetLocus_tag() &&
2134 !NStr::IsBlank(gene_xref->GetLocus_tag())) ||
2135 (gene_xref->IsSetLocus() &&
2136 !NStr::IsBlank(gene_xref->GetLocus()))) {
2137 CConstRef<CSeq_feat> gene = m_Imp.GetGeneCache().GetGeneFromCache(&m_Feat, m_Scope);
2138 if (!gene && m_LocationBioseq && m_LocationBioseq.IsAa()) {
2139 const CSeq_feat* cds = GetCDSForProduct(m_LocationBioseq);
2140 if (cds != 0) {
2141 if (cds->IsSetLocation()) {
2142 const CSeq_loc& loc = cds->GetLocation();
2143 const CSeq_id* id = loc.GetId();
2144 if (id != NULL) {
2145 CBioseq_Handle nbsh = m_LocationBioseq.GetScope().GetBioseqHandle(*id);
2146 if (nbsh) {
2147 gene = m_Imp.GetGeneCache().GetGeneFromCache(cds, m_Scope);
2148 }
2149 }
2150 }
2151 }
2152 }
2153 string label;
2154 if (gene && !CSingleFeatValidator::s_GeneRefsAreEquivalent(*gene_xref, gene->GetData().GetGene(), label)) {
2155 gene.Reset(NULL);
2156 }
2157 if (gene_xref->IsSetLocus_tag() &&
2158 !NStr::IsBlank(gene_xref->GetLocus_tag()) &&
2159 !gene) {
2160 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GeneXrefWithoutGene,
2161 "Feature has gene locus_tag cross-reference but no equivalent gene feature exists");
2162 } else if (gene_xref->IsSetLocus() &&
2163 !NStr::IsBlank(gene_xref->GetLocus()) &&
2164 !gene) {
2165 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GeneXrefWithoutGene,
2166 "Feature has gene locus cross-reference but no equivalent gene feature exists");
2167 }
2168 }
2169 }
2170 }
2171
2172 }
2173
2174
x_ValidateNonGene()2175 void CSingleFeatValidator::x_ValidateNonGene()
2176 {
2177 if (m_Feat.GetData().IsGene()) {
2178 return;
2179 }
2180 x_ValidateGeneXRef();
2181
2182 if (m_Feat.IsSetQual()) {
2183 // check old locus tag on feature and overlapping gene
2184 for (auto it : m_Feat.GetQual()) {
2185 if (it->IsSetQual() && NStr::Equal(it->GetQual(), "old_locus_tag")
2186 && it->IsSetVal() && !NStr::IsBlank(it->GetVal())) {
2187 x_ValidateOldLocusTag(it->GetVal());
2188 }
2189 }
2190 }
2191 }
2192
2193
s_IsPseudo(const CGene_ref & ref)2194 bool CSingleFeatValidator::s_IsPseudo(const CGene_ref& ref)
2195 {
2196 if (ref.IsSetPseudo() && ref.GetPseudo()) {
2197 return true;
2198 } else {
2199 return false;
2200 }
2201 }
2202
2203
s_HasNamedQual(const CSeq_feat & feat,const string & qual)2204 bool s_HasNamedQual(const CSeq_feat& feat, const string& qual)
2205 {
2206 bool rval = false;
2207 if (feat.IsSetQual()) {
2208 for (auto it : feat.GetQual()) {
2209 if (it->IsSetQual() && NStr::EqualNocase(it->GetQual(), qual)) {
2210 rval = true;
2211 break;
2212 }
2213 }
2214 }
2215 return rval;
2216 }
2217
2218
s_IsPseudo(const CSeq_feat & feat)2219 bool CSingleFeatValidator::s_IsPseudo(const CSeq_feat& feat)
2220 {
2221 if (feat.IsSetPseudo() && feat.GetPseudo()) {
2222 return true;
2223 } else if (s_HasNamedQual(feat, "pseudogene")) {
2224 return true;
2225 } else if (feat.IsSetData() && feat.GetData().IsGene() &&
2226 s_IsPseudo(feat.GetData().GetGene())) {
2227 return true;
2228 } else {
2229 return false;
2230 }
2231 }
2232
2233
x_ValidateOldLocusTag(const string & old_locus_tag)2234 void CSingleFeatValidator::x_ValidateOldLocusTag(const string& old_locus_tag)
2235 {
2236 if (NStr::IsBlank(old_locus_tag)) {
2237 return;
2238 }
2239 bool pseudo = s_IsPseudo(m_Feat);
2240 const CGene_ref* grp = m_Feat.GetGeneXref();
2241 if ( !grp) {
2242 // check overlapping gene
2243 CConstRef<CSeq_feat> overlap = m_Imp.GetGeneCache().GetGeneFromCache(&m_Feat, m_Scope);
2244 if ( overlap ) {
2245 if (s_IsPseudo(*overlap)) {
2246 pseudo = true;
2247 }
2248 string gene_old_locus_tag;
2249
2250 FOR_EACH_GBQUAL_ON_SEQFEAT (it, *overlap) {
2251 if ((*it)->IsSetQual() && NStr::Equal ((*it)->GetQual(), "old_locus_tag")
2252 && (*it)->IsSetVal() && !NStr::IsBlank((*it)->GetVal())) {
2253 gene_old_locus_tag = (*it)->GetVal();
2254 break;
2255 }
2256 }
2257 if (!NStr::IsBlank (gene_old_locus_tag)
2258 && !NStr::EqualNocase (gene_old_locus_tag, old_locus_tag)) {
2259 PostErr (eDiag_Warning, eErr_SEQ_FEAT_OldLocusTagMismtach,
2260 "Old locus tag on feature (" + old_locus_tag
2261 + ") does not match that on gene (" + gene_old_locus_tag + ")");
2262 }
2263 grp = &(overlap->GetData().GetGene());
2264 }
2265 }
2266 if (grp && s_IsPseudo(*grp)) {
2267 pseudo = true;
2268 }
2269 if (grp == 0 || ! grp->IsSetLocus_tag() || NStr::IsBlank (grp->GetLocus_tag())) {
2270 if (! pseudo) {
2271 PostErr(eDiag_Error, eErr_SEQ_FEAT_OldLocusTagWithoutLocusTag,
2272 "old_locus_tag without inherited locus_tag");
2273 }
2274 }
2275 }
2276
2277
x_ValidateImpFeatLoc()2278 void CSingleFeatValidator::x_ValidateImpFeatLoc()
2279 {
2280 if (!m_Feat.GetData().IsImp()) {
2281 return;
2282 }
2283 const string& key = m_Feat.GetData().GetImp().GetKey();
2284 // validate the feature's location
2285 if ( m_Feat.GetData().GetImp().IsSetLoc() ) {
2286 const string& imp_loc = m_Feat.GetData().GetImp().GetLoc();
2287 if ( imp_loc.find("one-of") != string::npos ) {
2288 PostErr(eDiag_Error, eErr_SEQ_FEAT_ImpFeatBadLoc,
2289 "ImpFeat loc " + imp_loc +
2290 " has obsolete 'one-of' text for feature " + key);
2291 } else if ( m_Feat.GetLocation().IsInt() ) {
2292 const CSeq_interval& seq_int = m_Feat.GetLocation().GetInt();
2293 string temp_loc = NStr::IntToString(seq_int.GetFrom() + 1) +
2294 ".." + NStr::IntToString(seq_int.GetTo() + 1);
2295 if ( imp_loc != temp_loc ) {
2296 PostErr(eDiag_Error, eErr_SEQ_FEAT_ImpFeatBadLoc,
2297 "ImpFeat loc " + imp_loc + " does not equal feature location " +
2298 temp_loc + " for feature " + key);
2299 }
2300 }
2301 }
2302
2303 }
2304
2305
x_ValidateImpFeatQuals()2306 void CSingleFeatValidator::x_ValidateImpFeatQuals()
2307 {
2308 if (!m_Feat.GetData().IsImp()) {
2309 return;
2310 }
2311 const string& key = m_Feat.GetData().GetImp().GetKey();
2312
2313 // Make sure a feature has its mandatory qualifiers
2314 for (auto required : CSeqFeatData::GetMandatoryQualifiers(m_Feat.GetData().GetSubtype())) {
2315 bool found = false;
2316 if (m_Feat.IsSetQual()) {
2317 for (auto qual : m_Feat.GetQual()) {
2318 if (qual->IsSetQual() && CSeqFeatData::GetQualifierType(qual->GetQual()) == required) {
2319 found = true;
2320 break;
2321 }
2322 }
2323 if (!found && required == CSeqFeatData::eQual_citation) {
2324 if (m_Feat.IsSetCit()) {
2325 found = true;
2326 }
2327 else if (m_Feat.IsSetComment() && !NStr::IsBlank(m_Feat.GetComment())) {
2328 // RefSeq allows conflict with accession in comment instead of sfp->cit
2329 if (m_LocationBioseq) {
2330 FOR_EACH_SEQID_ON_BIOSEQ(it, *(m_LocationBioseq.GetCompleteBioseq())) {
2331 if ((*it)->IsOther()) {
2332 found = true;
2333 break;
2334 }
2335 }
2336 }
2337 }
2338 if (!found
2339 && (NStr::EqualNocase(key, "conflict")
2340 || NStr::EqualNocase(key, "old_sequence"))) {
2341 // compare qualifier can now substitute for citation qualifier for conflict and old_sequence
2342 FOR_EACH_GBQUAL_ON_FEATURE(qual, m_Feat) {
2343 if ((*qual)->IsSetQual() && CSeqFeatData::GetQualifierType((*qual)->GetQual()) == CSeqFeatData::eQual_compare) {
2344 found = true;
2345 break;
2346 }
2347 }
2348 }
2349 }
2350 }
2351 if (!found) {
2352 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MissingQualOnImpFeat,
2353 "Missing qualifier " + CSeqFeatData::GetQualifierAsString(required) +
2354 " for feature " + key);
2355 }
2356 }
2357 }
2358
2359
x_ValidateSeqFeatDataType()2360 void CSingleFeatValidator::x_ValidateSeqFeatDataType()
2361 {
2362 switch ( m_Feat.GetData().Which () ) {
2363 case CSeqFeatData::e_Gene:
2364 case CSeqFeatData::e_Cdregion:
2365 case CSeqFeatData::e_Prot:
2366 case CSeqFeatData::e_Rna:
2367 case CSeqFeatData::e_Pub:
2368 case CSeqFeatData::e_Imp:
2369 case CSeqFeatData::e_Biosrc:
2370 case CSeqFeatData::e_Org:
2371 case CSeqFeatData::e_Region:
2372 case CSeqFeatData::e_Seq:
2373 case CSeqFeatData::e_Comment:
2374 case CSeqFeatData::e_Bond:
2375 case CSeqFeatData::e_Site:
2376 case CSeqFeatData::e_Rsite:
2377 case CSeqFeatData::e_User:
2378 case CSeqFeatData::e_Txinit:
2379 case CSeqFeatData::e_Num:
2380 case CSeqFeatData::e_Psec_str:
2381 case CSeqFeatData::e_Non_std_residue:
2382 case CSeqFeatData::e_Het:
2383 case CSeqFeatData::e_Clone:
2384 case CSeqFeatData::e_Variation:
2385 break;
2386 default:
2387 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidType,
2388 "Invalid SeqFeat type [" + NStr::IntToString(m_Feat.GetData().Which()) + "]");
2389 break;
2390 }
2391 }
2392
2393
s_BioseqHasRefSeqThatStartsWithPrefix(CBioseq_Handle bsh,string prefix)2394 bool CSingleFeatValidator::s_BioseqHasRefSeqThatStartsWithPrefix (CBioseq_Handle bsh, string prefix)
2395 {
2396 bool rval = false;
2397 FOR_EACH_SEQID_ON_BIOSEQ (it, *(bsh.GetBioseqCore())) {
2398 if ((*it)->IsOther() && (*it)->GetTextseq_Id()->IsSetAccession()
2399 && NStr::StartsWith ((*it)->GetTextseq_Id()->GetAccession(), prefix)) {
2400 rval = true;
2401 break;
2402 }
2403 }
2404 return rval;
2405 }
2406
2407
x_ReportPseudogeneConflict(CConstRef<CSeq_feat> gene)2408 void CSingleFeatValidator::x_ReportPseudogeneConflict(CConstRef <CSeq_feat> gene)
2409 {
2410 if (!m_Feat.IsSetData() ||
2411 (m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_mRNA &&
2412 m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_cdregion)) {
2413 return;
2414 }
2415 string sfp_pseudo;
2416 string gene_pseudo;
2417 bool has_sfp_pseudo = false;
2418 bool has_gene_pseudo = false;
2419 if (m_Feat.IsSetQual()) {
2420 for (auto it : m_Feat.GetQual()) {
2421 if (it->IsSetQual() &&
2422 NStr::EqualNocase(it->GetQual(), "pseudogene") &&
2423 it->IsSetVal()) {
2424 sfp_pseudo = it->GetVal();
2425 has_sfp_pseudo = true;
2426 }
2427 }
2428 }
2429 if (gene && gene->IsSetQual()) {
2430 for (auto it : gene->GetQual()) {
2431 if (it->IsSetQual() &&
2432 NStr::EqualNocase(it->GetQual(), "pseudogene") &&
2433 it->IsSetVal()) {
2434 gene_pseudo = it->GetVal();
2435 has_gene_pseudo = true;
2436 }
2437 }
2438 }
2439
2440 if (!has_sfp_pseudo && !has_gene_pseudo) {
2441 return;
2442 } else if (!has_sfp_pseudo) {
2443 return;
2444 } else if (has_sfp_pseudo && !has_gene_pseudo) {
2445 string msg = m_Feat.GetData().IsCdregion() ? "CDS" : "mRNA";
2446 msg += " has pseudogene qualifier, gene does not";
2447 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InconsistentPseudogeneValue,
2448 msg);
2449 } else if (!NStr::EqualNocase(sfp_pseudo, gene_pseudo)) {
2450 string msg = "Different pseudogene values on ";
2451 msg += m_Feat.GetData().IsCdregion() ? "CDS" : "mRNA";
2452 msg += " (" + sfp_pseudo + ") and gene (" + gene_pseudo + ")";
2453 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InconsistentPseudogeneValue,
2454 msg);
2455 }
2456 }
2457
2458
2459 // grp is from gene xref or from overlapping gene
x_ValidateLocusTagGeneralMatch(CConstRef<CSeq_feat> gene)2460 void CSingleFeatValidator::x_ValidateLocusTagGeneralMatch(CConstRef <CSeq_feat> gene)
2461 {
2462 if (!m_Imp.IsLocusTagGeneralMatch()) {
2463 return;
2464 }
2465 if (!m_Feat.IsSetProduct()) {
2466 return;
2467 }
2468
2469 CTempString locus_tag = kEmptyStr;
2470 // obtain the gene-ref from the feature or the overlapping gene
2471 const CGene_ref* grp = m_Feat.GetGeneXref();
2472 if (grp && grp->IsSuppressed()) {
2473 return;
2474 } else if (grp && grp->IsSetLocus_tag() &&
2475 !NStr::IsBlank(grp->GetLocus_tag())) {
2476 locus_tag = grp->GetLocus_tag();
2477 } else if (gene && gene->GetData().GetGene().IsSetLocus_tag() &&
2478 !NStr::IsBlank(gene->GetData().GetGene().GetLocus_tag())) {
2479 locus_tag = gene->GetData().GetGene().GetLocus_tag();
2480 } else {
2481 return;
2482 }
2483
2484 if (!m_ProductBioseq) {
2485 return;
2486 }
2487
2488 for (auto id : m_ProductBioseq.GetId()) {
2489 CConstRef<CSeq_id> seqid = id.GetSeqId();
2490 if (!seqid || !seqid->IsGeneral()) {
2491 continue;
2492 }
2493 const CDbtag& dbt = seqid->GetGeneral();
2494 if (!dbt.IsSetDb() || dbt.IsSkippable()) {
2495 continue;
2496 }
2497
2498 if (dbt.IsSetTag() && dbt.GetTag().IsStr()) {
2499 SIZE_TYPE pos = dbt.GetTag().GetStr().find('-');
2500 string str = dbt.GetTag().GetStr().substr(0, pos);
2501 if (!NStr::EqualNocase(locus_tag, str)) {
2502 PostErr(eDiag_Error, eErr_SEQ_FEAT_LocusTagProductMismatch,
2503 "Gene locus_tag does not match general ID of product");
2504 }
2505 }
2506 }
2507 }
2508
2509
x_CheckForNonAsciiCharacters()2510 void CSingleFeatValidator::x_CheckForNonAsciiCharacters()
2511 {
2512 CStdTypeConstIterator<string> it(m_Feat);
2513
2514 for (; it; ++it) {
2515 const string& str = *it;
2516 FOR_EACH_CHAR_IN_STRING(c_it, str) {
2517 const char& ch = *c_it;
2518 unsigned char chu = ch;
2519 if (ch > 127 || (ch < 32 && ch != '\t' && ch != '\r' && ch != '\n')) {
2520 PostErr(eDiag_Fatal, eErr_GENERIC_NonAsciiAsn,
2521 "Non-ASCII character '" + NStr::NumericToString(chu) + "' found in feature (" + str + ")");
2522 break;
2523 }
2524 }
2525 }
2526 }
2527
Validate()2528 void CProtValidator::Validate()
2529 {
2530 CSingleFeatValidator::Validate();
2531
2532 x_CheckForEmpty();
2533
2534 const CProt_ref& prot = m_Feat.GetData().GetProt();
2535 for (auto it : prot.GetName()) {
2536 if (prot.IsSetEc() && !prot.IsSetProcessed()
2537 && (NStr::EqualCase (it, "Hypothetical protein")
2538 || NStr::EqualCase (it, "hypothetical protein")
2539 || NStr::EqualCase (it, "Unknown protein")
2540 || NStr::EqualCase (it, "unknown protein"))) {
2541 PostErr (eDiag_Warning, eErr_SEQ_FEAT_BadProteinName,
2542 "Unknown or hypothetical protein should not have EC number");
2543 }
2544
2545 }
2546
2547 if (prot.IsSetDesc() && ContainsSgml(prot.GetDesc())) {
2548 PostErr (eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
2549 "protein description " + prot.GetDesc() + " has SGML");
2550 }
2551
2552 if (prot.IsSetDesc() && m_Feat.IsSetComment()
2553 && NStr::EqualCase(prot.GetDesc(), m_Feat.GetComment())) {
2554 PostErr (eDiag_Warning, eErr_SEQ_FEAT_RedundantFields,
2555 "Comment has same value as protein description");
2556 }
2557
2558 if (m_Feat.IsSetComment() && HasECnumberPattern(m_Feat.GetComment())) {
2559 PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberInProteinComment,
2560 "Apparent EC number in protein comment");
2561 }
2562
2563 x_ReportUninformativeNames();
2564
2565 // only look for EC numbers in first protein name
2566 // only look for brackets and hypothetical protein XP_ in first protein name
2567 if (prot.IsSetName() && prot.GetName().size() > 0) {
2568 if (HasECnumberPattern(prot.GetName().front())) {
2569 PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberInProteinName,
2570 "Apparent EC number in protein title");
2571 }
2572 x_ValidateProteinName(prot.GetName().front());
2573 }
2574
2575 if ( prot.CanGetDb () ) {
2576 m_Imp.ValidateDbxref(prot.GetDb(), m_Feat);
2577 }
2578 if ( (!prot.IsSetName() || prot.GetName().empty()) &&
2579 (!prot.IsSetProcessed()
2580 || (prot.GetProcessed() != CProt_ref::eProcessed_signal_peptide
2581 && prot.GetProcessed() != CProt_ref::eProcessed_transit_peptide))) {
2582 if (prot.IsSetDesc() && !NStr::IsBlank (prot.GetDesc())) {
2583 PostErr(eDiag_Warning, eErr_SEQ_FEAT_NoNameForProtein,
2584 "Protein feature has description but no name");
2585 } else if (prot.IsSetActivity() && !prot.GetActivity().empty()) {
2586 PostErr(eDiag_Warning, eErr_SEQ_FEAT_NoNameForProtein,
2587 "Protein feature has function but no name");
2588 } else if (prot.IsSetEc() && !prot.GetEc().empty()) {
2589 PostErr(eDiag_Warning, eErr_SEQ_FEAT_NoNameForProtein,
2590 "Protein feature has EC number but no name");
2591 } else {
2592 PostErr(eDiag_Warning, eErr_SEQ_FEAT_NoNameForProtein,
2593 "Protein feature has no name");
2594 }
2595 }
2596
2597 x_ValidateECNumbers();
2598
2599 x_ValidateMolinfoPartials();
2600 }
2601
2602
x_CheckForEmpty()2603 void CProtValidator::x_CheckForEmpty()
2604 {
2605 const CProt_ref& prot = m_Feat.GetData().GetProt();
2606 CProt_ref::EProcessed processed = CProt_ref::eProcessed_not_set;
2607
2608 if ( prot.IsSetProcessed() ) {
2609 processed = prot.GetProcessed();
2610 }
2611
2612 bool empty = true;
2613 if ( processed != CProt_ref::eProcessed_signal_peptide &&
2614 processed != CProt_ref::eProcessed_transit_peptide ) {
2615 if ( prot.IsSetName() &&
2616 !prot.GetName().empty() &&
2617 !prot.GetName().front().empty() ) {
2618 empty = false;
2619 }
2620 if ( prot.CanGetDesc() && !prot.GetDesc().empty() ) {
2621 empty = false;
2622 }
2623 if ( prot.CanGetEc() && !prot.GetEc().empty() ) {
2624 empty = false;
2625 }
2626 if ( prot.CanGetActivity() && !prot.GetActivity().empty() ) {
2627 empty = false;
2628 }
2629 if ( prot.CanGetDb() && !prot.GetDb().empty() ) {
2630 empty = false;
2631 }
2632
2633 if ( empty ) {
2634 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ProtRefHasNoData,
2635 "There is a protein feature where all fields are empty");
2636 }
2637 }
2638 }
2639
2640
2641 // note - list bad protein names in lower case, as search term is converted to lower case
2642 // before looking for exact match
2643 static const char* const sc_BadProtNameText [] = {
2644 "'hypothetical protein",
2645 "alpha",
2646 "alternative",
2647 "alternatively spliced",
2648 "bacteriophage hypothetical protein",
2649 "beta",
2650 "cellular",
2651 "cnserved hypothetical protein",
2652 "conesrved hypothetical protein",
2653 "conserevd hypothetical protein",
2654 "conserved archaeal protein",
2655 "conserved domain protein",
2656 "conserved hypohetical protein",
2657 "conserved hypotehtical protein",
2658 "conserved hypotheical protein",
2659 "conserved hypothertical protein",
2660 "conserved hypothetcial protein",
2661 "conserved hypothetical",
2662 "conserved hypothetical exported protein",
2663 "conserved hypothetical integral membrane protein",
2664 "conserved hypothetical membrane protein",
2665 "conserved hypothetical phage protein",
2666 "conserved hypothetical prophage protein",
2667 "conserved hypothetical protein",
2668 "conserved hypothetical protein - phage associated",
2669 "conserved hypothetical protein fragment 3",
2670 "conserved hypothetical protein, fragment",
2671 "conserved hypothetical protein, putative",
2672 "conserved hypothetical protein, truncated",
2673 "conserved hypothetical protein, truncation",
2674 "conserved hypothetical protein.",
2675 "conserved hypothetical protein; possible membrane protein",
2676 "conserved hypothetical protein; putative membrane protein",
2677 "conserved hypothetical proteins",
2678 "conserved hypothetical protien",
2679 "conserved hypothetical transmembrane protein",
2680 "conserved hypotheticcal protein",
2681 "conserved hypthetical protein",
2682 "conserved in bacteria",
2683 "conserved membrane protein",
2684 "conserved protein",
2685 "conserved protein of unknown function",
2686 "conserved protein of unknown function ; putative membrane protein",
2687 "conserved unknown protein",
2688 "conservedhypothetical protein",
2689 "conserverd hypothetical protein",
2690 "conservered hypothetical protein",
2691 "consrved hypothetical protein",
2692 "converved hypothetical protein",
2693 "cytokine",
2694 "delta",
2695 "drosophila",
2696 "duplicated hypothetical protein",
2697 "epsilon",
2698 "gamma",
2699 "hla",
2700 "homeodomain",
2701 "homeodomain protein",
2702 "homolog",
2703 "hyopthetical protein",
2704 "hypotethical",
2705 "hypotheical protein",
2706 "hypothertical protein",
2707 "hypothetcical protein",
2708 "hypothetical",
2709 "hypothetical protein",
2710 "hypothetical conserved protein",
2711 "hypothetical exported protein",
2712 "hypothetical novel protein",
2713 "hypothetical orf",
2714 "hypothetical phage protein",
2715 "hypothetical prophage protein",
2716 "hypothetical protein (fragment)",
2717 "hypothetical protein (multi-domain)",
2718 "hypothetical protein (phage associated)",
2719 "hypothetical protein - phage associated",
2720 "hypothetical protein fragment",
2721 "hypothetical protein fragment 1",
2722 "hypothetical protein predicted by genemark",
2723 "hypothetical protein predicted by glimmer",
2724 "hypothetical protein predicted by glimmer/critica",
2725 "hypothetical protein, conserved",
2726 "hypothetical protein, phage associated",
2727 "hypothetical protein, truncated",
2728 "hypothetical protein-putative conserved hypothetical protein",
2729 "hypothetical protein.",
2730 "hypothetical proteins",
2731 "hypothetical protien",
2732 "hypothetical transmembrane protein",
2733 "hypothetoical protein",
2734 "hypothteical protein",
2735 "identified by sequence similarity; putative; orf located~using blastx/framed",
2736 "identified by sequence similarity; putative; orf located~using blastx/glimmer/genemark",
2737 "ion channel",
2738 "membrane protein, putative",
2739 "mouse",
2740 "narrowly conserved hypothetical protein",
2741 "novel protein",
2742 "orf",
2743 "orf, conserved hypothetical protein",
2744 "orf, hypothetical",
2745 "orf, hypothetical protein",
2746 "orf, hypothetical, fragment",
2747 "orf, partial conserved hypothetical protein",
2748 "orf; hypothetical protein",
2749 "orf; unknown function",
2750 "partial",
2751 "partial cds, hypothetical",
2752 "partially conserved hypothetical protein",
2753 "phage hypothetical protein",
2754 "phage-related conserved hypothetical protein",
2755 "phage-related protein",
2756 "plasma",
2757 "possible hypothetical protein",
2758 "precursor",
2759 "predicted coding region",
2760 "predicted protein",
2761 "predicted protein (pseudogene)",
2762 "predicted protein family",
2763 "product uncharacterised protein family",
2764 "protein family",
2765 "protein of unknown function",
2766 "pseudogene",
2767 "putative",
2768 "putative conserved protein",
2769 "putative exported protein",
2770 "putative hypothetical protein",
2771 "putative membrane protein",
2772 "putative orf; unknown function",
2773 "putative phage protein",
2774 "putative protein",
2775 "rearranged",
2776 "repeats containing protein",
2777 "reserved",
2778 "ribosomal protein",
2779 "similar to",
2780 "small",
2781 "small hypothetical protein",
2782 "transmembrane protein",
2783 "trna",
2784 "trp repeat",
2785 "trp-repeat protein",
2786 "truncated conserved hypothetical protein",
2787 "truncated hypothetical protein",
2788 "uncharacterized conserved membrane protein",
2789 "uncharacterized conserved protein",
2790 "uncharacterized conserved secreted protein",
2791 "uncharacterized protein",
2792 "uncharacterized protein conserved in archaea",
2793 "uncharacterized protein conserved in bacteria",
2794 "unique hypothetical",
2795 "unique hypothetical protein",
2796 "unknown",
2797 "unknown cds",
2798 "unknown function",
2799 "unknown gene",
2800 "unknown protein",
2801 "unknown, conserved protein",
2802 "unknown, hypothetical",
2803 "unknown-related protein",
2804 "unknown; predicted coding region",
2805 "unnamed",
2806 "unnamed protein product",
2807 "very hypothetical protein"
2808 };
2809 typedef CStaticArraySet<const char*, PCase_CStr> TBadProtNameSet;
2810 DEFINE_STATIC_ARRAY_MAP(TBadProtNameSet, sc_BadProtName, sc_BadProtNameText);
2811
2812
x_ReportUninformativeNames()2813 void CProtValidator::x_ReportUninformativeNames()
2814 {
2815 if (!m_Imp.IsRefSeq()) {
2816 return;
2817 }
2818 const CProt_ref& prot = m_Feat.GetData().GetProt();
2819 if (!prot.IsSetName()) {
2820 if (!prot.IsSetProcessed() ||
2821 (prot.GetProcessed() != CProt_ref::eProcessed_signal_peptide &&
2822 prot.GetProcessed() != CProt_ref::eProcessed_transit_peptide)) {
2823 PostErr (eDiag_Error, eErr_SEQ_FEAT_NoNameForProtein,
2824 "Protein name is not set");
2825 }
2826 return;
2827 }
2828 for (auto it : m_Feat.GetData().GetProt().GetName()) {
2829 string search = it;
2830 search = NStr::ToLower(search);
2831 if (search.empty()) {
2832 PostErr (eDiag_Error, eErr_SEQ_FEAT_NoNameForProtein,
2833 "Protein name is empty");
2834 } else if (sc_BadProtName.find (search.c_str()) != sc_BadProtName.end()
2835 || NStr::Find(search, "=") != string::npos
2836 || NStr::Find(search, "~") != string::npos
2837 || NStr::FindNoCase(search, "uniprot") != string::npos
2838 || NStr::FindNoCase(search, "uniprotkb") != string::npos
2839 || NStr::FindNoCase(search, "pmid") != string::npos
2840 || NStr::FindNoCase(search, "dbxref") != string::npos) {
2841 PostErr (eDiag_Warning, eErr_SEQ_FEAT_UndesiredProteinName,
2842 "Uninformative protein name '" + it + "'");
2843 }
2844 }
2845 }
2846
2847
x_ValidateECNumbers()2848 void CProtValidator::x_ValidateECNumbers()
2849 {
2850 if (!m_Feat.GetData().GetProt().IsSetEc()) {
2851 return;
2852 }
2853 for (auto it : m_Feat.GetData().GetProt().GetEc()) {
2854 if (NStr::IsBlank (it)) {
2855 PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberEmpty, "EC number should not be empty");
2856 } else if (!CProt_ref::IsValidECNumberFormat(it)) {
2857 PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadEcNumberFormat,
2858 (it) + " is not in proper EC_number format");
2859 } else {
2860 const string& ec_number = it;
2861 CProt_ref::EECNumberStatus status = CProt_ref::GetECNumberStatus (ec_number);
2862 x_ReportECNumFileStatus();
2863 switch (status) {
2864 case CProt_ref::eEC_deleted:
2865 PostErr(eDiag_Warning, eErr_SEQ_FEAT_DeletedEcNumber,
2866 "EC_number " + ec_number + " was deleted");
2867 break;
2868 case CProt_ref::eEC_replaced:
2869 PostErr (eDiag_Warning,
2870 CProt_ref::IsECNumberSplit(ec_number) ? eErr_SEQ_FEAT_SplitEcNumber : eErr_SEQ_FEAT_ReplacedEcNumber,
2871 "EC_number " + ec_number + " was transferred and is no longer valid");
2872 break;
2873 case CProt_ref::eEC_unknown:
2874 {
2875 size_t pos = NStr::Find (ec_number, "n");
2876 if (pos == string::npos || !isdigit (ec_number.c_str()[pos + 1])) {
2877 PostErr (eDiag_Warning, eErr_SEQ_FEAT_BadEcNumberValue,
2878 ec_number + " is not a legal value for qualifier EC_number");
2879 } else {
2880 PostErr (eDiag_Info, eErr_SEQ_FEAT_BadEcNumberValue,
2881 ec_number + " is not a legal preliminary value for qualifier EC_number");
2882 }
2883 }
2884 break;
2885 default:
2886 break;
2887 }
2888 }
2889 }
2890
2891 }
2892
2893
x_ValidateProteinName(const string & prot_name)2894 void CProtValidator::x_ValidateProteinName(const string& prot_name)
2895 {
2896 if (NStr::EndsWith(prot_name, "]")) {
2897 bool report_name = true;
2898 size_t pos = NStr::Find(prot_name, "[", NStr::eNocase, NStr::eReverseSearch);
2899 if (pos == string::npos) {
2900 // no disqualifying text
2901 } else if (prot_name.length() - pos < 5) {
2902 // no disqualifying text
2903 } else if (NStr::EqualCase(prot_name, pos, 4, "[NAD")) {
2904 report_name = false;
2905 }
2906 if (!m_Imp.IsEmbl() && !m_Imp.IsTPE()) {
2907 if (report_name) {
2908 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ProteinNameEndsInBracket,
2909 "Protein name ends with bracket and may contain organism name");
2910 }
2911 }
2912 }
2913 if (NStr::StartsWith(prot_name, "hypothetical protein XP_") && m_LocationBioseq) {
2914 for (auto id_it : m_LocationBioseq.GetCompleteBioseq()->GetId()) {
2915 if (id_it->IsOther()
2916 && id_it->GetOther().IsSetAccession()
2917 && !NStr::EqualNocase(id_it->GetOther().GetAccession(),
2918 prot_name.substr(21))) {
2919 PostErr(eDiag_Warning, eErr_SEQ_FEAT_HypotheticalProteinMismatch,
2920 "Hypothetical protein reference does not match accession");
2921 }
2922 }
2923 }
2924 if (!m_Imp.IsRefSeq() && NStr::FindNoCase(prot_name, "RefSeq") != string::npos) {
2925 PostErr(eDiag_Error, eErr_SEQ_FEAT_RefSeqInText, "Protein name contains 'RefSeq'");
2926 }
2927 if (m_Feat.IsSetComment() && NStr::EqualCase(m_Feat.GetComment(), prot_name)) {
2928 PostErr(eDiag_Warning, eErr_SEQ_FEAT_RedundantFields,
2929 "Comment has same value as protein name");
2930 }
2931
2932 if (s_StringHasPMID(prot_name)) {
2933 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ProteinNameHasPMID,
2934 "Protein name has internal PMID");
2935 }
2936
2937 if (m_Imp.DoRubiscoTest()) {
2938 if (NStr::FindCase(prot_name, "ribulose") != string::npos
2939 && NStr::FindCase(prot_name, "bisphosphate") != string::npos
2940 && NStr::FindCase(prot_name, "methyltransferase") == string::npos
2941 && NStr::FindCase(prot_name, "activase") == string::npos) {
2942 if (NStr::EqualNocase(prot_name, "ribulose-1,5-bisphosphate carboxylase/oxygenase")) {
2943 // allow standard name without large or small subunit designation - later need kingdom test
2944 } else if (!NStr::EqualNocase(prot_name, "ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit")
2945 && !NStr::EqualNocase(prot_name, "ribulose-1,5-bisphosphate carboxylase/oxygenase small subunit")) {
2946 PostErr(eDiag_Warning, eErr_SEQ_FEAT_RubiscoProblem,
2947 "Nonstandard ribulose bisphosphate protein name");
2948 }
2949 }
2950 }
2951
2952
2953
2954 ValidateCharactersInField(prot_name, "Protein name");
2955 if (ContainsSgml(prot_name)) {
2956 PostErr(eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
2957 "protein name " + prot_name + " has SGML");
2958 }
2959
2960 }
2961
2962
x_ValidateMolinfoPartials()2963 void CProtValidator::x_ValidateMolinfoPartials()
2964 {
2965 if (!m_LocationBioseq) {
2966 return;
2967 }
2968 const CBioseq& pbioseq = *(m_LocationBioseq.GetCompleteBioseq());
2969 // if there is a coding region for this bioseq, this type of error
2970 // will be handled there
2971 const CSeq_feat* cds = m_Imp.GetCDSGivenProduct(pbioseq);
2972 if (cds) return;
2973 CFeat_CI prot(m_LocationBioseq, CSeqFeatData::eSubtype_prot);
2974 if (! prot) return;
2975
2976 CSeqdesc_CI mi_i(m_LocationBioseq, CSeqdesc::e_Molinfo);
2977 if (! mi_i) return;
2978 const CMolInfo& mi = mi_i->GetMolinfo();
2979 if (! mi.IsSetCompleteness()) return;
2980 int completeness = mi.GetCompleteness();
2981
2982 const CSeq_loc& prot_loc = prot->GetLocation();
2983 bool prot_partial5 = prot_loc.IsPartialStart(eExtreme_Biological);
2984 bool prot_partial3 = prot_loc.IsPartialStop(eExtreme_Biological);
2985
2986 bool conflict = false;
2987 if (completeness == CMolInfo::eCompleteness_partial && ((! prot_partial5) && (! prot_partial3))) {
2988 conflict = true;
2989 } else if (completeness == CMolInfo::eCompleteness_no_left && ((! prot_partial5) || prot_partial3)) {
2990 conflict = true;
2991 } else if (completeness == CMolInfo::eCompleteness_no_right && (prot_partial5 || (! prot_partial3))) {
2992 conflict = true;
2993 } else if (completeness == CMolInfo::eCompleteness_no_ends && ((! prot_partial5) || (! prot_partial3))) {
2994 conflict = true;
2995 } else if ((completeness < CMolInfo::eCompleteness_partial || completeness > CMolInfo::eCompleteness_no_ends) && (prot_partial5 || prot_partial3)) {
2996 conflict = true;
2997 }
2998
2999 if (conflict) {
3000 PostErr (eDiag_Warning, eErr_SEQ_FEAT_PartialsInconsistent,
3001 "Molinfo completeness and protein feature partials conflict");
3002 }
3003 }
3004
Validate()3005 void CRNAValidator::Validate()
3006 {
3007 CSingleFeatValidator::Validate();
3008
3009 const CRNA_ref& rna = m_Feat.GetData().GetRna();
3010
3011 CRNA_ref::EType rna_type = CRNA_ref::eType_unknown;
3012 if (rna.IsSetType()) {
3013 rna_type = rna.GetType();
3014 }
3015
3016 if (rna_type == CRNA_ref::eType_rRNA) {
3017 if (rna.CanGetExt() && rna.GetExt().IsName()) {
3018 const string& rna_name = rna.GetExt().GetName();
3019 ValidateCharactersInField (rna_name, "rRNA name");
3020 if (ContainsSgml(rna_name)) {
3021 PostErr (eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
3022 "rRNA name " + rna_name + " has SGML");
3023 }
3024 }
3025 }
3026
3027 x_ValidateTrnaData();
3028 x_ValidateTrnaType();
3029
3030 bool feat_pseudo = s_IsPseudo(m_Feat);
3031 bool pseudo = feat_pseudo;
3032 if (!pseudo) {
3033 CConstRef<CSeq_feat> gene = m_Imp.GetGeneCache().GetGeneFromCache(&m_Feat, m_Scope);
3034 if (gene) {
3035 pseudo = s_IsPseudo(*gene);
3036 }
3037 }
3038
3039 if (!pseudo) {
3040 x_ValidateRnaTrans();
3041 }
3042
3043 x_ValidateRnaProduct(feat_pseudo, pseudo);
3044
3045 if (rna_type == CRNA_ref::eType_rRNA
3046 || rna_type == CRNA_ref::eType_snRNA
3047 || rna_type == CRNA_ref::eType_scRNA
3048 || rna_type == CRNA_ref::eType_snoRNA) {
3049 if (!rna.IsSetExt() || !rna.GetExt().IsName() || NStr::IsBlank(rna.GetExt().GetName())) {
3050 if (!pseudo) {
3051 string rna_typename = CRNA_ref::GetRnaTypeName(rna_type);
3052 PostErr(eDiag_Warning, eErr_SEQ_FEAT_rRNADoesNotHaveProduct,
3053 rna_typename + " has no name");
3054 }
3055 }
3056 }
3057
3058
3059 if ( rna_type == CRNA_ref::eType_unknown ) {
3060 PostErr(eDiag_Warning, eErr_SEQ_FEAT_RNAtype0,
3061 "RNA type 0 (unknown) not supported");
3062 }
3063
3064
3065 }
3066
3067
x_ValidateRnaProduct(bool feat_pseudo,bool pseudo)3068 void CRNAValidator::x_ValidateRnaProduct(bool feat_pseudo, bool pseudo)
3069 {
3070 if (!m_Feat.IsSetProduct()) {
3071 return;
3072 }
3073
3074 x_ValidateRnaProductType();
3075
3076 if ((!m_Feat.IsSetExcept_text()
3077 || NStr::FindNoCase (m_Feat.GetExcept_text(), "transcribed pseudogene") == string::npos)
3078 && !m_Imp.IsRefSeq()) {
3079 if (feat_pseudo) {
3080 PostErr (eDiag_Warning, eErr_SEQ_FEAT_PseudoRnaHasProduct,
3081 "A pseudo RNA should not have a product");
3082 } else if (pseudo) {
3083 PostErr (eDiag_Warning, eErr_SEQ_FEAT_PseudoRnaViaGeneHasProduct,
3084 "An RNA overlapped by a pseudogene should not have a product");
3085 }
3086 }
3087
3088 }
3089
3090
x_ValidateRnaProductType()3091 void CRNAValidator::x_ValidateRnaProductType()
3092 {
3093 if ( !m_Feat.GetData().GetRna().IsSetType() || !m_ProductBioseq ) {
3094 return;
3095 }
3096 CSeqdesc_CI di(m_ProductBioseq, CSeqdesc::e_Molinfo);
3097 if ( !di ) {
3098 return;
3099 }
3100 const CMolInfo& mol_info = di->GetMolinfo();
3101 if ( !mol_info.CanGetBiomol() ) {
3102 return;
3103 }
3104 int biomol = mol_info.GetBiomol();
3105
3106 switch ( m_Feat.GetData().GetRna().GetType() ) {
3107
3108 case CRNA_ref::eType_mRNA:
3109 if ( biomol == CMolInfo::eBiomol_mRNA ) {
3110 return;
3111 }
3112 break;
3113
3114 case CRNA_ref::eType_tRNA:
3115 if ( biomol == CMolInfo::eBiomol_tRNA ) {
3116 return;
3117 }
3118 break;
3119
3120 case CRNA_ref::eType_rRNA:
3121 if ( biomol == CMolInfo::eBiomol_rRNA ) {
3122 return;
3123 }
3124 break;
3125
3126 default:
3127 return;
3128 }
3129
3130 PostErr(eDiag_Error, eErr_SEQ_FEAT_RnaProductMismatch,
3131 "Type of RNA does not match MolInfo of product Bioseq");
3132 }
3133
3134
s_IsBioseqPartial(CBioseq_Handle bsh)3135 static bool s_IsBioseqPartial (CBioseq_Handle bsh)
3136 {
3137 CSeqdesc_CI sd(bsh, CSeqdesc::e_Molinfo);
3138 if ( !sd ) {
3139 return false;
3140 }
3141 const CMolInfo& molinfo = sd->GetMolinfo();
3142 if (!molinfo.IsSetCompleteness ()) {
3143 return false;
3144 }
3145 CMolInfo::TCompleteness completeness = molinfo.GetCompleteness();
3146 if (completeness == CMolInfo::eCompleteness_partial
3147 || completeness == CMolInfo::eCompleteness_no_ends
3148 || completeness == CMolInfo::eCompleteness_no_left
3149 || completeness == CMolInfo::eCompleteness_no_right) {
3150 return true;
3151 } else {
3152 return false;
3153 }
3154 }
3155
3156
x_ValidateTrnaData()3157 void CRNAValidator::x_ValidateTrnaData()
3158 {
3159 if (!m_Feat.GetData().GetRna().IsSetExt() || !m_Feat.GetData().GetRna().GetExt().IsTRNA()) {
3160 return;
3161 }
3162 if ( !m_Feat.GetData().GetRna().IsSetType() ||
3163 m_Feat.GetData().GetRna().GetType() != CRNA_ref::eType_tRNA ) {
3164 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidTRNAdata,
3165 "tRNA data structure on non-tRNA feature");
3166 }
3167
3168 const CTrna_ext& trna = m_Feat.GetData().GetRna().GetExt ().GetTRNA ();
3169 if ( trna.CanGetAnticodon () ) {
3170 const CSeq_loc& anticodon = trna.GetAnticodon();
3171 size_t anticodon_len = GetLength(anticodon, &m_Scope);
3172 if ( anticodon_len != 3 ) {
3173 PostErr (eDiag_Warning, eErr_SEQ_FEAT_tRNArange,
3174 "Anticodon is not 3 bases in length");
3175 }
3176 ECompare comp = sequence::Compare(anticodon,
3177 m_Feat.GetLocation(),
3178 &m_Scope,
3179 sequence::fCompareOverlapping);
3180 if ( comp != eContained && comp != eSame ) {
3181 PostErr (eDiag_Error, eErr_SEQ_FEAT_tRNArange,
3182 "Anticodon location not in tRNA");
3183 }
3184 x_ValidateAnticodon(anticodon);
3185 }
3186 x_ValidateTrnaCodons();
3187
3188 }
3189
3190
x_ValidateTrnaType()3191 void CRNAValidator::x_ValidateTrnaType()
3192 {
3193 if (!m_Feat.GetData().GetRna().IsSetType() ||
3194 m_Feat.GetData().GetRna().GetType() != CRNA_ref::eType_tRNA) {
3195 return;
3196 }
3197 const CRNA_ref& rna = m_Feat.GetData().GetRna();
3198
3199 // check for unparsed qualifiers
3200 for (auto& gbqual : m_Feat.GetQual()) {
3201 if ( NStr::CompareNocase(gbqual->GetQual (), "anticodon") == 0 ) {
3202 PostErr(eDiag_Error, eErr_SEQ_FEAT_UnparsedtRNAAnticodon,
3203 "Unparsed anticodon qualifier in tRNA");
3204 } else if (NStr::CompareNocase (gbqual->GetQual (), "product") == 0 ) {
3205 if (NStr::CompareNocase (gbqual->GetVal (), "tRNA-fMet") != 0 &&
3206 NStr::CompareNocase (gbqual->GetVal (), "tRNA-iMet") != 0 &&
3207 NStr::CompareNocase (gbqual->GetVal (), "tRNA-Ile2") != 0) {
3208 PostErr(eDiag_Error, eErr_SEQ_FEAT_UnparsedtRNAProduct,
3209 "Unparsed product qualifier in tRNA");
3210 }
3211 }
3212 }
3213
3214
3215 /* tRNA with string extension */
3216 if ( rna.IsSetExt() &&
3217 rna.GetExt().Which () == CRNA_ref::C_Ext::e_Name ) {
3218 PostErr(eDiag_Error, eErr_SEQ_FEAT_UnparsedtRNAProduct,
3219 "Unparsed product qualifier in tRNA");
3220 } else if (!rna.IsSetExt() || rna.GetExt().Which() == CRNA_ref::C_Ext::e_not_set ) {
3221 PostErr (eDiag_Warning, eErr_SEQ_FEAT_MissingTrnaAA,
3222 "Missing encoded amino acid qualifier in tRNA");
3223 }
3224
3225 x_ValidateTrnaOverlap();
3226 }
3227
3228
x_ValidateAnticodon(const CSeq_loc & anticodon)3229 void CRNAValidator::x_ValidateAnticodon(const CSeq_loc& anticodon)
3230 {
3231 bool ordered = true;
3232 bool adjacent = false;
3233 bool unmarked_strand = false;
3234 bool mixed_strand = false;
3235
3236 CSeq_loc_CI prev;
3237 for (CSeq_loc_CI curr(anticodon); curr; ++curr) {
3238 bool chk = true;
3239 if (curr.GetEmbeddingSeq_loc().IsInt()) {
3240 chk = sequence::IsValid(curr.GetEmbeddingSeq_loc().GetInt(), &m_Scope);
3241 } else if (curr.GetEmbeddingSeq_loc().IsPnt()) {
3242 chk = sequence::IsValid(curr.GetEmbeddingSeq_loc().GetPnt(), &m_Scope);
3243 } else {
3244 continue;
3245 }
3246
3247 if ( !chk ) {
3248 string lbl;
3249 curr.GetEmbeddingSeq_loc().GetLabel(&lbl);
3250 PostErr(eDiag_Critical, eErr_SEQ_FEAT_tRNArange,
3251 "Anticodon location [" + lbl + "] out of range");
3252 }
3253
3254 if ( prev && curr &&
3255 IsSameBioseq(curr.GetSeq_id(), prev.GetSeq_id(), &m_Scope) ) {
3256 CSeq_loc_CI::TRange prev_range = prev.GetRange();
3257 CSeq_loc_CI::TRange curr_range = curr.GetRange();
3258 if ( ordered ) {
3259 if ( curr.GetStrand() == eNa_strand_minus ) {
3260 if (prev_range.GetTo() < curr_range.GetTo()) {
3261 ordered = false;
3262 }
3263 if (curr_range.GetTo() + 1 == prev_range.GetFrom()) {
3264 adjacent = true;
3265 }
3266 } else {
3267 if (prev_range.GetTo() > curr_range.GetTo()) {
3268 ordered = false;
3269 }
3270 if (prev_range.GetTo() + 1 == curr_range.GetFrom()) {
3271 adjacent = true;
3272 }
3273 }
3274 }
3275 ENa_strand curr_strand = curr.GetStrand();
3276 ENa_strand prev_strand = prev.GetStrand();
3277 if ( curr_range == prev_range && curr_strand == prev_strand ) {
3278 PostErr(eDiag_Warning, eErr_SEQ_FEAT_DuplicateAnticodonInterval,
3279 "Duplicate anticodon exons in location");
3280 }
3281 if ( curr_strand != prev_strand ) {
3282 if (curr_strand == eNa_strand_plus && prev_strand == eNa_strand_unknown) {
3283 unmarked_strand = true;
3284 } else if (curr_strand == eNa_strand_unknown && prev_strand == eNa_strand_plus) {
3285 unmarked_strand = true;
3286 } else {
3287 mixed_strand = true;
3288 }
3289 }
3290 }
3291 prev = curr;
3292 }
3293 if (adjacent) {
3294 PostErr(eDiag_Warning, eErr_SEQ_FEAT_AbuttingIntervals,
3295 "Adjacent intervals in Anticodon");
3296 }
3297
3298 ENa_strand loc_strand = m_Feat.GetLocation().GetStrand();
3299 ENa_strand ac_strand = anticodon.GetStrand();
3300 if (loc_strand == eNa_strand_minus && ac_strand != eNa_strand_minus) {
3301 PostErr (eDiag_Error, eErr_SEQ_FEAT_AnticodonStrandConflict,
3302 "Anticodon strand and tRNA strand do not match.");
3303 } else if (loc_strand != eNa_strand_minus && ac_strand == eNa_strand_minus) {
3304 PostErr (eDiag_Error, eErr_SEQ_FEAT_AnticodonStrandConflict,
3305 "Anticodon strand and tRNA strand do not match.");
3306 }
3307
3308 // trans splicing exception turns off both mixed_strand and out_of_order messages
3309 bool trans_splice = false;
3310 if (m_Feat.CanGetExcept() && m_Feat.GetExcept() && m_Feat.CanGetExcept_text()) {
3311 if (NStr::FindNoCase(m_Feat.GetExcept_text(), "trans-splicing") != NPOS) {
3312 trans_splice = true;
3313 }
3314 }
3315 if (!trans_splice) {
3316 string loc_lbl;
3317 anticodon.GetLabel(&loc_lbl);
3318 if (mixed_strand) {
3319 PostErr(eDiag_Warning, eErr_SEQ_FEAT_AnticodonMixedStrand,
3320 "Mixed strands in Anticodon [" + loc_lbl + "]");
3321 }
3322 if (unmarked_strand) {
3323 PostErr(eDiag_Warning, eErr_SEQ_FEAT_AnticodonMixedStrand,
3324 "Mixed plus and unknown strands in Anticodon [" + loc_lbl + "]");
3325 }
3326 if (!ordered) {
3327 PostErr(eDiag_Warning, eErr_SEQ_FEAT_SeqLocOrder,
3328 "Intervals out of order in Anticodon [" + loc_lbl + "]");
3329 }
3330 }
3331 }
3332
3333
3334 int s_LegalNcbieaaValues[] = { 42, 65, 66, 67, 68, 69, 70, 71, 72, 73,
3335 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
3336 84, 85, 86, 87, 88, 89, 90 };
3337
3338 static const char* kAANames[] = {
3339 "---", "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
3340 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr",
3341 "Val", "Trp", "OTHER", "Tyr", "Glx", "Sec", "TERM", "Pyl", "Xle"
3342 };
3343
3344
GetAAName(unsigned char aa,bool is_ascii)3345 const char* GetAAName(unsigned char aa, bool is_ascii)
3346 {
3347 try {
3348 if (is_ascii) {
3349 aa = CSeqportUtil::GetMapToIndex
3350 (CSeq_data::e_Ncbieaa, CSeq_data::e_Ncbistdaa, aa);
3351 }
3352 return (aa < sizeof(kAANames)/sizeof(*kAANames)) ? kAANames[aa] : "OTHER";
3353 } catch (CException ) {
3354 return "OTHER";
3355 } catch (std::exception ) {
3356 return "OTHER";
3357 }
3358 }
3359
3360
GetGeneticCodeName(int gcode)3361 static string GetGeneticCodeName (int gcode)
3362 {
3363 const CGenetic_code_table& code_table = CGen_code_table::GetCodeTable();
3364 const list<CRef<CGenetic_code> >& codes = code_table.Get();
3365
3366 for ( list<CRef<CGenetic_code> >::const_iterator code_it = codes.begin(), code_it_end = codes.end(); code_it != code_it_end; ++code_it ) {
3367 if ((*code_it)->GetId() == gcode) {
3368 return (*code_it)->GetName();
3369 }
3370 }
3371 return "unknown";
3372 }
3373
3374
x_ValidateTrnaCodons()3375 void CRNAValidator::x_ValidateTrnaCodons()
3376 {
3377 if (!m_Feat.IsSetData() || !m_Feat.GetData().IsRna() ||
3378 !m_Feat.GetData().GetRna().IsSetExt() ||
3379 !m_Feat.GetData().GetRna().GetExt().IsTRNA()) {
3380 return;
3381 }
3382 const CTrna_ext& trna = m_Feat.GetData().GetRna().GetExt().GetTRNA();
3383
3384 if (!trna.IsSetAa()) {
3385 PostErr (eDiag_Error, eErr_SEQ_FEAT_BadTrnaAA, "Missing tRNA amino acid");
3386 return;
3387 }
3388
3389 unsigned char aa = 0, orig_aa;
3390 vector<char> seqData;
3391 string str;
3392
3393 switch (trna.GetAa().Which()) {
3394 case CTrna_ext::C_Aa::e_Iupacaa:
3395 str = trna.GetAa().GetIupacaa();
3396 CSeqConvert::Convert(str, CSeqUtil::e_Iupacaa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
3397 aa = seqData[0];
3398 break;
3399 case CTrna_ext::C_Aa::e_Ncbi8aa:
3400 str = trna.GetAa().GetNcbi8aa();
3401 CSeqConvert::Convert(str, CSeqUtil::e_Ncbi8aa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
3402 aa = seqData[0];
3403 break;
3404 case CTrna_ext::C_Aa::e_Ncbistdaa:
3405 str = trna.GetAa().GetNcbi8aa();
3406 CSeqConvert::Convert(str, CSeqUtil::e_Ncbistdaa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
3407 aa = seqData[0];
3408 break;
3409 case CTrna_ext::C_Aa::e_Ncbieaa:
3410 seqData.push_back(trna.GetAa().GetNcbieaa());
3411 aa = seqData[0];
3412 break;
3413 default:
3414 NCBI_THROW (CCoreException, eCore, "Unrecognized tRNA aa coding");
3415 break;
3416 }
3417
3418 // make sure the amino acid is valid
3419 bool found = false;
3420 for ( unsigned int i = 0; i < sizeof (s_LegalNcbieaaValues) / sizeof (int); ++i ) {
3421 if ( aa == s_LegalNcbieaaValues[i] ) {
3422 found = true;
3423 break;
3424 }
3425 }
3426 orig_aa = aa;
3427 if ( !found ) {
3428 aa = ' ';
3429 }
3430
3431 if (m_Feat.GetData().GetRna().IsSetType() &&
3432 m_Feat.GetData().GetRna().GetType() == CRNA_ref::eType_tRNA) {
3433 bool mustbemethionine = false;
3434 for (auto gbqual : m_Feat.GetQual()) {
3435 if (NStr::CompareNocase(gbqual->GetQual(), "product") == 0 &&
3436 (NStr::CompareNocase(gbqual->GetVal(), "tRNA-fMet") == 0 ||
3437 NStr::CompareNocase(gbqual->GetVal(), "tRNA-iMet") == 0)) {
3438 mustbemethionine = true;
3439 break;
3440 }
3441 }
3442 if (mustbemethionine) {
3443 if (aa != 'M') {
3444 string aanm = GetAAName(aa, true);
3445 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
3446 "Initiation tRNA claims to be tRNA-" + aanm +
3447 ", but should be tRNA-Met");
3448 }
3449 }
3450 }
3451
3452 // Retrive the Genetic code id for the tRNA
3453 int gcode = 1;
3454 if ( m_LocationBioseq ) {
3455 // need only the closest biosoure.
3456 CSeqdesc_CI diter(m_LocationBioseq, CSeqdesc::e_Source);
3457 if ( diter ) {
3458 gcode = diter->GetSource().GetGenCode();
3459 }
3460 }
3461
3462 const string& ncbieaa = CGen_code_table::GetNcbieaa(gcode);
3463 if ( ncbieaa.length() != 64 ) {
3464 return;
3465 }
3466
3467 string codename = GetGeneticCodeName (gcode);
3468 char buf[2];
3469 buf[0] = aa;
3470 buf[1] = 0;
3471 string aaname = buf;
3472 aaname += "/";
3473 aaname += GetAAName (aa, true);
3474
3475 EDiagSev sev = (aa == 'U' || aa == 'O') ? eDiag_Warning : eDiag_Error;
3476
3477 bool modified_codon_recognition = false;
3478 bool rna_editing = false;
3479 if ( m_Feat.IsSetExcept_text() ) {
3480 string excpt_text = m_Feat.GetExcept_text();
3481 if ( NStr::FindNoCase(excpt_text, "modified codon recognition") != NPOS ) {
3482 modified_codon_recognition = true;
3483 }
3484 if ( NStr::FindNoCase(excpt_text, "RNA editing") != NPOS ) {
3485 rna_editing = true;
3486 }
3487 }
3488
3489 vector<string> recognized_codon_values;
3490 vector<unsigned char> recognized_taa_values;
3491
3492 ITERATE( CTrna_ext::TCodon, iter, trna.GetCodon() ) {
3493 if (*iter == 255) continue;
3494 // test that codon value is in range 0 - 63
3495 if ( *iter > 63 ) {
3496 PostErr(sev, eErr_SEQ_FEAT_BadTrnaCodon,
3497 "tRNA codon value " + NStr::IntToString(*iter) +
3498 " is greater than maximum 63");
3499 continue;
3500 } else if (*iter < 0) {
3501 PostErr(sev, eErr_SEQ_FEAT_BadTrnaCodon,
3502 "tRNA codon value " + NStr::IntToString(*iter) +
3503 " is less than 0");
3504 continue;
3505 }
3506
3507 if ( !modified_codon_recognition && !rna_editing ) {
3508 unsigned char taa = ncbieaa[*iter];
3509 string codon = CGen_code_table::IndexToCodon(*iter);
3510 recognized_codon_values.push_back (codon);
3511 recognized_taa_values.push_back (taa);
3512
3513 if ( taa != aa ) {
3514 if ( (aa == 'U') && (taa == '*') && (*iter == 14) ) {
3515 // selenocysteine normally uses TGA (14), so ignore without requiring exception in record
3516 // TAG (11) is used for pyrrolysine in archaebacteria
3517 // TAA (10) is not yet known to be used for an exceptional amino acid
3518 } else {
3519 NStr::ReplaceInPlace (codon, "T", "U");
3520
3521 PostErr(sev, eErr_SEQ_FEAT_TrnaCodonWrong,
3522 "Codon recognized by tRNA (" + codon + ") does not match amino acid ("
3523 + aaname + ") specified by genetic code ("
3524 + NStr::IntToString (gcode) + "/" + codename + ")");
3525 }
3526 }
3527 }
3528 }
3529
3530 // see if anticodon is compatible with codons recognized and amino acid
3531 string anticodon = "?";
3532 vector<string> codon_values;
3533 vector<unsigned char> taa_values;
3534
3535 if (trna.IsSetAnticodon() && GetLength (trna.GetAnticodon(), &m_Scope) == 3) {
3536 try {
3537 anticodon = GetSequenceStringFromLoc(trna.GetAnticodon(), m_Scope);
3538 // get reverse complement sequence for location
3539 CRef<CSeq_loc> codon_loc(SeqLocRevCmpl(trna.GetAnticodon(), &m_Scope));
3540 string codon = GetSequenceStringFromLoc(*codon_loc, m_Scope);
3541 if (codon.length() > 3) {
3542 codon = codon.substr (0, 3);
3543 }
3544
3545 // expand wobble base to known binding partners
3546 string wobble;
3547
3548 char ch = anticodon.c_str()[0];
3549 switch (ch) {
3550 case 'A' :
3551 wobble = "ACT";
3552 break;
3553 case 'C' :
3554 wobble = "G";
3555 break;
3556 case 'G' :
3557 wobble = "CT";
3558 break;
3559 case 'T' :
3560 wobble = "AG";
3561 break;
3562 default :
3563 break;
3564 }
3565 if (!NStr::IsBlank(wobble)) {
3566 string::iterator str_it = wobble.begin();
3567 while (str_it != wobble.end()) {
3568 codon[2] = *str_it;
3569 int index = CGen_code_table::CodonToIndex (codon);
3570 if (index < 64 && index > -1) {
3571 unsigned char taa = ncbieaa[index];
3572 taa_values.push_back(taa);
3573 codon_values.push_back(codon);
3574 }
3575 ++str_it;
3576 }
3577 }
3578 NStr::ReplaceInPlace (anticodon, "T", "U");
3579 if (anticodon.length() > 3) {
3580 anticodon = anticodon.substr(0, 3);
3581 }
3582 } catch (CException ) {
3583 } catch (std::exception ) {
3584 }
3585
3586 if (codon_values.size() > 0) {
3587 bool ok = false;
3588 // check that codons predicted from anticodon can transfer indicated amino acid
3589 for (size_t i = 0; i < codon_values.size(); i++) {
3590 if (!NStr::IsBlank (codon_values[i]) && aa == taa_values[i]) {
3591 ok = true;
3592 }
3593 }
3594 if (!ok) {
3595 if (aa == 'U' && NStr::Equal (anticodon, "UCA")) {
3596 // ignore TGA codon for selenocysteine
3597 } else if (aa == 'O' && NStr::Equal (anticodon, "CUA")) {
3598 // ignore TAG codon for pyrrolysine
3599 } else if (aa == 'I' && NStr::Equal (anticodon, "CAU")) {
3600 // ignore ATG predicted codon for Ile2
3601 } else if (!m_Feat.IsSetExcept_text()
3602 || (NStr::FindNoCase(m_Feat.GetExcept_text(), "modified codon recognition") == string::npos
3603 &&NStr::FindNoCase(m_Feat.GetExcept_text(), "RNA editing") == string::npos)) {
3604 PostErr (eDiag_Warning, eErr_SEQ_FEAT_BadAnticodonAA,
3605 "Codons predicted from anticodon (" + anticodon
3606 + ") cannot produce amino acid (" + aaname + ")");
3607 }
3608 }
3609
3610 // check that codons recognized match codons predicted from anticodon
3611 if (recognized_codon_values.size() > 0) {
3612 bool ok = false;
3613 for (size_t i = 0; i < codon_values.size() && !ok; i++) {
3614 for (size_t j = 0; j < recognized_codon_values.size() && !ok; j++) {
3615 if (NStr::Equal (codon_values[i], recognized_codon_values[j])) {
3616 ok = true;
3617 } else if ( NStr::Equal (codon_values[i], "ATG") && aa == 'I') {
3618 // allow ATG recognized codon (pre-RNA-editing) for Ile2
3619 ok = true;
3620 }
3621 }
3622 }
3623 if (!ok
3624 && (!m_Feat.IsSetExcept_text()
3625 || NStr::FindNoCase (m_Feat.GetExcept_text(), "RNA editing") == string::npos)) {
3626 PostErr (eDiag_Warning, eErr_SEQ_FEAT_BadAnticodonCodon,
3627 "Codon recognized cannot be produced from anticodon ("
3628 + anticodon + ")");
3629 }
3630 }
3631 }
3632 }
3633
3634 if (!m_Feat.IsSetPseudo() || !m_Feat.GetPseudo()) {
3635 if (orig_aa == 0 || orig_aa == 255) {
3636 PostErr (sev, eErr_SEQ_FEAT_BadTrnaAA, "Missing tRNA amino acid");
3637 } else {
3638 // verify that legal amino acid is indicated
3639 unsigned int idx;
3640 if (aa != '*') {
3641 idx = aa - 64;
3642 } else {
3643 idx = 25;
3644 }
3645 if (idx == 0 || idx >= 28) {
3646 PostErr (sev, eErr_SEQ_FEAT_BadTrnaAA, "Invalid tRNA amino acid");
3647 }
3648 }
3649 }
3650 }
3651
3652
x_ValidateTrnaOverlap()3653 void CRNAValidator::x_ValidateTrnaOverlap()
3654 {
3655 if (!m_Feat.GetData().GetRna().IsSetType() ||
3656 m_Feat.GetData().GetRna().GetType() != CRNA_ref::eType_tRNA) {
3657 return;
3658 }
3659 TFeatScores scores;
3660 GetOverlappingFeatures(m_Feat.GetLocation(),
3661 CSeqFeatData::e_Rna,
3662 CSeqFeatData::eSubtype_rRNA,
3663 eOverlap_Interval,
3664 scores, m_Scope);
3665 bool found_bad = false;
3666 for (auto it : scores) {
3667 CRef<CSeq_loc> intersection = it.second->GetLocation().Intersect(m_Feat.GetLocation(),
3668 0 /* flags*/,
3669 NULL /* synonym mapper */);
3670 if (intersection) {
3671 TSeqPos length = sequence::GetLength(*intersection, &m_Scope);
3672 if (length >= 5) {
3673 found_bad = true;
3674 break;
3675 }
3676 }
3677 }
3678 if (found_bad) {
3679 PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadRRNAcomponentOverlapTRNA,
3680 "tRNA-rRNA overlap");
3681 }
3682 }
3683
3684
x_ValidateRnaTrans()3685 void CRNAValidator::x_ValidateRnaTrans()
3686 {
3687 size_t mismatches = 0;
3688 size_t problems = GetMRNATranslationProblems
3689 (m_Feat, mismatches, m_Imp.IgnoreExceptions(),
3690 m_LocationBioseq, m_ProductBioseq,
3691 m_Imp.IsFarFetchMRNAproducts(), m_Imp.IsGpipe(),
3692 m_Imp.IsGenomic(), &m_Scope);
3693 x_ReportRNATranslationProblems(problems, mismatches);
3694 }
3695
3696
x_ReportRNATranslationProblems(size_t problems,size_t mismatches)3697 void CRNAValidator::x_ReportRNATranslationProblems(size_t problems, size_t mismatches)
3698 {
3699 if (problems & eMRNAProblem_TransFail) {
3700 PostErr(eDiag_Error, eErr_SEQ_FEAT_MrnaTransFail,
3701 "Unable to transcribe mRNA");
3702 }
3703 if (problems & eMRNAProblem_UnableToFetch) {
3704 const CSeq_id& product_id = GetId(m_Feat.GetProduct(), &m_Scope);
3705 string label = product_id.AsFastaString();
3706 PostErr(eDiag_Error, eErr_SEQ_FEAT_ProductFetchFailure,
3707 "Unable to fetch mRNA transcript '" + label + "'");
3708 }
3709
3710 bool is_refseq = m_Imp.IsRefSeqConventions();
3711 if (m_LocationBioseq) {
3712 FOR_EACH_SEQID_ON_BIOSEQ(it, *(m_LocationBioseq.GetCompleteBioseq())) {
3713 if ((*it)->IsOther()) {
3714 is_refseq = true;
3715 break;
3716 }
3717 }
3718 }
3719
3720 TSeqPos feat_len = sequence::GetLength(m_Feat.GetLocation(), &m_Scope);
3721
3722 string farstr;
3723 EDiagSev sev = eDiag_Error;
3724
3725 // if not local bioseq product, lower severity (with the exception of Refseq)
3726 if (m_ProductIsFar && !is_refseq) {
3727 sev = eDiag_Warning;
3728 }
3729 if (m_ProductIsFar) {
3730 farstr = "(far) ";
3731 if (m_Feat.IsSetPartial()
3732 && !s_IsBioseqPartial(m_ProductBioseq)
3733 && s_BioseqHasRefSeqThatStartsWithPrefix(m_ProductBioseq, "NM_")) {
3734 sev = eDiag_Warning;
3735 }
3736 }
3737
3738 if (problems & eMRNAProblem_TranscriptLenLess) {
3739 PostErr(sev, eErr_SEQ_FEAT_TranscriptLen,
3740 "Transcript length [" + NStr::SizetToString(feat_len) +
3741 "] less than " + farstr + "product length [" +
3742 NStr::SizetToString(m_ProductBioseq.GetInst_Length()) + "], and tail < 95% polyA");
3743 }
3744
3745 if (problems & eMRNAProblem_PolyATail100) {
3746 PostErr(eDiag_Info, eErr_SEQ_FEAT_PolyATail,
3747 "Transcript length [" + NStr::SizetToString(feat_len)
3748 + "] less than " + farstr + "product length ["
3749 + NStr::SizetToString(m_ProductBioseq.GetInst_Length()) + "], but tail is 100% polyA");
3750 }
3751 if (problems & eMRNAProblem_PolyATail95) {
3752 PostErr(eDiag_Info, eErr_SEQ_FEAT_PolyATail,
3753 "Transcript length [" + NStr::SizetToString(feat_len) +
3754 "] less than " + farstr + "product length [" +
3755 NStr::SizetToString(m_ProductBioseq.GetInst_Length()) + "], but tail >= 95% polyA");
3756 }
3757 if (problems & eMRNAProblem_TranscriptLenMore) {
3758 PostErr(sev, eErr_SEQ_FEAT_TranscriptLen,
3759 "Transcript length [" + NStr::IntToString(feat_len) + "] " +
3760 "greater than " + farstr + "product length [" +
3761 NStr::IntToString(m_ProductBioseq.GetInst_Length()) + "]");
3762 }
3763 if ((problems & eMRNAProblem_Mismatch) && mismatches > 0) {
3764 PostErr(sev, eErr_SEQ_FEAT_TranscriptMismatches,
3765 "There are " + NStr::SizetToString(mismatches) +
3766 " mismatches out of " + NStr::SizetToString(feat_len) +
3767 " bases between the transcript and " + farstr + "product sequence");
3768 }
3769 if (problems & eMRNAProblem_UnnecessaryException) {
3770 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryException,
3771 "mRNA has exception but passes transcription test");
3772 }
3773 if (problems & eMRNAProblem_ErroneousException) {
3774 size_t total = min(feat_len, m_ProductBioseq.GetInst_Length());
3775 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ErroneousException,
3776 "mRNA has unclassified exception but only difference is " + NStr::SizetToString(mismatches)
3777 + " mismatches out of " + NStr::SizetToString(total) + " bases");
3778 }
3779 if (problems & eMRNAProblem_ProductReplaced) {
3780 PostErr(eDiag_Warning, eErr_SEQ_FEAT_mRNAUnnecessaryException,
3781 "mRNA has transcribed product replaced exception");
3782 }
3783 }
3784
3785
CMRNAValidator(const CSeq_feat & feat,CScope & scope,CValidError_imp & imp)3786 CMRNAValidator::CMRNAValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp) :
3787 CRNAValidator(feat, scope, imp)
3788 {
3789 m_Gene = m_Imp.GetGeneCache().GetGeneFromCache(&feat, m_Scope);
3790 if (m_Gene) {
3791 m_GeneIsPseudo = s_IsPseudo(*m_Gene);
3792 } else {
3793 m_GeneIsPseudo = false;
3794 }
3795 m_FeatIsPseudo = s_IsPseudo(m_Feat);
3796 }
3797
3798
Validate()3799 void CMRNAValidator::Validate()
3800 {
3801 CRNAValidator::Validate();
3802
3803 x_ReportPseudogeneConflict(m_Gene);
3804 x_ValidateLocusTagGeneralMatch(m_Gene);
3805
3806 x_ValidateMrna();
3807
3808 if (!m_GeneIsPseudo && !m_FeatIsPseudo) {
3809 x_ValidateCommonMRNAProduct();
3810 }
3811 x_ValidateMrnaGene();
3812 }
3813
3814
x_ValidateMrna()3815 void CMRNAValidator::x_ValidateMrna()
3816 {
3817 bool pseudo = m_GeneIsPseudo;
3818 if (!pseudo) {
3819 pseudo = s_IsPseudo(m_Feat);
3820 }
3821 ValidateSplice(pseudo, false);
3822
3823 const CRNA_ref& rna = m_Feat.GetData().GetRna();
3824
3825 if (m_Feat.IsSetQual()) {
3826 for (auto it : m_Feat.GetQual()) {
3827 const CGb_qual& qual = *it;
3828 if (qual.CanGetQual()) {
3829 const string& key = qual.GetQual();
3830 if (NStr::EqualNocase(key, "protein_id")) {
3831 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnFeature,
3832 "protein_id should not be a gbqual on an mRNA feature");
3833 }
3834 else if (NStr::EqualNocase(key, "transcript_id")) {
3835 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnFeature,
3836 "transcript_id should not be a gbqual on an mRNA feature");
3837 }
3838 }
3839 }
3840 }
3841
3842 if (rna.IsSetExt() && rna.GetExt().IsName()) {
3843 const string& rna_name = rna.GetExt().GetName();
3844 if (NStr::StartsWith(rna_name, "transfer RNA ") &&
3845 (!NStr::EqualNocase(rna_name, "transfer RNA nucleotidyltransferase")) &&
3846 (!NStr::EqualNocase(rna_name, "transfer RNA methyltransferase"))) {
3847 PostErr(eDiag_Warning, eErr_SEQ_FEAT_tRNAmRNAmixup,
3848 "mRNA feature product indicates it should be a tRNA feature");
3849 }
3850 ValidateCharactersInField(rna_name, "mRNA name");
3851 if (ContainsSgml(rna_name)) {
3852 PostErr(eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
3853 "mRNA name " + rna_name + " has SGML");
3854 }
3855 }
3856 }
3857
3858
x_ValidateCommonMRNAProduct()3859 void CMRNAValidator::x_ValidateCommonMRNAProduct()
3860 {
3861 if (!m_Feat.IsSetProduct()) {
3862 return;
3863 }
3864 if ( !m_ProductBioseq) {
3865 if (m_LocationBioseq) {
3866 CSeq_entry_Handle seh = m_LocationBioseq.GetTopLevelEntry();
3867 if (seh.IsSet() && seh.GetSet().IsSetClass()
3868 && (seh.GetSet().GetClass() == CBioseq_set::eClass_gen_prod_set
3869 || seh.GetSet().GetClass() == CBioseq_set::eClass_other)) {
3870 PostErr(eDiag_Error, eErr_SEQ_FEAT_MissingMRNAproduct,
3871 "Product Bioseq of mRNA feature is not "
3872 "packaged in the record");
3873 }
3874 }
3875 } else {
3876
3877 CConstRef<CSeq_feat> mrna = m_Imp.GetmRNAGivenProduct (*(m_ProductBioseq.GetCompleteBioseq()));
3878 if (mrna && mrna.GetPointer() != &m_Feat) {
3879 PostErr(eDiag_Critical, eErr_SEQ_FEAT_IdenticalMRNAtranscriptIDs,
3880 "Identical transcript IDs found on multiple mRNAs");
3881 }
3882 }
3883 }
3884
3885
s_EqualGene_ref(const CGene_ref & genomic,const CGene_ref & mrna)3886 static bool s_EqualGene_ref(const CGene_ref& genomic, const CGene_ref& mrna)
3887 {
3888 bool locus = (!genomic.CanGetLocus() && !mrna.CanGetLocus()) ||
3889 (genomic.CanGetLocus() && mrna.CanGetLocus() &&
3890 genomic.GetLocus() == mrna.GetLocus());
3891 bool allele = (!genomic.CanGetAllele() && !mrna.CanGetAllele()) ||
3892 (genomic.CanGetAllele() && mrna.CanGetAllele() &&
3893 genomic.GetAllele() == mrna.GetAllele());
3894 bool desc = (!genomic.CanGetDesc() && !mrna.CanGetDesc()) ||
3895 (genomic.CanGetDesc() && mrna.CanGetDesc() &&
3896 genomic.GetDesc() == mrna.GetDesc());
3897 bool locus_tag = (!genomic.CanGetLocus_tag() && !mrna.CanGetLocus_tag()) ||
3898 (genomic.CanGetLocus_tag() && mrna.CanGetLocus_tag() &&
3899 genomic.GetLocus_tag() == mrna.GetLocus_tag());
3900
3901 return locus && allele && desc && locus_tag;
3902 }
3903
3904
3905 // check that there is no conflict between the gene on the genomic
3906 // and the gene on the mrna.
x_ValidateMrnaGene()3907 void CMRNAValidator::x_ValidateMrnaGene()
3908 {
3909 if (!m_ProductBioseq) {
3910 return;
3911 }
3912 const CGene_ref* genomicgrp = NULL;
3913 if (m_Gene) {
3914 genomicgrp = &(m_Gene->GetData().GetGene());
3915 } else {
3916 genomicgrp = m_Feat.GetGeneXref();
3917 }
3918 if (!genomicgrp) {
3919 return;
3920 }
3921 CFeat_CI mrna_gene(m_ProductBioseq, CSeqFeatData::e_Gene);
3922 if ( mrna_gene ) {
3923 const CGene_ref& mrnagrp = mrna_gene->GetData().GetGene();
3924 if ( !s_EqualGene_ref(*genomicgrp, mrnagrp) ) {
3925 m_Imp.PostErr(eDiag_Warning, eErr_SEQ_FEAT_GenesInconsistent,
3926 "Gene on mRNA bioseq does not match gene on genomic bioseq",
3927 mrna_gene->GetOriginalFeature());
3928 }
3929 }
3930
3931 }
3932
3933
Validate()3934 void CPubFeatValidator::Validate()
3935 {
3936 CSingleFeatValidator::Validate();
3937 m_Imp.ValidatePubdesc(m_Feat.GetData().GetPub (), m_Feat);
3938 }
3939
3940
Validate()3941 void CSrcFeatValidator::Validate()
3942 {
3943 CSingleFeatValidator::Validate();
3944 const CBioSource& bsrc = m_Feat.GetData().GetBiosrc();
3945 if ( bsrc.IsSetIs_focus() ) {
3946 PostErr(eDiag_Error, eErr_SEQ_FEAT_FocusOnBioSourceFeature,
3947 "Focus must be on BioSource descriptor, not BioSource feature.");
3948 }
3949
3950 m_Imp.ValidateBioSource(bsrc, m_Feat);
3951
3952 CSeqdesc_CI dbsrc_i(m_LocationBioseq, CSeqdesc::e_Source);
3953 if ( !dbsrc_i ) {
3954 return;
3955 }
3956
3957 const COrg_ref& org = bsrc.GetOrg();
3958 const CBioSource& dbsrc = dbsrc_i->GetSource();
3959 const COrg_ref& dorg = dbsrc.GetOrg();
3960
3961 if ( org.CanGetTaxname() && !org.GetTaxname().empty() &&
3962 dorg.CanGetTaxname() ) {
3963 string taxname = org.GetTaxname();
3964 string dtaxname = dorg.GetTaxname();
3965 if ( NStr::CompareNocase(taxname, dtaxname) != 0 ) {
3966 if ( !dbsrc.IsSetIs_focus() && !m_Imp.IsTransgenic(dbsrc) ) {
3967 PostErr(eDiag_Error, eErr_SEQ_DESCR_BioSourceNeedsFocus,
3968 "BioSource descriptor must have focus or transgenic "
3969 "when BioSource feature with different taxname is "
3970 "present.");
3971 }
3972 }
3973 }
3974 }
3975
3976
x_ValidateSeqFeatLoc()3977 void CPolyASiteValidator::x_ValidateSeqFeatLoc()
3978 {
3979 CSingleFeatValidator::x_ValidateSeqFeatLoc();
3980 CSeq_loc::TRange range = m_Feat.GetLocation().GetTotalRange();
3981 if ( range.GetFrom() != range.GetTo() ) {
3982 EDiagSev sev = eDiag_Warning;
3983 if (m_Imp.IsRefSeq()) {
3984 sev = eDiag_Error;
3985 }
3986 PostErr(sev, eErr_SEQ_FEAT_PolyAsiteNotPoint,
3987 "PolyA_site should be a single point");
3988 }
3989
3990 }
3991
3992
x_ValidateSeqFeatLoc()3993 void CPolyASignalValidator::x_ValidateSeqFeatLoc()
3994 {
3995 CSeq_loc::TRange range = m_Feat.GetLocation().GetTotalRange();
3996 if ( range.GetFrom() == range.GetTo() ) {
3997 EDiagSev sev = eDiag_Warning;
3998 if (m_Imp.IsRefSeq()) {
3999 sev = eDiag_Error;
4000 }
4001 PostErr (sev, eErr_SEQ_FEAT_PolyAsignalNotRange, "PolyA_signal should be a range");
4002 }
4003 }
4004
4005
CPeptideValidator(const CSeq_feat & feat,CScope & scope,CValidError_imp & imp)4006 CPeptideValidator::CPeptideValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp) :
4007 CSingleFeatValidator(feat, scope, imp)
4008 {
4009 m_CDS = GetOverlappingCDS(m_Feat.GetLocation(), m_Scope);
4010 }
4011
4012
Validate()4013 void CPeptideValidator::Validate()
4014 {
4015 CSingleFeatValidator::Validate();
4016
4017 if (m_Imp.IsEmbl() || m_Imp.IsDdbj()) {
4018 PostErr(!m_CDS ? eDiag_Error : eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatureLacksCDS,
4019 "sig/mat/transit_peptide feature cannot be associated with a "
4020 "protein product of a coding region feature");
4021 } else {
4022 PostErr(m_Imp.IsRefSeq() ? eDiag_Error : eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatureLacksCDS,
4023 "Peptide processing feature should be converted to the "
4024 "appropriate protein feature subtype");
4025 }
4026 x_ValidatePeptideOnCodonBoundary();
4027 }
4028
4029
x_ValidatePeptideOnCodonBoundary()4030 void CPeptideValidator::x_ValidatePeptideOnCodonBoundary()
4031 {
4032 if (!m_CDS) {
4033 return;
4034 }
4035
4036 const string& key = m_Feat.GetData().GetImp().GetKey();
4037
4038 feature::ELocationInFrame in_frame = feature::IsLocationInFrame(m_Scope.GetSeq_featHandle(*m_CDS), m_Feat.GetLocation());
4039 if (NStr::Equal(key, "sig_peptide") && in_frame == feature::eLocationInFrame_NotIn) {
4040 return;
4041 }
4042 switch (in_frame) {
4043 case feature::eLocationInFrame_NotIn:
4044 if (NStr::Equal(key, "sig_peptide")) {
4045 // ignore
4046 } else {
4047 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
4048 "Start and stop of " + key + " are out of frame with CDS codons");
4049 }
4050 break;
4051 case feature::eLocationInFrame_BadStartAndStop:
4052 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
4053 "Start and stop of " + key + " are out of frame with CDS codons");
4054 break;
4055 case feature::eLocationInFrame_BadStart:
4056 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
4057 "Start of " + key + " is out of frame with CDS codons");
4058 break;
4059 case feature::eLocationInFrame_BadStop:
4060 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
4061 "Stop of " + key + " is out of frame with CDS codons");
4062 break;
4063 case feature::eLocationInFrame_InFrame:
4064 break;
4065 }
4066 }
4067
4068
Validate()4069 void CExonValidator::Validate()
4070 {
4071 CSingleFeatValidator::Validate();
4072 bool feat_pseudo = s_IsPseudo(m_Feat);
4073 bool pseudo = feat_pseudo;
4074 if (!pseudo) {
4075 CConstRef<CSeq_feat> gene = m_Imp.GetGeneCache().GetGeneFromCache(&m_Feat, m_Scope);
4076 if (gene) {
4077 pseudo = s_IsPseudo(*gene);
4078 }
4079 }
4080 if (m_Imp.IsValidateExons()) {
4081 ValidateSplice(pseudo, true);
4082 }
4083 }
4084
4085
Validate()4086 void CIntronValidator::Validate()
4087 {
4088 CSingleFeatValidator::Validate();
4089 bool feat_pseudo = s_IsPseudo(m_Feat);
4090 bool pseudo = feat_pseudo;
4091 if (!pseudo) {
4092 CConstRef<CSeq_feat> gene = m_Imp.GetGeneCache().GetGeneFromCache(&m_Feat, m_Scope);
4093 if (gene) {
4094 pseudo = s_IsPseudo(*gene);
4095 }
4096 }
4097
4098 if (x_IsIntronShort(pseudo)) {
4099 PostErr (eDiag_Warning, eErr_SEQ_FEAT_ShortIntron,
4100 "Introns should be at least 10 nt long");
4101 }
4102
4103 if (m_Feat.IsSetExcept() && m_Feat.IsSetExcept_text()
4104 && NStr::FindNoCase (m_Feat.GetExcept_text(), "nonconsensus splice site") != string::npos) {
4105 return;
4106 }
4107
4108 const CSeq_loc& loc = m_Feat.GetLocation();
4109
4110 bool partial5 = loc.IsPartialStart(eExtreme_Biological);
4111 bool partial3 = loc.IsPartialStop(eExtreme_Biological);
4112 if (partial5 && partial3) {
4113 return;
4114 }
4115
4116 // suppress if contained by rRNA - different consensus splice site
4117 TFeatScores scores;
4118 GetOverlappingFeatures(loc,
4119 CSeqFeatData::e_Rna,
4120 CSeqFeatData::eSubtype_rRNA,
4121 eOverlap_Contained,
4122 scores, m_Scope);
4123 if (scores.size() > 0) {
4124 return;
4125 }
4126
4127 // suppress if contained by tRNA - different consensus splice site
4128 scores.clear();
4129 GetOverlappingFeatures(loc,
4130 CSeqFeatData::e_Rna,
4131 CSeqFeatData::eSubtype_tRNA,
4132 eOverlap_Contained,
4133 scores, m_Scope);
4134 if (scores.size() > 0) {
4135 return;
4136 }
4137
4138 // skip if more than one bioseq
4139 if (!IsOneBioseq(loc, &m_Scope)) {
4140 return;
4141 }
4142
4143 // skip if organelle
4144 if (!m_LocationBioseq || IsOrganelle(m_LocationBioseq)) {
4145 return;
4146 }
4147
4148 TSeqPos seq_len = m_LocationBioseq.GetBioseqLength();
4149 string label;
4150 m_LocationBioseq.GetId().front().GetSeqId()->GetLabel(&label);
4151 CSeqVector vec = m_LocationBioseq.GetSeqVector (CBioseq_Handle::eCoding_Iupac);
4152
4153 ENa_strand strand = loc.GetStrand();
4154
4155 if (eNa_strand_minus != strand && eNa_strand_plus != strand) {
4156 strand = eNa_strand_plus;
4157 }
4158
4159 bool donor_in_gap = false;
4160 bool acceptor_in_gap = false;
4161
4162 TSeqPos end5 = loc.GetStart (eExtreme_Biological);
4163 if (vec.IsInGap(end5)) {
4164 donor_in_gap = true;
4165 }
4166
4167 TSeqPos end3 = loc.GetStop (eExtreme_Biological);
4168 if (vec.IsInGap(end3)) {
4169 acceptor_in_gap = true;
4170 }
4171
4172 if (!partial5 && !partial3) {
4173 if (donor_in_gap && acceptor_in_gap) {
4174 return;
4175 }
4176 }
4177
4178 Char donor[2]; // donor site signature
4179 Char acceptor[2]; // acceptor site signature
4180 bool donor_good = false; // flag == "true" indicates that donor signature is in @donor
4181 bool acceptor_good = false; // flag == "true" indicates that acceptor signature is in @acceptor
4182
4183 // Read donor signature into @donor
4184 if (!partial5 && !donor_in_gap) {
4185 if (eNa_strand_minus == strand) {
4186 if (end5 > 0 && IsResidue (vec[end5 - 1]) && IsResidue (vec[end5])) {
4187 donor[0] = vec[end5 - 1];
4188 donor[1] = vec[end5];
4189 donor_good = true;
4190 }
4191 }
4192 else {
4193 if( end5 < seq_len - 1 && IsResidue (vec[end5]) && IsResidue (vec[end5 + 1])) {
4194 donor[0] = vec[end5];
4195 donor[1] = vec[end5 + 1];
4196 donor_good = true;
4197 }
4198 }
4199 }
4200
4201 // Read acceptor signature into @acceptor
4202 if (!partial3 && !acceptor_in_gap) {
4203 if (eNa_strand_minus == strand) {
4204 if (end3 < seq_len - 1 && IsResidue (vec[end3]) && IsResidue (vec[end3 + 1])) {
4205 acceptor[0] = vec[end3];
4206 acceptor[1] = vec[end3 + 1];
4207 acceptor_good = true;
4208 }
4209 }
4210 else {
4211 if (end3 > 0 && IsResidue (vec[end3 - 1]) && IsResidue (vec[end3])) {
4212 acceptor[0] = vec[end3 - 1];
4213 acceptor[1] = vec[end3];
4214 acceptor_good = true;
4215 }
4216 }
4217 }
4218
4219 // Check intron's both ends.
4220 if (!partial5 && !partial3) {
4221 if (donor_good && acceptor_good) {
4222 if (CheckIntronSpliceSites(strand, donor, acceptor)) {
4223 return;
4224 }
4225 }
4226 }
4227
4228 // Check 5'-most
4229 if (!partial5) {
4230 if (!donor_in_gap) {
4231 bool not_found = true;
4232
4233 if (donor_good) {
4234 if (CheckIntronDonor(strand, donor)) {
4235 not_found = false;
4236 }
4237 }
4238 //
4239 if (not_found) {
4240 if ((strand == eNa_strand_minus && end5 == seq_len - 1) ||
4241 (strand == eNa_strand_plus && end5 == 0)) {
4242
4243 PostErr(eDiag_Info, eErr_SEQ_FEAT_NotSpliceConsensusDonorTerminalIntron,
4244 "Splice donor consensus (GT) not found at start of terminal intron, position "
4245 + NStr::IntToString (end5 + 1) + " of " + label);
4246 }
4247 else {
4248 PostErr(x_SeverityForConsensusSplice(), eErr_SEQ_FEAT_NotSpliceConsensusDonor,
4249 "Splice donor consensus (GT) not found at start of intron, position "
4250 + NStr::IntToString (end5 + 1) + " of " + label);
4251 }
4252 }
4253 }
4254 }
4255
4256 // Check 3'-most
4257 if (!partial3) {
4258 if (!acceptor_in_gap) {
4259 bool not_found = true;
4260
4261 if (acceptor_good) {
4262 if (CheckIntronAcceptor(strand, acceptor)) {
4263 not_found = false;
4264 }
4265 }
4266
4267 if (not_found) {
4268 if ((strand == eNa_strand_minus && end3 == 0) ||
4269 (strand == eNa_strand_plus && end3 == seq_len - 1)) {
4270 PostErr(eDiag_Info, eErr_SEQ_FEAT_NotSpliceConsensusAcceptorTerminalIntron,
4271 "Splice acceptor consensus (AG) not found at end of terminal intron, position "
4272 + NStr::IntToString (end3 + 1) + " of " + label + ", but at end of sequence");
4273 }
4274 else {
4275 PostErr(x_SeverityForConsensusSplice(), eErr_SEQ_FEAT_NotSpliceConsensusAcceptor,
4276 "Splice acceptor consensus (AG) not found at end of intron, position "
4277 + NStr::IntToString (end3 + 1) + " of " + label);
4278 }
4279 }
4280 }
4281 }
4282
4283 }
4284
4285
x_IsIntronShort(bool pseudo)4286 bool CIntronValidator::x_IsIntronShort(bool pseudo)
4287 {
4288 if (!m_Feat.IsSetData()
4289 || m_Feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_intron
4290 || !m_Feat.IsSetLocation()
4291 || pseudo) {
4292 return false;
4293 }
4294
4295 const CSeq_loc& loc = m_Feat.GetLocation();
4296 bool is_short = false;
4297
4298 if (! m_Imp.IsIndexerVersion()) {
4299 if (!m_LocationBioseq || IsOrganelle(m_LocationBioseq)) return is_short;
4300 }
4301
4302 if (GetLength(loc, &m_Scope) < 11) {
4303 bool partial_left = loc.IsPartialStart(eExtreme_Positional);
4304 bool partial_right = loc.IsPartialStop(eExtreme_Positional);
4305
4306 CBioseq_Handle bsh;
4307 if (partial_left && loc.GetStart(eExtreme_Positional) == 0) {
4308 // partial at beginning of sequence, ok
4309 } else if (partial_right &&
4310 (m_LocationBioseq) &&
4311 loc.GetStop(eExtreme_Positional) == (
4312 m_LocationBioseq.GetBioseqLength() - 1))
4313 {
4314 // partial at end of sequence
4315 } else {
4316 is_short = true;
4317 }
4318 }
4319 return is_short;
4320 }
4321
4322
Validate()4323 void CMiscFeatValidator::Validate()
4324 {
4325 CSingleFeatValidator::Validate();
4326 if ((!m_Feat.IsSetComment() || NStr::IsBlank (m_Feat.GetComment()))
4327 && (!m_Feat.IsSetQual() || m_Feat.GetQual().empty())
4328 && (!m_Feat.IsSetDbxref() || m_Feat.GetDbxref().empty())) {
4329 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MiscFeatureNeedsNote,
4330 "A note or other qualifier is required for a misc_feature");
4331 }
4332 if (m_Feat.IsSetComment() && ! NStr::IsBlank (m_Feat.GetComment())) {
4333 if (NStr::FindWord(m_Feat.GetComment(), "cspA") != NPOS) {
4334 CConstRef<CSeq_feat> cds = GetBestOverlappingFeat(m_Feat.GetLocation(), CSeqFeatData::eSubtype_cdregion, eOverlap_Simple, m_Scope);
4335 if (cds) {
4336 string content_label;
4337 feature::GetLabel(*cds, &content_label, feature::fFGL_Content, &m_Scope);
4338 if (NStr::Equal(content_label, "cold-shock protein")) {
4339 PostErr(eDiag_Error, eErr_SEQ_FEAT_ColdShockProteinProblem,
4340 "cspA misc_feature overlapped by cold-shock protein CDS");
4341 }
4342 }
4343 }
4344 }
4345
4346 }
4347
4348
Validate()4349 void CAssemblyGapValidator::Validate()
4350 {
4351 CSingleFeatValidator::Validate();
4352
4353 bool is_far_delta = false;
4354 if ( m_LocationBioseq.IsSetInst_Repr()) {
4355 CBioseq_Handle::TInst::TRepr repr = m_LocationBioseq.GetInst_Repr();
4356 if ( repr == CSeq_inst::eRepr_delta ) {
4357 is_far_delta = true;
4358 const CBioseq& seq = *(m_LocationBioseq.GetCompleteBioseq());
4359 const CSeq_inst& inst = seq.GetInst();
4360 ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
4361 if ( !(*sg) ) continue;
4362 if (( (**sg).Which() == CDelta_seq::e_Loc )) continue;
4363 is_far_delta = false;
4364 }
4365 }
4366 }
4367 if (! is_far_delta) {
4368 PostErr(eDiag_Warning, eErr_SEQ_FEAT_AssemblyGapFeatureProblem,
4369 "An assembly_gap feature should only be on a contig record");
4370 }
4371 if (m_Feat.GetLocation().IsInt()) {
4372 TSeqPos from = m_Feat.GetLocation().GetInt().GetFrom();
4373 TSeqPos to = m_Feat.GetLocation().GetInt().GetTo();
4374 CSeqVector vec = m_LocationBioseq.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
4375 string sequence;
4376 bool is5 = false;
4377 bool is3 = false;
4378 long int count = 0;
4379 vec.GetSeqData(from - 1, from, sequence);
4380 if (NStr::Equal (sequence, "N")) {
4381 is5 = true;
4382 }
4383 vec.GetSeqData(to + 1, to + 2, sequence);
4384 if (NStr::Equal (sequence, "N")) {
4385 is3 = true;
4386 }
4387 EDiagSev sv = eDiag_Warning;
4388 if (m_Imp.IsGenomeSubmission()) {
4389 sv = eDiag_Error;
4390 }
4391 if (is5 && is3) {
4392 PostErr(sv, eErr_SEQ_FEAT_AssemblyGapAdjacentToNs,
4393 "Assembly_gap flanked by Ns on 5' and 3' sides");
4394 } else if (is5) {
4395 PostErr(sv, eErr_SEQ_FEAT_AssemblyGapAdjacentToNs,
4396 "Assembly_gap flanked by Ns on 5' side");
4397 } else if (is3) {
4398 PostErr(sv, eErr_SEQ_FEAT_AssemblyGapAdjacentToNs,
4399 "Assembly_gap flanked by Ns on 3' side");
4400 }
4401 vec.GetSeqData(from, to + 1, sequence);
4402 for (size_t i = 0; i < sequence.size(); i++) {
4403 if (sequence[i] != 'N') {
4404 count++;
4405 }
4406 }
4407 if (count > 0) {
4408 PostErr(eDiag_Error, eErr_SEQ_FEAT_AssemblyGapCoversSequence, "Assembly_gap extends into sequence");
4409 }
4410 }
4411 }
4412
4413
Validate()4414 void CGapFeatValidator::Validate()
4415 {
4416 CSingleFeatValidator::Validate();
4417 int loc_len = GetLength (m_Feat.GetLocation(), &m_Scope);
4418 // look for estimated length qualifier
4419 FOR_EACH_GBQUAL_ON_SEQFEAT (it, m_Feat) {
4420 if ((*it)->IsSetQual() && NStr::EqualNocase ((*it)->GetQual(), "estimated_length")
4421 && (*it)->IsSetVal() && !NStr::EqualNocase ((*it)->GetVal(), "unknown")) {
4422 try {
4423 int estimated_length = NStr::StringToInt ((*it)->GetVal());
4424 if (estimated_length != loc_len) {
4425 PostErr (eDiag_Error, eErr_SEQ_FEAT_GapFeatureProblem,
4426 "Gap feature estimated_length " + NStr::IntToString (estimated_length)
4427 + " does not match " + NStr::IntToString (loc_len) + " feature length");
4428 return;
4429 }
4430 } catch (CException ) {
4431 } catch (std::exception ) {
4432 }
4433 }
4434 }
4435 try {
4436 string s_data = GetSequenceStringFromLoc(m_Feat.GetLocation(), m_Scope);
4437 CSeqVector vec = GetSequenceFromFeature(m_Feat, m_Scope);
4438 if ( !vec.empty() ) {
4439 string vec_data;
4440 vec.GetSeqData(0, vec.size(), vec_data);
4441 int num_n = 0;
4442 int num_real = 0;
4443 unsigned int num_gap = 0;
4444 int pos = 0;
4445 string::iterator it = vec_data.begin();
4446 while (it != vec_data.end()) {
4447 if (*it == 'N') {
4448 if (vec.IsInGap(pos)) {
4449 // gap not N
4450 num_gap++;
4451 } else {
4452 num_n++;
4453 }
4454 } else if (*it != '-') {
4455 num_real++;
4456 }
4457 ++it;
4458 ++pos;
4459 }
4460 if (num_real > 0 && num_n > 0) {
4461 PostErr (eDiag_Error, eErr_SEQ_FEAT_GapFeatureProblem,
4462 "Gap feature over " + NStr::IntToString (num_real)
4463 + " real bases and " + NStr::IntToString (num_n)
4464 + " Ns");
4465 } else if (num_real > 0) {
4466 PostErr (eDiag_Error, eErr_SEQ_FEAT_GapFeatureProblem,
4467 "Gap feature over " + NStr::IntToString (num_real)
4468 + " real bases");
4469 } else if (num_n > 0) {
4470 PostErr (eDiag_Error, eErr_SEQ_FEAT_GapFeatureProblem,
4471 "Gap feature over " + NStr::IntToString (num_n)
4472 + " Ns");
4473 } else if (num_gap != GetLength (m_Feat.GetLocation(), &m_Scope)) {
4474 PostErr (eDiag_Error, eErr_SEQ_FEAT_GapFeatureProblem,
4475 "Gap feature estimated_length " + NStr::IntToString (loc_len)
4476 + " does not match " + NStr::IntToString (num_gap)
4477 + " gap characters");
4478 }
4479 }
4480
4481 } catch (CException ) {
4482 } catch (std::exception ) {
4483 }
4484 }
4485
4486
Validate()4487 void CImpFeatValidator::Validate()
4488 {
4489 CSingleFeatValidator::Validate();
4490 CSeqFeatData::ESubtype subtype = m_Feat.GetData().GetSubtype();
4491
4492 const string& key = m_Feat.GetData().GetImp().GetKey();
4493 if (NStr::IsBlank(key)) {
4494 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatKey,
4495 "NULL feature key");
4496 return;
4497 }
4498
4499 if (subtype == CSeqFeatData::eSubtype_imp || subtype == CSeqFeatData::eSubtype_bad) {
4500 if (NStr::Equal(key, "mRNA")) {
4501 subtype = CSeqFeatData::eSubtype_mRNA;
4502 } else if (NStr::Equal(key, "tRNA")) {
4503 subtype = CSeqFeatData::eSubtype_tRNA;
4504 } else if (NStr::Equal(key, "tRNA")) {
4505 subtype = CSeqFeatData::eSubtype_tRNA;
4506 } else if (NStr::Equal(key, "rRNA")) {
4507 subtype = CSeqFeatData::eSubtype_rRNA;
4508 } else if (NStr::Equal(key, "snRNA")) {
4509 subtype = CSeqFeatData::eSubtype_snRNA;
4510 } else if (NStr::Equal(key, "scRNA")) {
4511 subtype = CSeqFeatData::eSubtype_scRNA;
4512 } else if (NStr::Equal(key, "snoRNA")) {
4513 subtype = CSeqFeatData::eSubtype_snoRNA;
4514 } else if (NStr::Equal(key, "misc_RNA")) {
4515 subtype = CSeqFeatData::eSubtype_misc_RNA;
4516 } else if (NStr::Equal(key, "precursor_RNA")) {
4517 subtype = CSeqFeatData::eSubtype_precursor_RNA;
4518 } else if (NStr::EqualNocase (key, "mat_peptide")) {
4519 subtype = CSeqFeatData::eSubtype_mat_peptide;
4520 } else if (NStr::EqualNocase (key, "propeptide")) {
4521 subtype = CSeqFeatData::eSubtype_propeptide;
4522 } else if (NStr::EqualNocase (key, "sig_peptide")) {
4523 subtype = CSeqFeatData::eSubtype_sig_peptide;
4524 } else if (NStr::EqualNocase (key, "transit_peptide")) {
4525 subtype = CSeqFeatData::eSubtype_transit_peptide;
4526 } else if (NStr::EqualNocase (key, "preprotein")
4527 || NStr::EqualNocase(key, "proprotein")) {
4528 subtype = CSeqFeatData::eSubtype_preprotein;
4529 } else if (NStr::EqualNocase (key, "virion")) {
4530 subtype = CSeqFeatData::eSubtype_virion;
4531 } else if (NStr::EqualNocase(key, "mutation")) {
4532 subtype = CSeqFeatData::eSubtype_mutation;
4533 } else if (NStr::EqualNocase(key, "allele")) {
4534 subtype = CSeqFeatData::eSubtype_allele;
4535 } else if (NStr::EqualNocase (key, "CDS")) {
4536 subtype = CSeqFeatData::eSubtype_Imp_CDS;
4537 } else if (NStr::EqualNocase(key, "Import")) {
4538 PostErr(eDiag_Error, eErr_SEQ_FEAT_UnknownImpFeatKey,
4539 "Feature key Import is no longer legal");
4540 return;
4541 }
4542 }
4543
4544 switch ( subtype ) {
4545
4546 case CSeqFeatData::eSubtype_bad:
4547 case CSeqFeatData::eSubtype_site_ref:
4548 case CSeqFeatData::eSubtype_gene:
4549 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatKey,
4550 "Unknown feature key " + key);
4551 break;
4552
4553 case CSeqFeatData::eSubtype_virion:
4554 case CSeqFeatData::eSubtype_mutation:
4555 case CSeqFeatData::eSubtype_allele:
4556 PostErr(eDiag_Error, eErr_SEQ_FEAT_UnknownImpFeatKey,
4557 "Feature key " + key + " is no longer legal");
4558 break;
4559
4560
4561 case CSeqFeatData::eSubtype_preprotein:
4562 if (m_Imp.IsEmbl() || m_Imp.IsDdbj()) {
4563 const CSeq_loc& loc = m_Feat.GetLocation();
4564 CConstRef<CSeq_feat> cds = GetOverlappingCDS(loc, m_Scope);
4565 PostErr(!cds ? eDiag_Error : eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatureLacksCDS,
4566 "Pre/pro protein feature cannot be associated with a "
4567 "protein product of a coding region feature");
4568 } else {
4569 PostErr(m_Imp.IsRefSeq() ? eDiag_Error : eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatureLacksCDS,
4570 "Peptide processing feature should be converted to the "
4571 "appropriate protein feature subtype");
4572 }
4573 break;
4574
4575 case CSeqFeatData::eSubtype_mRNA:
4576 case CSeqFeatData::eSubtype_tRNA:
4577 case CSeqFeatData::eSubtype_rRNA:
4578 case CSeqFeatData::eSubtype_snRNA:
4579 case CSeqFeatData::eSubtype_scRNA:
4580 case CSeqFeatData::eSubtype_snoRNA:
4581 case CSeqFeatData::eSubtype_misc_RNA:
4582 case CSeqFeatData::eSubtype_precursor_RNA:
4583 // !!! what about other RNA types (preRNA, otherRNA)?
4584 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidRNAFeature,
4585 "RNA feature should be converted to the appropriate RNA feature "
4586 "subtype, location should be converted manually");
4587 break;
4588
4589 case CSeqFeatData::eSubtype_Imp_CDS:
4590 {
4591 // impfeat CDS must be pseudo; fail if not
4592 bool pseudo = sequence::IsPseudo(m_Feat, m_Scope);
4593 if ( !pseudo ) {
4594 PostErr(eDiag_Info, eErr_SEQ_FEAT_ImpCDSnotPseudo,
4595 "ImpFeat CDS should be pseudo");
4596 }
4597
4598 FOR_EACH_GBQUAL_ON_FEATURE (gbqual, m_Feat) {
4599 if ( NStr::CompareNocase( (*gbqual)->GetQual(), "translation") == 0 ) {
4600 PostErr(eDiag_Error, eErr_SEQ_FEAT_ImpCDShasTranslation,
4601 "ImpFeat CDS with /translation found");
4602 }
4603 }
4604 }
4605 break;
4606 case CSeqFeatData::eSubtype_imp:
4607 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatKey,
4608 "Unknown feature key " + key);
4609 break;
4610 case CSeqFeatData::eSubtype_repeat_region:
4611 if ((!m_Feat.IsSetComment() || NStr::IsBlank (m_Feat.GetComment()))
4612 && (!m_Feat.IsSetDbxref() || m_Feat.GetDbxref().empty())) {
4613 if (!m_Feat.IsSetQual() || m_Feat.GetQual().empty()) {
4614 PostErr(eDiag_Warning, eErr_SEQ_FEAT_RepeatRegionNeedsNote,
4615 "repeat_region has no qualifiers");
4616 } else if ( ! m_Imp.IsGenomeSubmission() ) {
4617 bool okay = false;
4618 FOR_EACH_GBQUAL_ON_FEATURE (gbqual, m_Feat) {
4619 if ( ! NStr::EqualNocase((*gbqual)->GetQual(), "rpt_type") ) {
4620 okay = true;
4621 break;
4622 }
4623 const string& val = (*gbqual)->GetVal();
4624 if ( ! NStr::Equal (val, "other") ) {
4625 okay = true;
4626 break;
4627 }
4628 }
4629 if ( ! okay ) {
4630 PostErr(eDiag_Warning, eErr_SEQ_FEAT_RepeatRegionNeedsNote,
4631 "repeat_region has no qualifiers except rpt_type other");
4632 }
4633 }
4634 }
4635 break;
4636 case CSeqFeatData::eSubtype_regulatory:
4637 {
4638 vector<string> valid_types = CSeqFeatData::GetRegulatoryClassList();
4639 FOR_EACH_GBQUAL_ON_FEATURE (gbqual, m_Feat) {
4640 if ( NStr::CompareNocase( (*gbqual)->GetQual(), "regulatory_class") != 0 ) continue;
4641 const string& val = (*gbqual)->GetVal();
4642 bool missing = true;
4643 FOR_EACH_STRING_IN_VECTOR (itr, valid_types) {
4644 string str = *itr;
4645 if ( NStr::Equal (str, val) ) {
4646 missing = false;
4647 }
4648 }
4649 if ( missing ) {
4650 if ( NStr::Equal (val, "other") && !m_Feat.IsSetComment() ) {
4651 PostErr(eDiag_Error, eErr_SEQ_FEAT_RegulatoryClassOtherNeedsNote,
4652 "The regulatory_class 'other' is missing the required /note");
4653 }
4654 }
4655 }
4656 }
4657 break;
4658 case CSeqFeatData::eSubtype_misc_recomb:
4659 {
4660 const CGb_qual::TLegalRecombinationClassSet recomb_values = CGb_qual::GetSetOfLegalRecombinationClassValues();
4661 FOR_EACH_GBQUAL_ON_FEATURE (gbqual, m_Feat) {
4662 if ( NStr::CompareNocase( (*gbqual)->GetQual(), "recombination_class") != 0 ) continue;
4663 const string& val = (*gbqual)->GetVal();
4664 if ( recomb_values.find(val.c_str()) == recomb_values.end() ) {
4665 if ( NStr::Equal (val, "other")) {
4666 if (!m_Feat.IsSetComment()) {
4667 PostErr(eDiag_Error, eErr_SEQ_FEAT_RecombinationClassOtherNeedsNote,
4668 "The recombination_class 'other' is missing the required /note");
4669 }
4670 } else {
4671 // Removed per VR-770. FlatFile will automatically
4672 // display the unrecognized recombination_class value
4673 // in the note and list the recombination_class as other
4674 // PostErr(eDiag_Info, eErr_SEQ_FEAT_InvalidQualifierValue,
4675 // "'" + val + "' is not a legal value for recombination_class", feat);
4676 }
4677 }
4678 }
4679 }
4680 break;
4681 default:
4682 break;
4683 }// end of switch statement
4684 }
4685
4686
FeatValidatorFactory(const CSeq_feat & feat,CScope & scope,CValidError_imp & imp)4687 CSingleFeatValidator* FeatValidatorFactory(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp)
4688 {
4689 if (!feat.IsSetData()) {
4690 return new CSingleFeatValidator(feat, scope, imp);
4691 } else if (feat.GetData().IsCdregion()) {
4692 return new CCdregionValidator(feat, scope, imp);
4693 } else if (feat.GetData().IsGene()) {
4694 return new CGeneValidator(feat, scope, imp);
4695 } else if (feat.GetData().IsProt()) {
4696 return new CProtValidator(feat, scope, imp);
4697 } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
4698 return new CMRNAValidator(feat, scope, imp);
4699 } else if (feat.GetData().IsRna()) {
4700 return new CRNAValidator(feat, scope, imp);
4701 } else if (feat.GetData().IsPub()) {
4702 return new CPubFeatValidator(feat, scope, imp);
4703 } else if (feat.GetData().IsBiosrc()) {
4704 return new CSrcFeatValidator(feat, scope, imp);
4705 } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_exon) {
4706 return new CExonValidator(feat, scope, imp);
4707 } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_intron) {
4708 return new CIntronValidator(feat, scope, imp);
4709 } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature) {
4710 return new CMiscFeatValidator(feat, scope, imp);
4711 } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_assembly_gap) {
4712 return new CAssemblyGapValidator(feat, scope, imp);
4713 } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_polyA_site) {
4714 return new CPolyASiteValidator(feat, scope, imp);
4715 }
4716 else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_polyA_signal) {
4717 return new CPolyASignalValidator(feat, scope, imp);
4718 } else if (feat.GetData().IsImp()) {
4719 CSeqFeatData::ESubtype subtype = feat.GetData().GetSubtype();
4720 switch (subtype) {
4721 case CSeqFeatData::eSubtype_mat_peptide:
4722 case CSeqFeatData::eSubtype_propeptide:
4723 case CSeqFeatData::eSubtype_sig_peptide:
4724 case CSeqFeatData::eSubtype_transit_peptide:
4725 return new CPeptideValidator(feat, scope, imp);
4726 break;
4727 case CSeqFeatData::eSubtype_gap:
4728 return new CGapFeatValidator(feat, scope, imp);
4729 break;
4730 default:
4731 return new CImpFeatValidator(feat, scope, imp);
4732 break;
4733 }
4734 } else {
4735 return new CSingleFeatValidator(feat, scope, imp);
4736 }
4737 }
4738
4739
4740
4741
4742 END_SCOPE(validator)
4743 END_SCOPE(objects)
4744 END_NCBI_SCOPE
4745