1 /* $Id: validatorp.cpp 632625 2021-06-03 17:38:33Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko, Mati Shomrat, ....
27 *
28 * File Description:
29 * Implementation of private parts of the validator
30 * .......
31 *
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <objmgr/object_manager.hpp>
38
39 #include <objtools/validator/validatorp.hpp>
40 #include <objtools/validator/validerror_desc.hpp>
41 #include <objtools/validator/validerror_descr.hpp>
42 #include <objtools/validator/validerror_annot.hpp>
43 #include <objtools/validator/validerror_bioseq.hpp>
44 #include <objtools/validator/validerror_bioseqset.hpp>
45 #include <objtools/validator/utilities.hpp>
46 #include <objtools/validator/validator_barcode.hpp>
47 #include <objtools/cleanup/cleanup.hpp>
48
49 #include <serial/iterator.hpp>
50 #include <serial/enumvalues.hpp>
51
52 #include <objects/general/Dbtag.hpp>
53 #include <objects/general/Person_id.hpp>
54 #include <objects/general/Name_std.hpp>
55
56 #include <objects/seqalign/Seq_align.hpp>
57
58 #include <objects/seqset/Bioseq_set.hpp>
59 #include <objects/seqset/Seq_entry.hpp>
60
61 #include <objects/seq/Bioseq.hpp>
62 #include <objects/seq/Seq_annot.hpp>
63 #include <objects/seq/Seqdesc.hpp>
64 #include <objects/seq/Seq_descr.hpp>
65 #include <objects/seq/Pubdesc.hpp>
66 #include <objects/seq/MolInfo.hpp>
67 #include <objects/seqfeat/BioSource.hpp>
68 #include <objects/seqfeat/OrgMod.hpp>
69 #include <objects/seqfeat/OrgName.hpp>
70 #include <objects/seqfeat/Org_ref.hpp>
71 #include <objects/seqfeat/Seq_feat.hpp>
72 #include <objects/seqfeat/SubSource.hpp>
73
74 #include <objects/seqloc/Seq_loc.hpp>
75 #include <objects/seqloc/Seq_interval.hpp>
76 #include <objects/seqloc/Seq_point.hpp>
77 #include <objects/seqloc/Textseq_id.hpp>
78
79 #include <objects/seqres/Seq_graph.hpp>
80
81 #include <objects/submit/Seq_submit.hpp>
82 #include <objects/submit/Submit_block.hpp>
83
84 #include <objmgr/bioseq_ci.hpp>
85 #include <objmgr/seqdesc_ci.hpp>
86 #include <objmgr/graph_ci.hpp>
87 #include <objmgr/seq_annot_ci.hpp>
88 #include <objmgr/util/feature.hpp>
89 #include <objmgr/util/sequence.hpp>
90
91 #include <objmgr/feat_ci.hpp>
92 #include <objmgr/align_ci.hpp>
93 #include <objmgr/seq_vector.hpp>
94 #include <objmgr/scope.hpp>
95
96 #include <objects/pub/Pub.hpp>
97 #include <objects/pub/Pub_equiv.hpp>
98
99 #include <objects/biblio/Author.hpp>
100 #include <objects/biblio/Auth_list.hpp>
101 #include <objects/biblio/Cit_art.hpp>
102 #include <objects/biblio/Cit_book.hpp>
103 #include <objects/biblio/Cit_gen.hpp>
104 #include <objects/biblio/Cit_jour.hpp>
105 #include <objects/biblio/Cit_let.hpp>
106 #include <objects/biblio/Cit_proc.hpp>
107 #include <objects/biblio/Cit_sub.hpp>
108 #include <objects/biblio/PubMedId.hpp>
109 #include <objects/biblio/PubStatus.hpp>
110 #include <objects/biblio/Title.hpp>
111 #include <objects/biblio/Imprint.hpp>
112 #include <objects/biblio/Affil.hpp>
113 #include <objects/misc/sequence_macros.hpp>
114 #include <objects/taxon3/itaxon3.hpp>
115 #include <objects/taxon3/taxon3.hpp>
116 #include <objects/taxon3/Taxon3_reply.hpp>
117
118 #include <objects/valid/Comment_set.hpp>
119 #include <objects/valid/Comment_rule.hpp>
120 #include <objects/valid/Field_set.hpp>
121 #include <objects/valid/Field_rule.hpp>
122 #include <objects/valid/Dependent_field_set.hpp>
123 #include <objects/valid/Dependent_field_rule.hpp>
124
125 #include <objtools/error_codes.hpp>
126 #include <objtools/validator/validerror_format.hpp>
127 #include <objtools/validator/utilities.hpp>
128 #include <objtools/edit/seq_entry_edit.hpp>
129 #include <util/sgml_entity.hpp>
130 #include <util/line_reader.hpp>
131 #include <util/util_misc.hpp>
132 #include <util/static_set.hpp>
133
134 #include <algorithm>
135
136
137 #include <serial/iterator.hpp>
138
139 #define NCBI_USE_ERRCODE_X Objtools_Validator
140
141 BEGIN_NCBI_SCOPE
142 BEGIN_SCOPE(objects)
143 BEGIN_SCOPE(validator)
144 using namespace sequence;
145
146 namespace {
147 // avoid creating a PQuickStringLess for every comparison
148 PQuickStringLess s_QuickStringLess;
149 };
150
151
152 // =============================================================================
153 // CValidError_imp Public
154 // =============================================================================
155
156 const CSeqFeatData::E_Choice CCacheImpl::kAnyFeatType =
157 static_cast<CSeqFeatData::E_Choice>(CSeqFeatData::e_not_set - 1);
158 const CSeqFeatData::ESubtype CCacheImpl::kAnyFeatSubtype =
159 static_cast<CSeqFeatData::ESubtype>(CSeqFeatData::eSubtype_bad - 1);
160 const CCacheImpl::TFeatValue CCacheImpl::kEmptyFeatValue;
161
162 const CBioseq_Handle CCacheImpl::kEmptyBioseqHandle;
163 const CTSE_Handle CCacheImpl::kEmptyTSEHandle;
164 const CBioseq_Handle CCacheImpl::kAnyBioseq;
165
166 //LCOV_EXCL_START
167 //not used by asnvalidate
168 // Constructor
CValidError_imp(CObjectManager & objmgr,CValidError * errs,Uint4 options)169 CValidError_imp::CValidError_imp
170 (CObjectManager& objmgr,
171 CValidError* errs,
172 Uint4 options) :
173 m_ObjMgr(&objmgr),
174 m_ErrRepository(errs),
175 m_taxon(NULL)
176 {
177 x_Init(options);
178 }
179 //LCOV_EXCL_STOP
180
181 // Constructor
CValidError_imp(CObjectManager & objmgr,CValidError * errs,ITaxon3 * taxon,Uint4 options)182 CValidError_imp::CValidError_imp
183 (CObjectManager& objmgr,
184 CValidError* errs,
185 ITaxon3* taxon,
186 Uint4 options) :
187 m_ObjMgr(&objmgr),
188 m_ErrRepository(errs),
189 m_taxon(taxon)
190 {
191 x_Init(options);
192 }
193
194
x_Init(Uint4 options)195 void CValidError_imp::x_Init(Uint4 options)
196 {
197 SetOptions(options);
198 Reset();
199
200 if (m_SourceQualTags.get() == 0) {
201 InitializeSourceQualTags();
202 }
203 }
204
205 // Destructor
~CValidError_imp()206 CValidError_imp::~CValidError_imp()
207 {
208 }
209
210
SetOptions(Uint4 options)211 void CValidError_imp::SetOptions(Uint4 options)
212 {
213 m_NonASCII = (options & CValidator::eVal_non_ascii) != 0;
214 m_SuppressContext = (options & CValidator::eVal_no_context) != 0;
215 m_ValidateAlignments = (options & CValidator::eVal_val_align) != 0;
216 m_ValidateExons = (options & CValidator::eVal_val_exons) != 0;
217 m_OvlPepErr = (options & CValidator::eVal_ovl_pep_err) != 0;
218 m_RequireISOJTA = (options & CValidator::eVal_need_isojta) != 0;
219 m_ValidateIdSet = (options & CValidator::eVal_validate_id_set) != 0;
220 m_RemoteFetch = (options & CValidator::eVal_remote_fetch) != 0;
221 m_FarFetchMRNAproducts = (options & CValidator::eVal_far_fetch_mrna_products) != 0;
222 m_FarFetchCDSproducts = (options & CValidator::eVal_far_fetch_cds_products) != 0;
223 m_LocusTagGeneralMatch = (options & CValidator::eVal_locus_tag_general_match) != 0;
224 m_DoRubiscoText = (options & CValidator::eVal_do_rubisco_test) != 0;
225 m_IndexerVersion = (options & CValidator::eVal_indexer_version) != 0;
226 m_UseEntrez = (options & CValidator::eVal_use_entrez) != 0;
227 m_DoTaxLookup = (options & CValidator::eVal_do_tax_lookup) != 0;
228 m_DoBarcodeTests = (options & CValidator::eVal_do_barcode_tests) != 0;
229 m_RefSeqConventions = (options & CValidator::eVal_refseq_conventions) != 0;
230 m_SeqSubmitParent = (options & CValidator::eVal_seqsubmit_parent) != 0;
231 m_ValidateInferenceAccessions = (options & CValidator::eVal_inference_accns) != 0;
232 m_IgnoreExceptions = (options & CValidator::eVal_ignore_exceptions) != 0;
233 m_ReportSpliceAsError = (options & CValidator::eVal_report_splice_as_error) != 0;
234 m_LatLonCheckState = (options & CValidator::eVal_latlon_check_state) != 0;
235 m_LatLonIgnoreWater = (options & CValidator::eVal_latlon_ignore_water) != 0;
236 m_genomeSubmission = (options & CValidator::eVal_genome_submission) != 0;
237 m_CollectLocusTags = (options & CValidator::eVal_collect_locus_tags) != 0;
238 m_GenerateGoldenFile = (options & CValidator::eVal_generate_golden_file) != 0;
239 m_CompareVDJCtoCDS = (options & CValidator::eVal_compare_vdjc_to_cds) != 0;
240 }
241
242
243 //LCOV_EXCL_START
244 //not used by asnvalidate
SetErrorRepository(CValidError * errors)245 void CValidError_imp::SetErrorRepository(CValidError* errors)
246 {
247 m_ErrRepository = errors;
248 }
249 //LCOV_EXCL_STOP
250
251
Reset(void)252 void CValidError_imp::Reset(void)
253 {
254 m_Scope = 0;
255 m_TSE = 0;
256 m_IsStandaloneAnnot = false;
257 m_SeqAnnot.Reset(NULL);
258 m_NoPubs = false;
259 m_NoCitSubPubs = false;
260 m_NoBioSource = false;
261 m_IsGPS = false;
262 m_IsGED = false;
263 m_IsPDB = false;
264 m_IsPatent = false;
265 m_IsRefSeq = false;
266 m_IsEmbl = false;
267 m_IsDdbj = false;
268 m_IsTPE = false;
269 m_IsNC = false;
270 m_IsNG = false;
271 m_IsNM = false;
272 m_IsNP = false;
273 m_IsNR = false;
274 m_IsNZ = false;
275 m_IsNS = false;
276 m_IsNT = false;
277 m_IsNW = false;
278 m_IsWP = false;
279 m_IsXR = false;
280 m_IsGI = false;
281 m_IsGB = false;
282 m_IsGpipe = false;
283 m_IsLocalGeneralOnly = true;
284 m_HasGiOrAccnVer = false;
285 m_IsGenomic = false;
286 m_IsSeqSubmit = false;
287 m_IsSmallGenomeSet = false;
288 m_FeatLocHasGI = false;
289 m_ProductLocHasGI = false;
290 m_GeneHasLocusTag = false;
291 m_ProteinHasGeneralID = false;
292 m_IsINSDInSep = false;
293 m_IsGeneious = false;
294 m_PrgCallback = 0;
295 m_NumAlign = 0;
296 m_NumAnnot = 0;
297 m_NumBioseq = 0;
298 m_NumBioseq_set = 0;
299 m_NumTopSetSiblings = 0;
300 m_NumDesc = 0;
301 m_NumDescr = 0;
302 m_NumFeat = 0;
303 m_NumGraph = 0;
304 m_NumMisplacedFeatures = 0;
305 m_NumSmallGenomeSetMisplaced = 0;
306 m_NumMisplacedGraphs = 0;
307 m_NumGenes = 0;
308 m_NumGeneXrefs = 0;
309 m_NumTpaWithHistory = 0;
310 m_NumTpaWithoutHistory = 0;
311 m_NumPseudo = 0;
312 m_NumPseudogene = 0;
313 m_FarFetchFailure = false;
314 m_IsTbl2Asn = false;
315 }
316
317
318 // Error post methods
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSerialObject & obj)319 void CValidError_imp::PostErr
320 (EDiagSev sv,
321 EErrType et,
322 const string& msg,
323 const CSerialObject& obj)
324 {
325 const CTypeInfo* type_info = obj.GetThisTypeInfo();
326 if (type_info == CSeqdesc::GetTypeInfo()) {
327 const CSeqdesc* desc = dynamic_cast < const CSeqdesc* > (&obj);
328 ERR_POST_X(1, Warning << "Seqdesc validation error using default context.");
329 PostErr (sv, et, msg, GetTSE(), *desc);
330 } else if (type_info == CSeq_feat::GetTypeInfo()) {
331 const CSeq_feat* feat = dynamic_cast < const CSeq_feat* > (&obj);
332 PostErr (sv, et, msg, *feat);
333 } else if (type_info == CBioseq::GetTypeInfo()) {
334 const CBioseq* seq = dynamic_cast < const CBioseq* > (&obj);
335 PostErr (sv, et, msg, *seq);
336 } else if (type_info == CBioseq_set::GetTypeInfo()) {
337 const CBioseq_set* set = dynamic_cast < const CBioseq_set* > (&obj);
338 PostErr (sv, et, msg, *set);
339 } else if (type_info == CSeq_annot::GetTypeInfo()) {
340 const CSeq_annot* annot = dynamic_cast < const CSeq_annot* > (&obj);
341 PostErr (sv, et, msg, *annot);
342 } else if (type_info == CSeq_graph::GetTypeInfo()) {
343 const CSeq_graph* graph = dynamic_cast < const CSeq_graph* > (&obj);
344 PostErr (sv, et, msg, *graph);
345 } else if (type_info == CSeq_align::GetTypeInfo()) {
346 const CSeq_align* align = dynamic_cast < const CSeq_align* > (&obj);
347 PostErr (sv, et, msg, *align);
348 } else if (type_info == CSeq_entry::GetTypeInfo()) {
349 const CSeq_entry* entry = dynamic_cast < const CSeq_entry* > (&obj);
350 PostErr (sv, et, msg, *entry);
351 } else if (type_info == CBioSource::GetTypeInfo()) {
352 const CBioSource* src = dynamic_cast < const CBioSource* > (&obj);
353 PostErr (sv, et, msg, *src);
354 } else if (type_info == COrg_ref::GetTypeInfo()) {
355 const COrg_ref* org = dynamic_cast < const COrg_ref* > (&obj);
356 PostErr (sv, et, msg, *org);
357 } else if (type_info == CPubdesc::GetTypeInfo()) {
358 const CPubdesc* pd = dynamic_cast < const CPubdesc* > (&obj);
359 PostErr (sv, et, msg, *pd);
360 } else if (type_info == CSeq_submit::GetTypeInfo()) {
361 const CSeq_submit* ss = dynamic_cast < const CSeq_submit* > (&obj);
362 PostErr (sv, et, msg, *ss);
363 } else {
364 ERR_POST_X(1, Warning << "Unknown data type in PostErr.");
365 }
366 }
367
368
369 /*
370 void CValidError_imp::PostErr
371 (EDiagSev sv,
372 EErrType et,
373 const string& msg,
374 TDesc ds)
375 {
376 // Append Descriptor label
377 string desc = "DESCRIPTOR: ";
378 ds.GetLabel (&desc, CSeqdesc::eBoth);
379 desc += ", NO Descriptor Context";
380 m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, *m_Scope);
381 }
382 */
383
384 static const EErrType sc_ValidGenomeRaise[] = {
385 eErr_SEQ_INST_ShortSeq,
386 eErr_SEQ_INST_ConflictingBiomolTech,
387 eErr_SEQ_INST_DuplicateSegmentReferences,
388 eErr_SEQ_INST_BadSeqIdFormat,
389 eErr_SEQ_INST_TerminalNs,
390 eErr_SEQ_INST_UnexpectedIdentifierChange,
391 eErr_SEQ_INST_TpaAssemblyProblem,
392 eErr_SEQ_INST_SeqLocLength,
393 eErr_SEQ_INST_CompleteTitleProblem,
394 eErr_SEQ_INST_BadHTGSeq,
395 eErr_SEQ_INST_OverlappingDeltaRange,
396 eErr_SEQ_INST_InternalNsInSeqRaw,
397 eErr_SEQ_INST_FarFetchFailure,
398 eErr_SEQ_INST_InternalGapsInSeqRaw,
399 eErr_SEQ_INST_HighNContentStretch,
400 eErr_SEQ_INST_UnknownLengthGapNot100,
401 eErr_SEQ_INST_CompleteGenomeHasGaps,
402 eErr_SEQ_DESCR_BioSourceMissing,
403 eErr_SEQ_DESCR_InvalidForType,
404 eErr_SEQ_DESCR_InconsistentBioSources,
405 eErr_SEQ_DESCR_BadOrganelleLocation,
406 eErr_SEQ_DESCR_MultipleChromosomes,
407 eErr_SEQ_DESCR_BadOrgMod,
408 eErr_SEQ_DESCR_Inconsistent,
409 eErr_SEQ_DESCR_ObsoleteSourceLocation,
410 eErr_SEQ_DESCR_ObsoleteSourceQual,
411 eErr_SEQ_DESCR_UnwantedCompleteFlag,
412 eErr_SEQ_DESCR_CollidingPublications,
413 eErr_SEQ_DESCR_TransgenicProblem,
414 eErr_SEQ_DESCR_BioSourceInconsistency,
415 eErr_SEQ_DESCR_BadCollectionDate,
416 eErr_SEQ_DESCR_BadPCRPrimerSequence,
417 eErr_SEQ_DESCR_BioSourceOnProtein,
418 eErr_SEQ_DESCR_BioSourceDbTagConflict,
419 eErr_SEQ_DESCR_DuplicatePCRPrimerSequence,
420 eErr_SEQ_DESCR_MultipleNames,
421 eErr_SEQ_DESCR_LatLonRange,
422 eErr_SEQ_DESCR_LatLonValue,
423 eErr_SEQ_DESCR_LatLonCountry,
424 eErr_SEQ_DESCR_BadCollectionCode,
425 eErr_SEQ_DESCR_IncorrectlyFormattedVoucherID,
426 eErr_SEQ_DESCR_MultipleSourceQualifiers,
427 eErr_SEQ_DESCR_IdenticalInstitutionCode,
428 eErr_SEQ_DESCR_WrongVoucherType,
429 eErr_SEQ_DESCR_BadKeyword,
430 eErr_SEQ_DESCR_BioSourceNeedsChromosome,
431 eErr_SEQ_DESCR_MolInfoConflictsWithBioSource,
432 eErr_SEQ_DESCR_TaxonomyIsSpeciesProblem,
433 eErr_SEQ_DESCR_BadAltitude,
434 eErr_SEQ_DESCR_DBLinkMissingUserObject,
435 eErr_GENERIC_UnnecessaryPubEquiv,
436 eErr_GENERIC_CollidingSerialNumbers,
437 eErr_GENERIC_PublicationInconsistency,
438 eErr_GENERIC_SgmlPresentInText,
439 eErr_GENERIC_MissingPubRequirement,
440 eErr_SEQ_PKG_EmptySet,
441 eErr_SEQ_PKG_FeaturePackagingProblem,
442 eErr_SEQ_PKG_GenomicProductPackagingProblem,
443 eErr_SEQ_PKG_ArchaicFeatureLocation,
444 eErr_SEQ_PKG_ArchaicFeatureProduct,
445 eErr_SEQ_PKG_InternalGenBankSet,
446 eErr_SEQ_PKG_BioseqSetClassNotSet,
447 eErr_SEQ_PKG_MissingSetTitle,
448 eErr_SEQ_PKG_NucProtSetHasTitle,
449 eErr_SEQ_PKG_ComponentMissingTitle,
450 eErr_SEQ_PKG_SingleItemSet,
451 eErr_SEQ_PKG_MisplacedMolInfo,
452 eErr_SEQ_PKG_ImproperlyNestedSets,
453 eErr_SEQ_PKG_SeqSubmitWithWgsSet,
454 eErr_SEQ_PKG_InconsistentMoltypeSet,
455 eErr_SEQ_FEAT_Range,
456 eErr_SEQ_FEAT_MixedStrand,
457 eErr_SEQ_FEAT_SeqLocOrder,
458 eErr_SEQ_FEAT_TransLen,
459 eErr_SEQ_FEAT_TranslExcept,
460 eErr_SEQ_FEAT_OrfCdsHasProduct,
461 eErr_SEQ_FEAT_GeneRefHasNoData,
462 eErr_SEQ_FEAT_ProtRefHasNoData,
463 eErr_SEQ_FEAT_RNAtype0,
464 eErr_SEQ_FEAT_UnknownImpFeatKey,
465 eErr_SEQ_FEAT_UnknownImpFeatQual,
466 eErr_SEQ_FEAT_WrongQualOnImpFeat,
467 eErr_SEQ_FEAT_MissingQualOnImpFeat,
468 eErr_SEQ_FEAT_IllegalDbXref,
469 eErr_SEQ_FEAT_FarLocation,
470 eErr_SEQ_FEAT_TranslExceptPhase,
471 eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
472 eErr_SEQ_FEAT_InvalidQualifierValue,
473 eErr_SEQ_FEAT_CDSproductPackagingProblem,
474 eErr_SEQ_FEAT_DuplicateExonInterval,
475 eErr_SEQ_FEAT_DuplicateAnticodonInterval,
476 eErr_SEQ_FEAT_AbuttingIntervals,
477 eErr_SEQ_FEAT_MissingCDSproduct,
478 eErr_SEQ_FEAT_OnlyGeneXrefs,
479 eErr_SEQ_FEAT_UTRdoesNotAbutCDS,
480 eErr_SEQ_FEAT_ConflictFlagSet,
481 eErr_SEQ_FEAT_LocusTagProblem,
482 eErr_SEQ_FEAT_GenesInconsistent,
483 eErr_SEQ_FEAT_TranslExceptAndRnaEditing,
484 eErr_SEQ_FEAT_NoNameForProtein,
485 eErr_SEQ_FEAT_MissingGeneXref,
486 eErr_SEQ_FEAT_FeatureCitationProblem,
487 eErr_SEQ_FEAT_WrongQualOnFeature,
488 eErr_SEQ_FEAT_UnknownFeatureQual,
489 eErr_SEQ_FEAT_BadCharInAuthorName,
490 eErr_SEQ_FEAT_CDSwithMultipleMRNAs,
491 eErr_SEQ_FEAT_MultipleEquivBioSources,
492 eErr_SEQ_FEAT_MultipleEquivPublications,
493 eErr_SEQ_FEAT_BadFullLengthFeature,
494 eErr_SEQ_FEAT_RedundantFields,
495 eErr_SEQ_FEAT_CDSwithNoMRNAOverlap,
496 eErr_SEQ_FEAT_FeatureProductInconsistency,
497 eErr_SEQ_FEAT_ImproperBondLocation,
498 eErr_SEQ_FEAT_GeneXrefWithoutGene,
499 eErr_SEQ_FEAT_MissingTrnaAA,
500 eErr_SEQ_FEAT_OldLocusTagMismtach,
501 eErr_SEQ_FEAT_InvalidInferenceValue,
502 eErr_SEQ_FEAT_HypotheticalProteinMismatch,
503 eErr_SEQ_FEAT_WholeLocation,
504 eErr_SEQ_FEAT_BadEcNumberFormat,
505 eErr_SEQ_FEAT_EcNumberProblem,
506 eErr_SEQ_FEAT_VectorContamination,
507 eErr_SEQ_FEAT_MinusStrandProtein,
508 eErr_SEQ_FEAT_BadProteinName,
509 eErr_SEQ_FEAT_GeneXrefWithoutLocus,
510 eErr_SEQ_FEAT_CDShasTooManyXs,
511 eErr_SEQ_FEAT_TerminalXDiscrepancy,
512 eErr_SEQ_FEAT_UnnecessaryTranslExcept,
513 eErr_SEQ_FEAT_FeatureInsideGap,
514 eErr_SEQ_FEAT_BadAnticodonAA,
515 eErr_SEQ_FEAT_BadAnticodonCodon,
516 eErr_SEQ_FEAT_FeatureBeginsOrEndsInGap,
517 eErr_SEQ_FEAT_GeneOntologyTermMissingGOID,
518 eErr_SEQ_FEAT_PseudoRnaHasProduct,
519 eErr_SEQ_FEAT_PseudoRnaViaGeneHasProduct,
520 eErr_SEQ_FEAT_BadRRNAcomponentOrder,
521 eErr_SEQ_FEAT_BadRRNAcomponentOverlap,
522 eErr_SEQ_FEAT_MultipleProtRefs,
523 eErr_SEQ_FEAT_BadInternalCharacter,
524 eErr_SEQ_FEAT_BadTrailingCharacter,
525 eErr_SEQ_FEAT_BadTrailingHyphen,
526 eErr_SEQ_FEAT_BadCharInAuthorLastName,
527 eErr_SEQ_FEAT_GeneXrefNeeded,
528 eErr_SEQ_FEAT_ProteinNameHasPMID,
529 eErr_SEQ_FEAT_BadGeneOntologyFormat,
530 eErr_SEQ_FEAT_InconsistentGeneOntologyTermAndId,
531 eErr_SEQ_FEAT_ShortIntron,
532 eErr_SEQ_FEAT_GeneXrefStrandProblem,
533 eErr_SEQ_FEAT_CDSmRNAXrefLocationProblem,
534 eErr_SEQ_FEAT_LocusCollidesWithLocusTag,
535 eErr_SEQ_FEAT_RptUnitRangeProblem,
536 eErr_SEQ_FEAT_InconsistentRRNAstrands,
537 eErr_SEQ_FEAT_CDSrange,
538 eErr_SEQ_GRAPH_GraphAbove,
539 eErr_SEQ_GRAPH_GraphOutOfOrder,
540 eErr_SEQ_GRAPH_GraphSeqLocLen,
541 eErr_SEQ_GRAPH_GraphBioseqId
542 };
543
544 DEFINE_STATIC_ARRAY_MAP(CStaticArraySet<EErrType>, sc_GenomeRaiseArray, sc_ValidGenomeRaise);
545
546 static const EErrType sc_ValidGenomeRaiseExceptEmblDdbj[] = {
547 eErr_SEQ_INST_CompleteTitleProblem,
548 eErr_SEQ_INST_CompleteGenomeHasGaps,
549 eErr_SEQ_FEAT_MiscFeatureNeedsNote,
550 eErr_SEQ_FEAT_RepeatRegionNeedsNote
551 };
552
553 DEFINE_STATIC_ARRAY_MAP(CStaticArraySet<EErrType>, sc_GenomeRaiseExceptEmblDdbjArray, sc_ValidGenomeRaiseExceptEmblDdbj);
554
555
556 static const EErrType sc_ValidGenomeRaiseExceptEmblDdbjRefSeq[] = {
557 eErr_SEQ_DESCR_BadInstitutionCode
558 };
559
560 DEFINE_STATIC_ARRAY_MAP(CStaticArraySet<EErrType>, sc_GenomeRaiseExceptEmblDdbjRefSeqArray, sc_ValidGenomeRaiseExceptEmblDdbjRefSeq);
561
562
RaiseGenomeSeverity(EErrType et)563 bool CValidError_imp::RaiseGenomeSeverity(
564 EErrType et
565 )
566
567 {
568 if (sc_GenomeRaiseExceptEmblDdbjRefSeqArray.find(et) != sc_GenomeRaiseExceptEmblDdbjRefSeqArray.end()) {
569 if (IsEmbl() || IsDdbj() || IsRefSeq()) {
570 return false;
571 } else {
572 return true;
573 }
574 }
575 if (sc_GenomeRaiseExceptEmblDdbjArray.find(et) != sc_GenomeRaiseExceptEmblDdbjArray.end()) {
576 if (IsEmbl() || IsDdbj()) {
577 return false;
578 } else {
579 return true;
580 }
581 }
582 if (sc_GenomeRaiseArray.find (et) != sc_GenomeRaiseArray.end()) {
583 return true;
584 }
585 return false;
586 }
587
PostErr(EDiagSev sv,EErrType et,const string & msg,TFeat ft)588 void CValidError_imp::PostErr
589 (EDiagSev sv,
590 EErrType et,
591 const string& msg,
592 TFeat ft)
593 {
594 CRef<CValidErrItem> item(new CValidErrItem());
595
596 // Adjust severity
597 if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
598 sv = eDiag_Error;
599 }
600
601 item->SetSev(sv);
602 item->SetErrIndex(et);
603 item->SetMsg(msg);
604 item->SetObject(ft);
605
606 if (GenerateGoldenFile()) {
607 m_ErrRepository->AddValidErrItem(item);
608 return;
609 }
610
611 string content_label = CValidErrorFormat::GetFeatureContentLabel(ft, m_Scope);
612 item->SetObj_content(content_label);
613
614 string feature_id = CValidErrorFormat::GetFeatureIdLabel(ft);
615 if (!NStr::IsBlank(feature_id)) {
616 item->SetFeatureId(feature_id);
617 }
618
619 string bioseq_label = CValidErrorFormat::GetFeatureBioseqLabel(ft, m_Scope, m_SuppressContext);
620 if (!NStr::IsBlank(bioseq_label)) {
621 item->SetBioseq(bioseq_label);
622 }
623
624 // Calculate sequence offset
625 TSeqPos offset = 0;
626 string location;
627 if (ft.IsSetLocation()) {
628 offset = ft.GetLocation().GetStart(eExtreme_Positional);
629 string loc_label = CValidErrorFormat::GetFeatureLocationLabel(ft, m_Scope, m_SuppressContext);
630 if (!NStr::IsBlank(loc_label)) {
631 item->SetLocation(loc_label);
632 }
633 item->SetSeqOffset(offset);
634 }
635
636
637 string product_label = CValidErrorFormat::GetFeatureProductLocLabel(ft, m_Scope, m_SuppressContext);
638 if (!NStr::IsBlank(product_label)) {
639 item->SetProduct_loc(product_label);
640 }
641
642 int version = 0;
643 string accession;
644 if (m_Scope) {
645 accession = GetAccessionFromObjects(&ft, NULL, *m_Scope, &version);
646 }
647 item->SetAccession(accession);
648 if (version > 0) {
649 item->SetAccnver(accession + "." + NStr::IntToString(version));
650 item->SetVersion(version);
651 } else {
652 item->SetAccnver(accession);
653 }
654
655 if (ft.IsSetData()) {
656 if (ft.GetData().IsGene()) {
657 if (ft.GetData().GetGene().IsSetLocus_tag() &&
658 !NStr::IsBlank(ft.GetData().GetGene().GetLocus_tag())) {
659 item->SetLocus_tag(ft.GetData().GetGene().GetLocus_tag());
660 }
661 } else {
662 if (m_CollectLocusTags) {
663 // TODO: this should be part of post-processing
664 CConstRef<CSeq_feat> gene = GetGeneCache().GetGeneFromCache(&ft, *m_Scope);
665 if (gene && gene->GetData().GetGene().IsSetLocus_tag() &&
666 !NStr::IsBlank(gene->GetData().GetGene().GetLocus_tag())) {
667 item->SetLocus_tag(gene->GetData().GetGene().GetLocus_tag());
668 }
669 }
670 }
671 }
672
673 item->SetFeatureObjDescFromFields();
674 m_ErrRepository->AddValidErrItem(item);
675 }
676
677
PostErr(EDiagSev sv,EErrType et,const string & msg,TBioseq sq)678 void CValidError_imp::PostErr
679 (EDiagSev sv,
680 EErrType et,
681 const string& msg,
682 TBioseq sq)
683 {
684 // Adjust severity
685 if (m_genomeSubmission && sv < eDiag_Error && RaiseGenomeSeverity(et)) {
686 sv = eDiag_Error;
687 }
688
689 if (GenerateGoldenFile()) {
690 m_ErrRepository->AddValidErrItem(sv, et, msg);
691 return;
692 }
693
694 // Append bioseq label
695 string desc;
696 AppendBioseqLabel(desc, sq, m_SuppressContext);
697 int version = 0;
698 const string& accession = GetAccessionFromObjects(&sq, NULL, *m_Scope, &version);
699 m_ErrRepository->AddValidErrItem(sv, et, msg, desc, sq, accession, version);
700 }
701
702
PostErr(EDiagSev sv,EErrType et,const string & msg,TSet st)703 void CValidError_imp::PostErr
704 (EDiagSev sv,
705 EErrType et,
706 const string& msg,
707 TSet st)
708 {
709 // Adjust severity
710 if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
711 sv = eDiag_Error;
712 }
713
714 if (GenerateGoldenFile()) {
715 m_ErrRepository->AddValidErrItem(sv, et, msg);
716 return;
717 }
718
719 // Append Bioseq_set label
720 int version = 0;
721 const string& accession = GetAccessionFromObjects(&st, NULL, *m_Scope, &version);
722 string desc = CValidErrorFormat::GetBioseqSetLabel(st, m_Scope, m_SuppressContext);
723 m_ErrRepository->AddValidErrItem(sv, et, msg, desc, st, accession, version);
724 }
725
726
PostErr(EDiagSev sv,EErrType et,const string & msg,TEntry ctx,TDesc ds)727 void CValidError_imp::PostErr
728 (EDiagSev sv,
729 EErrType et,
730 const string& msg,
731 TEntry ctx,
732 TDesc ds)
733 {
734 // Adjust severity
735 if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
736 sv = eDiag_Error;
737 }
738
739 if (GenerateGoldenFile()) {
740 m_ErrRepository->AddValidErrItem(sv, et, msg);
741 return;
742 }
743
744 // Append Descriptor label
745 string desc = CValidErrorFormat::GetDescriptorLabel(ds, ctx, m_Scope, m_SuppressContext);
746 int version = 0;
747 const string& accession = GetAccessionFromObjects(&ds, &ctx, *m_Scope, &version);
748 m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, ctx, accession, version);
749 }
750
751
752 //void CValidError_imp::PostErr
753 //(EDiagSev sv,
754 // EErrType et,
755 // const string& msg,
756 // TBioseq sq,
757 // TDesc ds)
758 //{
759 // // Append Descriptor label
760 // string desc("DESCRIPTOR: ");
761 // ds.GetLabel(&desc, CSeqdesc::eBoth);
762 //
763 // s_AppendBioseqLabel(desc, sq, m_SuppressContext);
764 // m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, *m_Scope);
765 // //PostErr(sv, et, msg, sq);
766 //}
767
768
769 //void CValidError_imp::PostErr
770 //(EDiagSev sv,
771 // EErrType et,
772 // const string& msg,
773 // TSet st,
774 // TDesc ds)
775 //{
776 // // Append Descriptor label
777 // string desc = " DESCRIPTOR: ";
778 // ds.GetLabel(&desc, CSeqdesc::eBoth);
779 // s_AppendSetLabel(desc, st, m_SuppressContext);
780 // m_ErrRepository->AddValidErrItem(sv, et, msg, desc, st, *m_Scope);
781 //
782 //}
783
784
PostErr(EDiagSev sv,EErrType et,const string & msg,TAnnot an)785 void CValidError_imp::PostErr
786 (EDiagSev sv,
787 EErrType et,
788 const string& msg,
789 TAnnot an)
790 {
791 // Adjust severity
792 if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
793 sv = eDiag_Error;
794 }
795
796 if (GenerateGoldenFile()) {
797 m_ErrRepository->AddValidErrItem(sv, et, msg);
798 return;
799 }
800
801 // Append Annotation label
802 string desc = "ANNOTATION: ";
803
804 // !!! need to decide on the message
805
806 int version = 0;
807 const string& accession = GetAccessionFromObjects(&an, NULL, *m_Scope, &version);
808 m_ErrRepository->AddValidErrItem(sv, et, msg, desc, an, accession, version);
809 }
810
811
PostErr(EDiagSev sv,EErrType et,const string & msg,TGraph graph)812 void CValidError_imp::PostErr
813 (EDiagSev sv,
814 EErrType et,
815 const string& msg,
816 TGraph graph)
817 {
818 // Adjust severity
819 if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
820 sv = eDiag_Error;
821 }
822
823 if (GenerateGoldenFile()) {
824 m_ErrRepository->AddValidErrItem(sv, et, msg);
825 return;
826 }
827
828 // Append Graph label
829 string desc = "GRAPH: ";
830 if (graph.IsSetTitle()) {
831 desc += graph.GetTitle();
832 } else {
833 desc += "<Unnamed>";
834 }
835 desc += " ";
836 graph.GetLoc().GetLabel(&desc);
837
838 int version = 0;
839 const string& accession = GetAccessionFromObjects(&graph, NULL, *m_Scope, &version);
840 m_ErrRepository->AddValidErrItem(sv, et, msg, desc, graph, accession, version);
841 }
842
843
PostErr(EDiagSev sv,EErrType et,const string & msg,TBioseq sq,TGraph graph)844 void CValidError_imp::PostErr
845 (EDiagSev sv,
846 EErrType et,
847 const string& msg,
848 TBioseq sq,
849 TGraph graph)
850 {
851 // Adjust severity
852 if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
853 sv = eDiag_Error;
854 }
855
856 if (GenerateGoldenFile()) {
857 m_ErrRepository->AddValidErrItem(sv, et, msg);
858 return;
859 }
860
861 // Append Graph label
862 string desc("GRAPH: ");
863 if ( graph.IsSetTitle() ) {
864 desc += graph.GetTitle();
865 } else {
866 desc += "<Unnamed>";
867 }
868 desc += " ";
869 graph.GetLoc().GetLabel(&desc);
870 AppendBioseqLabel(desc, sq, m_SuppressContext);
871 int version = 0;
872 const string& accession = GetAccessionFromObjects(&graph, NULL, *m_Scope, &version);
873 m_ErrRepository->AddValidErrItem(sv, et, msg, desc, graph, accession, version);
874 }
875
876
PostErr(EDiagSev sv,EErrType et,const string & msg,TAlign align)877 void CValidError_imp::PostErr
878 (EDiagSev sv,
879 EErrType et,
880 const string& msg,
881 TAlign align)
882 {
883 // Adjust severity
884 if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
885 sv = eDiag_Error;
886 }
887
888 if (GenerateGoldenFile()) {
889 m_ErrRepository->AddValidErrItem(sv, et, msg);
890 return;
891 }
892
893 CConstRef<CSeq_id> id = GetReportableSeqIdForAlignment(align, *m_Scope);
894 if (id) {
895 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(*id);
896 if (bsh) {
897 PostErr(sv, et, msg, *(bsh.GetCompleteBioseq()));
898 return;
899 }
900 }
901
902 // Can't get bioseq for reporting, use other Alignment label
903 string desc = "ALIGNMENT: ";
904 if (align.IsSetType()) {
905 desc += align.ENUM_METHOD_NAME(EType)()->FindName(align.GetType(), true);
906 }
907 try {
908 CSeq_align::TDim dim = align.GetDim();
909 desc += ", dim=" + NStr::NumericToString(dim);
910 } catch ( const CUnassignedMember ) {
911 desc += ", dim=UNASSIGNED";
912 }
913
914 if (align.IsSetSegs()) {
915 desc += " SEGS: ";
916 desc += align.GetSegs().SelectionName(align.GetSegs().Which());
917 }
918
919 int version = 0;
920 const string& accession = GetAccessionFromObjects(&align, NULL, *m_Scope, &version);
921 m_ErrRepository->AddValidErrItem(sv, et, msg, desc, align, accession, version);
922 }
923
924
PostErr(EDiagSev sv,EErrType et,const string & msg,TEntry entry)925 void CValidError_imp::PostErr
926 (EDiagSev sv,
927 EErrType et,
928 const string& msg,
929 TEntry entry)
930 {
931 // Adjust severity
932 if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
933 sv = eDiag_Error;
934 }
935
936 if (GenerateGoldenFile()) {
937 m_ErrRepository->AddValidErrItem(sv, et, msg);
938 return;
939 }
940
941 if (entry.IsSeq()) {
942 PostErr(sv, et, msg, entry.GetSeq());
943 } else if (entry.IsSet()) {
944 PostErr(sv, et, msg, entry.GetSet());
945 } else {
946 string desc = "SEQ-ENTRY: ";
947 entry.GetLabel(&desc, CSeq_entry::eContent);
948
949 int version = 0;
950 const string& accession = GetAccessionFromObjects(&entry, NULL, *m_Scope, &version);
951 m_ErrRepository->AddValidErrItem(sv, et, msg, desc, entry, accession, version);
952 }
953 }
954
955
PostErr(EDiagSev sv,EErrType et,const string & msg,const CBioSource & src)956 void CValidError_imp::PostErr
957 (EDiagSev sv,
958 EErrType et,
959 const string& msg,
960 const CBioSource& src)
961 {
962 // Adjust severity
963 if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
964 sv = eDiag_Error;
965 }
966
967 if (GenerateGoldenFile()) {
968 m_ErrRepository->AddValidErrItem(sv, et, msg);
969 return;
970 }
971
972 string desc = "BioSource: ";
973 m_ErrRepository->AddValidErrItem(sv, et, msg, desc, src, "", 0);
974 }
975
976
PostErr(EDiagSev sv,EErrType et,const string & msg,const COrg_ref & org)977 void CValidError_imp::PostErr
978 (EDiagSev sv,
979 EErrType et,
980 const string& msg,
981 const COrg_ref& org)
982 {
983 // Adjust severity
984 if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
985 sv = eDiag_Error;
986 }
987
988 if (GenerateGoldenFile()) {
989 m_ErrRepository->AddValidErrItem(sv, et, msg);
990 return;
991 }
992
993 string desc = "Org-ref: ";
994 m_ErrRepository->AddValidErrItem(sv, et, msg, desc, org, "", 0);
995 }
996
997
PostErr(EDiagSev sv,EErrType et,const string & msg,const CPubdesc & pd)998 void CValidError_imp::PostErr
999 (EDiagSev sv,
1000 EErrType et,
1001 const string& msg,
1002 const CPubdesc& pd)
1003 {
1004 // Adjust severity
1005 if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1006 sv = eDiag_Error;
1007 }
1008
1009 if (GenerateGoldenFile()) {
1010 m_ErrRepository->AddValidErrItem(sv, et, msg);
1011 return;
1012 }
1013
1014 string desc = "Pubdesc: ";
1015 m_ErrRepository->AddValidErrItem(sv, et, msg, desc, pd, "", 0);
1016 }
1017
1018
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSeq_submit & ss)1019 void CValidError_imp::PostErr
1020 (EDiagSev sv,
1021 EErrType et,
1022 const string& msg,
1023 const CSeq_submit& ss)
1024 {
1025 // Adjust severity
1026 if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1027 sv = eDiag_Error;
1028 }
1029
1030 if (GenerateGoldenFile()) {
1031 m_ErrRepository->AddValidErrItem(sv, et, msg);
1032 return;
1033 }
1034
1035 string desc = "Seq-submit: ";
1036 m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ss, "", 0);
1037 }
1038
1039
PostObjErr(EDiagSev sv,EErrType et,const string & msg,const CSerialObject & obj,const CSeq_entry * ctx)1040 void CValidError_imp::PostObjErr
1041 (EDiagSev sv,
1042 EErrType et,
1043 const string& msg,
1044 const CSerialObject& obj,
1045 const CSeq_entry *ctx)
1046 {
1047 if (ctx == 0) {
1048 PostErr (sv, et, msg, obj);
1049 } else if (obj.GetThisTypeInfo() == CSeqdesc::GetTypeInfo()) {
1050 PostErr(sv, et, msg, *ctx, *(dynamic_cast <const CSeqdesc*> (&obj)));
1051 } else {
1052 PostErr(sv, et, msg, obj);
1053 }
1054
1055 }
1056
1057
PostBadDateError(EDiagSev sv,const string & msg,int flags,const CSerialObject & obj,const CSeq_entry * ctx)1058 void CValidError_imp::PostBadDateError
1059 (EDiagSev sv,
1060 const string& msg,
1061 int flags,
1062 const CSerialObject& obj,
1063 const CSeq_entry *ctx)
1064 {
1065 string reasons = GetDateErrorDescription(flags);
1066
1067 NStr::TruncateSpacesInPlace (reasons);
1068 reasons = msg + " - " + reasons;
1069
1070 PostObjErr (sv, eErr_GENERIC_BadDate, reasons, obj, ctx);
1071 }
1072
1073
Validate(const CSeq_entry & se,const CCit_sub * cs,CScope * scope)1074 bool CValidError_imp::Validate
1075 (const CSeq_entry& se,
1076 const CCit_sub* cs,
1077 CScope* scope)
1078 {
1079 CSeq_entry_Handle seh;
1080 try {
1081 seh = scope->GetSeq_entryHandle(se);
1082 } catch (const CException ) { ; }
1083 if (! seh) {
1084 seh = scope->AddTopLevelSeqEntry(se);
1085 if (!seh) {
1086 return false;
1087 }
1088 }
1089
1090 return Validate(seh, cs);
1091 }
1092
s_IsPhage(const COrg_ref & org)1093 static bool s_IsPhage(const COrg_ref& org)
1094 {
1095 if (org.IsSetDivision() && NStr::Equal(org.GetDivision(), "PHG")) {
1096 return true;
1097 } else {
1098 return false;
1099 }
1100 }
1101
1102
ValidateMultipleTaxIds(const CSeq_entry_Handle & seh)1103 void CValidError_imp::ValidateMultipleTaxIds(const CSeq_entry_Handle& seh)
1104 {
1105 bool has_mult = false;
1106 int first_id = 0;
1107 int phage_id = 0;
1108
1109 for (CBioseq_CI bi(seh); bi; ++bi) {
1110 for (CSeqdesc_CI desc_ci(*bi, CSeqdesc::e_Source);
1111 desc_ci && !has_mult;
1112 ++desc_ci) {
1113 if (desc_ci->GetSource().IsSetOrg()) {
1114 const COrg_ref& org = desc_ci->GetSource().GetOrg();
1115 if (org.IsSetDb()) {
1116 ITERATE(COrg_ref::TDb, it, org.GetDb()) {
1117 if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "taxon") &&
1118 (*it)->IsSetTag() && (*it)->GetTag().IsId()) {
1119 int this_id = (*it)->GetTag().GetId();
1120 if (this_id > 0) {
1121 if (s_IsPhage(org)) {
1122 phage_id = this_id;
1123 } else if (first_id == 0) {
1124 first_id = this_id;
1125 } else if (first_id != this_id) {
1126 has_mult = true;
1127 }
1128 }
1129 }
1130 }
1131 }
1132 }
1133 }
1134 }
1135 if (has_mult || (phage_id > 0 && first_id > 0)) {
1136 PostErr(has_mult ? eDiag_Error : eDiag_Warning, eErr_SEQ_DESCR_MultipleTaxonIDs,
1137 "There are multiple taxonIDs in this RefSeq record.",
1138 *m_TSE);
1139 }
1140 }
1141
1142
Validate(const CSeq_entry_Handle & seh,const CCit_sub * cs)1143 bool CValidError_imp::Validate
1144 (const CSeq_entry_Handle& seh,
1145 const CCit_sub* cs)
1146 {
1147 _ASSERT(seh);
1148
1149 if ( m_PrgCallback ) {
1150 m_PrgInfo.m_State = CValidator::CProgressInfo::eState_Initializing;
1151 if ( m_PrgCallback(&m_PrgInfo) ) {
1152 return false;
1153 }
1154 }
1155
1156 // Check that CSeq_entry has data
1157 if (seh.Which() == CSeq_entry::e_not_set) {
1158 ERR_POST_X(2, Warning << "Seq_entry not set");
1159 return false;
1160 }
1161
1162 Setup(seh);
1163
1164 // Seq-submit has submission citationTest_Descr_LatLonValue
1165 if (cs) {
1166 m_NoPubs = false;
1167 m_IsSeqSubmit = true;
1168 }
1169
1170 // Get first CBioseq object pointer for PostErr below.
1171 CTypeConstIterator<CBioseq> seq(ConstBegin(*m_TSE));
1172 if (!seq) {
1173 PostErr(eDiag_Error, eErr_SEQ_PKG_NoBioseqFound,
1174 "No Bioseqs in this entire record.", seh.GetCompleteSeq_entry()->GetSet());
1175 return true;
1176 }
1177
1178 // If m_NonASCII is true, then this flag was set by the caller
1179 // of validate to indicate that a non ascii character had been
1180 // read from a file being used to create a CSeq_entry, that the
1181 // error had been corrected, but that the error needs to be reported
1182 // by Validate. Note, Validate is not doing anything other than
1183 // reporting an error if m_NonASCII is true;
1184 if (m_NonASCII) {
1185 PostErr(eDiag_Fatal, eErr_GENERIC_NonAsciiAsn,
1186 "Non-ascii chars in input ASN.1 strings", *seq);
1187 // Only report the error once
1188 m_NonASCII = false;
1189 }
1190
1191 // Iterate thru components of record and validate each
1192
1193 // also want to know if we have gi
1194 bool has_gi = false;
1195 // also want to know if there are any nucleotide sequences
1196 bool has_nucleotide_sequence = false;
1197
1198 for (CBioseq_CI bi(GetTSEH(), CSeq_inst::eMol_not_set, CBioseq_CI::eLevel_All);
1199 bi && (!m_IsINSDInSep || !has_gi || !has_nucleotide_sequence);
1200 ++bi) {
1201 FOR_EACH_SEQID_ON_BIOSEQ (it, *(bi->GetCompleteBioseq())) {
1202 if ((*it)->IsGi()) {
1203 has_gi = true;
1204 }
1205 }
1206 if (bi->IsSetInst_Mol() && bi->IsNa()) {
1207 has_nucleotide_sequence = true;
1208 }
1209 }
1210
1211 if (m_IsINSDInSep && m_IsRefSeq) {
1212 // NOTE: We use m_IsRefSeq to indicate the actual presence of RefSeq IDs in
1213 // the record, rather than IsRefSeq(), which indicates *either* RefSeq IDs are
1214 // present *OR* the refseq flag has been used
1215 PostErr (eDiag_Error, eErr_SEQ_PKG_INSDRefSeqPackaging,
1216 "INSD and RefSeq records should not be present in the same set", *m_TSE);
1217 }
1218
1219 #if 0
1220 // disabled for now
1221 // look for long IDs that would collide if truncated at 30 characters
1222 vector<string> id_strings;
1223 for (CBioseq_CI bi(GetTSEH(), CSeq_inst::eMol_not_set, CBioseq_CI::eLevel_All);
1224 bi;
1225 ++bi) {
1226 FOR_EACH_SEQID_ON_BIOSEQ (it, *(bi->GetCompleteBioseq())) {
1227 if (!IsNCBIFILESeqId(**it)) {
1228 string label;
1229 (*it)->GetLabel(&label);
1230 id_strings.push_back(label);
1231 }
1232 }
1233 }
1234 stable_sort (id_strings.begin(), id_strings.end());
1235 for (vector<string>::iterator id_str_it = id_strings.begin();
1236 id_str_it != id_strings.end();
1237 ++id_str_it) {
1238 string pattern = (*id_str_it).substr(0, 30);
1239 string first_id = *id_str_it;
1240 vector<string>::iterator cmp_it = id_str_it;
1241 ++cmp_it;
1242 while (cmp_it != id_strings.end() && NStr::StartsWith(*cmp_it, pattern)) {
1243 CRef<CSeq_id> id(new CSeq_id(*cmp_it));
1244 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(*id);
1245 PostErr (eDiag_Warning, eErr_SEQ_INST_BadSeqIdFormat,
1246 "First 30 characters of " + first_id + " and " +
1247 *cmp_it + " are identical", *(bsh.GetCompleteBioseq()));
1248 ++id_str_it;
1249 ++cmp_it;
1250 }
1251 }
1252 #endif
1253
1254 // look for colliding feature IDs
1255 vector < int > feature_ids;
1256 for (CFeat_CI fi(GetTSEH()); fi; ++fi) {
1257 const CSeq_feat& sf = fi->GetOriginalFeature();
1258 if (sf.IsSetId() && sf.GetId().IsLocal() && sf.GetId().GetLocal().IsId()) {
1259 feature_ids.push_back(sf.GetId().GetLocal().GetId());
1260 }
1261 }
1262
1263 if (feature_ids.size() > 0) {
1264 const CTSE_Handle& tse = seh.GetTSE_Handle ();
1265 stable_sort (feature_ids.begin(), feature_ids.end());
1266 vector <int>::iterator it = feature_ids.begin();
1267 int id = *it;
1268 ++it;
1269 while (it != feature_ids.end()) {
1270 if (*it == id) {
1271 vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, id);
1272 ITERATE( vector<CSeq_feat_Handle>, feat_it, handles ) {
1273 PostErr (eDiag_Critical, eErr_SEQ_FEAT_CollidingFeatureIDs,
1274 "Colliding feature ID " + NStr::NumericToString (id), *(feat_it->GetSeq_feat()));
1275 }
1276 while (it != feature_ids.end() && *it == id) {
1277 ++it;
1278 }
1279 if (it != feature_ids.end()) {
1280 id = *it;
1281 ++it;
1282 }
1283 } else {
1284 id = *it;
1285 ++it;
1286 }
1287 }
1288 }
1289
1290 // look for mixed gps and non-gps sets
1291 bool has_nongps = false;
1292 bool has_gps = false;
1293
1294 for (CTypeConstIterator<CBioseq_set> si(*m_TSE); si && (!has_nongps || !has_gps); ++si) {
1295 if (si->IsSetClass()) {
1296 if (si->GetClass() == CBioseq_set::eClass_mut_set
1297 || si->GetClass() == CBioseq_set::eClass_pop_set
1298 || si->GetClass() == CBioseq_set::eClass_phy_set
1299 || si->GetClass() == CBioseq_set::eClass_eco_set
1300 || si->GetClass() == CBioseq_set::eClass_wgs_set
1301 || si->GetClass() == CBioseq_set::eClass_small_genome_set) {
1302 has_nongps = true;
1303 } else if (si->GetClass() == CBioseq_set::eClass_gen_prod_set) {
1304 has_gps = true;
1305 }
1306 }
1307 }
1308
1309 if (has_nongps && has_gps) {
1310 PostErr(eDiag_Error, eErr_SEQ_PKG_GPSnonGPSPackaging,
1311 "Genomic product set and mut/pop/phy/eco set records should not be present in the same set",
1312 *m_TSE);
1313 }
1314
1315 // count inference accessions - if there are too many, temporarily disable inference checking
1316 bool old_inference_acc_check = m_ValidateInferenceAccessions;
1317 if (m_ValidateInferenceAccessions) {
1318 size_t num_inferences = 0, num_accessions = 0;
1319 CFeat_CI feat_inf(seh);
1320 while (feat_inf) {
1321 FOR_EACH_GBQUAL_ON_FEATURE (qual, *feat_inf) {
1322 if ((*qual)->IsSetQual() && (*qual)->IsSetVal() && NStr::Equal((*qual)->GetQual(), "inference")) {
1323 num_inferences++;
1324 string prefix, remainder;
1325 bool same_species;
1326 vector<string> accessions = CValidError_feat::GetAccessionsFromInferenceString ((*qual)->GetVal(), prefix, remainder, same_species);
1327 for (size_t i = 0; i < accessions.size(); i++) {
1328 NStr::TruncateSpacesInPlace (accessions[i]);
1329 string acc_prefix, accession;
1330 if (CValidError_feat::GetPrefixAndAccessionFromInferenceAccession (remainder, acc_prefix, accession)) {
1331 if (NStr::EqualNocase (acc_prefix, "INSD") || NStr::EqualNocase (acc_prefix, "RefSeq")) {
1332 num_accessions++;
1333 }
1334 }
1335 }
1336 }
1337 }
1338 ++feat_inf;
1339 }
1340 if (/* num_inferences > 1000 || */ num_accessions > 1000) {
1341 // warn about too many inferences
1342 PostErr (eDiag_Info, eErr_SEQ_FEAT_TooManyInferenceAccessions,
1343 "Skipping validation of " + NStr::SizetToString (num_inferences) + " /inference qualifiers with "
1344 + NStr::SizetToString (num_accessions) + " accessions",
1345 *m_TSE);
1346
1347 // disable inference checking
1348 m_ValidateInferenceAccessions = false;
1349 }
1350 }
1351
1352 // validate the main data
1353 if (seh.IsSeq()) {
1354 const CBioseq& seq = seh.GetCompleteSeq_entry()->GetSeq();
1355 CValidError_bioseq bioseq_validator(*this);
1356 try {
1357 bioseq_validator.ValidateBioseq(seq);
1358 } catch ( const exception& e ) {
1359 PostErr(eDiag_Fatal, eErr_INTERNAL_Exception,
1360 string("Exception while validating bioseq. EXCEPTION: ") +
1361 e.what(), seq);
1362 return true;
1363 }
1364 } else if (seh.IsSet()) {
1365 const CBioseq_set& set = seh.GetCompleteSeq_entry()->GetSet();
1366 CValidError_bioseqset bioseqset_validator(*this);
1367 try {
1368 bioseqset_validator.ValidateBioseqSet(set);
1369 } catch ( const exception& e ) {
1370 PostErr(eDiag_Fatal, eErr_INTERNAL_Exception,
1371 string("Exception while validating bioseq set. EXCEPTION: ") +
1372 e.what(), set);
1373 return true;
1374 }
1375 }
1376
1377 // put flag for validating inference accessions back to original value
1378 m_ValidateInferenceAccessions = old_inference_acc_check;
1379
1380 // validation from data collected during previous step
1381
1382 if ( m_NumTpaWithHistory > 0 &&
1383 m_NumTpaWithoutHistory > 0 ) {
1384 PostErr(eDiag_Error, eErr_SEQ_INST_TpaAssemblyProblem,
1385 "There are " +
1386 NStr::SizetToString(m_NumTpaWithHistory) +
1387 " TPAs with history and " +
1388 NStr::SizetToString(m_NumTpaWithoutHistory) +
1389 " without history in this record.", *seq);
1390 }
1391 if ( m_NumTpaWithoutHistory > 0 && has_gi) {
1392 PostErr (eDiag_Warning, eErr_SEQ_INST_TpaAssemblyProblem,
1393 "There are " +
1394 NStr::SizetToString(m_NumTpaWithoutHistory) +
1395 " TPAs without history in this record, but the record has a gi number assignment.", *m_TSE);
1396 }
1397 if (IsIndexerVersion() && DoesAnyProteinHaveGeneralID() && !IsRefSeq() && has_nucleotide_sequence) {
1398 PostErr (eDiag_Info, eErr_SEQ_INST_ProteinsHaveGeneralID,
1399 "INDEXER_ONLY - Protein bioseqs have general seq-id.",
1400 *(seh.GetCompleteSeq_entry()));
1401 }
1402
1403 ReportMissingPubs(*m_TSE, cs);
1404 ReportMissingBiosource(*m_TSE);
1405
1406 if (m_NumMisplacedFeatures > 1) {
1407 PostErr (eDiag_Critical, eErr_SEQ_PKG_FeaturePackagingProblem,
1408 "There are " + NStr::SizetToString (m_NumMisplacedFeatures) + " mispackaged features in this record.",
1409 *(seh.GetCompleteSeq_entry()));
1410 } else if (m_NumMisplacedFeatures == 1) {
1411 PostErr (eDiag_Critical, eErr_SEQ_PKG_FeaturePackagingProblem,
1412 "There is 1 mispackaged feature in this record.",
1413 *(seh.GetCompleteSeq_entry()));
1414 }
1415 if (m_NumSmallGenomeSetMisplaced > 1) {
1416 PostErr (eDiag_Warning, eErr_SEQ_PKG_FeaturePackagingProblem,
1417 "There are " + NStr::SizetToString (m_NumSmallGenomeSetMisplaced) + " mispackaged features in this small genome set record.",
1418 *(seh.GetCompleteSeq_entry()));
1419 } else if (m_NumSmallGenomeSetMisplaced == 1) {
1420 PostErr (eDiag_Warning, eErr_SEQ_PKG_FeaturePackagingProblem,
1421 "There is 1 mispackaged feature in this small genome set record.",
1422 *(seh.GetCompleteSeq_entry()));
1423 }
1424 if ( m_NumGenes == 0 &&
1425 m_NumGeneXrefs > 0 ) {
1426 PostErr(eDiag_Warning, eErr_SEQ_FEAT_OnlyGeneXrefs,
1427 "There are " + NStr::SizetToString(m_NumGeneXrefs) +
1428 " gene xrefs and no gene features in this record.", *m_TSE);
1429 }
1430 ValidateCitations (seh);
1431
1432
1433 if ( m_NumMisplacedGraphs > 0 ) {
1434 string num = NStr::SizetToString(m_NumMisplacedGraphs);
1435 PostErr(eDiag_Critical, eErr_SEQ_PKG_GraphPackagingProblem,
1436 string("There ") + ((m_NumMisplacedGraphs > 1) ? "are " : "is ") + num +
1437 " mispackaged graph" + ((m_NumMisplacedGraphs > 1) ? "s" : "") + " in this record.",
1438 *m_TSE);
1439 }
1440
1441 if ( IsRefSeq() && ! IsWP() ) {
1442 ValidateMultipleTaxIds(seh);
1443 }
1444
1445
1446 FindEmbeddedScript(*(seh.GetCompleteSeq_entry()));
1447 FindNonAsciiText(*(seh.GetCompleteSeq_entry()));
1448 FindCollidingSerialNumbers(*(seh.GetCompleteSeq_entry()));
1449
1450 if (m_FarFetchFailure) {
1451 PostErr(eDiag_Warning, eErr_SEQ_INST_FarFetchFailure,
1452 "Far fetch failures caused some validator tests to be bypassed",
1453 *m_TSE);
1454 }
1455
1456 if (m_DoTaxLookup) {
1457 ValidateTaxonomy(*(seh.GetCompleteSeq_entry()));
1458 }
1459
1460 // validate cit-sub
1461 if (cs) {
1462 ValidateCitSub (*cs, *(seh.GetCompleteSeq_entry()), seh.GetCompleteSeq_entry());
1463 }
1464
1465 // optional barcode tests
1466 if (m_DoBarcodeTests) {
1467 x_DoBarcodeTests(seh);
1468 }
1469 return true;
1470 }
1471
1472
ValidateSubmitBlock(const CSubmit_block & block,const CSeq_submit & ss)1473 void CValidError_imp::ValidateSubmitBlock(const CSubmit_block& block, const CSeq_submit& ss)
1474 {
1475 if (block.IsSetHup() && block.GetHup() && block.IsSetReldate() &&
1476 IsDateInPast(block.GetReldate())) {
1477 PostErr(eDiag_Warning, eErr_GENERIC_PastReleaseDate,
1478 "Record release date has already passed", ss);
1479 }
1480 }
1481
1482
Validate(const CSeq_submit & ss,CScope * scope)1483 void CValidError_imp::Validate(
1484 const CSeq_submit& ss, CScope* scope)
1485 {
1486 // Check that ss is type e_Entrys
1487 if ( ss.GetData().Which() != CSeq_submit::C_Data::e_Entrys ) {
1488 return;
1489 }
1490
1491 m_IsSeqSubmit = true;
1492 ValidateSubmitBlock(ss.GetSub(), ss);
1493
1494 // Get CCit_sub pointer
1495 const CCit_sub* cs = &ss.GetSub().GetCit();
1496
1497 if (ss.IsSetSub() && ss.GetSub().IsSetTool() && NStr::StartsWith(ss.GetSub().GetTool(), "Geneious")) {
1498 m_IsGeneious = true;
1499 }
1500
1501 // Just loop thru CSeq_entrys
1502 FOR_EACH_SEQENTRY_ON_SEQSUBMIT (se_itr, ss) {
1503 const CSeq_entry& se = **se_itr;
1504 if(se.IsSet())
1505 {
1506 const CBioseq_set &set = se.GetSet();
1507 if(set.IsSetClass() &&
1508 set.GetClass() == CBioseq_set::eClass_wgs_set)
1509 {
1510 CSeq_entry_Handle seh;
1511 seh = scope->GetSeq_entryHandle(se);
1512 Setup(seh);
1513 PostErr(eDiag_Warning, eErr_SEQ_PKG_SeqSubmitWithWgsSet,
1514 "File was created as a wgs-set, but should be a batch submission instead.",
1515 seh.GetCompleteSeq_entry()->GetSet());
1516 }
1517 }
1518 Validate (se, cs, scope);
1519 }
1520 }
1521
1522
Validate(const CSeq_annot_Handle & sah)1523 void CValidError_imp::Validate(
1524 const CSeq_annot_Handle& sah)
1525 {
1526 Setup(sah);
1527
1528 // Iterate thru components of record and validate each
1529
1530 CValidError_annot annot_validator(*this);
1531 annot_validator.ValidateSeqAnnot(sah);
1532
1533 switch (sah.Which()) {
1534 case CSeq_annot::TData::e_Ftable :
1535 {
1536 CValidError_feat feat_validator(*this);
1537 for (CFeat_CI fi (sah); fi; ++fi) {
1538 const CSeq_feat& sf = fi->GetOriginalFeature();
1539 feat_validator.ValidateSeqFeat(sf);
1540 }
1541 }
1542 break;
1543
1544 case CSeq_annot::TData::e_Align :
1545 {
1546 if (IsValidateAlignments()) {
1547 CValidError_align align_validator(*this);
1548 int order = 1;
1549 for (CAlign_CI ai(sah); ai; ++ai) {
1550 const CSeq_align& sa = ai.GetOriginalSeq_align();
1551 align_validator.ValidateSeqAlign(sa, order++);
1552 }
1553 }
1554 }
1555 break;
1556
1557 case CSeq_annot::TData::e_Graph :
1558 {
1559 CValidError_graph graph_validator(*this);
1560 // for (CTypeConstIterator <CSeq_graph> gi (sa); gi; ++gi) {
1561 for (CGraph_CI gi(sah); gi; ++gi) {
1562 const CSeq_graph& sg = gi->GetOriginalGraph();
1563 graph_validator.ValidateSeqGraph(sg);
1564 }
1565 }
1566 break;
1567 default:
1568 break;
1569 }
1570 FindEmbeddedScript(*(sah.GetCompleteSeq_annot()));
1571 FindNonAsciiText(*(sah.GetCompleteSeq_annot()));
1572 FindCollidingSerialNumbers(*(sah.GetCompleteSeq_annot()));
1573 }
1574
1575
Validate(const CSeq_feat & feat,CScope * scope)1576 void CValidError_imp::Validate(const CSeq_feat& feat, CScope* scope)
1577 {
1578 // automatically restores m_Scope to its old value when we leave
1579 // the function
1580 CScopeRestorer scopeRestorer( m_Scope );
1581
1582 if( scope != NULL ) {
1583 m_Scope.Reset(scope);
1584 }
1585 if (!m_Scope) {
1586 // set up a temporary local scope if there is no scope set already
1587 m_Scope.Reset(new CScope(*m_ObjMgr));
1588 }
1589
1590 CValidError_feat feat_validator(*this);
1591 feat_validator.SetScope(*m_Scope);
1592 CSeq_entry_Handle empty;
1593 feat_validator.SetTSE(empty);
1594 feat_validator.ValidateSeqFeat(feat);
1595 if (feat.IsSetData() && feat.GetData().IsBiosrc()) {
1596 const CBioSource& src = feat.GetData().GetBiosrc();
1597 if (src.IsSetOrg()) {
1598 ValidateTaxonomy (src.GetOrg(), src.IsSetGenome() ? src.GetGenome() : CBioSource::eGenome_unknown);
1599 }
1600 }
1601 FindEmbeddedScript(feat);
1602 FindNonAsciiText(feat);
1603 FindCollidingSerialNumbers(feat);
1604 }
1605
1606
Validate(const CBioSource & src,CScope * scope)1607 void CValidError_imp::Validate(const CBioSource& src, CScope* scope)
1608 {
1609 // automatically restores m_Scope to its old value when we leave
1610 // the function
1611 CScopeRestorer scopeRestorer( m_Scope );
1612
1613 if( scope != NULL ) {
1614 m_Scope.Reset(scope);
1615 }
1616 if (!m_Scope) {
1617 // set up a temporary local scope if there is no scope set already
1618 m_Scope.Reset(new CScope(*m_ObjMgr));
1619 }
1620
1621 ValidateBioSource(src, src);
1622 if (src.IsSetOrg()) {
1623 ValidateTaxonomy (src.GetOrg(), src.IsSetGenome() ? src.GetGenome() : CBioSource::eGenome_unknown);
1624 }
1625 FindEmbeddedScript(src);
1626 FindNonAsciiText(src);
1627 FindCollidingSerialNumbers(src);
1628 }
1629
1630
Validate(const CPubdesc & pubdesc,CScope * scope)1631 void CValidError_imp::Validate(const CPubdesc& pubdesc, CScope* scope)
1632 {
1633 // automatically restores m_Scope to its old value when we leave
1634 // the function
1635 CScopeRestorer scopeRestorer( m_Scope );
1636
1637 if( scope != NULL ) {
1638 m_Scope.Reset(scope);
1639 }
1640 if (!m_Scope) {
1641 // set up a temporary local scope if there is no scope set already
1642 m_Scope.Reset(new CScope(*m_ObjMgr));
1643 }
1644
1645 ValidatePubdesc(pubdesc, pubdesc);
1646 FindEmbeddedScript(pubdesc);
1647 FindNonAsciiText(pubdesc);
1648 FindCollidingSerialNumbers(pubdesc);
1649 }
1650
Validate(const CSeqdesc & desc,const CSeq_entry & ctx)1651 void CValidError_imp::Validate(const CSeqdesc& desc, const CSeq_entry& ctx)
1652 {
1653 CValidError_desc seqdesc_validator(*this);
1654 m_Scope.Reset(new CScope(*m_ObjMgr));
1655 m_Scope->AddTopLevelSeqEntry(ctx);
1656 seqdesc_validator.ValidateSeqDesc(desc,ctx);
1657 }
1658
1659
SetProgressCallback(CValidator::TProgressCallback callback,void * user_data)1660 void CValidError_imp::SetProgressCallback
1661 (CValidator::TProgressCallback callback,
1662 void* user_data)
1663 {
1664 m_PrgCallback = callback;
1665 m_PrgInfo.m_UserData = user_data;
1666 }
1667
1668
ValidateDbxref(const CDbtag & xref,const CSerialObject & obj,bool biosource,const CSeq_entry * ctx)1669 void CValidError_imp::ValidateDbxref
1670 (const CDbtag& xref,
1671 const CSerialObject& obj,
1672 bool biosource,
1673 const CSeq_entry *ctx)
1674 {
1675 bool refseq_or_gps = IsRefSeq() || IsGPS();
1676 CValidator::TDbxrefValidFlags flags = CValidator::IsValidDbxref(xref, biosource,
1677 refseq_or_gps);
1678
1679 const string& db = xref.IsSetDb() ? xref.GetDb() : kEmptyStr;
1680
1681 if (flags & CValidator::eTagHasSgml) {
1682 PostObjErr(eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
1683 "dbxref value " + xref.GetTag().GetStr() + " has SGML",
1684 obj, ctx);
1685 }
1686 if (flags & CValidator::eContainsSpace) {
1687 PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref,
1688 "dbxref value " + xref.GetTag().GetStr() + " contains space character",
1689 obj, ctx);
1690 }
1691 if (flags & CValidator::eDbHasSgml) {
1692 PostObjErr(eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
1693 "dbxref database " + db + " has SGML",
1694 obj, ctx);
1695 }
1696
1697 string dbv;
1698 if (xref.IsSetTag() && xref.GetTag().IsStr()) {
1699 dbv = xref.GetTag().GetStr();
1700 } else if (xref.IsSetTag() && xref.GetTag().IsId()) {
1701 dbv = NStr::NumericToString(xref.GetTag().GetId());
1702 }
1703
1704 if (flags & CValidator::eUnrecognized) {
1705 PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref,
1706 "Illegal db_xref type " + db + " (" + dbv + ")", obj, ctx);
1707 }
1708 if (flags & CValidator::eBadCapitalization) {
1709 // capitalization is bad
1710 bool refseq_db = false, src_db = false;
1711 string correct_caps;
1712 xref.GetDBFlags(refseq_db, src_db, correct_caps);
1713 string message = "Illegal db_xref type " + db + " (" + dbv + "), legal capitalization is " + correct_caps;
1714 if (flags & CValidator::eNotForSource) {
1715 message += ", but should not be used on an OrgRef";
1716 } else if (flags & CValidator::eOnlyForSource) {
1717 message += ", but should only be used on an OrgRef";
1718 }
1719
1720 PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref, message, obj, ctx);
1721 } else {
1722 if (flags & CValidator::eOnlyForRefSeq) {
1723 if (flags & CValidator::eNotForSource) {
1724 PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref,
1725 "RefSeq-specific db_xref type " + db + " (" + dbv + ") should not be used on a non-RefSeq OrgRef",
1726 obj, ctx);
1727 } else {
1728 PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref,
1729 "db_xref type " + db + " (" + dbv + ") is only legal for RefSeq",
1730 obj, ctx);
1731 }
1732 } else if (flags & CValidator::eNotForSource) {
1733 if (flags & CValidator::eRefSeqNotForSource) {
1734 PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref,
1735 "RefSeq-specific db_xref type " + db + " (" + dbv + ") should not be used on an OrgRef",
1736 obj, ctx);
1737 } else {
1738 PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref,
1739 "db_xref type " + db + " (" + dbv + ") should not be used on an OrgRef",
1740 obj, ctx);
1741 }
1742 } else if (flags & CValidator::eOnlyForSource) {
1743 PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref,
1744 "db_xref type " + db + " (" + dbv + ") should only be used on an OrgRef",
1745 obj, ctx);
1746 }
1747 }
1748
1749 }
1750
1751
ValidateDbxref(TDbtags & xref_list,const CSerialObject & obj,bool biosource,const CSeq_entry * ctx)1752 void CValidError_imp::ValidateDbxref
1753 (TDbtags& xref_list,
1754 const CSerialObject& obj,
1755 bool biosource,
1756 const CSeq_entry *ctx)
1757 {
1758 string last_db;
1759
1760 ITERATE( TDbtags, xref, xref_list) {
1761 if (biosource
1762 && (*xref)->IsSetDb()) {
1763 if (!NStr::IsBlank(last_db)
1764 && NStr::EqualNocase((*xref)->GetDb(), last_db)) {
1765 PostObjErr (eDiag_Warning, eErr_SEQ_DESCR_BioSourceDbTagConflict,
1766 "BioSource uses db " + last_db + " multiple times",
1767 obj, ctx);
1768 }
1769 last_db = (*xref)->GetDb();
1770 }
1771 ValidateDbxref(**xref, obj, biosource, ctx);
1772 }
1773 }
1774
1775
x_CheckPackedInt(const CPacked_seqint & packed_int,SLocCheck & lc,const CSerialObject & obj)1776 void CValidError_imp::x_CheckPackedInt
1777 (const CPacked_seqint& packed_int,
1778 SLocCheck& lc,
1779 const CSerialObject& obj)
1780 {
1781 ITERATE(CPacked_seqint::Tdata, it, packed_int.Get()) {
1782 lc.int_cur = (*it);
1783 lc.chk &= x_CheckSeqInt(lc.id_cur, lc.int_cur, lc.strand_cur, obj);
1784
1785 x_CheckForStrandChange(lc);
1786
1787 lc.id_prv = lc.id_cur;
1788 lc.strand_prv = lc.strand_cur;
1789 lc.int_prv = lc.int_cur;
1790 }
1791 }
1792
1793
x_CheckSeqInt(CConstRef<CSeq_id> & id_cur,const CSeq_interval * int_cur,ENa_strand & strand_cur,const CSerialObject & obj)1794 bool CValidError_imp::x_CheckSeqInt
1795 (CConstRef<CSeq_id>& id_cur,
1796 const CSeq_interval * int_cur,
1797 ENa_strand& strand_cur,
1798 const CSerialObject& obj)
1799 {
1800 strand_cur = int_cur->IsSetStrand() ?
1801 int_cur->GetStrand() : eNa_strand_unknown;
1802 id_cur = &int_cur->GetId();
1803 bool chk = IsValid(*int_cur, m_Scope);
1804 return chk;
1805 }
1806
1807
x_ReportInvalidFuzz(const CPacked_seqint & packed_int,const CSerialObject & obj)1808 void CValidError_imp::x_ReportInvalidFuzz(const CPacked_seqint& packed_int, const CSerialObject& obj)
1809 {
1810 ITERATE(CPacked_seqint::Tdata, it, packed_int.Get()) {
1811 x_ReportInvalidFuzz(**it, obj);
1812 }
1813 }
1814
1815
1816 static const string kSpaceLeftFirst = "Should not specify 'space to left' at first position of non-circular sequence";
1817 static const string kSpaceRightLast = "Should not specify 'space to right' at last position of non-circular sequence";
1818
1819 static const string kSpaceLeftCircle = "Should not specify 'circle to left' except at first position of circular sequence";
1820 static const string kSpaceRightCircle = "Should not specify 'circle to right' except at last position of circular sequence";
1821
x_ReportInvalidFuzz(const CSeq_interval & interval,const CSerialObject & obj)1822 void CValidError_imp::x_ReportInvalidFuzz(const CSeq_interval& interval, const CSerialObject& obj)
1823 {
1824 CInt_fuzz::ELim fuzz_from = CInt_fuzz::eLim_unk;
1825 CInt_fuzz::ELim fuzz_to = CInt_fuzz::eLim_unk;
1826 bool has_fuzz_from = false;
1827 bool has_fuzz_to = false;
1828
1829 if (interval.IsSetFuzz_from() && interval.GetFuzz_from().IsLim()) {
1830 fuzz_from = interval.GetFuzz_from().GetLim();
1831 has_fuzz_from = true;
1832 }
1833 if (interval.IsSetFuzz_to() && interval.GetFuzz_to().IsLim()) {
1834 fuzz_to = interval.GetFuzz_to().GetLim();
1835 has_fuzz_to = true;
1836 }
1837 if (! has_fuzz_from && ! has_fuzz_to) {
1838 return;
1839 }
1840
1841 // check for invalid fuzz on both ends of Interval
1842 if (has_fuzz_from && has_fuzz_to && fuzz_from == fuzz_to) {
1843 if (fuzz_from == CInt_fuzz::eLim_tl) {
1844 PostErr(eDiag_Error,
1845 eErr_SEQ_FEAT_InvalidFuzz,
1846 "Should not specify 'space to left' for both ends of interval", obj);
1847 }
1848 else if (fuzz_from == CInt_fuzz::eLim_tr) {
1849 PostErr(eDiag_Error,
1850 eErr_SEQ_FEAT_InvalidFuzz,
1851 "Should not specify 'space to right' for both ends of interval", obj);
1852 }
1853 else if (fuzz_from == CInt_fuzz::eLim_circle) {
1854 PostErr(eDiag_Error,
1855 eErr_SEQ_FEAT_InvalidFuzz,
1856 "Should not specify 'origin of circle' for both ends of interval", obj);
1857 }
1858 }
1859
1860 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(interval.GetId());
1861 if (! bsh) {
1862 return;
1863 }
1864
1865 CSeq_inst::ETopology top = CSeq_inst::eTopology_not_set;
1866 if (bsh.IsSetInst_Topology()) {
1867 top = bsh.GetInst_Topology();
1868 }
1869
1870 if (top != CSeq_inst::eTopology_circular) {
1871
1872 // VR-15
1873 // look for space to left at beginning of sequence or space to right at end
1874 if (fuzz_from == CInt_fuzz::eLim_tl && interval.IsSetFrom() && interval.GetFrom() == 0) {
1875 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidFuzz, kSpaceLeftFirst, obj);
1876 }
1877 if (fuzz_to == CInt_fuzz::eLim_tr && interval.IsSetTo() && interval.GetTo() == bsh.GetBioseqLength() - 1) {
1878 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidFuzz, kSpaceRightLast, obj);
1879 }
1880
1881 } else if (fuzz_from == CInt_fuzz::eLim_circle || fuzz_to == CInt_fuzz::eLim_circle) {
1882
1883 if (obj.GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
1884 const CSeq_feat* sfp = dynamic_cast<const CSeq_feat*>(&obj);
1885 if (sfp && sfp->IsSetExcept() && sfp->CanGetExcept_text() && NStr::FindNoCase(sfp->GetExcept_text(), "ribosomal slippage") != NPOS) {
1886 return;
1887 }
1888 }
1889
1890 // VR-832
1891 if (fuzz_from == CInt_fuzz::eLim_circle && interval.IsSetFrom() && interval.GetFrom() != 0) {
1892 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidFuzz, kSpaceLeftCircle, obj);
1893 }
1894 if (fuzz_to == CInt_fuzz::eLim_circle && interval.IsSetTo() && interval.GetTo() != bsh.GetBioseqLength() - 1) {
1895 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidFuzz, kSpaceRightCircle, obj);
1896 }
1897 }
1898 }
1899
1900
x_ReportInvalidFuzz(const CSeq_point & point,const CSerialObject & obj)1901 void CValidError_imp::x_ReportInvalidFuzz(const CSeq_point& point, const CSerialObject& obj)
1902 {
1903 // VR-15
1904 if (!point.IsSetFuzz() || !point.GetFuzz().IsLim() ||
1905 (point.GetFuzz().GetLim() != CInt_fuzz::eLim_tl && point.GetFuzz().GetLim() != CInt_fuzz::eLim_tr) ||
1906 !point.IsSetId() || !point.IsSetPoint()) {
1907 return;
1908 }
1909 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(point.GetId());
1910 if (!bsh) {
1911 return;
1912 }
1913 if (bsh.IsSetInst_Topology() && bsh.GetInst_Topology() == CSeq_inst::eTopology_circular) {
1914 return;
1915 }
1916 if (point.GetPoint() == 0 && point.GetFuzz().GetLim() == CInt_fuzz::eLim_tl) {
1917 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidFuzz, kSpaceLeftFirst, obj);
1918 }
1919 if (point.GetPoint() == bsh.GetBioseqLength() - 1) {
1920 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidFuzz, kSpaceRightLast, obj);
1921 }
1922 }
1923
1924
x_ReportInvalidFuzz(const CSeq_loc & loc,const CSerialObject & obj)1925 void CValidError_imp::x_ReportInvalidFuzz(const CSeq_loc& loc, const CSerialObject& obj)
1926 {
1927 CTypeConstIterator<CSeq_loc> lit = ConstBegin(loc);
1928 for (; lit; ++lit) {
1929 CSeq_loc::E_Choice loc_choice = lit->Which();
1930 switch (loc_choice) {
1931 case CSeq_loc::e_Int:
1932 x_ReportInvalidFuzz(lit->GetInt(), obj);
1933 break;
1934 case CSeq_loc::e_Packed_int:
1935 x_ReportInvalidFuzz(lit->GetPacked_int(), obj);
1936 break;
1937 case CSeq_loc::e_Pnt:
1938 x_ReportInvalidFuzz(lit->GetPnt(), obj);
1939 break;
1940 default:
1941 break;
1942 }
1943 }
1944 }
1945
1946
s_CountMix(const CSeq_loc & loc)1947 unsigned int s_CountMix(const CSeq_loc& loc)
1948 {
1949 unsigned int num_mix = 0;
1950 CTypeConstIterator<CSeq_loc> lit = ConstBegin(loc);
1951 for (; lit; ++lit) {
1952 if (lit->IsMix()) {
1953 num_mix++;
1954 }
1955 }
1956 return num_mix;
1957 }
1958
1959
x_InitLocCheck(SLocCheck & lc,const string & prefix)1960 void CValidError_imp::x_InitLocCheck(SLocCheck& lc, const string& prefix)
1961 {
1962 lc.chk = true;
1963 lc.unmarked_strand = false;
1964 lc.mixed_strand = false;
1965 lc.has_other = false;
1966 lc.has_not_other = false;
1967 lc.id_cur = 0;
1968 lc.id_prv = 0;
1969 lc.int_cur = 0;
1970 lc.int_prv = 0;
1971 lc.strand_cur = eNa_strand_unknown;
1972 lc.strand_prv = eNa_strand_unknown;
1973 lc.prefix = prefix;
1974 }
1975
x_CheckForStrandChange(SLocCheck & lc)1976 void CValidError_imp::x_CheckForStrandChange(SLocCheck& lc)
1977 {
1978 if (lc.strand_prv != eNa_strand_other &&
1979 lc.strand_cur != eNa_strand_other) {
1980 if (lc.id_cur && lc.id_prv &&
1981 IsSameBioseq(*lc.id_cur, *lc.id_prv, m_Scope)) {
1982 if (lc.strand_prv != lc.strand_cur) {
1983 if ((lc.strand_prv == eNa_strand_plus &&
1984 lc.strand_cur == eNa_strand_unknown) ||
1985 (lc.strand_prv == eNa_strand_unknown &&
1986 lc.strand_cur == eNa_strand_plus)) {
1987 lc.unmarked_strand = true;
1988 } else {
1989 lc.mixed_strand = true;
1990 }
1991 }
1992 }
1993 }
1994 if (lc.strand_cur == eNa_strand_other) {
1995 lc.has_other = true;
1996 } else if (lc.strand_cur == eNa_strand_minus || lc.strand_cur == eNa_strand_plus) {
1997 lc.has_not_other = true;
1998 }
1999
2000 }
2001
x_CheckLoc(const CSeq_loc & loc,const CSerialObject & obj,SLocCheck & lc,bool lowerSev)2002 void CValidError_imp::x_CheckLoc(const CSeq_loc& loc, const CSerialObject& obj, SLocCheck& lc, bool lowerSev)
2003 {
2004 try {
2005 switch (loc.Which()) {
2006 case CSeq_loc::e_Int:
2007 lc.int_cur = &loc.GetInt();
2008 lc.chk = x_CheckSeqInt(lc.id_cur, lc.int_cur, lc.strand_cur, obj);
2009 if ((!lc.chk) && lowerSev) {
2010 TSeqPos length = GetLength(loc.GetInt().GetId(), m_Scope);
2011 TSeqPos fr = loc.GetInt().GetFrom();
2012 TSeqPos to = loc.GetInt().GetTo();
2013 if (fr < length && to >= length) {
2014 // RefSeq variation feature with dbSNP xref and interval flanking the length is ERROR
2015 } else {
2016 // otherwise keep severity at REJECT
2017 lowerSev = false;
2018 }
2019 }
2020 break;
2021 case CSeq_loc::e_Pnt:
2022 lc.strand_cur = loc.GetPnt().IsSetStrand() ?
2023 loc.GetPnt().GetStrand() : eNa_strand_unknown;
2024 lc.id_cur = &loc.GetPnt().GetId();
2025 lc.chk = IsValid(loc.GetPnt(), m_Scope);
2026 lc.int_prv = 0;
2027 break;
2028 case CSeq_loc::e_Packed_pnt:
2029 lc.strand_cur = loc.GetPacked_pnt().IsSetStrand() ?
2030 loc.GetPacked_pnt().GetStrand() : eNa_strand_unknown;
2031 lc.id_cur = &loc.GetPacked_pnt().GetId();
2032 lc.chk = IsValid(loc.GetPacked_pnt(), m_Scope);
2033 lc.int_prv = 0;
2034 break;
2035 case CSeq_loc::e_Packed_int:
2036 x_CheckPackedInt(loc.GetPacked_int(), lc, obj);
2037 break;
2038 case CSeq_loc::e_Null:
2039 break;
2040 case CSeq_loc::e_Mix:
2041 for (auto l : loc.GetMix().Get()) {
2042 x_CheckLoc(*l, obj, lc, lowerSev);
2043 x_CheckForStrandChange(lc);
2044 }
2045 break;
2046 default:
2047 lc.strand_cur = eNa_strand_other;
2048 lc.id_cur = 0;
2049 lc.int_prv = 0;
2050 break;
2051 }
2052 if (!lc.chk) {
2053 string lbl = GetValidatorLocationLabel (loc, *m_Scope);
2054 EDiagSev sev = eDiag_Critical;
2055 if (lowerSev) {
2056 sev = eDiag_Error;
2057 }
2058 PostErr(sev, eErr_SEQ_FEAT_Range,
2059 lc.prefix + ": SeqLoc [" + lbl + "] out of range", obj);
2060 }
2061
2062 if (loc.Which() != CSeq_loc::e_Null) {
2063 x_CheckForStrandChange(lc);
2064
2065 lc.strand_prv = lc.strand_cur;
2066 lc.id_prv = lc.id_cur;
2067 }
2068 } catch( const exception& e ) {
2069 string label = GetValidatorLocationLabel(loc, *m_Scope);
2070 PostErr(eDiag_Error, eErr_INTERNAL_Exception,
2071 "Exception caught while validating location " +
2072 label + ". Exception: " + e.what(), obj);
2073
2074 lc.strand_cur = eNa_strand_other;
2075 lc.id_cur = 0;
2076 lc.int_prv = 0;
2077 }
2078
2079 }
2080
ValidateSeqLoc(const CSeq_loc & loc,const CBioseq_Handle & seq,bool report_abutting,const string & prefix,const CSerialObject & obj,bool lowerSev)2081 void CValidError_imp::ValidateSeqLoc
2082 (const CSeq_loc& loc,
2083 const CBioseq_Handle& seq,
2084 bool report_abutting,
2085 const string& prefix,
2086 const CSerialObject& obj,
2087 bool lowerSev)
2088 {
2089 SLocCheck lc;
2090
2091 x_InitLocCheck(lc, prefix);
2092
2093 x_CheckLoc(loc, obj, lc, lowerSev);
2094
2095 if (lc.has_other && lc.has_not_other) {
2096 string label = GetValidatorLocationLabel(loc, *m_Scope);
2097 PostErr(IsSmallGenomeSet() ? eDiag_Warning : eDiag_Error, eErr_SEQ_FEAT_MixedStrand,
2098 prefix + ": Inconsistent use of other strand SeqLoc [" + label + "]", obj);
2099 }
2100
2101 x_ReportInvalidFuzz(loc, obj);
2102
2103 if (m_Scope && CValidator::DoesSeqLocContainDuplicateIntervals(loc, *m_Scope)) {
2104 PostErr(eDiag_Error,
2105 eErr_SEQ_FEAT_DuplicateExonInterval,
2106 "Duplicate exons in location", obj);
2107 }
2108
2109 if (s_CountMix(loc) > 1) {
2110 string label;
2111 loc.GetLabel(&label);
2112 PostErr (eDiag_Error, eErr_SEQ_FEAT_NestedSeqLocMix,
2113 prefix + ": SeqLoc [" + label + "] has nested SEQLOC_MIX elements",
2114 obj);
2115 }
2116
2117 // Warn if different parts of a seq-loc refer to the same bioseq using
2118 // differnt id types (i.e. gi and accession)
2119 ValidateSeqLocIds(loc, obj);
2120
2121 bool trans_splice = false;
2122 bool exception = false;
2123 const CSeq_feat* sfp = NULL;
2124 if (obj.GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
2125 sfp = dynamic_cast<const CSeq_feat*>(&obj);
2126 }
2127 if (sfp != 0) {
2128
2129 // primer_bind intervals MAY be in on opposite strands
2130 if ( sfp->GetData().GetSubtype() == CSeqFeatData::eSubtype_primer_bind ) {
2131 lc.mixed_strand = false;
2132 lc.unmarked_strand = false;
2133 }
2134
2135 exception = sfp->IsSetExcept() ? sfp->GetExcept() : false;
2136 if (exception && sfp->CanGetExcept_text()) {
2137 // trans splicing exception turns off both mixed_strand and
2138 // out_of_order messages
2139 if (NStr::FindNoCase(sfp->GetExcept_text(), "trans-splicing") != NPOS) {
2140 trans_splice = true;
2141 }
2142 }
2143 }
2144
2145 string loc_lbl;
2146 if (report_abutting && (!sfp || !CSeqFeatData::AllowAdjacentIntervals(sfp->GetData().GetSubtype())) &&
2147 (m_Scope && CValidator::DoesSeqLocContainAdjacentIntervals(loc, *m_Scope))) {
2148 loc_lbl = GetValidatorLocationLabel(loc, *m_Scope);
2149
2150 EDiagSev sev = exception ? eDiag_Warning : eDiag_Error;
2151 PostErr(sev, eErr_SEQ_FEAT_AbuttingIntervals,
2152 prefix + ": Adjacent intervals in SeqLoc [" +
2153 loc_lbl + "]", obj);
2154 }
2155
2156 if (trans_splice && !NStr::Equal(prefix, "Product")) {
2157 CSeq_loc_CI li(loc);
2158 ++li;
2159 if (!li) {
2160 PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadTranssplicedInterval, "Trans-spliced feature should have multiple intervals", obj);
2161 }
2162 return;
2163 }
2164
2165 bool ordered = true;
2166 bool circular = false;
2167 if ( seq &&
2168 seq.IsSetInst() && seq.GetInst().IsSetTopology() &&
2169 seq.GetInst().GetTopology() == CSeq_inst::eTopology_circular ) {
2170 circular = true;
2171 }
2172 try {
2173 if (m_Scope && (!sfp || CSeqFeatData::RequireLocationIntervalsInBiologicalOrder(sfp->GetData().GetSubtype())) && !circular) {
2174 ordered = CValidator::IsSeqLocCorrectlyOrdered(loc, *m_Scope);
2175 }
2176 } catch ( const CException& ex) {
2177 string label;
2178 loc.GetLabel(&label);
2179 PostErr(eDiag_Error, eErr_INTERNAL_Exception,
2180 "Exception caught while validating location " +
2181 label + ". Exception: " + ex.what(), obj);
2182 }
2183
2184 if (lc.mixed_strand || lc.unmarked_strand || !ordered) {
2185 if (loc_lbl.empty()) {
2186 loc_lbl = GetValidatorLocationLabel(loc, *m_Scope);
2187 }
2188 if (lc.mixed_strand) {
2189 if (IsSmallGenomeSet()) {
2190 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GenomeSetMixedStrand,
2191 prefix + ": Mixed strands in SeqLoc ["
2192 + loc_lbl + "] in small genome set - set trans-splicing exception if appropriate", obj);
2193 } else {
2194 EDiagSev sev = eDiag_Error;
2195 if (IsGeneious() || (sfp && sequence::IsPseudo(*sfp, *m_Scope))) {
2196 sev = eDiag_Warning;
2197 }
2198 PostErr(sev, eErr_SEQ_FEAT_MixedStrand,
2199 prefix + ": Mixed strands in SeqLoc ["
2200 + loc_lbl + "]", obj);
2201 }
2202 } else if (lc.unmarked_strand) {
2203 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MixedStrand,
2204 prefix + ": Mixed plus and unknown strands in SeqLoc ["
2205 + loc_lbl + "]", obj);
2206 }
2207 if (!ordered) {
2208 if (IsSmallGenomeSet()) {
2209 PostErr(eDiag_Warning, eErr_SEQ_FEAT_SeqLocOrder,
2210 prefix + ": Intervals out of order in SeqLoc [" +
2211 loc_lbl + "]", obj);
2212 } else {
2213 PostErr(eDiag_Error, eErr_SEQ_FEAT_SeqLocOrder,
2214 prefix + ": Intervals out of order in SeqLoc [" +
2215 loc_lbl + "]", obj);
2216 }
2217 }
2218 return;
2219 }
2220
2221 if ( seq &&
2222 seq.IsSetInst_Repr() &&
2223 seq.GetInst_Repr() != CSeq_inst::eRepr_seg ) {
2224 return;
2225 }
2226
2227 // Check for intervals out of order on segmented Bioseq
2228 if ( seq && BadSeqLocSortOrder(seq, loc) ) {
2229 if (loc_lbl.empty()) {
2230 loc.GetLabel(&loc_lbl);
2231 }
2232 PostErr(eDiag_Error, eErr_SEQ_FEAT_SeqLocOrder,
2233 prefix + "Intervals out of order in SeqLoc [" +
2234 loc_lbl + "]", obj);
2235 }
2236
2237 // Check for mixed strand on segmented Bioseq
2238 if ( IsMixedStrands(loc) ) {
2239 if (loc_lbl.empty()) {
2240 loc.GetLabel(&loc_lbl);
2241 }
2242 PostErr(eDiag_Error, eErr_SEQ_FEAT_MixedStrand,
2243 prefix + ": Mixed strands in SeqLoc [" +
2244 loc_lbl + "]", obj);
2245 }
2246 }
2247
2248
AddBioseqWithNoBiosource(const CBioseq & seq)2249 void CValidError_imp::AddBioseqWithNoBiosource(const CBioseq& seq)
2250 {
2251 if (!SeqIsPatent(seq)) {
2252 m_BioseqWithNoSource.push_back(CConstRef<CBioseq>(&seq));
2253 }
2254 }
2255
2256
AddProtWithoutFullRef(const CBioseq_Handle & seq)2257 void CValidError_imp::AddProtWithoutFullRef(const CBioseq_Handle& seq)
2258 {
2259 if (!SeqIsPatent (seq)) {
2260 PostErr (eDiag_Error, eErr_SEQ_FEAT_MissingProteinName,
2261 "The product name is missing from this protein.", *(seq.GetCompleteBioseq()));
2262 }
2263 }
2264
2265
IsWGSIntermediate(const CBioseq & seq)2266 bool CValidError_imp::IsWGSIntermediate(const CBioseq& seq)
2267 {
2268 bool wgs = false;
2269
2270 FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
2271 if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech()
2272 && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_wgs) {
2273 wgs = true;
2274 break;
2275 }
2276 }
2277 if (!wgs) {
2278 return false;
2279 }
2280
2281 bool is_other = false;
2282 bool has_gi = false;
2283
2284 FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2285 if ((*it)->IsOther()) {
2286 is_other = true;
2287 break;
2288 } else if ((*it)->IsGi()) {
2289 has_gi = true;
2290 break;
2291 }
2292 }
2293 if (!is_other || has_gi) {
2294 return false;
2295 }
2296
2297 return true;
2298 }
2299
2300
IsTSAIntermediate(const CBioseq & seq)2301 bool CValidError_imp::IsTSAIntermediate(const CBioseq& seq)
2302 {
2303 bool tsa = false;
2304
2305 FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
2306 if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech()
2307 && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_tsa) {
2308 tsa = true;
2309 break;
2310 }
2311 }
2312 if (!tsa) {
2313 return false;
2314 }
2315
2316 bool is_other = false;
2317 bool has_gi = false;
2318
2319 FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2320 if ((*it)->IsOther()) {
2321 is_other = true;
2322 break;
2323 } else if ((*it)->IsGi()) {
2324 has_gi = true;
2325 break;
2326 }
2327 }
2328 if (!is_other || has_gi) {
2329 return false;
2330 }
2331
2332 return true;
2333 }
2334
2335
ReportMissingBiosource(const CSeq_entry & se)2336 void CValidError_imp::ReportMissingBiosource(const CSeq_entry& se)
2337 {
2338 if(m_NoBioSource && !m_IsPatent && !m_IsPDB) {
2339 PostErr(eDiag_Error, eErr_SEQ_DESCR_NoSourceDescriptor,
2340 "No source information included on this record.", se);
2341 return;
2342 }
2343
2344 size_t num_no_source = m_BioseqWithNoSource.size();
2345
2346 for ( size_t i = 0; i < num_no_source; ++i ) {
2347 PostErr(eDiag_Fatal, eErr_SEQ_DESCR_NoOrgFound,
2348 "No organism name included in the source. Other qualifiers may exist.",
2349 *(m_BioseqWithNoSource[i]));
2350 }
2351 }
2352
2353
GetCDSGivenProduct(const CBioseq & seq)2354 CConstRef<CSeq_feat> CValidError_imp::GetCDSGivenProduct(const CBioseq& seq)
2355 {
2356 CConstRef<CSeq_feat> feat;
2357
2358 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
2359
2360 if ( bsh ) {
2361 if ( IsNT() && m_TSE ) {
2362 // In case of a NT bioseq limit the search to features packaged on the
2363 // NT (we assume features have been pulled from the segments to the NT).
2364 SAnnotSelector sel(CSeqFeatData::e_Cdregion);
2365 sel.SetByProduct()
2366 .SetLimitTSE(m_Scope->GetSeq_entryHandle(*m_TSE));
2367 CFeat_CI fi(bsh, sel);
2368 if ( fi ) {
2369 // return the first one (should be the one packaged on the
2370 // nuc-prot set).
2371 feat.Reset(&(fi->GetOriginalFeature()));
2372 }
2373 } else {
2374 SAnnotSelector sel(CSeqFeatData::e_Cdregion);
2375 sel.SetByProduct();
2376 CFeat_CI fi(bsh, sel);
2377 if ( fi ) {
2378 // return the first one (should be the one packaged on the
2379 // nuc-prot set).
2380 feat.Reset(&(fi->GetOriginalFeature()));
2381 }
2382 }
2383 }
2384
2385 return feat;
2386 }
2387
2388
GetmRNAGivenProduct(const CBioseq & seq)2389 CConstRef<CSeq_feat> CValidError_imp::GetmRNAGivenProduct(const CBioseq& seq)
2390 {
2391 CConstRef<CSeq_feat> feat;
2392
2393 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
2394
2395
2396 if ( bsh ) {
2397 // In case of a NT bioseq limit the search to features packaged on the
2398 // NT (we assume features have been pulled from the segments to the NT).
2399 CSeq_entry_Handle limit;
2400 if ( IsNT() && m_TSE ) {
2401 limit = m_Scope->GetSeq_entryHandle(*m_TSE);
2402 }
2403
2404 if (limit) {
2405 SAnnotSelector sel(CSeqFeatData::eSubtype_mRNA);
2406 sel.SetByProduct() .SetLimitTSE(limit);
2407 CFeat_CI fi(bsh, sel);
2408 if ( fi ) {
2409 // return the first one (should be the one packaged on the
2410 // nuc-prot set).
2411 feat.Reset(&(fi->GetOriginalFeature()));
2412 }
2413 } else {
2414 SAnnotSelector sel(CSeqFeatData::eSubtype_mRNA);
2415 sel.SetByProduct();
2416 CFeat_CI fi(bsh, sel);
2417 if ( fi ) {
2418 // return the first one (should be the one packaged on the
2419 // nuc-prot set).
2420 feat.Reset(&(fi->GetOriginalFeature()));
2421 }
2422 }
2423 }
2424
2425 return feat;
2426 }
2427
2428
GetAncestor(const CBioseq & seq,CBioseq_set::EClass clss)2429 const CSeq_entry* CValidError_imp::GetAncestor
2430 (const CBioseq& seq,
2431 CBioseq_set::EClass clss)
2432 {
2433 const CSeq_entry* parent = 0;
2434 for ( parent = seq.GetParentEntry();
2435 parent != 0;
2436 parent = parent->GetParentEntry() ) {
2437 if ( parent->IsSet() ) {
2438 const CBioseq_set& set = parent->GetSet();
2439 if ( set.IsSetClass() && set.GetClass() == clss ) {
2440 break;
2441 }
2442 }
2443 }
2444 return parent;
2445 }
2446
2447
IsSerialNumberInComment(const string & comment)2448 bool CValidError_imp::IsSerialNumberInComment(const string& comment)
2449 {
2450 size_t pos = comment.find('[', 0);
2451 while ( pos != string::npos ) {
2452 ++pos;
2453 bool okay = true;
2454 if ( isdigit((unsigned char) comment[pos]) ) {
2455 // skip if first character after bracket is 0
2456 if (comment[pos] == '0') {
2457 okay = false;
2458 }
2459 while ( isdigit((unsigned char) comment[pos]) ) {
2460 ++pos;
2461 }
2462 if ( comment[pos] == ']' && okay ) {
2463 return true;
2464 }
2465 }
2466
2467 pos = comment.find('[', pos);
2468 }
2469 return false;
2470 }
2471
2472
RequireLocalProduct(const CSeq_id * sid) const2473 bool CValidError_imp::RequireLocalProduct(const CSeq_id* sid) const
2474 {
2475 // okay to have far RefSeq product, but only if genomic product set
2476 if ( sid != 0 && sid->IsOther() ) {
2477 if ( IsGPS() ) {
2478 return false;
2479 }
2480 }
2481 // or just a bioseq
2482 if ( GetTSE().IsSeq() ) {
2483 return false;
2484 }
2485
2486 // or in a standalone Seq-annot
2487 if (IsStandaloneAnnot() ) {
2488 return false;
2489 }
2490 return true;
2491 }
2492
2493
s_CollectPubDescriptorLabels(const CSeq_entry & se,vector<TEntrezId> & pmids,vector<TEntrezId> & muids,vector<int> & serials,vector<string> & published_labels,vector<string> & unpublished_labels)2494 static void s_CollectPubDescriptorLabels (const CSeq_entry& se,
2495 vector<TEntrezId>& pmids, vector<TEntrezId>& muids, vector<int>& serials,
2496 vector<string>& published_labels, vector<string>& unpublished_labels)
2497 {
2498 FOR_EACH_SEQDESC_ON_SEQENTRY (it, se) {
2499 if ((*it)->IsPub()) {
2500 CCleanup::GetPubdescLabels ((*it)->GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
2501 }
2502 }
2503
2504 if (se.IsSet()) {
2505 FOR_EACH_SEQENTRY_ON_SEQSET (it, se.GetSet()) {
2506 s_CollectPubDescriptorLabels (**it, pmids, muids, serials, published_labels, unpublished_labels);
2507 }
2508 }
2509 }
2510
2511
ValidateCitations(const CSeq_entry_Handle & seh)2512 void CValidError_imp::ValidateCitations (const CSeq_entry_Handle& seh)
2513 {
2514 vector<TEntrezId> pmids;
2515 vector<TEntrezId> muids;
2516 vector<int> serials;
2517 vector<string> published_labels;
2518 vector<string> unpublished_labels;
2519
2520 // collect labels for pubs on record
2521 s_CollectPubDescriptorLabels (*(seh.GetCompleteSeq_entry()), pmids, muids, serials, published_labels, unpublished_labels);
2522
2523 CFeat_CI feat (seh, SAnnotSelector(CSeqFeatData::e_Pub));
2524 while (feat) {
2525 CCleanup::GetPubdescLabels (feat->GetData().GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
2526 ++feat;
2527 }
2528
2529 // now examine citations to determine whether they match a pub on the record
2530 CFeat_CI f (seh);
2531 while (f) {
2532 if (f->IsSetCit() && f->GetCit().IsPub()) {
2533 ITERATE (CPub_set::TPub, cit_it, f->GetCit().GetPub()) {
2534 bool found = false;
2535
2536 if ((*cit_it)->IsPmid()) {
2537 vector<TEntrezId>::iterator it = pmids.begin();
2538 while (it != pmids.end() && !found) {
2539 if (*it == (*cit_it)->GetPmid()) {
2540 found = true;
2541 }
2542 ++it;
2543 }
2544 if (!found) {
2545 PostErr (eDiag_Warning, eErr_SEQ_FEAT_FeatureCitationProblem,
2546 "Citation on feature refers to uid ["
2547 + NStr::NumericToString((*cit_it)->GetPmid().Get())
2548 + "] not on a publication in the record",
2549 f->GetOriginalFeature());
2550 }
2551 } else if ((*cit_it)->IsMuid()) {
2552 vector<TEntrezId>::iterator it = muids.begin();
2553 while (it != muids.end() && !found) {
2554 if (*it == (*cit_it)->GetMuid()) {
2555 found = true;
2556 }
2557 ++it;
2558 }
2559 if (!found) {
2560 PostErr (eDiag_Warning, eErr_SEQ_FEAT_FeatureCitationProblem,
2561 "Citation on feature refers to uid ["
2562 + NStr::NumericToString((*cit_it)->GetMuid())
2563 + "] not on a publication in the record",
2564 f->GetOriginalFeature());
2565 }
2566 } else if ((*cit_it)->IsEquiv()) {
2567 continue;
2568 } else {
2569 string label;
2570 (*cit_it)->GetLabel(&label, CPub::eContent, true);
2571
2572 if (NStr::EndsWith (label, ">")) {
2573 label = label.substr(0, label.length() - 2);
2574 }
2575 if(NStr::EndsWith (label, "|")) {
2576 label = label.substr(0, label.length() - 1);
2577 }
2578 if (NStr::EndsWith (label, " ")) {
2579 label = label.substr(0, label.length() - 1);
2580 }
2581 size_t len = label.length();
2582 vector<string>::iterator unpub_it = unpublished_labels.begin();
2583 while (unpub_it != unpublished_labels.end() && !found) {
2584 size_t it_len =(*unpub_it).length();
2585 if (NStr::EqualNocase (*unpub_it, 0, it_len > len ? len : it_len, label)) {
2586 found = true;
2587 }
2588 ++unpub_it;
2589 }
2590 vector<string>::iterator pub_it = published_labels.begin();
2591
2592 while (pub_it != published_labels.end() && !found) {
2593 size_t it_len =(*pub_it).length();
2594 if (NStr::EqualNocase (*pub_it, 0, it_len > len ? len : it_len, label)) {
2595 PostErr (eDiag_Warning, eErr_SEQ_FEAT_FeatureCitationProblem,
2596 "Citation on feature needs to be updated to published uid",
2597 f->GetOriginalFeature());
2598 found = true;
2599 }
2600 ++pub_it;
2601 }
2602 if (!found) {
2603 PostErr (eDiag_Warning, eErr_SEQ_FEAT_FeatureCitationProblem,
2604 "Citation on feature refers to a publication not in the record",
2605 f->GetOriginalFeature());
2606 }
2607 }
2608 }
2609 }
2610 ++f;
2611 }
2612 }
2613
2614
2615 // =============================================================================
2616 // Private
2617 // =============================================================================
2618
2619
2620
FindNonAsciiText(const CSerialObject & obj)2621 void CValidError_imp::FindNonAsciiText (const CSerialObject& obj)
2622 {
2623 CStdTypeConstIterator<string> it(obj);
2624 for( ; it; ++it) {
2625 const string& str = *it;
2626 FOR_EACH_CHAR_IN_STRING(c_it, str) {
2627 const char& ch = *c_it;
2628 unsigned char chu = ch;
2629 if (ch > 127 || (ch < 32 && ch != '\t' && ch != '\r' && ch != '\n')) {
2630 PostErr (eDiag_Fatal, eErr_GENERIC_NonAsciiAsn,
2631 "Non-ASCII character '" + NStr::NumericToString(chu) + "' found in item", obj);
2632 break;
2633 }
2634 }
2635 }
2636 }
2637
2638
FindEmbeddedScript(const CSerialObject & obj)2639 void CValidError_imp::FindEmbeddedScript (const CSerialObject& obj)
2640 {
2641 class CScriptTagTextFsm : public CTextFsm<int>
2642 {
2643 public:
2644 CScriptTagTextFsm() {
2645 const char * script_tags[] = {
2646 "<script", "<object", "<applet", "<embed", "<form",
2647 "javascript:", "vbscript:"};
2648 ITERATE_0_IDX(idx, ArraySize(script_tags)) {
2649 AddWord(script_tags[idx], true);
2650 }
2651 Prime();
2652 }
2653
2654 // Returns true if the given string matches any of the strings
2655 // in the fsm anywhere.
2656 bool DoesStrHaveFsmHits(const string &str) {
2657 int state = GetInitialState();
2658 ITERATE(string, str_it, str) {
2659 state = GetNextState(state, *str_it);
2660 if( IsMatchFound(state) ) {
2661 return true;
2662 }
2663 }
2664
2665 return false;
2666 }
2667 };
2668 static CScriptTagTextFsm s_ScriptTagFsm;
2669
2670
2671 CStdTypeConstIterator<string> it(obj);
2672 for( ; it; ++it) {
2673 if (s_ScriptTagFsm.DoesStrHaveFsmHits(*it)) {
2674 PostErr (eDiag_Error, eErr_GENERIC_EmbeddedScript,
2675 "Script tag found in item", obj);
2676 return;
2677 }
2678 }
2679 }
2680
2681
IsMixedStrands(const CSeq_loc & loc)2682 bool CValidError_imp::IsMixedStrands(const CSeq_loc& loc)
2683 {
2684 if ( SeqLocCheck(loc, m_Scope) == eSeqLocCheck_warning ) {
2685 return false;
2686 }
2687
2688 CSeq_loc_CI curr(loc);
2689 if ( !curr ) {
2690 return false;
2691 }
2692 CSeq_loc_CI prev = curr;
2693 ++curr;
2694
2695 while ( curr ) {
2696 ENa_strand curr_strand = curr.GetStrand();
2697 ENa_strand prev_strand = prev.GetStrand();
2698
2699 if ( (prev_strand == eNa_strand_minus &&
2700 curr_strand != eNa_strand_minus) ||
2701 (prev_strand != eNa_strand_minus &&
2702 curr_strand == eNa_strand_minus) ) {
2703 return true;
2704 }
2705
2706 prev = curr;
2707 ++curr;
2708 }
2709
2710 return false;
2711 }
2712
2713
s_SeqLocHasGI(const CSeq_loc & loc)2714 static bool s_SeqLocHasGI (const CSeq_loc& loc)
2715 {
2716 bool rval = false;
2717
2718 for ( CSeq_loc_CI it(loc); it && !rval; ++it ) {
2719 if (it.GetSeq_id().IsGi()) {
2720 rval = true;
2721 }
2722 }
2723 return rval;
2724 }
2725
2726
SetTSE(const CSeq_entry_Handle & seh)2727 void CValidError_imp::SetTSE(const CSeq_entry_Handle& seh)
2728 {
2729 m_TSEH = seh;
2730 m_TSE = m_TSEH.GetCompleteSeq_entry();
2731 m_GeneCache.Clear();
2732 }
2733
2734
s_IsGoodTopSetClass(CBioseq_set::EClass set_class)2735 bool s_IsGoodTopSetClass(CBioseq_set::EClass set_class)
2736 {
2737 if (set_class == CBioseq_set::eClass_gen_prod_set || set_class == CBioseq_set::eClass_small_genome_set) {
2738 return true;
2739 } else {
2740 return false;
2741 }
2742 }
2743
2744
s_CountTopSetSiblings(const CSeq_entry & se)2745 size_t s_CountTopSetSiblings(const CSeq_entry& se)
2746 {
2747 if (se.IsSeq()) {
2748 return 1;
2749 } else if (!se.IsSet()) {
2750 return 0;
2751 }
2752 if (se.GetSet().IsSetClass()) {
2753 if (se.GetSet().GetClass() == CBioseq_set::eClass_nuc_prot ||
2754 s_IsGoodTopSetClass(se.GetSet().GetClass())) {
2755 return 1;
2756 }
2757 }
2758 size_t count = 0;
2759 if (se.GetSet().IsSetSeq_set()) {
2760 for (auto it = se.GetSet().GetSeq_set().begin(); it != se.GetSet().GetSeq_set().end(); it++) {
2761 count += s_CountTopSetSiblings(**it);
2762 }
2763 }
2764 return count;
2765 }
2766
2767
Setup(const CSeq_entry_Handle & seh)2768 void CValidError_imp::Setup(const CSeq_entry_Handle& seh)
2769 {
2770 // "Save" the Seq-entry
2771 SetTSE(seh);
2772
2773 m_NumTopSetSiblings = s_CountTopSetSiblings(*(seh.GetCompleteSeq_entry()));
2774 m_Scope.Reset(&m_TSEH.GetScope());
2775
2776 // If no Pubs/BioSource in CSeq_entry, post only one error
2777 CTypeConstIterator<CPub> pub(ConstBegin(*m_TSE));
2778 m_NoPubs = !pub;
2779 while (pub && !pub->IsSub()) {
2780 ++pub;
2781 }
2782 m_NoCitSubPubs = !pub;
2783
2784 CTypeConstIterator<CBioSource> src(ConstBegin(*m_TSE));
2785 m_NoBioSource = !src;
2786
2787 // Look for genomic product set
2788 for (CTypeConstIterator <CBioseq_set> si (*m_TSE); si; ++si) {
2789 if (si->IsSetClass ()) {
2790 if (si->GetClass () == CBioseq_set::eClass_gen_prod_set) {
2791 m_IsGPS = true;
2792 }
2793 if (si->GetClass () == CBioseq_set::eClass_small_genome_set) {
2794 m_IsSmallGenomeSet = true;
2795 }
2796 }
2797 }
2798
2799 // Examine all Seq-ids on Bioseqs
2800 for (CTypeConstIterator <CBioseq> bi (*m_TSE); bi; ++bi) {
2801 FOR_EACH_SEQID_ON_BIOSEQ (sid_itr, *bi) {
2802 const CSeq_id& sid = **sid_itr;
2803 const CTextseq_id* tsid = sid.GetTextseq_Id();
2804 CSeq_id::E_Choice typ = sid.Which();
2805 switch (typ) {
2806 case CSeq_id::e_not_set:
2807 break;
2808 case CSeq_id::e_Local:
2809 break;
2810 case CSeq_id::e_Gibbsq:
2811 break;
2812 case CSeq_id::e_Gibbmt:
2813 break;
2814 case CSeq_id::e_Giim:
2815 break;
2816 case CSeq_id::e_Genbank:
2817 m_IsINSDInSep = true;
2818 m_IsGB = true;
2819 m_IsGED = true;
2820 break;
2821 case CSeq_id::e_Embl:
2822 m_IsINSDInSep = true;
2823 m_IsGED = true;
2824 m_IsEmbl = true;
2825 break;
2826 case CSeq_id::e_Pir:
2827 break;
2828 case CSeq_id::e_Swissprot:
2829 break;
2830 case CSeq_id::e_Patent:
2831 m_IsPatent = true;
2832 break;
2833 case CSeq_id::e_Other:
2834 m_IsRefSeq = true;
2835 // and do RefSeq subclasses up front as well
2836 if (sid.GetOther().IsSetAccession()) {
2837 string acc = sid.GetOther().GetAccession().substr(0, 3);
2838 if (acc == "NC_") {
2839 m_IsNC = true;
2840 } else if (acc == "NG_") {
2841 m_IsNG = true;
2842 } else if (acc == "NM_") {
2843 m_IsNM = true;
2844 } else if (acc == "NP_") {
2845 m_IsNP = true;
2846 } else if (acc == "NR_") {
2847 m_IsNR = true;
2848 } else if (acc == "NZ_") {
2849 m_IsNZ = true;
2850 } else if (acc == "NS_") {
2851 m_IsNS = true;
2852 } else if (acc == "NT_") {
2853 m_IsNT = true;
2854 } else if (acc == "NW_") {
2855 m_IsNW = true;
2856 } else if (acc == "WP_") {
2857 m_IsWP = true;
2858 } else if (acc == "XR_") {
2859 m_IsXR = true;
2860 }
2861 }
2862 break;
2863 case CSeq_id::e_General:
2864 if ((*bi).IsAa() && !sid.GetGeneral().IsSkippable()) {
2865 m_ProteinHasGeneralID = true;
2866 }
2867 break;
2868 case CSeq_id::e_Gi:
2869 m_IsGI = true;
2870 m_HasGiOrAccnVer = true;
2871 break;
2872 case CSeq_id::e_Ddbj:
2873 m_IsINSDInSep = true;
2874 m_IsGED = true;
2875 m_IsDdbj = true;
2876 break;
2877 case CSeq_id::e_Prf:
2878 break;
2879 case CSeq_id::e_Pdb:
2880 m_IsPDB = true;
2881 break;
2882 case CSeq_id::e_Tpg:
2883 m_IsINSDInSep = true;
2884 break;
2885 case CSeq_id::e_Tpe:
2886 m_IsTPE = true;
2887 m_IsINSDInSep = true;
2888 break;
2889 case CSeq_id::e_Tpd:
2890 m_IsINSDInSep = true;
2891 break;
2892 case CSeq_id::e_Gpipe:
2893 m_IsGpipe = true;
2894 break;
2895 default:
2896 break;
2897 }
2898 if ( tsid && tsid->IsSetAccession() && tsid->IsSetVersion() && tsid->GetVersion() >= 1 ) {
2899 m_HasGiOrAccnVer = true;
2900 }
2901 if (typ != CSeq_id::e_Local && typ != CSeq_id::e_General) {
2902 m_IsLocalGeneralOnly = false;
2903 }
2904 }
2905 }
2906
2907 // search all source descriptors for genomic source
2908 for (CSeqdesc_CI desc_ci (seh, CSeqdesc::e_Source);
2909 desc_ci && !m_IsGenomic;
2910 ++desc_ci) {
2911 if (desc_ci->GetSource().IsSetGenome()
2912 && desc_ci->GetSource().GetGenome() == CBioSource::eGenome_genomic) {
2913 m_IsGenomic = true;
2914 }
2915 }
2916
2917 // search genome build and annotation pipeline user object descriptors
2918 for (CSeqdesc_CI desc_ci (seh, CSeqdesc::e_User);
2919 desc_ci && !m_IsGpipe;
2920 ++desc_ci) {
2921 if ( desc_ci->GetUser().IsSetType() ) {
2922 const CUser_object& obj = desc_ci->GetUser();
2923 const CObject_id& oi = obj.GetType();
2924 if ( ! oi.IsStr() ) continue;
2925 if ( NStr::CompareNocase(oi.GetStr(), "GenomeBuild") == 0 ) {
2926 m_IsGpipe = true;
2927 } else if ( NStr::CompareNocase(oi.GetStr(), "StructuredComment") == 0 ) {
2928 ITERATE (CUser_object::TData, field, obj.GetData()) {
2929 if ((*field)->IsSetLabel() && (*field)->GetLabel().IsStr()) {
2930 if (NStr::EqualNocase((*field)->GetLabel().GetStr(), "Annotation Pipeline")) {
2931 if (NStr::EqualNocase((*field)->GetData().GetStr(), "NCBI eukaryotic genome annotation pipeline")) {
2932 m_IsGpipe = true;
2933 }
2934 }
2935 }
2936 }
2937 }
2938 }
2939 }
2940
2941 // examine features for location gi, product gi, and locus tag
2942 for (CFeat_CI feat_ci (seh);
2943 feat_ci && (!m_FeatLocHasGI || !m_ProductLocHasGI || !m_GeneHasLocusTag);
2944 ++feat_ci) {
2945 if (s_SeqLocHasGI(feat_ci->GetLocation())) {
2946 m_FeatLocHasGI = true;
2947 }
2948 if (feat_ci->IsSetProduct() && s_SeqLocHasGI(feat_ci->GetProduct())) {
2949 m_ProductLocHasGI = true;
2950 }
2951 if (feat_ci->IsSetData() && feat_ci->GetData().IsGene()
2952 && feat_ci->GetData().GetGene().IsSetLocus_tag()
2953 && !NStr::IsBlank (feat_ci->GetData().GetGene().GetLocus_tag())) {
2954 m_GeneHasLocusTag = true;
2955 }
2956 }
2957
2958 if ( m_PrgCallback ) {
2959 m_NumAlign = 0;
2960 for (CTypeConstIterator<CSeq_align> i(*m_TSE); i; ++i) {
2961 m_NumAlign++;
2962 }
2963 m_NumAnnot = 0;
2964 for (CTypeConstIterator<CSeq_annot> i(*m_TSE); i; ++i) {
2965 m_NumAnnot++;
2966 }
2967 m_NumBioseq = 0;
2968 for (CTypeConstIterator<CBioseq> i(*m_TSE); i; ++i) {
2969 m_NumBioseq++;
2970 }
2971 m_NumBioseq_set = 0;
2972 for (CTypeConstIterator<CBioseq_set> i(*m_TSE); i; ++i) {
2973 m_NumBioseq_set++;
2974 }
2975 m_NumDesc = 0;
2976 for (CTypeConstIterator<CSeqdesc> i(*m_TSE); i; ++i) {
2977 m_NumDesc++;
2978 }
2979 m_NumDescr = 0;
2980 for (CTypeConstIterator<CSeq_descr> i(*m_TSE); i; ++i) {
2981 m_NumDescr++;
2982 }
2983 m_NumFeat = 0;
2984 for (CTypeConstIterator<CSeq_feat> i(*m_TSE); i; ++i) {
2985 m_NumFeat++;
2986 }
2987 m_NumGraph = 0;
2988 for (CTypeConstIterator<CSeq_graph> i(*m_TSE); i; ++i) {
2989 m_NumGraph++;
2990 }
2991 m_PrgInfo.m_Total = m_NumAlign + m_NumAnnot + m_NumBioseq +
2992 m_NumBioseq_set + m_NumDesc + m_NumDescr + m_NumFeat +
2993 m_NumGraph;
2994 }
2995
2996 if (CNcbiApplication::Instance()->GetProgramDisplayName() == "table2asn") {
2997 m_IsTbl2Asn = true;
2998 }
2999 }
3000
3001
SetScope(const CSeq_entry & se)3002 void CValidError_imp::SetScope(const CSeq_entry& se)
3003 {
3004 m_Scope.Reset(new CScope(*m_ObjMgr));
3005 m_Scope->AddTopLevelSeqEntry(*const_cast<CSeq_entry*>(&se));
3006 m_Scope->AddDefaults();
3007 }
3008
3009
Setup(const CSeq_annot_Handle & sah)3010 void CValidError_imp::Setup(const CSeq_annot_Handle& sah)
3011 {
3012 m_IsStandaloneAnnot = true;
3013 if (! m_Scope) {
3014 m_Scope.Reset(& sah.GetScope());
3015 }
3016 m_SeqAnnot = sah.GetCompleteSeq_annot();
3017 m_TSE.Reset(new CSeq_entry); // set a dummy Seq-entry
3018 m_TSEH = m_Scope->AddTopLevelSeqEntry(*m_TSE);
3019 }
3020
3021
Setup(const CBioseq & seq)3022 CSeq_entry_Handle CValidError_imp::Setup(const CBioseq& seq)
3023 {
3024 m_Scope.Reset(new CScope(*m_ObjMgr));
3025 CRef<CSeq_entry> tmp_entry(new CSeq_entry());
3026 tmp_entry->SetSeq().Assign(seq);
3027 m_TSE.Reset(tmp_entry);
3028 m_TSEH = m_Scope->AddTopLevelSeqEntry(*m_TSE);
3029 Setup(m_TSEH);
3030 return m_TSEH;
3031 }
3032
3033
ValidateSeqLocIds(const CSeq_loc & loc,const CSerialObject & obj)3034 void CValidError_imp::ValidateSeqLocIds
3035 (const CSeq_loc& loc,
3036 const CSerialObject& obj)
3037 {
3038 for ( CSeq_loc_CI lit(loc); lit; ++lit ) {
3039 const CSeq_id& id1 = lit.GetSeq_id();
3040 CSeq_loc_CI lit2 = lit;
3041 for ( ++lit2; lit2; ++lit2 ) {
3042 const CSeq_id& id2 = lit2.GetSeq_id();
3043 if ( IsSameBioseq(id1, id2, m_Scope) && !id1.Match(id2) ) {
3044 PostErr(eDiag_Warning,
3045 eErr_SEQ_FEAT_DifferntIdTypesInSeqLoc,
3046 "Two ids refer to the same bioseq but are of "
3047 "different type", obj);
3048 }
3049 }
3050 if (IsTemporary(id1)) {
3051 PostErr(eDiag_Critical, eErr_SEQ_INST_BadSeqIdFormat,
3052 "Feature locations should not use Seq-ids that will be stripped during ID load", obj);
3053 }
3054 }
3055 if (BadMultipleSequenceLocation(loc, *m_Scope)) {
3056 PostErr(eDiag_Error, eErr_SEQ_FEAT_BadLocation,
3057 "Feature location intervals should all be on the same sequence", obj);
3058 }
3059 }
3060
3061
IsInOrganelleSmallGenomeSet(const CSeq_id & id,CScope & scope)3062 bool CValidError_imp::IsInOrganelleSmallGenomeSet(const CSeq_id& id, CScope& scope)
3063 {
3064 CBioseq_Handle bsh = scope.GetBioseqHandle(id);
3065 if (!bsh) {
3066 // can't fetch bioseq, can't tell, assume not
3067 return false;
3068 }
3069 CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
3070 if (!src || !src->GetSource().IsSetGenome() || !IsOrganelle(src->GetSource().GetGenome())) {
3071 // not an organelle location
3072 return false;
3073 }
3074 CBioseq_set_Handle set = bsh.GetParentBioseq_set();
3075 while (set) {
3076 if (!set.IsSetClass()) {
3077 // class not set - quit
3078 break;
3079 } else if (set.GetClass() == CBioseq_set::eClass_small_genome_set) {
3080 return true;
3081 } else if (set.GetClass() == CBioseq_set::eClass_nuc_prot) {
3082 // look at parent
3083 set = set.GetParentBioseq_set();
3084 } else {
3085 break;
3086 }
3087 }
3088 return false;
3089 }
3090
3091 // all ids in a location should point to the same sequence, unless the sequences are
3092 // in an organelle small genome set
BadMultipleSequenceLocation(const CSeq_loc & loc,CScope & scope)3093 bool CValidError_imp::BadMultipleSequenceLocation(const CSeq_loc& loc, CScope& scope)
3094 {
3095 CSeq_loc_CI lit(loc);
3096 const CSeq_id& id1 = lit.GetSeq_id();
3097
3098 bool in_organelle_small_genome_set = IsInOrganelleSmallGenomeSet(id1, scope);
3099
3100 ++lit;
3101 while (lit) {
3102 const CSeq_id& id2 = lit.GetSeq_id();
3103 if (in_organelle_small_genome_set && !IsInOrganelleSmallGenomeSet(id2, scope)) {
3104 // if one sequence in small genome set and other not, this is bad
3105 return true;
3106 }
3107 if (!id2.Match(id1) && !IsSameBioseq(id1, id2, &scope) && !in_organelle_small_genome_set) {
3108 return true;
3109 }
3110 ++lit;
3111 }
3112 return false;
3113 }
3114
3115
x_IsFarFetchFailure(const CSeq_loc & loc)3116 bool CValidError_imp::x_IsFarFetchFailure (const CSeq_loc& loc)
3117 {
3118 if (!IsFarFetchMRNAproducts() && !IsFarFetchCDSproducts()
3119 && IsFarLocation(loc, GetTSEH())) {
3120 return true;
3121 } else {
3122 return false;
3123 }
3124 }
3125
3126
3127 //LCOV_EXCL_START
3128 // not used by asnvalidate, used by external programs
GetTSANStretchErrors(const CSeq_entry_Handle & se)3129 bool CValidError_imp::GetTSANStretchErrors(const CSeq_entry_Handle& se)
3130 {
3131 bool rval = false;
3132 Setup(se);
3133 CValidError_bioseq bioseq_validator(*this);
3134 CBioseq_CI bi(se, CSeq_inst::eMol_na);
3135 while (bi) {
3136 rval |= bioseq_validator.GetTSANStretchErrors(*(bi->GetCompleteBioseq()));
3137 ++bi;
3138 }
3139 return rval;
3140 }
3141
3142
GetTSANStretchErrors(const CBioseq & seq)3143 bool CValidError_imp::GetTSANStretchErrors(const CBioseq& seq)
3144 {
3145 CSeq_entry_Handle seh = Setup(seq);
3146 CValidError_bioseq bioseq_validator(*this);
3147 return bioseq_validator.GetTSANStretchErrors(*(seh.GetSeq().GetCompleteBioseq()));
3148 }
3149
3150
GetTSACDSOnMinusStrandErrors(const CSeq_entry_Handle & se)3151 bool CValidError_imp::GetTSACDSOnMinusStrandErrors (const CSeq_entry_Handle& se)
3152 {
3153 bool rval = false;
3154 Setup(se);
3155 CValidError_feat feat_validator(*this);
3156 CFeat_CI fi(se);
3157 while (fi) {
3158 CBioseq_Handle bsh = se.GetScope().GetBioseqHandle(fi->GetLocation());
3159 if (bsh) {
3160 rval |= feat_validator.GetTSACDSOnMinusStrandErrors(*(fi->GetSeq_feat()), *(bsh.GetCompleteBioseq()));
3161 }
3162 ++fi;
3163 }
3164
3165 return rval;
3166 }
3167
3168
GetTSACDSOnMinusStrandErrors(const CSeq_feat & f,const CBioseq & seq)3169 bool CValidError_imp::GetTSACDSOnMinusStrandErrors (const CSeq_feat& f, const CBioseq& seq)
3170 {
3171 CSeq_entry_Handle seh = Setup(seq);
3172 CValidError_feat feat_validator(*this);
3173 return feat_validator.GetTSACDSOnMinusStrandErrors(f, *(seh.GetSeq().GetCompleteBioseq()));
3174 }
3175
3176
GetTSAConflictingBiomolTechErrors(const CSeq_entry_Handle & se)3177 bool CValidError_imp::GetTSAConflictingBiomolTechErrors (const CSeq_entry_Handle& se)
3178 {
3179 bool rval = false;
3180 Setup(se);
3181 CValidError_bioseq bioseq_validator(*this);
3182 CBioseq_CI bi(se, CSeq_inst::eMol_na);
3183 while (bi) {
3184 rval |= bioseq_validator.GetTSAConflictingBiomolTechErrors(*(bi->GetCompleteBioseq()));
3185 ++bi;
3186 }
3187 return rval;
3188 }
3189
3190
GetTSAConflictingBiomolTechErrors(const CBioseq & seq)3191 bool CValidError_imp::GetTSAConflictingBiomolTechErrors (const CBioseq& seq)
3192 {
3193 CSeq_entry_Handle seh = Setup(seq);
3194 CValidError_bioseq bioseq_validator(*this);
3195 return bioseq_validator.GetTSAConflictingBiomolTechErrors(*(seh.GetSeq().GetCompleteBioseq()));
3196 }
3197 //LCOV_EXCL_STOP
3198
3199
x_GetTaxonService()3200 ITaxon3* CValidError_imp::x_GetTaxonService()
3201 {
3202 if (m_taxon == NULL) {
3203 //Impossible to reach code, as c'tor requires non-null taxon service
3204 throw runtime_error("Taxon service not defined by CValidator");
3205 }
3206 return m_taxon;
3207 }
3208
3209
3210 const string kTooShort = "Too Short";
3211 const string kMissingPrimers = "Missing Primers";
3212 const string kMissingCountry = "Missing Country";
3213 const string kMissingVoucher = "Missing Voucher";
3214 const string kBadCollectionDate = "Bad Collection Date";
3215 const string kTooManyNs = "Too Many Ns";
3216 const string kMissingOrderAssignment = "Missing Order Assignment";
3217 const string kLowTrace = "Low Trace";
3218 const string kFrameShift = "Frame Shift";
3219 const string kStructuredVoucher = "Structured Voucher";
3220
3221 #define ADD_BARCODE_ERR(TestName) \
3222 PostErr(eDiag_Warning, eErr_GENERIC_Barcode##TestName, k##TestName, sq); \
3223 if (!msg.empty()) { \
3224 msg += ","; \
3225 } \
3226 msg += k##TestName;
3227
x_DoBarcodeTests(CSeq_entry_Handle seh)3228 void CValidError_imp::x_DoBarcodeTests(CSeq_entry_Handle seh)
3229 {
3230 TBarcodeResults results = GetBarcodeValues(seh);
3231 for (auto r : results) {
3232 const CBioseq& sq = *(r.bsh.GetCompleteBioseq());
3233 if (BarcodeTestFails(r)){
3234 string msg;
3235 if (r.length) {
3236 ADD_BARCODE_ERR(TooShort)
3237 }
3238 if (r.primers) {
3239 ADD_BARCODE_ERR(MissingPrimers)
3240 }
3241 if (r.country) {
3242 ADD_BARCODE_ERR(MissingCountry)
3243 }
3244 if (r.voucher) {
3245 ADD_BARCODE_ERR(MissingVoucher)
3246 }
3247 if (!r.percent_n.empty()) {
3248 PostErr(eDiag_Warning, eErr_GENERIC_BarcodeTooManyNs, kTooManyNs + ":" + r.percent_n, sq);
3249 if (!msg.empty()) {
3250 msg += ",";
3251 }
3252 msg += kTooManyNs + ":" + r.percent_n;
3253 }
3254 if (r.collection_date) {
3255 ADD_BARCODE_ERR(BadCollectionDate)
3256 }
3257 if (r.order_assignment) {
3258 ADD_BARCODE_ERR(MissingOrderAssignment)
3259 }
3260 if (r.low_trace) {
3261 ADD_BARCODE_ERR(LowTrace)
3262 }
3263 if (r.frame_shift) {
3264 ADD_BARCODE_ERR(FrameShift)
3265 }
3266 if (!r.structured_voucher) {
3267 ADD_BARCODE_ERR(StructuredVoucher)
3268 }
3269 PostErr(eDiag_Info, eErr_GENERIC_BarcodeTestFails, "FAIL (" + msg + ")", sq);
3270 } else {
3271 PostErr(eDiag_Info, eErr_GENERIC_BarcodeTestPasses, "PASS", sq);
3272 }
3273 }
3274 }
3275
3276
3277 // =============================================================================
3278 // CValidError_base Implementation
3279 // =============================================================================
3280
3281
CValidError_base(CValidError_imp & imp)3282 CValidError_base::CValidError_base(CValidError_imp& imp) :
3283 m_Imp(imp), m_Scope(imp.GetScope())
3284 {
3285 }
3286
3287
~CValidError_base()3288 CValidError_base::~CValidError_base()
3289 {
3290 }
3291
3292
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSerialObject & obj)3293 void CValidError_base::PostErr
3294 (EDiagSev sv,
3295 EErrType et,
3296 const string& msg,
3297 const CSerialObject& obj)
3298 {
3299 m_Imp.PostErr(sv, et, msg, obj);
3300 }
3301
3302
3303 //void CValidError_base::PostErr
3304 //(EDiagSev sv,
3305 // EErrType et,
3306 // const string& msg,
3307 // TDesc ds)
3308 //{
3309 // m_Imp.PostErr(sv, et, msg, ds);
3310 //}
3311
3312
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSeq_feat & ft)3313 void CValidError_base::PostErr
3314 (EDiagSev sv,
3315 EErrType et,
3316 const string& msg,
3317 const CSeq_feat& ft)
3318 {
3319 m_Imp.PostErr(sv, et, msg, ft);
3320 }
3321
3322
PostErr(EDiagSev sv,EErrType et,const string & msg,const CBioseq & sq)3323 void CValidError_base::PostErr
3324 (EDiagSev sv,
3325 EErrType et,
3326 const string& msg,
3327 const CBioseq& sq)
3328 {
3329 m_Imp.PostErr(sv, et, msg, sq);
3330 }
3331
3332
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSeq_entry & ctx,const CSeqdesc & ds)3333 void CValidError_base::PostErr
3334 (EDiagSev sv,
3335 EErrType et,
3336 const string& msg,
3337 const CSeq_entry& ctx,
3338 const CSeqdesc& ds)
3339 {
3340 m_Imp.PostErr(sv, et, msg, ctx, ds);
3341 }
3342
3343
PostErr(EDiagSev sv,EErrType et,const string & msg,const CBioseq_set & set)3344 void CValidError_base::PostErr
3345 (EDiagSev sv,
3346 EErrType et,
3347 const string& msg,
3348 const CBioseq_set& set)
3349 {
3350 m_Imp.PostErr(sv, et, msg, set);
3351 }
3352
3353
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSeq_annot & annot)3354 void CValidError_base::PostErr
3355 (EDiagSev sv,
3356 EErrType et,
3357 const string& msg,
3358 const CSeq_annot& annot)
3359 {
3360 m_Imp.PostErr(sv, et, msg, annot);
3361 }
3362
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSeq_graph & graph)3363 void CValidError_base::PostErr
3364 (EDiagSev sv,
3365 EErrType et,
3366 const string& msg,
3367 const CSeq_graph& graph)
3368 {
3369 m_Imp.PostErr(sv, et, msg, graph);
3370 }
3371
3372
PostErr(EDiagSev sv,EErrType et,const string & msg,const CBioseq & sq,const CSeq_graph & graph)3373 void CValidError_base::PostErr
3374 (EDiagSev sv,
3375 EErrType et,
3376 const string& msg,
3377 const CBioseq& sq,
3378 const CSeq_graph& graph)
3379 {
3380 m_Imp.PostErr(sv, et, msg, sq, graph);
3381 }
3382
3383
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSeq_align & align)3384 void CValidError_base::PostErr
3385 (EDiagSev sv,
3386 EErrType et,
3387 const string& msg,
3388 const CSeq_align& align)
3389 {
3390 m_Imp.PostErr(sv, et, msg, align);
3391 }
3392
3393
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSeq_entry & entry)3394 void CValidError_base::PostErr
3395 (EDiagSev sv,
3396 EErrType et,
3397 const string& msg,
3398 const CSeq_entry& entry)
3399 {
3400 m_Imp.PostErr(sv, et, msg, entry);
3401 }
3402
3403 CCacheImpl &
GetCache(void)3404 CValidError_base::GetCache(void)
3405 {
3406 return m_Imp.GetCache();
3407 }
3408
3409
s_HasTopSetSiblings(CSeq_entry_Handle seh)3410 bool s_HasTopSetSiblings(CSeq_entry_Handle seh)
3411 {
3412 CSeq_entry_Handle parent = seh.GetParentEntry();
3413 if (!parent || !parent.IsSet()) {
3414 return false;
3415 }
3416 CConstRef<CBioseq_set> pset = parent.GetSet().GetCompleteBioseq_set();
3417 if (!pset) {
3418 return false;
3419 }
3420 if (pset->IsSetSeq_set() && pset->GetSeq_set().size() > 10) {
3421 return true;
3422 } else {
3423 return s_HasTopSetSiblings(parent);
3424 }
3425 }
3426
3427
GetAppropriateXrefParent(CSeq_entry_Handle seh)3428 CSeq_entry_Handle CValidError_base::GetAppropriateXrefParent(CSeq_entry_Handle seh)
3429 {
3430 CSeq_entry_Handle appropriate_parent;
3431
3432 CSeq_entry_Handle np;
3433 CSeq_entry_Handle gps;
3434 if (seh.IsSet() && seh.GetSet().IsSetClass()) {
3435 if (seh.GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
3436 np = seh;
3437 } else if (s_IsGoodTopSetClass(seh.GetSet().GetClass())) {
3438 gps = seh;
3439 }
3440 } else if (seh.IsSeq()) {
3441 CSeq_entry_Handle p = seh.GetParentEntry();
3442 if (p && p.IsSet() && p.GetSet().IsSetClass()) {
3443 if (p.GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
3444 np = p;
3445 } else if (s_IsGoodTopSetClass(p.GetSet().GetClass())) {
3446 gps = p;
3447 }
3448 }
3449 }
3450 if (gps) {
3451 appropriate_parent = gps;
3452 } else if (np) {
3453 CSeq_entry_Handle gp = np.GetParentEntry();
3454 if (gp && gp.IsSet() && gp.GetSet().IsSetClass() &&
3455 s_IsGoodTopSetClass(gp.GetSet().GetClass())) {
3456 appropriate_parent = gp;
3457 } else {
3458 appropriate_parent = np;
3459 }
3460 } else {
3461 appropriate_parent = seh;
3462 }
3463 return appropriate_parent;
3464 }
3465
3466
3467 const CCacheImpl::CPubdescInfo &
GetPubdescToInfo(CConstRef<CPubdesc> pub)3468 CCacheImpl::GetPubdescToInfo(
3469 CConstRef<CPubdesc> pub)
3470 {
3471 // first, try to receive from cache
3472 CCacheImpl::TPubdescCache::const_iterator find_iter =
3473 m_pubdescCache.find(pub);
3474 if( find_iter != m_pubdescCache.end() ) {
3475 return *find_iter->second;
3476 }
3477
3478 CRef<CPubdescInfo> pInfo(new CPubdescInfo);
3479 CCleanup::GetPubdescLabels(
3480 *pub, pInfo->m_pmids, pInfo->m_muids,
3481 pInfo->m_serials, pInfo->m_published_labels,
3482 pInfo->m_unpublished_labels);
3483 m_pubdescCache[pub] = pInfo;
3484 return *pInfo;
3485 }
3486
3487 bool
operator <(const SFeatKey & rhs) const3488 CCacheImpl::SFeatKey::operator<(
3489 const SFeatKey & rhs) const
3490 {
3491 if( feat_type != rhs.feat_type ) {
3492 return feat_type < rhs.feat_type;
3493 } else if( feat_subtype != rhs.feat_subtype ) {
3494 return feat_subtype < rhs.feat_subtype;
3495 } else {
3496 return bioseq_h < rhs.bioseq_h;
3497 }
3498 }
3499
3500 bool
operator ==(const SFeatKey & rhs) const3501 CCacheImpl::SFeatKey::operator==(
3502 const SFeatKey & rhs) const
3503 {
3504 return (feat_type == rhs.feat_type) &&
3505 (feat_subtype == rhs.feat_subtype) && (bioseq_h == rhs.bioseq_h);
3506 }
3507
3508 const CCacheImpl::TFeatValue &
GetFeatFromCache(const CCacheImpl::SFeatKey & featKey)3509 CCacheImpl::GetFeatFromCache(
3510 const CCacheImpl::SFeatKey & featKey)
3511 {
3512 // check common case where already in the cache
3513 TFeatCache::iterator find_iter = m_featCache.find(featKey);
3514 if( find_iter != m_featCache.end() ) {
3515 return find_iter->second;
3516 }
3517
3518 // check if bioseq already processed, but had no entry requested above
3519 SFeatKey bioseq_check_key(
3520 kAnyFeatType, kAnyFeatSubtype, featKey.bioseq_h );
3521 TFeatCache::const_iterator bioseq_find_iter =
3522 m_featCache.find(bioseq_check_key);
3523 if( bioseq_find_iter != m_featCache.end() ) {
3524 const static TFeatValue kEmptyFeatValue;
3525 // bioseq was already processed,
3526 // it just happened to not have an entry here
3527 return kEmptyFeatValue;
3528 }
3529
3530 // bioseq never added to cache, so calculate that now
3531
3532 // to avoid expensive constructions of CFeat_CI's,
3533 // we iterate through all the seqs on
3534 // the bioseq and load them into the cache.
3535 CFeat_CI feat_ci(featKey.bioseq_h);
3536 for( ; feat_ci; ++feat_ci ) {
3537 SFeatKey inner_feat_key(
3538 feat_ci->GetFeatType(), feat_ci->GetFeatSubtype(), featKey.bioseq_h);
3539
3540 m_featCache[inner_feat_key].push_back(*feat_ci);
3541
3542 // also add "don't care" entries for partial searches
3543 // (e.g. if caller just wants to search on type but not on
3544 // subtype they can set subtype to kAnyFeatSubtype)
3545 SFeatKey any_type_key = inner_feat_key;
3546 any_type_key.feat_type = kAnyFeatType;
3547 m_featCache[any_type_key].push_back(*feat_ci);
3548
3549 SFeatKey any_subtype_key = inner_feat_key;
3550 any_subtype_key.feat_subtype = kAnyFeatSubtype;
3551 m_featCache[any_subtype_key].push_back(*feat_ci);
3552
3553 // for when the caller wants all feats on a bioseq
3554 SFeatKey any_type_or_subtype_key = inner_feat_key;
3555 any_type_or_subtype_key.feat_type = kAnyFeatType;
3556 any_type_or_subtype_key.feat_subtype = kAnyFeatSubtype;
3557 m_featCache[any_type_or_subtype_key].push_back(*feat_ci);
3558 }
3559
3560 // in case a bioseq has no features, we add a dummy key just to
3561 // remember that so we don't use CFeat_CI again on the same bioseq
3562 m_featCache[bioseq_check_key]; // gets default val
3563
3564 return m_featCache[featKey];
3565 }
3566
3567 AutoPtr<CCacheImpl::TFeatValue>
GetFeatFromCacheMulti(const vector<SFeatKey> & featKeys)3568 CCacheImpl::GetFeatFromCacheMulti(
3569 const vector<SFeatKey> &featKeys)
3570 {
3571 if( featKeys.empty() ) {
3572 return new TFeatValue;
3573 }
3574
3575 // all featKeys must have the same bioseq
3576 const CBioseq_Handle & bioseq_h = featKeys[0].bioseq_h;
3577 ITERATE(vector<SFeatKey>, feat_it, featKeys) {
3578 if( feat_it->bioseq_h != bioseq_h ) {
3579 throw runtime_error("GetFeatFromCacheMulti must be called with only 1 bioseq in its args");
3580 }
3581 }
3582
3583 // set prevents dups
3584 set<TFeatValue::value_type> set_of_feats;
3585
3586 // combine the answers from every key into the set
3587 ITERATE(vector<SFeatKey>, key_it, featKeys ) {
3588 const TFeatValue & feat_value = GetFeatFromCache(*key_it);
3589 copy(BEGIN_COMMA_END(feat_value), inserter(
3590 set_of_feats, set_of_feats.begin()));
3591 }
3592
3593 // go through every feature on the bioseq and remember any that match what's in the set
3594 // (The purpose of this step is to return the feats in the same
3595 // order they were on the original bioseq. In the future, we may
3596 // consider adding a flag to avoid sorting for time purposes).
3597 AutoPtr<TFeatValue> answer(new TFeatValue);
3598 SFeatKey all_feats_key(
3599 kAnyFeatType, kAnyFeatSubtype, bioseq_h);
3600 const TFeatValue & all_feats_vec = GetFeatFromCache(all_feats_key);
3601 ITERATE(TFeatValue, feat_it, all_feats_vec) {
3602 if( set_of_feats.find(*feat_it) != set_of_feats.end() ) {
3603 answer->push_back(*feat_it);
3604 }
3605 }
3606
3607 return answer;
3608 }
3609
3610
3611 //LCOV_EXCL_START
3612 //not used
3613 bool
operator <(const SFeatStrKey & rhs) const3614 CCacheImpl::SFeatStrKey::operator<(const SFeatStrKey & rhs) const
3615 {
3616 if( m_eFeatKeyStr != rhs.m_eFeatKeyStr ) {
3617 return m_eFeatKeyStr < rhs.m_eFeatKeyStr;
3618 }
3619 if( m_bioseq != rhs.m_bioseq ) {
3620 return m_bioseq < rhs.m_bioseq;
3621 }
3622 return s_QuickStringLess(m_feat_str, rhs.m_feat_str);
3623 }
3624
3625
3626 bool
operator ==(const SFeatStrKey & rhs) const3627 CCacheImpl::SFeatStrKey::operator==(const SFeatStrKey & rhs) const
3628 {
3629 if( m_eFeatKeyStr != rhs.m_eFeatKeyStr ) {
3630 return false;
3631 }
3632 if( m_bioseq != rhs.m_bioseq ) {
3633 return false;
3634 }
3635 return (m_feat_str == rhs.m_feat_str);
3636 }
3637
3638
3639 const CCacheImpl::TFeatValue &
GetFeatStrKeyToFeats(const SFeatStrKey & feat_str_key,const CTSE_Handle & tse_arg)3640 CCacheImpl::GetFeatStrKeyToFeats(
3641 const SFeatStrKey & feat_str_key, const CTSE_Handle & tse_arg)
3642 {
3643 const CBioseq_Handle & search_bsh = feat_str_key.m_bioseq;
3644
3645 // caller must give us something to work with
3646 _ASSERT(search_bsh || tse_arg);
3647
3648 const CTSE_Handle & tse = (tse_arg ? tse_arg : search_bsh.GetTSE_Handle());
3649
3650 // load cache if empty
3651 if( m_featStrKeyToFeatsCache.empty() ) {
3652 // (for now just indexes genes, but more may be added in the future)
3653 SAnnotSelector sel(CSeqFeatData::e_Gene);
3654 AutoPtr<CFeat_CI> p_gene_ci;
3655 // if we have TSE, get all features on it; otherwise, just get
3656 // the features from the bioseq
3657 if( tse ) {
3658 p_gene_ci.reset(new CFeat_CI(tse, sel));
3659 } else {
3660 p_gene_ci.reset(new CFeat_CI(search_bsh, sel));
3661 }
3662 CFeat_CI & gene_ci = *p_gene_ci; // for convenience
3663
3664 for( ; gene_ci; ++gene_ci ) {
3665 CBioseq_Handle bsh = tse.GetScope().GetBioseqHandle(gene_ci->GetLocation());
3666 string label;
3667 const CGene_ref & gene_ref = gene_ci->GetData().GetGene();
3668
3669 // for each one, add an entry for using given Bioseq and the
3670 // kAnyBioseq (so users can search on any bioseq)
3671 gene_ref.GetLabel(&label);
3672 SFeatStrKey label_key(eFeatKeyStr_Label, bsh, label);
3673 m_featStrKeyToFeatsCache[label_key].push_back(*gene_ci);
3674 if( bsh ) {
3675 label_key.m_bioseq = kAnyBioseq;
3676 m_featStrKeyToFeatsCache[label_key].push_back(*gene_ci);
3677 }
3678
3679 const string & locus_tag = (
3680 gene_ref.IsSetLocus_tag() ? gene_ref.GetLocus_tag() :
3681 kEmptyStr);
3682 SFeatStrKey locus_tag_key(eFeatKeyStr_LocusTag, bsh, locus_tag);
3683 m_featStrKeyToFeatsCache[locus_tag_key].push_back(*gene_ci);
3684 if( bsh ) {
3685 locus_tag_key.m_bioseq = kAnyBioseq;
3686 m_featStrKeyToFeatsCache[locus_tag_key].push_back(*gene_ci);
3687 }
3688 }
3689 }
3690
3691 // get from cache, if possible
3692 TFeatStrKeyToFeatsCache::const_iterator find_iter =
3693 m_featStrKeyToFeatsCache.find(feat_str_key);
3694 if( find_iter != m_featStrKeyToFeatsCache.end() ) {
3695 return find_iter->second;
3696 } else {
3697 // nothing found
3698 return kEmptyFeatValue;
3699 }
3700 }
3701
3702
3703 const CCacheImpl::TFeatToBioseqValue &
GetBioseqsOfFeatCache(const CCacheImpl::TFeatToBioseqKey & feat_to_bioseq_key,const CTSE_Handle & tse)3704 CCacheImpl::GetBioseqsOfFeatCache(
3705 const CCacheImpl::TFeatToBioseqKey & feat_to_bioseq_key,
3706 const CTSE_Handle & tse)
3707 {
3708 // load cache if empty
3709 if( m_featToBioseqCache.empty() ) {
3710 CBioseq_CI bioseq_ci(tse);
3711 for( ; bioseq_ci; ++bioseq_ci ) {
3712 CFeat_CI feat_ci(*bioseq_ci);
3713 for( ; feat_ci; ++feat_ci ) {
3714 m_featToBioseqCache[*feat_ci].insert(*bioseq_ci);
3715 }
3716 }
3717 }
3718
3719 // we're being given the map to a feature, so we should've loaded
3720 // at least one feature when we loaded the cache
3721 _ASSERT( ! m_featToBioseqCache.empty() );
3722
3723 // load from the cache
3724 TFeatToBioseqCache::const_iterator find_iter =
3725 m_featToBioseqCache.find(feat_to_bioseq_key);
3726 if( find_iter != m_featToBioseqCache.end() ) {
3727 return find_iter->second;
3728 } else {
3729 const static TFeatToBioseqValue kEmptyFeatToBioseqCache;
3730 return kEmptyFeatToBioseqCache;
3731 }
3732 }
3733 //LCOV_EXCL_STOP
3734
3735 const CCacheImpl::TIdToBioseqValue &
GetIdToBioseq(const CCacheImpl::TIdToBioseqKey & key,const CTSE_Handle & tse)3736 CCacheImpl::GetIdToBioseq(
3737 const CCacheImpl::TIdToBioseqKey & key,
3738 const CTSE_Handle & tse)
3739 {
3740 _ASSERT(tse);
3741
3742 // load cache if empty
3743 if( m_IdToBioseqCache.empty() ) {
3744 CBioseq_CI bioseq_ci(tse);
3745 for( ; bioseq_ci; ++bioseq_ci ) {
3746 const CBioseq_Handle::TId & ids = bioseq_ci->GetId();
3747 ITERATE(CBioseq_Handle::TId, id_it, ids) {
3748 m_IdToBioseqCache[id_it->GetSeqId()] = *bioseq_ci;
3749 }
3750 }
3751 }
3752
3753 // there should be at least one Bioseq otherwise there wouldn't
3754 // be anything to validate.
3755 _ASSERT(! m_IdToBioseqCache.empty());
3756
3757 TIdToBioseqCache::const_iterator find_iter = m_IdToBioseqCache.find(key);
3758 if( find_iter != m_IdToBioseqCache.end() ) {
3759 return find_iter->second;
3760 } else {
3761 static const TIdToBioseqValue s_EmptyResult;
3762 return s_EmptyResult;
3763 }
3764 }
3765
3766 CBioseq_Handle
GetBioseqHandleFromLocation(CScope * scope,const CSeq_loc & loc,const CTSE_Handle & tse)3767 CCacheImpl::GetBioseqHandleFromLocation(
3768 CScope *scope, const CSeq_loc& loc, const CTSE_Handle & tse)
3769 {
3770 _ASSERT(scope || tse);
3771 if( ! tse || (!tse.GetTopLevelEntry().IsSet() && !tse.GetTopLevelEntry().IsSeq())) {
3772 // fall back on old style
3773 return BioseqHandleFromLocation(scope, loc);
3774 }
3775
3776
3777 for ( CSeq_loc_CI citer (loc); citer; ++citer) {
3778 CConstRef<CSeq_id> id(&citer.GetSeq_id());
3779 const TIdToBioseqValue & bioseq = GetIdToBioseq(id, tse);
3780 if( bioseq ) {
3781 return bioseq;
3782 }
3783 }
3784
3785 // nothing found, so fall back on old style if possible
3786 if( scope ) {
3787 return BioseqHandleFromLocation(scope, loc);
3788 } else {
3789 return kEmptyBioseqHandle;
3790 }
3791 }
3792
3793
Clear()3794 void CCacheImpl::Clear()
3795 {
3796 m_pubdescCache.clear();
3797 m_featCache.clear();
3798 m_featStrKeyToFeatsCache.clear();
3799 m_featToBioseqCache.clear();
3800 m_IdToBioseqCache.clear();
3801 }
3802
3803
3804 END_SCOPE(validator)
3805 END_SCOPE(objects)
3806 END_NCBI_SCOPE
3807