1 /*  $Id: validatorp.cpp 632625 2021-06-03 17:38:33Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Jonathan Kans, Clifford Clausen, Aaron Ucko, Mati Shomrat, ....
27  *
28  * File Description:
29  *   Implementation of private parts of the validator
30  *   .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <objmgr/object_manager.hpp>
38 
39 #include <objtools/validator/validatorp.hpp>
40 #include <objtools/validator/validerror_desc.hpp>
41 #include <objtools/validator/validerror_descr.hpp>
42 #include <objtools/validator/validerror_annot.hpp>
43 #include <objtools/validator/validerror_bioseq.hpp>
44 #include <objtools/validator/validerror_bioseqset.hpp>
45 #include <objtools/validator/utilities.hpp>
46 #include <objtools/validator/validator_barcode.hpp>
47 #include <objtools/cleanup/cleanup.hpp>
48 
49 #include <serial/iterator.hpp>
50 #include <serial/enumvalues.hpp>
51 
52 #include <objects/general/Dbtag.hpp>
53 #include <objects/general/Person_id.hpp>
54 #include <objects/general/Name_std.hpp>
55 
56 #include <objects/seqalign/Seq_align.hpp>
57 
58 #include <objects/seqset/Bioseq_set.hpp>
59 #include <objects/seqset/Seq_entry.hpp>
60 
61 #include <objects/seq/Bioseq.hpp>
62 #include <objects/seq/Seq_annot.hpp>
63 #include <objects/seq/Seqdesc.hpp>
64 #include <objects/seq/Seq_descr.hpp>
65 #include <objects/seq/Pubdesc.hpp>
66 #include <objects/seq/MolInfo.hpp>
67 #include <objects/seqfeat/BioSource.hpp>
68 #include <objects/seqfeat/OrgMod.hpp>
69 #include <objects/seqfeat/OrgName.hpp>
70 #include <objects/seqfeat/Org_ref.hpp>
71 #include <objects/seqfeat/Seq_feat.hpp>
72 #include <objects/seqfeat/SubSource.hpp>
73 
74 #include <objects/seqloc/Seq_loc.hpp>
75 #include <objects/seqloc/Seq_interval.hpp>
76 #include <objects/seqloc/Seq_point.hpp>
77 #include <objects/seqloc/Textseq_id.hpp>
78 
79 #include <objects/seqres/Seq_graph.hpp>
80 
81 #include <objects/submit/Seq_submit.hpp>
82 #include <objects/submit/Submit_block.hpp>
83 
84 #include <objmgr/bioseq_ci.hpp>
85 #include <objmgr/seqdesc_ci.hpp>
86 #include <objmgr/graph_ci.hpp>
87 #include <objmgr/seq_annot_ci.hpp>
88 #include <objmgr/util/feature.hpp>
89 #include <objmgr/util/sequence.hpp>
90 
91 #include <objmgr/feat_ci.hpp>
92 #include <objmgr/align_ci.hpp>
93 #include <objmgr/seq_vector.hpp>
94 #include <objmgr/scope.hpp>
95 
96 #include <objects/pub/Pub.hpp>
97 #include <objects/pub/Pub_equiv.hpp>
98 
99 #include <objects/biblio/Author.hpp>
100 #include <objects/biblio/Auth_list.hpp>
101 #include <objects/biblio/Cit_art.hpp>
102 #include <objects/biblio/Cit_book.hpp>
103 #include <objects/biblio/Cit_gen.hpp>
104 #include <objects/biblio/Cit_jour.hpp>
105 #include <objects/biblio/Cit_let.hpp>
106 #include <objects/biblio/Cit_proc.hpp>
107 #include <objects/biblio/Cit_sub.hpp>
108 #include <objects/biblio/PubMedId.hpp>
109 #include <objects/biblio/PubStatus.hpp>
110 #include <objects/biblio/Title.hpp>
111 #include <objects/biblio/Imprint.hpp>
112 #include <objects/biblio/Affil.hpp>
113 #include <objects/misc/sequence_macros.hpp>
114 #include <objects/taxon3/itaxon3.hpp>
115 #include <objects/taxon3/taxon3.hpp>
116 #include <objects/taxon3/Taxon3_reply.hpp>
117 
118 #include <objects/valid/Comment_set.hpp>
119 #include <objects/valid/Comment_rule.hpp>
120 #include <objects/valid/Field_set.hpp>
121 #include <objects/valid/Field_rule.hpp>
122 #include <objects/valid/Dependent_field_set.hpp>
123 #include <objects/valid/Dependent_field_rule.hpp>
124 
125 #include <objtools/error_codes.hpp>
126 #include <objtools/validator/validerror_format.hpp>
127 #include <objtools/validator/utilities.hpp>
128 #include <objtools/edit/seq_entry_edit.hpp>
129 #include <util/sgml_entity.hpp>
130 #include <util/line_reader.hpp>
131 #include <util/util_misc.hpp>
132 #include <util/static_set.hpp>
133 
134 #include <algorithm>
135 
136 
137 #include <serial/iterator.hpp>
138 
139 #define NCBI_USE_ERRCODE_X   Objtools_Validator
140 
141 BEGIN_NCBI_SCOPE
142 BEGIN_SCOPE(objects)
143 BEGIN_SCOPE(validator)
144 using namespace sequence;
145 
146 namespace {
147     // avoid creating a PQuickStringLess for every comparison
148     PQuickStringLess s_QuickStringLess;
149 };
150 
151 
152 // =============================================================================
153 //                            CValidError_imp Public
154 // =============================================================================
155 
156 const CSeqFeatData::E_Choice CCacheImpl::kAnyFeatType =
157     static_cast<CSeqFeatData::E_Choice>(CSeqFeatData::e_not_set - 1);
158 const CSeqFeatData::ESubtype CCacheImpl::kAnyFeatSubtype =
159     static_cast<CSeqFeatData::ESubtype>(CSeqFeatData::eSubtype_bad - 1);
160 const CCacheImpl::TFeatValue CCacheImpl::kEmptyFeatValue;
161 
162 const CBioseq_Handle CCacheImpl::kEmptyBioseqHandle;
163 const CTSE_Handle CCacheImpl::kEmptyTSEHandle;
164 const CBioseq_Handle CCacheImpl::kAnyBioseq;
165 
166 //LCOV_EXCL_START
167 //not used by asnvalidate
168 // Constructor
CValidError_imp(CObjectManager & objmgr,CValidError * errs,Uint4 options)169 CValidError_imp::CValidError_imp
170 (CObjectManager& objmgr,
171 CValidError*     errs,
172 Uint4            options) :
173 m_ObjMgr(&objmgr),
174 m_ErrRepository(errs),
175 m_taxon(NULL)
176 {
177     x_Init(options);
178 }
179 //LCOV_EXCL_STOP
180 
181 // Constructor
CValidError_imp(CObjectManager & objmgr,CValidError * errs,ITaxon3 * taxon,Uint4 options)182 CValidError_imp::CValidError_imp
183 (CObjectManager& objmgr,
184 CValidError*     errs,
185 ITaxon3*         taxon,
186 Uint4            options) :
187 m_ObjMgr(&objmgr),
188 m_ErrRepository(errs),
189 m_taxon(taxon)
190 {
191     x_Init(options);
192 }
193 
194 
x_Init(Uint4 options)195 void CValidError_imp::x_Init(Uint4 options)
196 {
197     SetOptions(options);
198     Reset();
199 
200     if (m_SourceQualTags.get() == 0) {
201         InitializeSourceQualTags();
202     }
203 }
204 
205 // Destructor
~CValidError_imp()206 CValidError_imp::~CValidError_imp()
207 {
208 }
209 
210 
SetOptions(Uint4 options)211 void CValidError_imp::SetOptions(Uint4 options)
212 {
213     m_NonASCII = (options & CValidator::eVal_non_ascii) != 0;
214     m_SuppressContext = (options & CValidator::eVal_no_context) != 0;
215     m_ValidateAlignments = (options & CValidator::eVal_val_align) != 0;
216     m_ValidateExons = (options & CValidator::eVal_val_exons) != 0;
217     m_OvlPepErr = (options & CValidator::eVal_ovl_pep_err) != 0;
218     m_RequireISOJTA = (options & CValidator::eVal_need_isojta) != 0;
219     m_ValidateIdSet = (options & CValidator::eVal_validate_id_set) != 0;
220     m_RemoteFetch = (options & CValidator::eVal_remote_fetch) != 0;
221     m_FarFetchMRNAproducts = (options & CValidator::eVal_far_fetch_mrna_products) != 0;
222     m_FarFetchCDSproducts = (options & CValidator::eVal_far_fetch_cds_products) != 0;
223     m_LocusTagGeneralMatch = (options & CValidator::eVal_locus_tag_general_match) != 0;
224     m_DoRubiscoText = (options & CValidator::eVal_do_rubisco_test) != 0;
225     m_IndexerVersion = (options & CValidator::eVal_indexer_version) != 0;
226     m_UseEntrez = (options & CValidator::eVal_use_entrez) != 0;
227     m_DoTaxLookup = (options & CValidator::eVal_do_tax_lookup) != 0;
228     m_DoBarcodeTests = (options & CValidator::eVal_do_barcode_tests) != 0;
229     m_RefSeqConventions = (options & CValidator::eVal_refseq_conventions) != 0;
230     m_SeqSubmitParent = (options & CValidator::eVal_seqsubmit_parent) != 0;
231     m_ValidateInferenceAccessions = (options & CValidator::eVal_inference_accns) != 0;
232     m_IgnoreExceptions = (options & CValidator::eVal_ignore_exceptions) != 0;
233     m_ReportSpliceAsError = (options & CValidator::eVal_report_splice_as_error) != 0;
234     m_LatLonCheckState = (options & CValidator::eVal_latlon_check_state) != 0;
235     m_LatLonIgnoreWater = (options & CValidator::eVal_latlon_ignore_water) != 0;
236     m_genomeSubmission = (options & CValidator::eVal_genome_submission) != 0;
237     m_CollectLocusTags = (options & CValidator::eVal_collect_locus_tags) != 0;
238     m_GenerateGoldenFile = (options & CValidator::eVal_generate_golden_file) != 0;
239     m_CompareVDJCtoCDS = (options & CValidator::eVal_compare_vdjc_to_cds) != 0;
240 }
241 
242 
243 //LCOV_EXCL_START
244 //not used by asnvalidate
SetErrorRepository(CValidError * errors)245 void CValidError_imp::SetErrorRepository(CValidError* errors)
246 {
247     m_ErrRepository = errors;
248 }
249 //LCOV_EXCL_STOP
250 
251 
Reset(void)252 void CValidError_imp::Reset(void)
253 {
254     m_Scope = 0;
255     m_TSE = 0;
256     m_IsStandaloneAnnot = false;
257     m_SeqAnnot.Reset(NULL);
258     m_NoPubs = false;
259     m_NoCitSubPubs = false;
260     m_NoBioSource = false;
261     m_IsGPS = false;
262     m_IsGED = false;
263     m_IsPDB = false;
264     m_IsPatent = false;
265     m_IsRefSeq = false;
266     m_IsEmbl = false;
267     m_IsDdbj = false;
268     m_IsTPE = false;
269     m_IsNC = false;
270     m_IsNG = false;
271     m_IsNM = false;
272     m_IsNP = false;
273     m_IsNR = false;
274     m_IsNZ = false;
275     m_IsNS  = false;
276     m_IsNT = false;
277     m_IsNW = false;
278     m_IsWP = false;
279     m_IsXR = false;
280     m_IsGI = false;
281     m_IsGB = false;
282     m_IsGpipe = false;
283     m_IsLocalGeneralOnly = true;
284     m_HasGiOrAccnVer = false;
285     m_IsGenomic = false;
286     m_IsSeqSubmit = false;
287     m_IsSmallGenomeSet = false;
288     m_FeatLocHasGI = false;
289     m_ProductLocHasGI = false;
290     m_GeneHasLocusTag = false;
291     m_ProteinHasGeneralID = false;
292     m_IsINSDInSep = false;
293     m_IsGeneious = false;
294     m_PrgCallback = 0;
295     m_NumAlign = 0;
296     m_NumAnnot = 0;
297     m_NumBioseq = 0;
298     m_NumBioseq_set = 0;
299     m_NumTopSetSiblings = 0;
300     m_NumDesc = 0;
301     m_NumDescr = 0;
302     m_NumFeat = 0;
303     m_NumGraph = 0;
304     m_NumMisplacedFeatures = 0;
305     m_NumSmallGenomeSetMisplaced = 0;
306     m_NumMisplacedGraphs = 0;
307     m_NumGenes = 0;
308     m_NumGeneXrefs = 0;
309     m_NumTpaWithHistory = 0;
310     m_NumTpaWithoutHistory = 0;
311     m_NumPseudo = 0;
312     m_NumPseudogene = 0;
313     m_FarFetchFailure = false;
314     m_IsTbl2Asn = false;
315 }
316 
317 
318 // Error post methods
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSerialObject & obj)319 void CValidError_imp::PostErr
320 (EDiagSev sv,
321  EErrType et,
322  const string&  msg,
323  const CSerialObject& obj)
324 {
325     const CTypeInfo* type_info = obj.GetThisTypeInfo();
326     if (type_info == CSeqdesc::GetTypeInfo()) {
327         const CSeqdesc* desc = dynamic_cast < const CSeqdesc* > (&obj);
328         ERR_POST_X(1, Warning << "Seqdesc validation error using default context.");
329         PostErr (sv, et, msg, GetTSE(), *desc);
330     } else if (type_info == CSeq_feat::GetTypeInfo()) {
331         const CSeq_feat* feat = dynamic_cast < const CSeq_feat* > (&obj);
332         PostErr (sv, et, msg, *feat);
333     } else if (type_info == CBioseq::GetTypeInfo()) {
334         const CBioseq* seq = dynamic_cast < const CBioseq* > (&obj);
335         PostErr (sv, et, msg, *seq);
336     } else if (type_info == CBioseq_set::GetTypeInfo()) {
337         const CBioseq_set* set = dynamic_cast < const CBioseq_set* > (&obj);
338         PostErr (sv, et, msg, *set);
339     } else if (type_info == CSeq_annot::GetTypeInfo()) {
340         const CSeq_annot* annot = dynamic_cast < const CSeq_annot* > (&obj);
341         PostErr (sv, et, msg, *annot);
342     } else if (type_info == CSeq_graph::GetTypeInfo()) {
343         const CSeq_graph* graph = dynamic_cast < const CSeq_graph* > (&obj);
344         PostErr (sv, et, msg, *graph);
345     } else if (type_info == CSeq_align::GetTypeInfo()) {
346         const CSeq_align* align = dynamic_cast < const CSeq_align* > (&obj);
347         PostErr (sv, et, msg, *align);
348     } else if (type_info == CSeq_entry::GetTypeInfo()) {
349         const CSeq_entry* entry = dynamic_cast < const CSeq_entry* > (&obj);
350         PostErr (sv, et, msg, *entry);
351     } else if (type_info == CBioSource::GetTypeInfo()) {
352         const CBioSource* src = dynamic_cast < const CBioSource* > (&obj);
353         PostErr (sv, et, msg, *src);
354     } else if (type_info == COrg_ref::GetTypeInfo()) {
355         const COrg_ref* org = dynamic_cast < const COrg_ref* > (&obj);
356         PostErr (sv, et, msg, *org);
357     } else if (type_info == CPubdesc::GetTypeInfo()) {
358         const CPubdesc* pd = dynamic_cast < const CPubdesc* > (&obj);
359         PostErr (sv, et, msg, *pd);
360     } else if (type_info == CSeq_submit::GetTypeInfo()) {
361         const CSeq_submit* ss = dynamic_cast < const CSeq_submit* > (&obj);
362         PostErr (sv, et, msg, *ss);
363     } else {
364         ERR_POST_X(1, Warning << "Unknown data type in PostErr.");
365     }
366 }
367 
368 
369 /*
370 void CValidError_imp::PostErr
371 (EDiagSev       sv,
372  EErrType       et,
373  const string&  msg,
374  TDesc          ds)
375 {
376     // Append Descriptor label
377     string desc = "DESCRIPTOR: ";
378     ds.GetLabel (&desc, CSeqdesc::eBoth);
379     desc += ", NO Descriptor Context";
380     m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, *m_Scope);
381 }
382 */
383 
384 static const EErrType sc_ValidGenomeRaise[] = {
385     eErr_SEQ_INST_ShortSeq,
386     eErr_SEQ_INST_ConflictingBiomolTech,
387     eErr_SEQ_INST_DuplicateSegmentReferences,
388     eErr_SEQ_INST_BadSeqIdFormat,
389     eErr_SEQ_INST_TerminalNs,
390     eErr_SEQ_INST_UnexpectedIdentifierChange,
391     eErr_SEQ_INST_TpaAssemblyProblem,
392     eErr_SEQ_INST_SeqLocLength,
393     eErr_SEQ_INST_CompleteTitleProblem,
394     eErr_SEQ_INST_BadHTGSeq,
395     eErr_SEQ_INST_OverlappingDeltaRange,
396     eErr_SEQ_INST_InternalNsInSeqRaw,
397     eErr_SEQ_INST_FarFetchFailure,
398     eErr_SEQ_INST_InternalGapsInSeqRaw,
399     eErr_SEQ_INST_HighNContentStretch,
400     eErr_SEQ_INST_UnknownLengthGapNot100,
401     eErr_SEQ_INST_CompleteGenomeHasGaps,
402     eErr_SEQ_DESCR_BioSourceMissing,
403     eErr_SEQ_DESCR_InvalidForType,
404     eErr_SEQ_DESCR_InconsistentBioSources,
405     eErr_SEQ_DESCR_BadOrganelleLocation,
406     eErr_SEQ_DESCR_MultipleChromosomes,
407     eErr_SEQ_DESCR_BadOrgMod,
408     eErr_SEQ_DESCR_Inconsistent,
409     eErr_SEQ_DESCR_ObsoleteSourceLocation,
410     eErr_SEQ_DESCR_ObsoleteSourceQual,
411     eErr_SEQ_DESCR_UnwantedCompleteFlag,
412     eErr_SEQ_DESCR_CollidingPublications,
413     eErr_SEQ_DESCR_TransgenicProblem,
414     eErr_SEQ_DESCR_BioSourceInconsistency,
415     eErr_SEQ_DESCR_BadCollectionDate,
416     eErr_SEQ_DESCR_BadPCRPrimerSequence,
417     eErr_SEQ_DESCR_BioSourceOnProtein,
418     eErr_SEQ_DESCR_BioSourceDbTagConflict,
419     eErr_SEQ_DESCR_DuplicatePCRPrimerSequence,
420     eErr_SEQ_DESCR_MultipleNames,
421     eErr_SEQ_DESCR_LatLonRange,
422     eErr_SEQ_DESCR_LatLonValue,
423     eErr_SEQ_DESCR_LatLonCountry,
424     eErr_SEQ_DESCR_BadCollectionCode,
425     eErr_SEQ_DESCR_IncorrectlyFormattedVoucherID,
426     eErr_SEQ_DESCR_MultipleSourceQualifiers,
427     eErr_SEQ_DESCR_IdenticalInstitutionCode,
428     eErr_SEQ_DESCR_WrongVoucherType,
429     eErr_SEQ_DESCR_BadKeyword,
430     eErr_SEQ_DESCR_BioSourceNeedsChromosome,
431     eErr_SEQ_DESCR_MolInfoConflictsWithBioSource,
432     eErr_SEQ_DESCR_TaxonomyIsSpeciesProblem,
433     eErr_SEQ_DESCR_BadAltitude,
434     eErr_SEQ_DESCR_DBLinkMissingUserObject,
435     eErr_GENERIC_UnnecessaryPubEquiv,
436     eErr_GENERIC_CollidingSerialNumbers,
437     eErr_GENERIC_PublicationInconsistency,
438     eErr_GENERIC_SgmlPresentInText,
439     eErr_GENERIC_MissingPubRequirement,
440     eErr_SEQ_PKG_EmptySet,
441     eErr_SEQ_PKG_FeaturePackagingProblem,
442     eErr_SEQ_PKG_GenomicProductPackagingProblem,
443     eErr_SEQ_PKG_ArchaicFeatureLocation,
444     eErr_SEQ_PKG_ArchaicFeatureProduct,
445     eErr_SEQ_PKG_InternalGenBankSet,
446     eErr_SEQ_PKG_BioseqSetClassNotSet,
447     eErr_SEQ_PKG_MissingSetTitle,
448     eErr_SEQ_PKG_NucProtSetHasTitle,
449     eErr_SEQ_PKG_ComponentMissingTitle,
450     eErr_SEQ_PKG_SingleItemSet,
451     eErr_SEQ_PKG_MisplacedMolInfo,
452     eErr_SEQ_PKG_ImproperlyNestedSets,
453     eErr_SEQ_PKG_SeqSubmitWithWgsSet,
454     eErr_SEQ_PKG_InconsistentMoltypeSet,
455     eErr_SEQ_FEAT_Range,
456     eErr_SEQ_FEAT_MixedStrand,
457     eErr_SEQ_FEAT_SeqLocOrder,
458     eErr_SEQ_FEAT_TransLen,
459     eErr_SEQ_FEAT_TranslExcept,
460     eErr_SEQ_FEAT_OrfCdsHasProduct,
461     eErr_SEQ_FEAT_GeneRefHasNoData,
462     eErr_SEQ_FEAT_ProtRefHasNoData,
463     eErr_SEQ_FEAT_RNAtype0,
464     eErr_SEQ_FEAT_UnknownImpFeatKey,
465     eErr_SEQ_FEAT_UnknownImpFeatQual,
466     eErr_SEQ_FEAT_WrongQualOnImpFeat,
467     eErr_SEQ_FEAT_MissingQualOnImpFeat,
468     eErr_SEQ_FEAT_IllegalDbXref,
469     eErr_SEQ_FEAT_FarLocation,
470     eErr_SEQ_FEAT_TranslExceptPhase,
471     eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
472     eErr_SEQ_FEAT_InvalidQualifierValue,
473     eErr_SEQ_FEAT_CDSproductPackagingProblem,
474     eErr_SEQ_FEAT_DuplicateExonInterval,
475     eErr_SEQ_FEAT_DuplicateAnticodonInterval,
476     eErr_SEQ_FEAT_AbuttingIntervals,
477     eErr_SEQ_FEAT_MissingCDSproduct,
478     eErr_SEQ_FEAT_OnlyGeneXrefs,
479     eErr_SEQ_FEAT_UTRdoesNotAbutCDS,
480     eErr_SEQ_FEAT_ConflictFlagSet,
481     eErr_SEQ_FEAT_LocusTagProblem,
482     eErr_SEQ_FEAT_GenesInconsistent,
483     eErr_SEQ_FEAT_TranslExceptAndRnaEditing,
484     eErr_SEQ_FEAT_NoNameForProtein,
485     eErr_SEQ_FEAT_MissingGeneXref,
486     eErr_SEQ_FEAT_FeatureCitationProblem,
487     eErr_SEQ_FEAT_WrongQualOnFeature,
488     eErr_SEQ_FEAT_UnknownFeatureQual,
489     eErr_SEQ_FEAT_BadCharInAuthorName,
490     eErr_SEQ_FEAT_CDSwithMultipleMRNAs,
491     eErr_SEQ_FEAT_MultipleEquivBioSources,
492     eErr_SEQ_FEAT_MultipleEquivPublications,
493     eErr_SEQ_FEAT_BadFullLengthFeature,
494     eErr_SEQ_FEAT_RedundantFields,
495     eErr_SEQ_FEAT_CDSwithNoMRNAOverlap,
496     eErr_SEQ_FEAT_FeatureProductInconsistency,
497     eErr_SEQ_FEAT_ImproperBondLocation,
498     eErr_SEQ_FEAT_GeneXrefWithoutGene,
499     eErr_SEQ_FEAT_MissingTrnaAA,
500     eErr_SEQ_FEAT_OldLocusTagMismtach,
501     eErr_SEQ_FEAT_InvalidInferenceValue,
502     eErr_SEQ_FEAT_HypotheticalProteinMismatch,
503     eErr_SEQ_FEAT_WholeLocation,
504     eErr_SEQ_FEAT_BadEcNumberFormat,
505     eErr_SEQ_FEAT_EcNumberProblem,
506     eErr_SEQ_FEAT_VectorContamination,
507     eErr_SEQ_FEAT_MinusStrandProtein,
508     eErr_SEQ_FEAT_BadProteinName,
509     eErr_SEQ_FEAT_GeneXrefWithoutLocus,
510     eErr_SEQ_FEAT_CDShasTooManyXs,
511     eErr_SEQ_FEAT_TerminalXDiscrepancy,
512     eErr_SEQ_FEAT_UnnecessaryTranslExcept,
513     eErr_SEQ_FEAT_FeatureInsideGap,
514     eErr_SEQ_FEAT_BadAnticodonAA,
515     eErr_SEQ_FEAT_BadAnticodonCodon,
516     eErr_SEQ_FEAT_FeatureBeginsOrEndsInGap,
517     eErr_SEQ_FEAT_GeneOntologyTermMissingGOID,
518     eErr_SEQ_FEAT_PseudoRnaHasProduct,
519     eErr_SEQ_FEAT_PseudoRnaViaGeneHasProduct,
520     eErr_SEQ_FEAT_BadRRNAcomponentOrder,
521     eErr_SEQ_FEAT_BadRRNAcomponentOverlap,
522     eErr_SEQ_FEAT_MultipleProtRefs,
523     eErr_SEQ_FEAT_BadInternalCharacter,
524     eErr_SEQ_FEAT_BadTrailingCharacter,
525     eErr_SEQ_FEAT_BadTrailingHyphen,
526     eErr_SEQ_FEAT_BadCharInAuthorLastName,
527     eErr_SEQ_FEAT_GeneXrefNeeded,
528     eErr_SEQ_FEAT_ProteinNameHasPMID,
529     eErr_SEQ_FEAT_BadGeneOntologyFormat,
530     eErr_SEQ_FEAT_InconsistentGeneOntologyTermAndId,
531     eErr_SEQ_FEAT_ShortIntron,
532     eErr_SEQ_FEAT_GeneXrefStrandProblem,
533     eErr_SEQ_FEAT_CDSmRNAXrefLocationProblem,
534     eErr_SEQ_FEAT_LocusCollidesWithLocusTag,
535     eErr_SEQ_FEAT_RptUnitRangeProblem,
536     eErr_SEQ_FEAT_InconsistentRRNAstrands,
537     eErr_SEQ_FEAT_CDSrange,
538     eErr_SEQ_GRAPH_GraphAbove,
539     eErr_SEQ_GRAPH_GraphOutOfOrder,
540     eErr_SEQ_GRAPH_GraphSeqLocLen,
541     eErr_SEQ_GRAPH_GraphBioseqId
542 };
543 
544 DEFINE_STATIC_ARRAY_MAP(CStaticArraySet<EErrType>, sc_GenomeRaiseArray, sc_ValidGenomeRaise);
545 
546 static const EErrType sc_ValidGenomeRaiseExceptEmblDdbj[] = {
547     eErr_SEQ_INST_CompleteTitleProblem,
548     eErr_SEQ_INST_CompleteGenomeHasGaps,
549     eErr_SEQ_FEAT_MiscFeatureNeedsNote,
550     eErr_SEQ_FEAT_RepeatRegionNeedsNote
551 };
552 
553 DEFINE_STATIC_ARRAY_MAP(CStaticArraySet<EErrType>, sc_GenomeRaiseExceptEmblDdbjArray, sc_ValidGenomeRaiseExceptEmblDdbj);
554 
555 
556 static const EErrType sc_ValidGenomeRaiseExceptEmblDdbjRefSeq[] = {
557     eErr_SEQ_DESCR_BadInstitutionCode
558 };
559 
560 DEFINE_STATIC_ARRAY_MAP(CStaticArraySet<EErrType>, sc_GenomeRaiseExceptEmblDdbjRefSeqArray, sc_ValidGenomeRaiseExceptEmblDdbjRefSeq);
561 
562 
RaiseGenomeSeverity(EErrType et)563 bool CValidError_imp::RaiseGenomeSeverity(
564     EErrType et
565 )
566 
567 {
568     if (sc_GenomeRaiseExceptEmblDdbjRefSeqArray.find(et) != sc_GenomeRaiseExceptEmblDdbjRefSeqArray.end()) {
569         if (IsEmbl() || IsDdbj() || IsRefSeq()) {
570             return false;
571         } else {
572             return true;
573         }
574     }
575     if (sc_GenomeRaiseExceptEmblDdbjArray.find(et) != sc_GenomeRaiseExceptEmblDdbjArray.end()) {
576         if (IsEmbl() || IsDdbj()) {
577             return false;
578         } else {
579             return true;
580         }
581     }
582     if (sc_GenomeRaiseArray.find (et) != sc_GenomeRaiseArray.end()) {
583         return true;
584     }
585     return false;
586 }
587 
PostErr(EDiagSev sv,EErrType et,const string & msg,TFeat ft)588 void CValidError_imp::PostErr
589 (EDiagSev       sv,
590  EErrType       et,
591  const string&  msg,
592  TFeat          ft)
593 {
594     CRef<CValidErrItem> item(new CValidErrItem());
595 
596     // Adjust severity
597     if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
598         sv = eDiag_Error;
599     }
600 
601     item->SetSev(sv);
602     item->SetErrIndex(et);
603     item->SetMsg(msg);
604     item->SetObject(ft);
605 
606     if (GenerateGoldenFile()) {
607         m_ErrRepository->AddValidErrItem(item);
608         return;
609     }
610 
611     string content_label = CValidErrorFormat::GetFeatureContentLabel(ft, m_Scope);
612     item->SetObj_content(content_label);
613 
614     string feature_id = CValidErrorFormat::GetFeatureIdLabel(ft);
615     if (!NStr::IsBlank(feature_id)) {
616         item->SetFeatureId(feature_id);
617     }
618 
619     string bioseq_label = CValidErrorFormat::GetFeatureBioseqLabel(ft, m_Scope, m_SuppressContext);
620     if (!NStr::IsBlank(bioseq_label)) {
621         item->SetBioseq(bioseq_label);
622     }
623 
624     // Calculate sequence offset
625     TSeqPos offset = 0;
626     string location;
627     if (ft.IsSetLocation()) {
628         offset = ft.GetLocation().GetStart(eExtreme_Positional);
629         string loc_label = CValidErrorFormat::GetFeatureLocationLabel(ft, m_Scope, m_SuppressContext);
630         if (!NStr::IsBlank(loc_label)) {
631             item->SetLocation(loc_label);
632         }
633         item->SetSeqOffset(offset);
634     }
635 
636 
637     string product_label = CValidErrorFormat::GetFeatureProductLocLabel(ft, m_Scope, m_SuppressContext);
638     if (!NStr::IsBlank(product_label)) {
639         item->SetProduct_loc(product_label);
640     }
641 
642     int version = 0;
643     string accession;
644     if (m_Scope) {
645         accession = GetAccessionFromObjects(&ft, NULL, *m_Scope, &version);
646     }
647     item->SetAccession(accession);
648     if (version > 0) {
649         item->SetAccnver(accession + "." + NStr::IntToString(version));
650         item->SetVersion(version);
651     } else {
652         item->SetAccnver(accession);
653     }
654 
655     if (ft.IsSetData()) {
656         if (ft.GetData().IsGene()) {
657             if (ft.GetData().GetGene().IsSetLocus_tag() &&
658                 !NStr::IsBlank(ft.GetData().GetGene().GetLocus_tag())) {
659                 item->SetLocus_tag(ft.GetData().GetGene().GetLocus_tag());
660             }
661         } else {
662             if (m_CollectLocusTags) {
663                 // TODO: this should be part of post-processing
664                 CConstRef<CSeq_feat> gene = GetGeneCache().GetGeneFromCache(&ft, *m_Scope);
665                 if (gene && gene->GetData().GetGene().IsSetLocus_tag() &&
666                     !NStr::IsBlank(gene->GetData().GetGene().GetLocus_tag())) {
667                     item->SetLocus_tag(gene->GetData().GetGene().GetLocus_tag());
668                 }
669             }
670         }
671     }
672 
673     item->SetFeatureObjDescFromFields();
674     m_ErrRepository->AddValidErrItem(item);
675 }
676 
677 
PostErr(EDiagSev sv,EErrType et,const string & msg,TBioseq sq)678 void CValidError_imp::PostErr
679 (EDiagSev       sv,
680  EErrType       et,
681  const string&  msg,
682  TBioseq        sq)
683 {
684     // Adjust severity
685     if (m_genomeSubmission && sv < eDiag_Error && RaiseGenomeSeverity(et)) {
686         sv = eDiag_Error;
687     }
688 
689     if (GenerateGoldenFile()) {
690         m_ErrRepository->AddValidErrItem(sv, et, msg);
691         return;
692     }
693 
694     // Append bioseq label
695     string desc;
696     AppendBioseqLabel(desc, sq, m_SuppressContext);
697     int version = 0;
698     const string& accession = GetAccessionFromObjects(&sq, NULL, *m_Scope, &version);
699     m_ErrRepository->AddValidErrItem(sv, et, msg, desc, sq, accession, version);
700 }
701 
702 
PostErr(EDiagSev sv,EErrType et,const string & msg,TSet st)703 void CValidError_imp::PostErr
704 (EDiagSev      sv,
705  EErrType      et,
706  const string& msg,
707  TSet          st)
708 {
709     // Adjust severity
710     if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
711         sv = eDiag_Error;
712     }
713 
714     if (GenerateGoldenFile()) {
715         m_ErrRepository->AddValidErrItem(sv, et, msg);
716         return;
717     }
718 
719     // Append Bioseq_set label
720     int version = 0;
721     const string& accession = GetAccessionFromObjects(&st, NULL, *m_Scope, &version);
722     string desc = CValidErrorFormat::GetBioseqSetLabel(st, m_Scope, m_SuppressContext);
723     m_ErrRepository->AddValidErrItem(sv, et, msg, desc, st, accession, version);
724 }
725 
726 
PostErr(EDiagSev sv,EErrType et,const string & msg,TEntry ctx,TDesc ds)727 void CValidError_imp::PostErr
728 (EDiagSev       sv,
729  EErrType       et,
730  const string&  msg,
731  TEntry         ctx,
732  TDesc          ds)
733 {
734     // Adjust severity
735     if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
736         sv = eDiag_Error;
737     }
738 
739     if (GenerateGoldenFile()) {
740         m_ErrRepository->AddValidErrItem(sv, et, msg);
741         return;
742     }
743 
744     // Append Descriptor label
745     string desc = CValidErrorFormat::GetDescriptorLabel(ds, ctx, m_Scope, m_SuppressContext);
746     int version = 0;
747     const string& accession = GetAccessionFromObjects(&ds, &ctx, *m_Scope, &version);
748     m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, ctx, accession, version);
749 }
750 
751 
752 //void CValidError_imp::PostErr
753 //(EDiagSev       sv,
754 // EErrType       et,
755 // const string&  msg,
756 // TBioseq        sq,
757 // TDesc          ds)
758 //{
759 //    // Append Descriptor label
760 //    string desc("DESCRIPTOR: ");
761 //    ds.GetLabel(&desc, CSeqdesc::eBoth);
762 //
763 //    s_AppendBioseqLabel(desc, sq, m_SuppressContext);
764 //    m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, *m_Scope);
765 //    //PostErr(sv, et, msg, sq);
766 //}
767 
768 
769 //void CValidError_imp::PostErr
770 //(EDiagSev        sv,
771 // EErrType        et,
772 // const string&   msg,
773 // TSet            st,
774 // TDesc           ds)
775 //{
776 //    // Append Descriptor label
777 //    string desc =  " DESCRIPTOR: ";
778 //    ds.GetLabel(&desc, CSeqdesc::eBoth);
779 //    s_AppendSetLabel(desc, st, m_SuppressContext);
780 //    m_ErrRepository->AddValidErrItem(sv, et, msg, desc, st, *m_Scope);
781 //
782 //}
783 
784 
PostErr(EDiagSev sv,EErrType et,const string & msg,TAnnot an)785 void CValidError_imp::PostErr
786 (EDiagSev       sv,
787  EErrType       et,
788  const string&  msg,
789  TAnnot         an)
790 {
791     // Adjust severity
792     if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
793         sv = eDiag_Error;
794     }
795 
796     if (GenerateGoldenFile()) {
797         m_ErrRepository->AddValidErrItem(sv, et, msg);
798         return;
799     }
800 
801     // Append Annotation label
802     string desc = "ANNOTATION: ";
803 
804     // !!! need to decide on the message
805 
806     int version = 0;
807     const string& accession = GetAccessionFromObjects(&an, NULL, *m_Scope, &version);
808     m_ErrRepository->AddValidErrItem(sv, et, msg, desc, an, accession, version);
809 }
810 
811 
PostErr(EDiagSev sv,EErrType et,const string & msg,TGraph graph)812 void CValidError_imp::PostErr
813 (EDiagSev       sv,
814  EErrType       et,
815  const string&  msg,
816  TGraph         graph)
817 {
818     // Adjust severity
819     if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
820         sv = eDiag_Error;
821     }
822 
823     if (GenerateGoldenFile()) {
824         m_ErrRepository->AddValidErrItem(sv, et, msg);
825         return;
826     }
827 
828     // Append Graph label
829     string desc = "GRAPH: ";
830     if (graph.IsSetTitle()) {
831         desc += graph.GetTitle();
832     } else {
833         desc += "<Unnamed>";
834     }
835     desc += " ";
836     graph.GetLoc().GetLabel(&desc);
837 
838     int version = 0;
839     const string& accession = GetAccessionFromObjects(&graph, NULL, *m_Scope, &version);
840     m_ErrRepository->AddValidErrItem(sv, et, msg, desc, graph, accession, version);
841 }
842 
843 
PostErr(EDiagSev sv,EErrType et,const string & msg,TBioseq sq,TGraph graph)844 void CValidError_imp::PostErr
845 (EDiagSev       sv,
846  EErrType       et,
847  const string&  msg,
848  TBioseq        sq,
849  TGraph         graph)
850 {
851     // Adjust severity
852     if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
853         sv = eDiag_Error;
854     }
855 
856     if (GenerateGoldenFile()) {
857         m_ErrRepository->AddValidErrItem(sv, et, msg);
858         return;
859     }
860 
861     // Append Graph label
862     string desc("GRAPH: ");
863     if ( graph.IsSetTitle() ) {
864         desc += graph.GetTitle();
865     } else {
866         desc += "<Unnamed>";
867     }
868     desc += " ";
869     graph.GetLoc().GetLabel(&desc);
870     AppendBioseqLabel(desc, sq, m_SuppressContext);
871     int version = 0;
872     const string& accession = GetAccessionFromObjects(&graph, NULL, *m_Scope, &version);
873     m_ErrRepository->AddValidErrItem(sv, et, msg, desc, graph, accession, version);
874 }
875 
876 
PostErr(EDiagSev sv,EErrType et,const string & msg,TAlign align)877 void CValidError_imp::PostErr
878 (EDiagSev      sv,
879  EErrType      et,
880  const string& msg,
881  TAlign        align)
882 {
883     // Adjust severity
884     if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
885         sv = eDiag_Error;
886     }
887 
888     if (GenerateGoldenFile()) {
889         m_ErrRepository->AddValidErrItem(sv, et, msg);
890         return;
891     }
892 
893     CConstRef<CSeq_id> id = GetReportableSeqIdForAlignment(align, *m_Scope);
894     if (id) {
895         CBioseq_Handle bsh = m_Scope->GetBioseqHandle(*id);
896         if (bsh) {
897             PostErr(sv, et, msg, *(bsh.GetCompleteBioseq()));
898             return;
899         }
900     }
901 
902     // Can't get bioseq for reporting, use other Alignment label
903     string desc = "ALIGNMENT: ";
904     if (align.IsSetType()) {
905         desc += align.ENUM_METHOD_NAME(EType)()->FindName(align.GetType(), true);
906     }
907     try {
908         CSeq_align::TDim dim = align.GetDim();
909         desc += ", dim=" + NStr::NumericToString(dim);
910     } catch ( const CUnassignedMember ) {
911         desc += ", dim=UNASSIGNED";
912     }
913 
914     if (align.IsSetSegs()) {
915         desc += " SEGS: ";
916         desc += align.GetSegs().SelectionName(align.GetSegs().Which());
917     }
918 
919     int version = 0;
920     const string& accession = GetAccessionFromObjects(&align, NULL, *m_Scope, &version);
921     m_ErrRepository->AddValidErrItem(sv, et, msg, desc, align, accession, version);
922 }
923 
924 
PostErr(EDiagSev sv,EErrType et,const string & msg,TEntry entry)925 void CValidError_imp::PostErr
926 (EDiagSev      sv,
927  EErrType      et,
928  const string& msg,
929  TEntry        entry)
930 {
931     // Adjust severity
932     if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
933         sv = eDiag_Error;
934     }
935 
936     if (GenerateGoldenFile()) {
937         m_ErrRepository->AddValidErrItem(sv, et, msg);
938         return;
939     }
940 
941     if (entry.IsSeq()) {
942         PostErr(sv, et, msg, entry.GetSeq());
943     } else if (entry.IsSet()) {
944         PostErr(sv, et, msg, entry.GetSet());
945     } else {
946         string desc = "SEQ-ENTRY: ";
947         entry.GetLabel(&desc, CSeq_entry::eContent);
948 
949         int version = 0;
950         const string& accession = GetAccessionFromObjects(&entry, NULL, *m_Scope, &version);
951         m_ErrRepository->AddValidErrItem(sv, et, msg, desc, entry, accession, version);
952     }
953 }
954 
955 
PostErr(EDiagSev sv,EErrType et,const string & msg,const CBioSource & src)956 void CValidError_imp::PostErr
957 (EDiagSev      sv,
958  EErrType      et,
959  const string& msg,
960  const CBioSource& src)
961 {
962     // Adjust severity
963     if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
964         sv = eDiag_Error;
965     }
966 
967     if (GenerateGoldenFile()) {
968         m_ErrRepository->AddValidErrItem(sv, et, msg);
969         return;
970     }
971 
972     string desc = "BioSource: ";
973     m_ErrRepository->AddValidErrItem(sv, et, msg, desc, src, "", 0);
974 }
975 
976 
PostErr(EDiagSev sv,EErrType et,const string & msg,const COrg_ref & org)977 void CValidError_imp::PostErr
978 (EDiagSev      sv,
979  EErrType      et,
980  const string& msg,
981  const COrg_ref& org)
982 {
983     // Adjust severity
984     if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
985         sv = eDiag_Error;
986     }
987 
988     if (GenerateGoldenFile()) {
989         m_ErrRepository->AddValidErrItem(sv, et, msg);
990         return;
991     }
992 
993     string desc = "Org-ref: ";
994     m_ErrRepository->AddValidErrItem(sv, et, msg, desc, org, "", 0);
995 }
996 
997 
PostErr(EDiagSev sv,EErrType et,const string & msg,const CPubdesc & pd)998 void CValidError_imp::PostErr
999 (EDiagSev      sv,
1000  EErrType      et,
1001  const string& msg,
1002  const CPubdesc& pd)
1003 {
1004     // Adjust severity
1005     if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1006         sv = eDiag_Error;
1007     }
1008 
1009     if (GenerateGoldenFile()) {
1010         m_ErrRepository->AddValidErrItem(sv, et, msg);
1011         return;
1012     }
1013 
1014     string desc = "Pubdesc: ";
1015     m_ErrRepository->AddValidErrItem(sv, et, msg, desc, pd, "", 0);
1016 }
1017 
1018 
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSeq_submit & ss)1019 void CValidError_imp::PostErr
1020 (EDiagSev      sv,
1021  EErrType      et,
1022  const string& msg,
1023  const CSeq_submit& ss)
1024 {
1025     // Adjust severity
1026     if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1027         sv = eDiag_Error;
1028     }
1029 
1030     if (GenerateGoldenFile()) {
1031         m_ErrRepository->AddValidErrItem(sv, et, msg);
1032         return;
1033     }
1034 
1035     string desc = "Seq-submit: ";
1036     m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ss, "", 0);
1037 }
1038 
1039 
PostObjErr(EDiagSev sv,EErrType et,const string & msg,const CSerialObject & obj,const CSeq_entry * ctx)1040 void CValidError_imp::PostObjErr
1041 (EDiagSev sv,
1042  EErrType et,
1043  const string&  msg,
1044  const CSerialObject& obj,
1045  const CSeq_entry *ctx)
1046 {
1047     if (ctx == 0) {
1048         PostErr (sv, et, msg, obj);
1049     } else if (obj.GetThisTypeInfo() == CSeqdesc::GetTypeInfo()) {
1050         PostErr(sv, et, msg, *ctx, *(dynamic_cast <const CSeqdesc*> (&obj)));
1051     } else {
1052         PostErr(sv, et, msg, obj);
1053     }
1054 
1055 }
1056 
1057 
PostBadDateError(EDiagSev sv,const string & msg,int flags,const CSerialObject & obj,const CSeq_entry * ctx)1058 void CValidError_imp::PostBadDateError
1059 (EDiagSev             sv,
1060  const string&        msg,
1061  int                  flags,
1062  const CSerialObject& obj,
1063  const CSeq_entry *ctx)
1064 {
1065     string reasons = GetDateErrorDescription(flags);
1066 
1067     NStr::TruncateSpacesInPlace (reasons);
1068     reasons = msg + " - " + reasons;
1069 
1070     PostObjErr (sv, eErr_GENERIC_BadDate, reasons, obj, ctx);
1071 }
1072 
1073 
Validate(const CSeq_entry & se,const CCit_sub * cs,CScope * scope)1074 bool CValidError_imp::Validate
1075 (const CSeq_entry& se,
1076  const CCit_sub* cs,
1077  CScope* scope)
1078 {
1079     CSeq_entry_Handle seh;
1080     try {
1081         seh = scope->GetSeq_entryHandle(se);
1082     } catch (const CException ) { ; }
1083     if (! seh) {
1084         seh = scope->AddTopLevelSeqEntry(se);
1085         if (!seh) {
1086             return false;
1087         }
1088     }
1089 
1090     return Validate(seh, cs);
1091 }
1092 
s_IsPhage(const COrg_ref & org)1093 static bool s_IsPhage(const COrg_ref& org)
1094 {
1095     if (org.IsSetDivision() && NStr::Equal(org.GetDivision(), "PHG")) {
1096         return true;
1097     } else {
1098         return false;
1099     }
1100 }
1101 
1102 
ValidateMultipleTaxIds(const CSeq_entry_Handle & seh)1103 void CValidError_imp::ValidateMultipleTaxIds(const CSeq_entry_Handle& seh)
1104 {
1105     bool has_mult = false;
1106     int first_id = 0;
1107     int phage_id = 0;
1108 
1109     for (CBioseq_CI bi(seh); bi; ++bi) {
1110         for (CSeqdesc_CI desc_ci(*bi, CSeqdesc::e_Source);
1111             desc_ci && !has_mult;
1112             ++desc_ci) {
1113             if (desc_ci->GetSource().IsSetOrg()) {
1114                 const COrg_ref& org = desc_ci->GetSource().GetOrg();
1115                 if (org.IsSetDb()) {
1116                     ITERATE(COrg_ref::TDb, it, org.GetDb()) {
1117                         if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "taxon") &&
1118                             (*it)->IsSetTag() && (*it)->GetTag().IsId()) {
1119                             int this_id = (*it)->GetTag().GetId();
1120                             if (this_id > 0) {
1121                                 if (s_IsPhage(org)) {
1122                                     phage_id = this_id;
1123                                 } else if (first_id == 0) {
1124                                     first_id = this_id;
1125                                 } else if (first_id != this_id) {
1126                                     has_mult = true;
1127                                 }
1128                             }
1129                         }
1130                     }
1131                 }
1132             }
1133         }
1134     }
1135     if (has_mult || (phage_id > 0 && first_id > 0)) {
1136         PostErr(has_mult ? eDiag_Error : eDiag_Warning, eErr_SEQ_DESCR_MultipleTaxonIDs,
1137             "There are multiple taxonIDs in this RefSeq record.",
1138             *m_TSE);
1139     }
1140 }
1141 
1142 
Validate(const CSeq_entry_Handle & seh,const CCit_sub * cs)1143 bool CValidError_imp::Validate
1144 (const CSeq_entry_Handle& seh,
1145  const CCit_sub* cs)
1146 {
1147     _ASSERT(seh);
1148 
1149     if ( m_PrgCallback ) {
1150         m_PrgInfo.m_State = CValidator::CProgressInfo::eState_Initializing;
1151         if ( m_PrgCallback(&m_PrgInfo) ) {
1152             return false;
1153         }
1154     }
1155 
1156     // Check that CSeq_entry has data
1157     if (seh.Which() == CSeq_entry::e_not_set) {
1158         ERR_POST_X(2, Warning << "Seq_entry not set");
1159         return false;
1160     }
1161 
1162     Setup(seh);
1163 
1164     // Seq-submit has submission citationTest_Descr_LatLonValue
1165     if (cs) {
1166         m_NoPubs = false;
1167         m_IsSeqSubmit = true;
1168     }
1169 
1170     // Get first CBioseq object pointer for PostErr below.
1171     CTypeConstIterator<CBioseq> seq(ConstBegin(*m_TSE));
1172     if (!seq) {
1173         PostErr(eDiag_Error, eErr_SEQ_PKG_NoBioseqFound,
1174                   "No Bioseqs in this entire record.", seh.GetCompleteSeq_entry()->GetSet());
1175         return true;
1176     }
1177 
1178     // If m_NonASCII is true, then this flag was set by the caller
1179     // of validate to indicate that a non ascii character had been
1180     // read from a file being used to create a CSeq_entry, that the
1181     // error had been corrected, but that the error needs to be reported
1182     // by Validate. Note, Validate is not doing anything other than
1183     // reporting an error if m_NonASCII is true;
1184     if (m_NonASCII) {
1185         PostErr(eDiag_Fatal, eErr_GENERIC_NonAsciiAsn,
1186                   "Non-ascii chars in input ASN.1 strings", *seq);
1187         // Only report the error once
1188         m_NonASCII = false;
1189     }
1190 
1191     // Iterate thru components of record and validate each
1192 
1193     // also want to know if we have gi
1194     bool has_gi = false;
1195     // also want to know if there are any nucleotide sequences
1196     bool has_nucleotide_sequence = false;
1197 
1198     for (CBioseq_CI bi(GetTSEH(), CSeq_inst::eMol_not_set, CBioseq_CI::eLevel_All);
1199          bi && (!m_IsINSDInSep || !has_gi || !has_nucleotide_sequence);
1200          ++bi) {
1201         FOR_EACH_SEQID_ON_BIOSEQ (it, *(bi->GetCompleteBioseq())) {
1202             if ((*it)->IsGi()) {
1203                 has_gi = true;
1204             }
1205         }
1206         if (bi->IsSetInst_Mol() && bi->IsNa()) {
1207             has_nucleotide_sequence = true;
1208         }
1209     }
1210 
1211     if (m_IsINSDInSep && m_IsRefSeq) {
1212         // NOTE: We use m_IsRefSeq to indicate the actual presence of RefSeq IDs in
1213         // the record, rather than IsRefSeq(), which indicates *either* RefSeq IDs are
1214         // present *OR* the refseq flag has been used
1215         PostErr (eDiag_Error, eErr_SEQ_PKG_INSDRefSeqPackaging,
1216                  "INSD and RefSeq records should not be present in the same set", *m_TSE);
1217     }
1218 
1219 #if 0
1220     // disabled for now
1221     // look for long IDs that would collide if truncated at 30 characters
1222     vector<string> id_strings;
1223     for (CBioseq_CI bi(GetTSEH(), CSeq_inst::eMol_not_set, CBioseq_CI::eLevel_All);
1224          bi;
1225          ++bi) {
1226         FOR_EACH_SEQID_ON_BIOSEQ (it, *(bi->GetCompleteBioseq())) {
1227             if (!IsNCBIFILESeqId(**it)) {
1228                 string label;
1229                 (*it)->GetLabel(&label);
1230                 id_strings.push_back(label);
1231             }
1232         }
1233     }
1234     stable_sort (id_strings.begin(), id_strings.end());
1235     for (vector<string>::iterator id_str_it = id_strings.begin();
1236          id_str_it != id_strings.end();
1237          ++id_str_it) {
1238         string pattern = (*id_str_it).substr(0, 30);
1239         string first_id = *id_str_it;
1240         vector<string>::iterator cmp_it = id_str_it;
1241         ++cmp_it;
1242         while (cmp_it != id_strings.end() && NStr::StartsWith(*cmp_it, pattern)) {
1243             CRef<CSeq_id> id(new CSeq_id(*cmp_it));
1244             CBioseq_Handle bsh = m_Scope->GetBioseqHandle(*id);
1245             PostErr (eDiag_Warning, eErr_SEQ_INST_BadSeqIdFormat,
1246                      "First 30 characters of " + first_id + " and " +
1247                      *cmp_it + " are identical", *(bsh.GetCompleteBioseq()));
1248             ++id_str_it;
1249             ++cmp_it;
1250         }
1251     }
1252 #endif
1253 
1254     // look for colliding feature IDs
1255     vector < int > feature_ids;
1256     for (CFeat_CI fi(GetTSEH()); fi; ++fi) {
1257         const CSeq_feat& sf = fi->GetOriginalFeature();
1258         if (sf.IsSetId() && sf.GetId().IsLocal() && sf.GetId().GetLocal().IsId()) {
1259             feature_ids.push_back(sf.GetId().GetLocal().GetId());
1260         }
1261     }
1262 
1263     if (feature_ids.size() > 0) {
1264         const CTSE_Handle& tse = seh.GetTSE_Handle ();
1265         stable_sort (feature_ids.begin(), feature_ids.end());
1266         vector <int>::iterator it = feature_ids.begin();
1267         int id = *it;
1268         ++it;
1269         while (it != feature_ids.end()) {
1270             if (*it == id) {
1271                 vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, id);
1272                 ITERATE( vector<CSeq_feat_Handle>, feat_it, handles ) {
1273                     PostErr (eDiag_Critical, eErr_SEQ_FEAT_CollidingFeatureIDs,
1274                              "Colliding feature ID " + NStr::NumericToString (id), *(feat_it->GetSeq_feat()));
1275                 }
1276                 while (it != feature_ids.end() && *it == id) {
1277                     ++it;
1278                 }
1279                 if (it != feature_ids.end()) {
1280                     id = *it;
1281                     ++it;
1282                 }
1283             } else {
1284                 id = *it;
1285                 ++it;
1286             }
1287         }
1288     }
1289 
1290     // look for mixed gps and non-gps sets
1291     bool has_nongps = false;
1292     bool has_gps = false;
1293 
1294     for (CTypeConstIterator<CBioseq_set> si(*m_TSE); si && (!has_nongps || !has_gps); ++si) {
1295         if (si->IsSetClass()) {
1296             if (si->GetClass() == CBioseq_set::eClass_mut_set
1297                 || si->GetClass() == CBioseq_set::eClass_pop_set
1298                 || si->GetClass() == CBioseq_set::eClass_phy_set
1299                 || si->GetClass() == CBioseq_set::eClass_eco_set
1300                 || si->GetClass() == CBioseq_set::eClass_wgs_set
1301                 || si->GetClass() == CBioseq_set::eClass_small_genome_set) {
1302                 has_nongps = true;
1303             } else if (si->GetClass() == CBioseq_set::eClass_gen_prod_set) {
1304                 has_gps = true;
1305             }
1306         }
1307     }
1308 
1309     if (has_nongps && has_gps) {
1310         PostErr(eDiag_Error, eErr_SEQ_PKG_GPSnonGPSPackaging,
1311             "Genomic product set and mut/pop/phy/eco set records should not be present in the same set",
1312             *m_TSE);
1313     }
1314 
1315     // count inference accessions - if there are too many, temporarily disable inference checking
1316     bool old_inference_acc_check = m_ValidateInferenceAccessions;
1317     if (m_ValidateInferenceAccessions) {
1318         size_t num_inferences = 0, num_accessions = 0;
1319         CFeat_CI feat_inf(seh);
1320         while (feat_inf) {
1321             FOR_EACH_GBQUAL_ON_FEATURE (qual, *feat_inf) {
1322                 if ((*qual)->IsSetQual() && (*qual)->IsSetVal() && NStr::Equal((*qual)->GetQual(), "inference")) {
1323                     num_inferences++;
1324                     string prefix, remainder;
1325                     bool same_species;
1326                     vector<string> accessions = CValidError_feat::GetAccessionsFromInferenceString ((*qual)->GetVal(), prefix, remainder, same_species);
1327                     for (size_t i = 0; i < accessions.size(); i++) {
1328                         NStr::TruncateSpacesInPlace (accessions[i]);
1329                         string acc_prefix, accession;
1330                         if (CValidError_feat::GetPrefixAndAccessionFromInferenceAccession (remainder, acc_prefix, accession)) {
1331                             if (NStr::EqualNocase (acc_prefix, "INSD") || NStr::EqualNocase (acc_prefix, "RefSeq")) {
1332                                 num_accessions++;
1333                             }
1334                         }
1335                     }
1336                 }
1337             }
1338             ++feat_inf;
1339         }
1340         if (/* num_inferences > 1000 || */ num_accessions > 1000) {
1341             // warn about too many inferences
1342             PostErr (eDiag_Info, eErr_SEQ_FEAT_TooManyInferenceAccessions,
1343                      "Skipping validation of " + NStr::SizetToString (num_inferences) + " /inference qualifiers with "
1344                      + NStr::SizetToString (num_accessions) + " accessions",
1345                      *m_TSE);
1346 
1347             // disable inference checking
1348             m_ValidateInferenceAccessions = false;
1349         }
1350     }
1351 
1352     // validate the main data
1353     if (seh.IsSeq()) {
1354         const CBioseq& seq = seh.GetCompleteSeq_entry()->GetSeq();
1355         CValidError_bioseq bioseq_validator(*this);
1356         try {
1357             bioseq_validator.ValidateBioseq(seq);
1358         } catch ( const exception& e ) {
1359             PostErr(eDiag_Fatal, eErr_INTERNAL_Exception,
1360                 string("Exception while validating bioseq. EXCEPTION: ") +
1361                 e.what(), seq);
1362             return true;
1363         }
1364     } else if (seh.IsSet()) {
1365         const CBioseq_set& set = seh.GetCompleteSeq_entry()->GetSet();
1366         CValidError_bioseqset bioseqset_validator(*this);
1367         try {
1368             bioseqset_validator.ValidateBioseqSet(set);
1369         } catch ( const exception& e ) {
1370             PostErr(eDiag_Fatal, eErr_INTERNAL_Exception,
1371                 string("Exception while validating bioseq set. EXCEPTION: ") +
1372                 e.what(), set);
1373             return true;
1374         }
1375     }
1376 
1377     // put flag for validating inference accessions back to original value
1378     m_ValidateInferenceAccessions = old_inference_acc_check;
1379 
1380     // validation from data collected during previous step
1381 
1382     if ( m_NumTpaWithHistory > 0  &&
1383          m_NumTpaWithoutHistory > 0 ) {
1384         PostErr(eDiag_Error, eErr_SEQ_INST_TpaAssemblyProblem,
1385             "There are " +
1386             NStr::SizetToString(m_NumTpaWithHistory) +
1387             " TPAs with history and " +
1388             NStr::SizetToString(m_NumTpaWithoutHistory) +
1389             " without history in this record.", *seq);
1390     }
1391     if ( m_NumTpaWithoutHistory > 0 && has_gi) {
1392         PostErr (eDiag_Warning, eErr_SEQ_INST_TpaAssemblyProblem,
1393             "There are " +
1394             NStr::SizetToString(m_NumTpaWithoutHistory) +
1395             " TPAs without history in this record, but the record has a gi number assignment.", *m_TSE);
1396     }
1397     if (IsIndexerVersion() && DoesAnyProteinHaveGeneralID() && !IsRefSeq() && has_nucleotide_sequence) {
1398         PostErr (eDiag_Info, eErr_SEQ_INST_ProteinsHaveGeneralID,
1399                  "INDEXER_ONLY - Protein bioseqs have general seq-id.",
1400                  *(seh.GetCompleteSeq_entry()));
1401     }
1402 
1403     ReportMissingPubs(*m_TSE, cs);
1404     ReportMissingBiosource(*m_TSE);
1405 
1406     if (m_NumMisplacedFeatures > 1) {
1407         PostErr (eDiag_Critical, eErr_SEQ_PKG_FeaturePackagingProblem,
1408                  "There are " + NStr::SizetToString (m_NumMisplacedFeatures) + " mispackaged features in this record.",
1409                  *(seh.GetCompleteSeq_entry()));
1410     } else if (m_NumMisplacedFeatures == 1) {
1411         PostErr (eDiag_Critical, eErr_SEQ_PKG_FeaturePackagingProblem,
1412                  "There is 1 mispackaged feature in this record.",
1413                  *(seh.GetCompleteSeq_entry()));
1414     }
1415     if (m_NumSmallGenomeSetMisplaced > 1) {
1416         PostErr (eDiag_Warning, eErr_SEQ_PKG_FeaturePackagingProblem,
1417                  "There are " + NStr::SizetToString (m_NumSmallGenomeSetMisplaced) + " mispackaged features in this small genome set record.",
1418                  *(seh.GetCompleteSeq_entry()));
1419     } else if (m_NumSmallGenomeSetMisplaced == 1) {
1420         PostErr (eDiag_Warning, eErr_SEQ_PKG_FeaturePackagingProblem,
1421                  "There is 1 mispackaged feature in this small genome set record.",
1422                  *(seh.GetCompleteSeq_entry()));
1423     }
1424     if ( m_NumGenes == 0  &&
1425          m_NumGeneXrefs > 0 ) {
1426         PostErr(eDiag_Warning, eErr_SEQ_FEAT_OnlyGeneXrefs,
1427             "There are " + NStr::SizetToString(m_NumGeneXrefs) +
1428             " gene xrefs and no gene features in this record.", *m_TSE);
1429     }
1430     ValidateCitations (seh);
1431 
1432 
1433     if ( m_NumMisplacedGraphs > 0 ) {
1434         string num = NStr::SizetToString(m_NumMisplacedGraphs);
1435         PostErr(eDiag_Critical, eErr_SEQ_PKG_GraphPackagingProblem,
1436             string("There ") + ((m_NumMisplacedGraphs > 1) ? "are " : "is ") + num +
1437             " mispackaged graph" + ((m_NumMisplacedGraphs > 1) ? "s" : "") + " in this record.",
1438             *m_TSE);
1439     }
1440 
1441     if ( IsRefSeq() && ! IsWP() ) {
1442         ValidateMultipleTaxIds(seh);
1443     }
1444 
1445 
1446     FindEmbeddedScript(*(seh.GetCompleteSeq_entry()));
1447     FindNonAsciiText(*(seh.GetCompleteSeq_entry()));
1448     FindCollidingSerialNumbers(*(seh.GetCompleteSeq_entry()));
1449 
1450     if (m_FarFetchFailure) {
1451         PostErr(eDiag_Warning, eErr_SEQ_INST_FarFetchFailure,
1452                 "Far fetch failures caused some validator tests to be bypassed",
1453                 *m_TSE);
1454     }
1455 
1456     if (m_DoTaxLookup) {
1457         ValidateTaxonomy(*(seh.GetCompleteSeq_entry()));
1458     }
1459 
1460     // validate cit-sub
1461     if (cs) {
1462         ValidateCitSub (*cs, *(seh.GetCompleteSeq_entry()), seh.GetCompleteSeq_entry());
1463     }
1464 
1465     // optional barcode tests
1466     if (m_DoBarcodeTests) {
1467         x_DoBarcodeTests(seh);
1468     }
1469     return true;
1470 }
1471 
1472 
ValidateSubmitBlock(const CSubmit_block & block,const CSeq_submit & ss)1473 void CValidError_imp::ValidateSubmitBlock(const CSubmit_block& block, const CSeq_submit& ss)
1474 {
1475     if (block.IsSetHup() && block.GetHup() && block.IsSetReldate() &&
1476         IsDateInPast(block.GetReldate())) {
1477         PostErr(eDiag_Warning, eErr_GENERIC_PastReleaseDate,
1478             "Record release date has already passed", ss);
1479     }
1480 }
1481 
1482 
Validate(const CSeq_submit & ss,CScope * scope)1483 void CValidError_imp::Validate(
1484     const CSeq_submit& ss, CScope* scope)
1485 {
1486     // Check that ss is type e_Entrys
1487     if ( ss.GetData().Which() != CSeq_submit::C_Data::e_Entrys ) {
1488         return;
1489     }
1490 
1491     m_IsSeqSubmit = true;
1492     ValidateSubmitBlock(ss.GetSub(), ss);
1493 
1494     // Get CCit_sub pointer
1495     const CCit_sub* cs = &ss.GetSub().GetCit();
1496 
1497     if (ss.IsSetSub() && ss.GetSub().IsSetTool() && NStr::StartsWith(ss.GetSub().GetTool(), "Geneious")) {
1498         m_IsGeneious = true;
1499     }
1500 
1501     // Just loop thru CSeq_entrys
1502     FOR_EACH_SEQENTRY_ON_SEQSUBMIT (se_itr, ss) {
1503         const CSeq_entry& se = **se_itr;
1504         if(se.IsSet())
1505         {
1506             const CBioseq_set &set = se.GetSet();
1507             if(set.IsSetClass() &&
1508                set.GetClass() == CBioseq_set::eClass_wgs_set)
1509             {
1510                 CSeq_entry_Handle seh;
1511                 seh = scope->GetSeq_entryHandle(se);
1512                 Setup(seh);
1513                 PostErr(eDiag_Warning, eErr_SEQ_PKG_SeqSubmitWithWgsSet,
1514                         "File was created as a wgs-set, but should be a batch submission instead.",
1515                         seh.GetCompleteSeq_entry()->GetSet());
1516             }
1517         }
1518         Validate (se, cs, scope);
1519     }
1520 }
1521 
1522 
Validate(const CSeq_annot_Handle & sah)1523 void CValidError_imp::Validate(
1524     const CSeq_annot_Handle& sah)
1525 {
1526     Setup(sah);
1527 
1528     // Iterate thru components of record and validate each
1529 
1530     CValidError_annot annot_validator(*this);
1531     annot_validator.ValidateSeqAnnot(sah);
1532 
1533     switch (sah.Which()) {
1534     case CSeq_annot::TData::e_Ftable :
1535         {
1536             CValidError_feat feat_validator(*this);
1537             for (CFeat_CI fi (sah); fi; ++fi) {
1538                 const CSeq_feat& sf = fi->GetOriginalFeature();
1539                 feat_validator.ValidateSeqFeat(sf);
1540             }
1541         }
1542         break;
1543 
1544     case CSeq_annot::TData::e_Align :
1545         {
1546             if (IsValidateAlignments()) {
1547                 CValidError_align align_validator(*this);
1548                 int order = 1;
1549                 for (CAlign_CI ai(sah); ai; ++ai) {
1550                     const CSeq_align& sa = ai.GetOriginalSeq_align();
1551                     align_validator.ValidateSeqAlign(sa, order++);
1552                 }
1553             }
1554         }
1555         break;
1556 
1557     case CSeq_annot::TData::e_Graph :
1558         {
1559             CValidError_graph graph_validator(*this);
1560             // for (CTypeConstIterator <CSeq_graph> gi (sa); gi; ++gi) {
1561             for (CGraph_CI gi(sah); gi; ++gi) {
1562                 const CSeq_graph& sg = gi->GetOriginalGraph();
1563                 graph_validator.ValidateSeqGraph(sg);
1564             }
1565         }
1566         break;
1567     default:
1568         break;
1569     }
1570     FindEmbeddedScript(*(sah.GetCompleteSeq_annot()));
1571     FindNonAsciiText(*(sah.GetCompleteSeq_annot()));
1572     FindCollidingSerialNumbers(*(sah.GetCompleteSeq_annot()));
1573 }
1574 
1575 
Validate(const CSeq_feat & feat,CScope * scope)1576 void CValidError_imp::Validate(const CSeq_feat& feat, CScope* scope)
1577 {
1578     // automatically restores m_Scope to its old value when we leave
1579     // the function
1580     CScopeRestorer scopeRestorer( m_Scope );
1581 
1582     if( scope != NULL ) {
1583         m_Scope.Reset(scope);
1584     }
1585     if (!m_Scope) {
1586         // set up a temporary local scope if there is no scope set already
1587         m_Scope.Reset(new CScope(*m_ObjMgr));
1588     }
1589 
1590     CValidError_feat feat_validator(*this);
1591     feat_validator.SetScope(*m_Scope);
1592     CSeq_entry_Handle empty;
1593     feat_validator.SetTSE(empty);
1594     feat_validator.ValidateSeqFeat(feat);
1595     if (feat.IsSetData() && feat.GetData().IsBiosrc()) {
1596         const CBioSource& src = feat.GetData().GetBiosrc();
1597         if (src.IsSetOrg()) {
1598             ValidateTaxonomy (src.GetOrg(), src.IsSetGenome() ? src.GetGenome() : CBioSource::eGenome_unknown);
1599         }
1600     }
1601     FindEmbeddedScript(feat);
1602     FindNonAsciiText(feat);
1603     FindCollidingSerialNumbers(feat);
1604 }
1605 
1606 
Validate(const CBioSource & src,CScope * scope)1607 void CValidError_imp::Validate(const CBioSource& src, CScope* scope)
1608 {
1609     // automatically restores m_Scope to its old value when we leave
1610     // the function
1611     CScopeRestorer scopeRestorer( m_Scope );
1612 
1613     if( scope != NULL ) {
1614         m_Scope.Reset(scope);
1615     }
1616     if (!m_Scope) {
1617         // set up a temporary local scope if there is no scope set already
1618         m_Scope.Reset(new CScope(*m_ObjMgr));
1619     }
1620 
1621     ValidateBioSource(src, src);
1622     if (src.IsSetOrg()) {
1623         ValidateTaxonomy (src.GetOrg(), src.IsSetGenome() ? src.GetGenome() : CBioSource::eGenome_unknown);
1624     }
1625     FindEmbeddedScript(src);
1626     FindNonAsciiText(src);
1627     FindCollidingSerialNumbers(src);
1628 }
1629 
1630 
Validate(const CPubdesc & pubdesc,CScope * scope)1631 void CValidError_imp::Validate(const CPubdesc& pubdesc, CScope* scope)
1632 {
1633     // automatically restores m_Scope to its old value when we leave
1634     // the function
1635     CScopeRestorer scopeRestorer( m_Scope );
1636 
1637     if( scope != NULL ) {
1638         m_Scope.Reset(scope);
1639     }
1640     if (!m_Scope) {
1641         // set up a temporary local scope if there is no scope set already
1642         m_Scope.Reset(new CScope(*m_ObjMgr));
1643     }
1644 
1645     ValidatePubdesc(pubdesc, pubdesc);
1646     FindEmbeddedScript(pubdesc);
1647     FindNonAsciiText(pubdesc);
1648     FindCollidingSerialNumbers(pubdesc);
1649 }
1650 
Validate(const CSeqdesc & desc,const CSeq_entry & ctx)1651 void CValidError_imp::Validate(const CSeqdesc& desc, const CSeq_entry& ctx)
1652 {
1653     CValidError_desc seqdesc_validator(*this);
1654     m_Scope.Reset(new CScope(*m_ObjMgr));
1655     m_Scope->AddTopLevelSeqEntry(ctx);
1656     seqdesc_validator.ValidateSeqDesc(desc,ctx);
1657 }
1658 
1659 
SetProgressCallback(CValidator::TProgressCallback callback,void * user_data)1660 void CValidError_imp::SetProgressCallback
1661 (CValidator::TProgressCallback callback,
1662  void* user_data)
1663 {
1664     m_PrgCallback = callback;
1665     m_PrgInfo.m_UserData = user_data;
1666 }
1667 
1668 
ValidateDbxref(const CDbtag & xref,const CSerialObject & obj,bool biosource,const CSeq_entry * ctx)1669 void CValidError_imp::ValidateDbxref
1670 (const CDbtag& xref,
1671  const CSerialObject& obj,
1672  bool biosource,
1673  const CSeq_entry *ctx)
1674 {
1675     bool refseq_or_gps = IsRefSeq() || IsGPS();
1676     CValidator::TDbxrefValidFlags flags = CValidator::IsValidDbxref(xref, biosource,
1677         refseq_or_gps);
1678 
1679     const string& db = xref.IsSetDb() ? xref.GetDb() : kEmptyStr;
1680 
1681     if (flags & CValidator::eTagHasSgml) {
1682         PostObjErr(eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
1683             "dbxref value " + xref.GetTag().GetStr() + " has SGML",
1684             obj, ctx);
1685     }
1686     if (flags & CValidator::eContainsSpace) {
1687         PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref,
1688                    "dbxref value " + xref.GetTag().GetStr() + " contains space character",
1689                    obj, ctx);
1690     }
1691     if (flags & CValidator::eDbHasSgml) {
1692         PostObjErr(eDiag_Warning, eErr_GENERIC_SgmlPresentInText,
1693             "dbxref database " + db + " has SGML",
1694             obj, ctx);
1695     }
1696 
1697     string dbv;
1698     if (xref.IsSetTag() && xref.GetTag().IsStr()) {
1699         dbv = xref.GetTag().GetStr();
1700     } else if (xref.IsSetTag() && xref.GetTag().IsId()) {
1701         dbv = NStr::NumericToString(xref.GetTag().GetId());
1702     }
1703 
1704     if (flags & CValidator::eUnrecognized) {
1705         PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref,
1706             "Illegal db_xref type " + db + " (" + dbv + ")", obj, ctx);
1707     }
1708     if (flags & CValidator::eBadCapitalization) {
1709         // capitalization is bad
1710         bool refseq_db = false, src_db = false;
1711         string correct_caps;
1712         xref.GetDBFlags(refseq_db, src_db, correct_caps);
1713         string message = "Illegal db_xref type " + db + " (" + dbv + "), legal capitalization is " + correct_caps;
1714         if (flags & CValidator::eNotForSource) {
1715             message += ", but should not be used on an OrgRef";
1716         } else if (flags & CValidator::eOnlyForSource) {
1717             message += ", but should only be used on an OrgRef";
1718         }
1719 
1720         PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref, message, obj, ctx);
1721     } else {
1722         if (flags & CValidator::eOnlyForRefSeq) {
1723             if (flags & CValidator::eNotForSource) {
1724                 PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref,
1725                     "RefSeq-specific db_xref type " + db + " (" + dbv + ") should not be used on a non-RefSeq OrgRef",
1726                     obj, ctx);
1727             } else {
1728                 PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref,
1729                     "db_xref type " + db + " (" + dbv + ") is only legal for RefSeq",
1730                     obj, ctx);
1731             }
1732         } else if (flags & CValidator::eNotForSource) {
1733             if (flags & CValidator::eRefSeqNotForSource) {
1734                 PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref,
1735                     "RefSeq-specific db_xref type " + db + " (" + dbv + ") should not be used on an OrgRef",
1736                     obj, ctx);
1737             } else {
1738                 PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref,
1739                     "db_xref type " + db + " (" + dbv + ") should not be used on an OrgRef",
1740                     obj, ctx);
1741             }
1742         } else if (flags & CValidator::eOnlyForSource) {
1743             PostObjErr(eDiag_Warning, eErr_SEQ_FEAT_IllegalDbXref,
1744                 "db_xref type " + db + " (" + dbv + ") should only be used on an OrgRef",
1745                 obj, ctx);
1746         }
1747     }
1748 
1749 }
1750 
1751 
ValidateDbxref(TDbtags & xref_list,const CSerialObject & obj,bool biosource,const CSeq_entry * ctx)1752 void CValidError_imp::ValidateDbxref
1753 (TDbtags& xref_list,
1754  const CSerialObject& obj,
1755  bool biosource,
1756  const CSeq_entry *ctx)
1757 {
1758     string last_db;
1759 
1760     ITERATE( TDbtags, xref, xref_list) {
1761         if (biosource
1762             && (*xref)->IsSetDb()) {
1763             if (!NStr::IsBlank(last_db)
1764                 && NStr::EqualNocase((*xref)->GetDb(), last_db)) {
1765                 PostObjErr (eDiag_Warning, eErr_SEQ_DESCR_BioSourceDbTagConflict,
1766                             "BioSource uses db " + last_db + " multiple times",
1767                             obj, ctx);
1768             }
1769             last_db = (*xref)->GetDb();
1770         }
1771         ValidateDbxref(**xref, obj, biosource, ctx);
1772     }
1773 }
1774 
1775 
x_CheckPackedInt(const CPacked_seqint & packed_int,SLocCheck & lc,const CSerialObject & obj)1776 void CValidError_imp::x_CheckPackedInt
1777 (const CPacked_seqint& packed_int,
1778   SLocCheck& lc,
1779  const CSerialObject& obj)
1780 {
1781     ITERATE(CPacked_seqint::Tdata, it, packed_int.Get()) {
1782         lc.int_cur = (*it);
1783         lc.chk &= x_CheckSeqInt(lc.id_cur, lc.int_cur, lc.strand_cur, obj);
1784 
1785         x_CheckForStrandChange(lc);
1786 
1787         lc.id_prv = lc.id_cur;
1788         lc.strand_prv = lc.strand_cur;
1789         lc.int_prv = lc.int_cur;
1790     }
1791 }
1792 
1793 
x_CheckSeqInt(CConstRef<CSeq_id> & id_cur,const CSeq_interval * int_cur,ENa_strand & strand_cur,const CSerialObject & obj)1794 bool CValidError_imp::x_CheckSeqInt
1795 (CConstRef<CSeq_id>& id_cur,
1796  const CSeq_interval * int_cur,
1797  ENa_strand& strand_cur,
1798  const CSerialObject& obj)
1799 {
1800     strand_cur = int_cur->IsSetStrand() ?
1801         int_cur->GetStrand() : eNa_strand_unknown;
1802     id_cur = &int_cur->GetId();
1803     bool chk = IsValid(*int_cur, m_Scope);
1804     return chk;
1805 }
1806 
1807 
x_ReportInvalidFuzz(const CPacked_seqint & packed_int,const CSerialObject & obj)1808 void CValidError_imp::x_ReportInvalidFuzz(const CPacked_seqint& packed_int, const CSerialObject& obj)
1809 {
1810     ITERATE(CPacked_seqint::Tdata, it, packed_int.Get()) {
1811         x_ReportInvalidFuzz(**it, obj);
1812     }
1813 }
1814 
1815 
1816 static const string kSpaceLeftFirst = "Should not specify 'space to left' at first position of non-circular sequence";
1817 static const string kSpaceRightLast = "Should not specify 'space to right' at last position of non-circular sequence";
1818 
1819 static const string kSpaceLeftCircle = "Should not specify 'circle to left' except at first position of circular sequence";
1820 static const string kSpaceRightCircle = "Should not specify 'circle to right' except at last position of circular sequence";
1821 
x_ReportInvalidFuzz(const CSeq_interval & interval,const CSerialObject & obj)1822 void CValidError_imp::x_ReportInvalidFuzz(const CSeq_interval& interval, const CSerialObject& obj)
1823 {
1824     CInt_fuzz::ELim fuzz_from = CInt_fuzz::eLim_unk;
1825     CInt_fuzz::ELim fuzz_to = CInt_fuzz::eLim_unk;
1826     bool has_fuzz_from = false;
1827     bool has_fuzz_to = false;
1828 
1829     if (interval.IsSetFuzz_from() && interval.GetFuzz_from().IsLim()) {
1830         fuzz_from = interval.GetFuzz_from().GetLim();
1831         has_fuzz_from = true;
1832     }
1833     if (interval.IsSetFuzz_to() && interval.GetFuzz_to().IsLim()) {
1834         fuzz_to = interval.GetFuzz_to().GetLim();
1835         has_fuzz_to = true;
1836     }
1837     if (! has_fuzz_from && ! has_fuzz_to) {
1838         return;
1839     }
1840 
1841     // check for invalid fuzz on both ends of Interval
1842     if (has_fuzz_from && has_fuzz_to && fuzz_from == fuzz_to) {
1843         if (fuzz_from == CInt_fuzz::eLim_tl) {
1844             PostErr(eDiag_Error,
1845                 eErr_SEQ_FEAT_InvalidFuzz,
1846                 "Should not specify 'space to left' for both ends of interval", obj);
1847         }
1848         else if (fuzz_from == CInt_fuzz::eLim_tr) {
1849             PostErr(eDiag_Error,
1850                 eErr_SEQ_FEAT_InvalidFuzz,
1851                 "Should not specify 'space to right' for both ends of interval", obj);
1852         }
1853         else if (fuzz_from == CInt_fuzz::eLim_circle) {
1854             PostErr(eDiag_Error,
1855                 eErr_SEQ_FEAT_InvalidFuzz,
1856                 "Should not specify 'origin of circle' for both ends of interval", obj);
1857         }
1858     }
1859 
1860     CBioseq_Handle bsh = m_Scope->GetBioseqHandle(interval.GetId());
1861     if (! bsh) {
1862         return;
1863     }
1864 
1865     CSeq_inst::ETopology top = CSeq_inst::eTopology_not_set;
1866     if (bsh.IsSetInst_Topology()) {
1867         top = bsh.GetInst_Topology();
1868     }
1869 
1870     if (top != CSeq_inst::eTopology_circular) {
1871 
1872         // VR-15
1873         // look for space to left at beginning of sequence or space to right at end
1874         if (fuzz_from == CInt_fuzz::eLim_tl && interval.IsSetFrom() && interval.GetFrom() == 0) {
1875             PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidFuzz, kSpaceLeftFirst, obj);
1876         }
1877         if (fuzz_to == CInt_fuzz::eLim_tr && interval.IsSetTo() && interval.GetTo() == bsh.GetBioseqLength() - 1) {
1878             PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidFuzz, kSpaceRightLast, obj);
1879         }
1880 
1881     } else if (fuzz_from == CInt_fuzz::eLim_circle || fuzz_to == CInt_fuzz::eLim_circle) {
1882 
1883         if (obj.GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
1884             const CSeq_feat* sfp = dynamic_cast<const CSeq_feat*>(&obj);
1885             if (sfp && sfp->IsSetExcept() && sfp->CanGetExcept_text() && NStr::FindNoCase(sfp->GetExcept_text(), "ribosomal slippage") != NPOS) {
1886                 return;
1887             }
1888         }
1889 
1890         // VR-832
1891         if (fuzz_from == CInt_fuzz::eLim_circle && interval.IsSetFrom() && interval.GetFrom() != 0) {
1892             PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidFuzz, kSpaceLeftCircle, obj);
1893         }
1894         if (fuzz_to == CInt_fuzz::eLim_circle && interval.IsSetTo() && interval.GetTo() != bsh.GetBioseqLength() - 1) {
1895             PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidFuzz, kSpaceRightCircle, obj);
1896         }
1897     }
1898 }
1899 
1900 
x_ReportInvalidFuzz(const CSeq_point & point,const CSerialObject & obj)1901 void CValidError_imp::x_ReportInvalidFuzz(const CSeq_point& point, const CSerialObject& obj)
1902 {
1903     // VR-15
1904     if (!point.IsSetFuzz() || !point.GetFuzz().IsLim() ||
1905         (point.GetFuzz().GetLim() != CInt_fuzz::eLim_tl && point.GetFuzz().GetLim() != CInt_fuzz::eLim_tr) ||
1906         !point.IsSetId() || !point.IsSetPoint()) {
1907         return;
1908     }
1909     CBioseq_Handle bsh = m_Scope->GetBioseqHandle(point.GetId());
1910     if (!bsh) {
1911         return;
1912     }
1913     if (bsh.IsSetInst_Topology() && bsh.GetInst_Topology() == CSeq_inst::eTopology_circular) {
1914         return;
1915     }
1916     if (point.GetPoint() == 0 && point.GetFuzz().GetLim() == CInt_fuzz::eLim_tl) {
1917         PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidFuzz, kSpaceLeftFirst, obj);
1918     }
1919     if (point.GetPoint() == bsh.GetBioseqLength() - 1) {
1920         PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidFuzz, kSpaceRightLast, obj);
1921     }
1922 }
1923 
1924 
x_ReportInvalidFuzz(const CSeq_loc & loc,const CSerialObject & obj)1925 void CValidError_imp::x_ReportInvalidFuzz(const CSeq_loc& loc, const CSerialObject& obj)
1926 {
1927     CTypeConstIterator<CSeq_loc> lit = ConstBegin(loc);
1928     for (; lit; ++lit) {
1929         CSeq_loc::E_Choice loc_choice = lit->Which();
1930         switch (loc_choice) {
1931         case CSeq_loc::e_Int:
1932             x_ReportInvalidFuzz(lit->GetInt(), obj);
1933             break;
1934         case CSeq_loc::e_Packed_int:
1935             x_ReportInvalidFuzz(lit->GetPacked_int(), obj);
1936             break;
1937         case CSeq_loc::e_Pnt:
1938             x_ReportInvalidFuzz(lit->GetPnt(), obj);
1939             break;
1940         default:
1941             break;
1942         }
1943     }
1944 }
1945 
1946 
s_CountMix(const CSeq_loc & loc)1947 unsigned int s_CountMix(const CSeq_loc& loc)
1948 {
1949     unsigned int num_mix = 0;
1950     CTypeConstIterator<CSeq_loc> lit = ConstBegin(loc);
1951     for (; lit; ++lit) {
1952         if (lit->IsMix()) {
1953             num_mix++;
1954         }
1955     }
1956     return num_mix;
1957 }
1958 
1959 
x_InitLocCheck(SLocCheck & lc,const string & prefix)1960 void CValidError_imp::x_InitLocCheck(SLocCheck& lc, const string& prefix)
1961 {
1962     lc.chk = true;
1963     lc.unmarked_strand = false;
1964     lc.mixed_strand = false;
1965     lc.has_other = false;
1966     lc.has_not_other = false;
1967     lc.id_cur = 0;
1968     lc.id_prv = 0;
1969     lc.int_cur = 0;
1970     lc.int_prv = 0;
1971     lc.strand_cur = eNa_strand_unknown;
1972     lc.strand_prv = eNa_strand_unknown;
1973     lc.prefix = prefix;
1974 }
1975 
x_CheckForStrandChange(SLocCheck & lc)1976 void CValidError_imp::x_CheckForStrandChange(SLocCheck& lc)
1977 {
1978     if (lc.strand_prv != eNa_strand_other  &&
1979         lc.strand_cur != eNa_strand_other) {
1980         if (lc.id_cur  &&  lc.id_prv  &&
1981             IsSameBioseq(*lc.id_cur, *lc.id_prv, m_Scope)) {
1982             if (lc.strand_prv != lc.strand_cur) {
1983                 if ((lc.strand_prv == eNa_strand_plus  &&
1984                     lc.strand_cur == eNa_strand_unknown)  ||
1985                     (lc.strand_prv == eNa_strand_unknown  &&
1986                     lc.strand_cur == eNa_strand_plus)) {
1987                     lc.unmarked_strand = true;
1988                 } else {
1989                     lc.mixed_strand = true;
1990                 }
1991             }
1992         }
1993     }
1994     if (lc.strand_cur == eNa_strand_other) {
1995         lc.has_other = true;
1996     } else if (lc.strand_cur == eNa_strand_minus || lc.strand_cur == eNa_strand_plus) {
1997         lc.has_not_other = true;
1998     }
1999 
2000 }
2001 
x_CheckLoc(const CSeq_loc & loc,const CSerialObject & obj,SLocCheck & lc,bool lowerSev)2002 void CValidError_imp::x_CheckLoc(const CSeq_loc& loc, const CSerialObject& obj, SLocCheck& lc, bool lowerSev)
2003 {
2004     try {
2005         switch (loc.Which()) {
2006             case CSeq_loc::e_Int:
2007                 lc.int_cur = &loc.GetInt();
2008                 lc.chk = x_CheckSeqInt(lc.id_cur, lc.int_cur, lc.strand_cur, obj);
2009                 if ((!lc.chk) && lowerSev) {
2010                     TSeqPos length = GetLength(loc.GetInt().GetId(), m_Scope);
2011                     TSeqPos fr = loc.GetInt().GetFrom();
2012                     TSeqPos to = loc.GetInt().GetTo();
2013                     if (fr < length && to >= length) {
2014                         // RefSeq variation feature with dbSNP xref and interval flanking the length is ERROR
2015                     } else {
2016                         // otherwise keep severity at REJECT
2017                         lowerSev = false;
2018                     }
2019                 }
2020                 break;
2021             case CSeq_loc::e_Pnt:
2022                 lc.strand_cur = loc.GetPnt().IsSetStrand() ?
2023                     loc.GetPnt().GetStrand() : eNa_strand_unknown;
2024                 lc.id_cur = &loc.GetPnt().GetId();
2025                 lc.chk = IsValid(loc.GetPnt(), m_Scope);
2026                 lc.int_prv = 0;
2027                 break;
2028             case CSeq_loc::e_Packed_pnt:
2029                 lc.strand_cur = loc.GetPacked_pnt().IsSetStrand() ?
2030                     loc.GetPacked_pnt().GetStrand() : eNa_strand_unknown;
2031                 lc.id_cur = &loc.GetPacked_pnt().GetId();
2032                 lc.chk = IsValid(loc.GetPacked_pnt(), m_Scope);
2033                 lc.int_prv = 0;
2034                 break;
2035             case CSeq_loc::e_Packed_int:
2036                 x_CheckPackedInt(loc.GetPacked_int(), lc, obj);
2037                 break;
2038             case CSeq_loc::e_Null:
2039                 break;
2040             case CSeq_loc::e_Mix:
2041                 for (auto l : loc.GetMix().Get()) {
2042                     x_CheckLoc(*l, obj, lc, lowerSev);
2043                     x_CheckForStrandChange(lc);
2044                 }
2045                 break;
2046             default:
2047                 lc.strand_cur = eNa_strand_other;
2048                 lc.id_cur = 0;
2049                 lc.int_prv = 0;
2050                 break;
2051         }
2052         if (!lc.chk) {
2053             string lbl = GetValidatorLocationLabel (loc, *m_Scope);
2054             EDiagSev sev = eDiag_Critical;
2055             if (lowerSev) {
2056                 sev = eDiag_Error;
2057             }
2058             PostErr(sev, eErr_SEQ_FEAT_Range,
2059                 lc.prefix + ": SeqLoc [" + lbl + "] out of range", obj);
2060         }
2061 
2062         if (loc.Which() != CSeq_loc::e_Null) {
2063             x_CheckForStrandChange(lc);
2064 
2065             lc.strand_prv = lc.strand_cur;
2066             lc.id_prv = lc.id_cur;
2067         }
2068     } catch( const exception& e ) {
2069         string label = GetValidatorLocationLabel(loc, *m_Scope);
2070         PostErr(eDiag_Error, eErr_INTERNAL_Exception,
2071             "Exception caught while validating location " +
2072             label + ". Exception: " + e.what(), obj);
2073 
2074         lc.strand_cur = eNa_strand_other;
2075         lc.id_cur = 0;
2076         lc.int_prv = 0;
2077     }
2078 
2079 }
2080 
ValidateSeqLoc(const CSeq_loc & loc,const CBioseq_Handle & seq,bool report_abutting,const string & prefix,const CSerialObject & obj,bool lowerSev)2081 void CValidError_imp::ValidateSeqLoc
2082 (const CSeq_loc& loc,
2083  const CBioseq_Handle&  seq,
2084  bool  report_abutting,
2085  const string&   prefix,
2086  const CSerialObject& obj,
2087  bool lowerSev)
2088 {
2089     SLocCheck lc;
2090 
2091     x_InitLocCheck(lc, prefix);
2092 
2093     x_CheckLoc(loc, obj, lc, lowerSev);
2094 
2095     if (lc.has_other && lc.has_not_other) {
2096         string label = GetValidatorLocationLabel(loc, *m_Scope);
2097         PostErr(IsSmallGenomeSet() ? eDiag_Warning : eDiag_Error, eErr_SEQ_FEAT_MixedStrand,
2098             prefix + ": Inconsistent use of other strand SeqLoc [" + label + "]", obj);
2099     }
2100 
2101     x_ReportInvalidFuzz(loc, obj);
2102 
2103     if (m_Scope && CValidator::DoesSeqLocContainDuplicateIntervals(loc, *m_Scope)) {
2104         PostErr(eDiag_Error,
2105             eErr_SEQ_FEAT_DuplicateExonInterval,
2106             "Duplicate exons in location", obj);
2107     }
2108 
2109     if (s_CountMix(loc) > 1) {
2110         string label;
2111         loc.GetLabel(&label);
2112         PostErr (eDiag_Error, eErr_SEQ_FEAT_NestedSeqLocMix,
2113             prefix + ": SeqLoc [" + label + "] has nested SEQLOC_MIX elements",
2114                  obj);
2115     }
2116 
2117     // Warn if different parts of a seq-loc refer to the same bioseq using
2118     // differnt id types (i.e. gi and accession)
2119     ValidateSeqLocIds(loc, obj);
2120 
2121     bool trans_splice = false;
2122     bool exception = false;
2123     const CSeq_feat* sfp = NULL;
2124     if (obj.GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
2125         sfp = dynamic_cast<const CSeq_feat*>(&obj);
2126     }
2127     if (sfp != 0) {
2128 
2129         // primer_bind intervals MAY be in on opposite strands
2130         if ( sfp->GetData().GetSubtype() == CSeqFeatData::eSubtype_primer_bind ) {
2131             lc.mixed_strand = false;
2132             lc.unmarked_strand = false;
2133         }
2134 
2135         exception = sfp->IsSetExcept() ?  sfp->GetExcept() : false;
2136         if (exception  &&  sfp->CanGetExcept_text()) {
2137             // trans splicing exception turns off both mixed_strand and
2138             // out_of_order messages
2139             if (NStr::FindNoCase(sfp->GetExcept_text(), "trans-splicing") != NPOS) {
2140                 trans_splice = true;
2141             }
2142         }
2143     }
2144 
2145     string loc_lbl;
2146     if (report_abutting && (!sfp || !CSeqFeatData::AllowAdjacentIntervals(sfp->GetData().GetSubtype())) &&
2147          (m_Scope && CValidator::DoesSeqLocContainAdjacentIntervals(loc, *m_Scope))) {
2148         loc_lbl = GetValidatorLocationLabel(loc, *m_Scope);
2149 
2150         EDiagSev sev = exception ? eDiag_Warning : eDiag_Error;
2151         PostErr(sev, eErr_SEQ_FEAT_AbuttingIntervals,
2152             prefix + ": Adjacent intervals in SeqLoc [" +
2153             loc_lbl + "]", obj);
2154     }
2155 
2156     if (trans_splice && !NStr::Equal(prefix, "Product")) {
2157         CSeq_loc_CI li(loc);
2158         ++li;
2159         if (!li) {
2160             PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadTranssplicedInterval, "Trans-spliced feature should have multiple intervals", obj);
2161         }
2162         return;
2163     }
2164 
2165     bool ordered = true;
2166     bool circular = false;
2167     if ( seq  &&
2168          seq.IsSetInst() && seq.GetInst().IsSetTopology() &&
2169          seq.GetInst().GetTopology() == CSeq_inst::eTopology_circular ) {
2170         circular = true;
2171     }
2172     try {
2173         if (m_Scope && (!sfp || CSeqFeatData::RequireLocationIntervalsInBiologicalOrder(sfp->GetData().GetSubtype())) && !circular) {
2174             ordered = CValidator::IsSeqLocCorrectlyOrdered(loc, *m_Scope);
2175         }
2176     } catch ( const CException& ex) {
2177         string label;
2178         loc.GetLabel(&label);
2179         PostErr(eDiag_Error, eErr_INTERNAL_Exception,
2180             "Exception caught while validating location " +
2181             label + ". Exception: " + ex.what(), obj);
2182     }
2183 
2184     if (lc.mixed_strand || lc.unmarked_strand || !ordered) {
2185         if (loc_lbl.empty()) {
2186             loc_lbl = GetValidatorLocationLabel(loc, *m_Scope);
2187         }
2188         if (lc.mixed_strand) {
2189             if (IsSmallGenomeSet()) {
2190                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GenomeSetMixedStrand,
2191                     prefix + ": Mixed strands in SeqLoc ["
2192                     + loc_lbl + "] in small genome set - set trans-splicing exception if appropriate", obj);
2193             } else {
2194                 EDiagSev sev = eDiag_Error;
2195                 if (IsGeneious() || (sfp && sequence::IsPseudo(*sfp, *m_Scope))) {
2196                     sev = eDiag_Warning;
2197                 }
2198                 PostErr(sev, eErr_SEQ_FEAT_MixedStrand,
2199                     prefix + ": Mixed strands in SeqLoc ["
2200                     + loc_lbl + "]", obj);
2201             }
2202         } else if (lc.unmarked_strand) {
2203             PostErr(eDiag_Warning, eErr_SEQ_FEAT_MixedStrand,
2204                 prefix + ": Mixed plus and unknown strands in SeqLoc ["
2205                 + loc_lbl + "]", obj);
2206         }
2207         if (!ordered) {
2208             if (IsSmallGenomeSet()) {
2209                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_SeqLocOrder,
2210                     prefix + ": Intervals out of order in SeqLoc [" +
2211                     loc_lbl + "]", obj);
2212             } else {
2213                 PostErr(eDiag_Error, eErr_SEQ_FEAT_SeqLocOrder,
2214                     prefix + ": Intervals out of order in SeqLoc [" +
2215                     loc_lbl + "]", obj);
2216             }
2217         }
2218         return;
2219     }
2220 
2221     if ( seq  &&
2222          seq.IsSetInst_Repr()  &&
2223          seq.GetInst_Repr() != CSeq_inst::eRepr_seg ) {
2224         return;
2225     }
2226 
2227     // Check for intervals out of order on segmented Bioseq
2228     if ( seq  &&  BadSeqLocSortOrder(seq, loc) ) {
2229         if (loc_lbl.empty()) {
2230             loc.GetLabel(&loc_lbl);
2231         }
2232         PostErr(eDiag_Error, eErr_SEQ_FEAT_SeqLocOrder,
2233             prefix + "Intervals out of order in SeqLoc [" +
2234             loc_lbl + "]", obj);
2235     }
2236 
2237     // Check for mixed strand on segmented Bioseq
2238     if ( IsMixedStrands(loc) ) {
2239         if (loc_lbl.empty()) {
2240             loc.GetLabel(&loc_lbl);
2241         }
2242         PostErr(eDiag_Error, eErr_SEQ_FEAT_MixedStrand,
2243             prefix + ": Mixed strands in SeqLoc [" +
2244             loc_lbl + "]", obj);
2245     }
2246 }
2247 
2248 
AddBioseqWithNoBiosource(const CBioseq & seq)2249 void CValidError_imp::AddBioseqWithNoBiosource(const CBioseq& seq)
2250 {
2251     if (!SeqIsPatent(seq)) {
2252         m_BioseqWithNoSource.push_back(CConstRef<CBioseq>(&seq));
2253     }
2254 }
2255 
2256 
AddProtWithoutFullRef(const CBioseq_Handle & seq)2257 void CValidError_imp::AddProtWithoutFullRef(const CBioseq_Handle& seq)
2258 {
2259     if (!SeqIsPatent (seq)) {
2260         PostErr (eDiag_Error, eErr_SEQ_FEAT_MissingProteinName,
2261                  "The product name is missing from this protein.", *(seq.GetCompleteBioseq()));
2262     }
2263 }
2264 
2265 
IsWGSIntermediate(const CBioseq & seq)2266 bool CValidError_imp::IsWGSIntermediate(const CBioseq& seq)
2267 {
2268     bool wgs = false;
2269 
2270     FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
2271         if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech()
2272             && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_wgs) {
2273             wgs = true;
2274             break;
2275         }
2276     }
2277     if (!wgs) {
2278         return false;
2279     }
2280 
2281     bool is_other = false;
2282     bool has_gi = false;
2283 
2284     FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2285         if ((*it)->IsOther()) {
2286             is_other = true;
2287             break;
2288         } else if ((*it)->IsGi()) {
2289             has_gi = true;
2290             break;
2291         }
2292     }
2293     if (!is_other || has_gi) {
2294         return false;
2295     }
2296 
2297     return true;
2298 }
2299 
2300 
IsTSAIntermediate(const CBioseq & seq)2301 bool CValidError_imp::IsTSAIntermediate(const CBioseq& seq)
2302 {
2303     bool tsa = false;
2304 
2305     FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
2306         if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech()
2307             && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_tsa) {
2308             tsa = true;
2309             break;
2310         }
2311     }
2312     if (!tsa) {
2313         return false;
2314     }
2315 
2316     bool is_other = false;
2317     bool has_gi = false;
2318 
2319     FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2320         if ((*it)->IsOther()) {
2321             is_other = true;
2322             break;
2323         } else if ((*it)->IsGi()) {
2324             has_gi = true;
2325             break;
2326         }
2327     }
2328     if (!is_other || has_gi) {
2329         return false;
2330     }
2331 
2332     return true;
2333 }
2334 
2335 
ReportMissingBiosource(const CSeq_entry & se)2336 void CValidError_imp::ReportMissingBiosource(const CSeq_entry& se)
2337 {
2338     if(m_NoBioSource  &&  !m_IsPatent  &&  !m_IsPDB) {
2339         PostErr(eDiag_Error, eErr_SEQ_DESCR_NoSourceDescriptor,
2340             "No source information included on this record.", se);
2341         return;
2342     }
2343 
2344     size_t num_no_source = m_BioseqWithNoSource.size();
2345 
2346     for ( size_t i = 0; i < num_no_source; ++i ) {
2347         PostErr(eDiag_Fatal, eErr_SEQ_DESCR_NoOrgFound,
2348                 "No organism name included in the source. Other qualifiers may exist.",
2349                 *(m_BioseqWithNoSource[i]));
2350     }
2351 }
2352 
2353 
GetCDSGivenProduct(const CBioseq & seq)2354 CConstRef<CSeq_feat> CValidError_imp::GetCDSGivenProduct(const CBioseq& seq)
2355 {
2356     CConstRef<CSeq_feat> feat;
2357 
2358     CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
2359 
2360     if ( bsh ) {
2361         if ( IsNT()  &&  m_TSE ) {
2362             // In case of a NT bioseq limit the search to features packaged on the
2363             // NT (we assume features have been pulled from the segments to the NT).
2364             SAnnotSelector sel(CSeqFeatData::e_Cdregion);
2365             sel.SetByProduct()
2366                 .SetLimitTSE(m_Scope->GetSeq_entryHandle(*m_TSE));
2367             CFeat_CI fi(bsh, sel);
2368             if ( fi ) {
2369                 // return the first one (should be the one packaged on the
2370                 // nuc-prot set).
2371                 feat.Reset(&(fi->GetOriginalFeature()));
2372             }
2373         } else {
2374             SAnnotSelector sel(CSeqFeatData::e_Cdregion);
2375             sel.SetByProduct();
2376             CFeat_CI fi(bsh, sel);
2377             if ( fi ) {
2378                 // return the first one (should be the one packaged on the
2379                 // nuc-prot set).
2380                 feat.Reset(&(fi->GetOriginalFeature()));
2381             }
2382         }
2383     }
2384 
2385     return feat;
2386 }
2387 
2388 
GetmRNAGivenProduct(const CBioseq & seq)2389 CConstRef<CSeq_feat> CValidError_imp::GetmRNAGivenProduct(const CBioseq& seq)
2390 {
2391     CConstRef<CSeq_feat> feat;
2392 
2393     CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
2394 
2395 
2396     if ( bsh ) {
2397         // In case of a NT bioseq limit the search to features packaged on the
2398         // NT (we assume features have been pulled from the segments to the NT).
2399         CSeq_entry_Handle limit;
2400         if ( IsNT()  &&  m_TSE ) {
2401             limit = m_Scope->GetSeq_entryHandle(*m_TSE);
2402         }
2403 
2404         if (limit) {
2405             SAnnotSelector sel(CSeqFeatData::eSubtype_mRNA);
2406             sel.SetByProduct() .SetLimitTSE(limit);
2407             CFeat_CI fi(bsh, sel);
2408             if ( fi ) {
2409                 // return the first one (should be the one packaged on the
2410                 // nuc-prot set).
2411                 feat.Reset(&(fi->GetOriginalFeature()));
2412             }
2413         } else {
2414             SAnnotSelector sel(CSeqFeatData::eSubtype_mRNA);
2415             sel.SetByProduct();
2416             CFeat_CI fi(bsh, sel);
2417             if ( fi ) {
2418                 // return the first one (should be the one packaged on the
2419                 // nuc-prot set).
2420                 feat.Reset(&(fi->GetOriginalFeature()));
2421             }
2422         }
2423     }
2424 
2425     return feat;
2426 }
2427 
2428 
GetAncestor(const CBioseq & seq,CBioseq_set::EClass clss)2429 const CSeq_entry* CValidError_imp::GetAncestor
2430 (const CBioseq& seq,
2431  CBioseq_set::EClass clss)
2432 {
2433     const CSeq_entry* parent = 0;
2434     for ( parent = seq.GetParentEntry();
2435           parent != 0;
2436           parent = parent->GetParentEntry() ) {
2437         if ( parent->IsSet() ) {
2438             const CBioseq_set& set = parent->GetSet();
2439             if ( set.IsSetClass()  &&  set.GetClass() == clss ) {
2440                 break;
2441             }
2442         }
2443     }
2444     return parent;
2445 }
2446 
2447 
IsSerialNumberInComment(const string & comment)2448 bool CValidError_imp::IsSerialNumberInComment(const string& comment)
2449 {
2450     size_t pos = comment.find('[', 0);
2451     while ( pos != string::npos ) {
2452         ++pos;
2453         bool okay = true;
2454         if ( isdigit((unsigned char) comment[pos]) ) {
2455             // skip if first character after bracket is 0
2456             if (comment[pos] == '0') {
2457                 okay = false;
2458             }
2459             while ( isdigit((unsigned char) comment[pos]) ) {
2460                 ++pos;
2461             }
2462             if ( comment[pos] == ']' && okay ) {
2463                 return true;
2464             }
2465         }
2466 
2467         pos = comment.find('[', pos);
2468     }
2469     return false;
2470 }
2471 
2472 
RequireLocalProduct(const CSeq_id * sid) const2473 bool CValidError_imp::RequireLocalProduct(const CSeq_id* sid) const
2474 {
2475         // okay to have far RefSeq product, but only if genomic product set
2476     if ( sid != 0  &&  sid->IsOther() ) {
2477         if ( IsGPS() ) {
2478             return false;
2479         }
2480     }
2481     // or just a bioseq
2482     if ( GetTSE().IsSeq() ) {
2483         return false;
2484     }
2485 
2486     // or in a standalone Seq-annot
2487     if (IsStandaloneAnnot() ) {
2488         return false;
2489     }
2490     return true;
2491 }
2492 
2493 
s_CollectPubDescriptorLabels(const CSeq_entry & se,vector<TEntrezId> & pmids,vector<TEntrezId> & muids,vector<int> & serials,vector<string> & published_labels,vector<string> & unpublished_labels)2494 static void s_CollectPubDescriptorLabels (const CSeq_entry& se,
2495                                           vector<TEntrezId>& pmids, vector<TEntrezId>& muids, vector<int>& serials,
2496                                           vector<string>& published_labels, vector<string>& unpublished_labels)
2497 {
2498     FOR_EACH_SEQDESC_ON_SEQENTRY (it, se) {
2499         if ((*it)->IsPub()) {
2500             CCleanup::GetPubdescLabels ((*it)->GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
2501         }
2502     }
2503 
2504     if (se.IsSet()) {
2505         FOR_EACH_SEQENTRY_ON_SEQSET (it, se.GetSet()) {
2506             s_CollectPubDescriptorLabels (**it, pmids, muids, serials, published_labels, unpublished_labels);
2507         }
2508     }
2509 }
2510 
2511 
ValidateCitations(const CSeq_entry_Handle & seh)2512 void CValidError_imp::ValidateCitations (const CSeq_entry_Handle& seh)
2513 {
2514     vector<TEntrezId> pmids;
2515     vector<TEntrezId> muids;
2516     vector<int> serials;
2517     vector<string> published_labels;
2518     vector<string> unpublished_labels;
2519 
2520     // collect labels for pubs on record
2521     s_CollectPubDescriptorLabels (*(seh.GetCompleteSeq_entry()), pmids, muids, serials, published_labels, unpublished_labels);
2522 
2523     CFeat_CI feat (seh, SAnnotSelector(CSeqFeatData::e_Pub));
2524     while (feat) {
2525         CCleanup::GetPubdescLabels (feat->GetData().GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
2526         ++feat;
2527     }
2528 
2529     // now examine citations to determine whether they match a pub on the record
2530     CFeat_CI f (seh);
2531     while (f) {
2532         if (f->IsSetCit() && f->GetCit().IsPub()) {
2533             ITERATE (CPub_set::TPub, cit_it, f->GetCit().GetPub()) {
2534                 bool found = false;
2535 
2536                 if ((*cit_it)->IsPmid()) {
2537                     vector<TEntrezId>::iterator it = pmids.begin();
2538                     while (it != pmids.end() && !found) {
2539                         if (*it == (*cit_it)->GetPmid()) {
2540                             found = true;
2541                         }
2542                         ++it;
2543                     }
2544                     if (!found) {
2545                         PostErr (eDiag_Warning, eErr_SEQ_FEAT_FeatureCitationProblem,
2546                                  "Citation on feature refers to uid ["
2547                                  + NStr::NumericToString((*cit_it)->GetPmid().Get())
2548                                  + "] not on a publication in the record",
2549                                  f->GetOriginalFeature());
2550                     }
2551                 } else if ((*cit_it)->IsMuid()) {
2552                     vector<TEntrezId>::iterator it = muids.begin();
2553                     while (it != muids.end() && !found) {
2554                         if (*it == (*cit_it)->GetMuid()) {
2555                             found = true;
2556                         }
2557                         ++it;
2558                     }
2559                     if (!found) {
2560                         PostErr (eDiag_Warning, eErr_SEQ_FEAT_FeatureCitationProblem,
2561                                  "Citation on feature refers to uid ["
2562                                  + NStr::NumericToString((*cit_it)->GetMuid())
2563                                  + "] not on a publication in the record",
2564                                  f->GetOriginalFeature());
2565                     }
2566                 } else if ((*cit_it)->IsEquiv()) {
2567                     continue;
2568                 } else {
2569                     string label;
2570                     (*cit_it)->GetLabel(&label, CPub::eContent, true);
2571 
2572                     if (NStr::EndsWith (label, ">")) {
2573                         label = label.substr(0, label.length() - 2);
2574                     }
2575                     if(NStr::EndsWith (label, "|")) {
2576                         label = label.substr(0, label.length() - 1);
2577                     }
2578                     if (NStr::EndsWith (label, "  ")) {
2579                         label = label.substr(0, label.length() - 1);
2580                     }
2581                     size_t len = label.length();
2582                     vector<string>::iterator unpub_it = unpublished_labels.begin();
2583                     while (unpub_it != unpublished_labels.end() && !found) {
2584                         size_t it_len =(*unpub_it).length();
2585                         if (NStr::EqualNocase (*unpub_it, 0, it_len > len ? len : it_len, label)) {
2586                             found = true;
2587                         }
2588                         ++unpub_it;
2589                     }
2590                     vector<string>::iterator pub_it = published_labels.begin();
2591 
2592                     while (pub_it != published_labels.end() && !found) {
2593                         size_t it_len =(*pub_it).length();
2594                         if (NStr::EqualNocase (*pub_it, 0, it_len > len ? len : it_len, label)) {
2595                             PostErr (eDiag_Warning, eErr_SEQ_FEAT_FeatureCitationProblem,
2596                                      "Citation on feature needs to be updated to published uid",
2597                                      f->GetOriginalFeature());
2598                             found = true;
2599                         }
2600                         ++pub_it;
2601                     }
2602                     if (!found) {
2603                         PostErr (eDiag_Warning, eErr_SEQ_FEAT_FeatureCitationProblem,
2604                                  "Citation on feature refers to a publication not in the record",
2605                                  f->GetOriginalFeature());
2606                     }
2607                 }
2608             }
2609         }
2610         ++f;
2611     }
2612 }
2613 
2614 
2615 // =============================================================================
2616 //                                  Private
2617 // =============================================================================
2618 
2619 
2620 
FindNonAsciiText(const CSerialObject & obj)2621 void CValidError_imp::FindNonAsciiText (const CSerialObject& obj)
2622 {
2623     CStdTypeConstIterator<string> it(obj);
2624     for( ; it; ++it) {
2625         const string& str = *it;
2626         FOR_EACH_CHAR_IN_STRING(c_it, str) {
2627             const char& ch = *c_it;
2628             unsigned char chu = ch;
2629             if (ch > 127 || (ch < 32 && ch != '\t' && ch != '\r' && ch != '\n')) {
2630                 PostErr (eDiag_Fatal, eErr_GENERIC_NonAsciiAsn,
2631                          "Non-ASCII character '" + NStr::NumericToString(chu) + "' found in item", obj);
2632                 break;
2633             }
2634         }
2635     }
2636 }
2637 
2638 
FindEmbeddedScript(const CSerialObject & obj)2639 void CValidError_imp::FindEmbeddedScript (const CSerialObject& obj)
2640 {
2641     class CScriptTagTextFsm : public CTextFsm<int>
2642     {
2643     public:
2644         CScriptTagTextFsm() {
2645             const char * script_tags[] = {
2646                 "<script", "<object", "<applet", "<embed", "<form",
2647                 "javascript:", "vbscript:"};
2648             ITERATE_0_IDX(idx, ArraySize(script_tags)) {
2649                 AddWord(script_tags[idx], true);
2650             }
2651             Prime();
2652         }
2653 
2654         // Returns true if the given string matches any of the strings
2655         // in the fsm anywhere.
2656         bool DoesStrHaveFsmHits(const string &str) {
2657             int state = GetInitialState();
2658             ITERATE(string, str_it, str) {
2659                 state = GetNextState(state, *str_it);
2660                 if( IsMatchFound(state) ) {
2661                     return true;
2662             }
2663         }
2664 
2665             return false;
2666     }
2667     };
2668     static CScriptTagTextFsm s_ScriptTagFsm;
2669 
2670 
2671     CStdTypeConstIterator<string> it(obj);
2672     for( ; it; ++it) {
2673         if (s_ScriptTagFsm.DoesStrHaveFsmHits(*it)) {
2674             PostErr (eDiag_Error, eErr_GENERIC_EmbeddedScript,
2675                      "Script tag found in item", obj);
2676             return;
2677     }
2678 }
2679 }
2680 
2681 
IsMixedStrands(const CSeq_loc & loc)2682 bool CValidError_imp::IsMixedStrands(const CSeq_loc& loc)
2683 {
2684     if ( SeqLocCheck(loc, m_Scope) == eSeqLocCheck_warning ) {
2685         return false;
2686     }
2687 
2688     CSeq_loc_CI curr(loc);
2689     if ( !curr ) {
2690         return false;
2691     }
2692     CSeq_loc_CI prev = curr;
2693     ++curr;
2694 
2695     while ( curr ) {
2696         ENa_strand curr_strand = curr.GetStrand();
2697         ENa_strand prev_strand = prev.GetStrand();
2698 
2699         if ( (prev_strand == eNa_strand_minus  &&
2700               curr_strand != eNa_strand_minus)   ||
2701              (prev_strand != eNa_strand_minus  &&
2702               curr_strand == eNa_strand_minus) ) {
2703             return true;
2704         }
2705 
2706         prev = curr;
2707         ++curr;
2708     }
2709 
2710     return false;
2711 }
2712 
2713 
s_SeqLocHasGI(const CSeq_loc & loc)2714 static bool s_SeqLocHasGI (const CSeq_loc& loc)
2715 {
2716     bool rval = false;
2717 
2718     for ( CSeq_loc_CI it(loc); it && !rval; ++it ) {
2719         if (it.GetSeq_id().IsGi()) {
2720             rval = true;
2721         }
2722     }
2723     return rval;
2724 }
2725 
2726 
SetTSE(const CSeq_entry_Handle & seh)2727 void CValidError_imp::SetTSE(const CSeq_entry_Handle& seh)
2728 {
2729     m_TSEH = seh;
2730     m_TSE = m_TSEH.GetCompleteSeq_entry();
2731     m_GeneCache.Clear();
2732 }
2733 
2734 
s_IsGoodTopSetClass(CBioseq_set::EClass set_class)2735 bool s_IsGoodTopSetClass(CBioseq_set::EClass set_class)
2736 {
2737     if (set_class == CBioseq_set::eClass_gen_prod_set || set_class == CBioseq_set::eClass_small_genome_set) {
2738         return true;
2739     } else {
2740         return false;
2741     }
2742 }
2743 
2744 
s_CountTopSetSiblings(const CSeq_entry & se)2745 size_t s_CountTopSetSiblings(const CSeq_entry& se)
2746 {
2747     if (se.IsSeq()) {
2748         return 1;
2749     } else if (!se.IsSet()) {
2750         return 0;
2751     }
2752     if (se.GetSet().IsSetClass()) {
2753         if (se.GetSet().GetClass() == CBioseq_set::eClass_nuc_prot ||
2754             s_IsGoodTopSetClass(se.GetSet().GetClass())) {
2755             return 1;
2756         }
2757     }
2758     size_t count = 0;
2759     if (se.GetSet().IsSetSeq_set()) {
2760         for (auto it = se.GetSet().GetSeq_set().begin(); it != se.GetSet().GetSeq_set().end(); it++) {
2761             count += s_CountTopSetSiblings(**it);
2762         }
2763     }
2764     return count;
2765 }
2766 
2767 
Setup(const CSeq_entry_Handle & seh)2768 void CValidError_imp::Setup(const CSeq_entry_Handle& seh)
2769 {
2770     // "Save" the Seq-entry
2771     SetTSE(seh);
2772 
2773     m_NumTopSetSiblings = s_CountTopSetSiblings(*(seh.GetCompleteSeq_entry()));
2774     m_Scope.Reset(&m_TSEH.GetScope());
2775 
2776     // If no Pubs/BioSource in CSeq_entry, post only one error
2777     CTypeConstIterator<CPub> pub(ConstBegin(*m_TSE));
2778     m_NoPubs = !pub;
2779     while (pub && !pub->IsSub()) {
2780         ++pub;
2781     }
2782     m_NoCitSubPubs = !pub;
2783 
2784     CTypeConstIterator<CBioSource> src(ConstBegin(*m_TSE));
2785     m_NoBioSource = !src;
2786 
2787     // Look for genomic product set
2788     for (CTypeConstIterator <CBioseq_set> si (*m_TSE); si; ++si) {
2789         if (si->IsSetClass ()) {
2790             if (si->GetClass () == CBioseq_set::eClass_gen_prod_set) {
2791                 m_IsGPS = true;
2792             }
2793             if (si->GetClass () == CBioseq_set::eClass_small_genome_set) {
2794                 m_IsSmallGenomeSet = true;
2795             }
2796         }
2797     }
2798 
2799     // Examine all Seq-ids on Bioseqs
2800     for (CTypeConstIterator <CBioseq> bi (*m_TSE); bi; ++bi) {
2801         FOR_EACH_SEQID_ON_BIOSEQ (sid_itr, *bi) {
2802             const CSeq_id& sid = **sid_itr;
2803             const CTextseq_id* tsid = sid.GetTextseq_Id();
2804             CSeq_id::E_Choice typ = sid.Which();
2805             switch (typ) {
2806                 case CSeq_id::e_not_set:
2807                     break;
2808                 case CSeq_id::e_Local:
2809                     break;
2810                 case CSeq_id::e_Gibbsq:
2811                     break;
2812                 case CSeq_id::e_Gibbmt:
2813                     break;
2814                 case CSeq_id::e_Giim:
2815                     break;
2816                 case CSeq_id::e_Genbank:
2817                     m_IsINSDInSep = true;
2818                     m_IsGB = true;
2819                     m_IsGED = true;
2820                     break;
2821                 case CSeq_id::e_Embl:
2822                     m_IsINSDInSep = true;
2823                     m_IsGED = true;
2824                     m_IsEmbl = true;
2825                     break;
2826                 case CSeq_id::e_Pir:
2827                     break;
2828                 case CSeq_id::e_Swissprot:
2829                     break;
2830                 case CSeq_id::e_Patent:
2831                     m_IsPatent = true;
2832                     break;
2833                 case CSeq_id::e_Other:
2834                     m_IsRefSeq = true;
2835                     // and do RefSeq subclasses up front as well
2836                     if (sid.GetOther().IsSetAccession()) {
2837                         string acc = sid.GetOther().GetAccession().substr(0, 3);
2838                         if (acc == "NC_") {
2839                             m_IsNC = true;
2840                         } else if (acc == "NG_") {
2841                             m_IsNG = true;
2842                         } else if (acc == "NM_") {
2843                             m_IsNM = true;
2844                         } else if (acc == "NP_") {
2845                             m_IsNP = true;
2846                         } else if (acc == "NR_") {
2847                             m_IsNR = true;
2848                           } else if (acc == "NZ_") {
2849                               m_IsNZ = true;
2850                         } else if (acc == "NS_") {
2851                             m_IsNS = true;
2852                         } else if (acc == "NT_") {
2853                             m_IsNT = true;
2854                         } else if (acc == "NW_") {
2855                             m_IsNW = true;
2856                         } else if (acc == "WP_") {
2857                             m_IsWP = true;
2858                         } else if (acc == "XR_") {
2859                             m_IsXR = true;
2860                         }
2861                     }
2862                     break;
2863                 case CSeq_id::e_General:
2864                     if ((*bi).IsAa() && !sid.GetGeneral().IsSkippable()) {
2865                         m_ProteinHasGeneralID = true;
2866                     }
2867                     break;
2868                 case CSeq_id::e_Gi:
2869                     m_IsGI = true;
2870                     m_HasGiOrAccnVer = true;
2871                     break;
2872                 case CSeq_id::e_Ddbj:
2873                     m_IsINSDInSep = true;
2874                     m_IsGED = true;
2875                     m_IsDdbj = true;
2876                     break;
2877                 case CSeq_id::e_Prf:
2878                     break;
2879                 case CSeq_id::e_Pdb:
2880                     m_IsPDB = true;
2881                     break;
2882                 case CSeq_id::e_Tpg:
2883                     m_IsINSDInSep = true;
2884                     break;
2885                 case CSeq_id::e_Tpe:
2886                     m_IsTPE = true;
2887                     m_IsINSDInSep = true;
2888                     break;
2889                 case CSeq_id::e_Tpd:
2890                     m_IsINSDInSep = true;
2891                     break;
2892                 case CSeq_id::e_Gpipe:
2893                     m_IsGpipe = true;
2894                     break;
2895                 default:
2896                     break;
2897             }
2898             if ( tsid && tsid->IsSetAccession() && tsid->IsSetVersion() && tsid->GetVersion() >= 1 ) {
2899                 m_HasGiOrAccnVer = true;
2900             }
2901             if (typ != CSeq_id::e_Local && typ != CSeq_id::e_General) {
2902                 m_IsLocalGeneralOnly = false;
2903             }
2904         }
2905     }
2906 
2907     // search all source descriptors for genomic source
2908     for (CSeqdesc_CI desc_ci (seh, CSeqdesc::e_Source);
2909          desc_ci && !m_IsGenomic;
2910          ++desc_ci) {
2911          if (desc_ci->GetSource().IsSetGenome()
2912              && desc_ci->GetSource().GetGenome() == CBioSource::eGenome_genomic) {
2913              m_IsGenomic = true;
2914          }
2915     }
2916 
2917     // search genome build and annotation pipeline user object descriptors
2918     for (CSeqdesc_CI desc_ci (seh, CSeqdesc::e_User);
2919          desc_ci && !m_IsGpipe;
2920          ++desc_ci) {
2921          if ( desc_ci->GetUser().IsSetType() ) {
2922              const CUser_object& obj = desc_ci->GetUser();
2923              const CObject_id& oi = obj.GetType();
2924              if ( ! oi.IsStr() ) continue;
2925              if ( NStr::CompareNocase(oi.GetStr(), "GenomeBuild") == 0 ) {
2926                  m_IsGpipe = true;
2927              } else if ( NStr::CompareNocase(oi.GetStr(), "StructuredComment") == 0 ) {
2928                  ITERATE (CUser_object::TData, field, obj.GetData()) {
2929                      if ((*field)->IsSetLabel() && (*field)->GetLabel().IsStr()) {
2930                          if (NStr::EqualNocase((*field)->GetLabel().GetStr(), "Annotation Pipeline")) {
2931                              if (NStr::EqualNocase((*field)->GetData().GetStr(), "NCBI eukaryotic genome annotation pipeline")) {
2932                                  m_IsGpipe = true;
2933                              }
2934                          }
2935                      }
2936                  }
2937              }
2938          }
2939     }
2940 
2941     // examine features for location gi, product gi, and locus tag
2942     for (CFeat_CI feat_ci (seh);
2943          feat_ci && (!m_FeatLocHasGI || !m_ProductLocHasGI || !m_GeneHasLocusTag);
2944          ++feat_ci) {
2945         if (s_SeqLocHasGI(feat_ci->GetLocation())) {
2946             m_FeatLocHasGI = true;
2947         }
2948         if (feat_ci->IsSetProduct() && s_SeqLocHasGI(feat_ci->GetProduct())) {
2949             m_ProductLocHasGI = true;
2950         }
2951         if (feat_ci->IsSetData() && feat_ci->GetData().IsGene()
2952             && feat_ci->GetData().GetGene().IsSetLocus_tag()
2953             && !NStr::IsBlank (feat_ci->GetData().GetGene().GetLocus_tag())) {
2954             m_GeneHasLocusTag = true;
2955         }
2956     }
2957 
2958     if ( m_PrgCallback ) {
2959         m_NumAlign = 0;
2960         for (CTypeConstIterator<CSeq_align> i(*m_TSE); i; ++i) {
2961             m_NumAlign++;
2962         }
2963         m_NumAnnot = 0;
2964         for (CTypeConstIterator<CSeq_annot> i(*m_TSE); i; ++i) {
2965             m_NumAnnot++;
2966         }
2967         m_NumBioseq = 0;
2968         for (CTypeConstIterator<CBioseq> i(*m_TSE); i; ++i) {
2969             m_NumBioseq++;
2970         }
2971         m_NumBioseq_set = 0;
2972         for (CTypeConstIterator<CBioseq_set> i(*m_TSE); i; ++i) {
2973             m_NumBioseq_set++;
2974         }
2975         m_NumDesc = 0;
2976         for (CTypeConstIterator<CSeqdesc> i(*m_TSE); i; ++i) {
2977             m_NumDesc++;
2978         }
2979         m_NumDescr = 0;
2980         for (CTypeConstIterator<CSeq_descr> i(*m_TSE); i; ++i) {
2981             m_NumDescr++;
2982         }
2983         m_NumFeat = 0;
2984         for (CTypeConstIterator<CSeq_feat> i(*m_TSE); i; ++i) {
2985             m_NumFeat++;
2986         }
2987         m_NumGraph = 0;
2988         for (CTypeConstIterator<CSeq_graph> i(*m_TSE); i; ++i) {
2989             m_NumGraph++;
2990         }
2991         m_PrgInfo.m_Total = m_NumAlign + m_NumAnnot + m_NumBioseq +
2992             m_NumBioseq_set + m_NumDesc + m_NumDescr + m_NumFeat +
2993             m_NumGraph;
2994     }
2995 
2996     if (CNcbiApplication::Instance()->GetProgramDisplayName() == "table2asn") {
2997         m_IsTbl2Asn = true;
2998     }
2999 }
3000 
3001 
SetScope(const CSeq_entry & se)3002 void CValidError_imp::SetScope(const CSeq_entry& se)
3003 {
3004     m_Scope.Reset(new CScope(*m_ObjMgr));
3005     m_Scope->AddTopLevelSeqEntry(*const_cast<CSeq_entry*>(&se));
3006     m_Scope->AddDefaults();
3007 }
3008 
3009 
Setup(const CSeq_annot_Handle & sah)3010 void CValidError_imp::Setup(const CSeq_annot_Handle& sah)
3011 {
3012     m_IsStandaloneAnnot = true;
3013     if (! m_Scope) {
3014         m_Scope.Reset(& sah.GetScope());
3015     }
3016     m_SeqAnnot = sah.GetCompleteSeq_annot();
3017     m_TSE.Reset(new CSeq_entry); // set a dummy Seq-entry
3018     m_TSEH = m_Scope->AddTopLevelSeqEntry(*m_TSE);
3019 }
3020 
3021 
Setup(const CBioseq & seq)3022 CSeq_entry_Handle CValidError_imp::Setup(const CBioseq& seq)
3023 {
3024     m_Scope.Reset(new CScope(*m_ObjMgr));
3025     CRef<CSeq_entry> tmp_entry(new CSeq_entry());
3026     tmp_entry->SetSeq().Assign(seq);
3027     m_TSE.Reset(tmp_entry);
3028     m_TSEH = m_Scope->AddTopLevelSeqEntry(*m_TSE);
3029     Setup(m_TSEH);
3030     return m_TSEH;
3031 }
3032 
3033 
ValidateSeqLocIds(const CSeq_loc & loc,const CSerialObject & obj)3034 void CValidError_imp::ValidateSeqLocIds
3035 (const CSeq_loc& loc,
3036  const CSerialObject& obj)
3037 {
3038     for ( CSeq_loc_CI lit(loc); lit; ++lit ) {
3039         const CSeq_id& id1 = lit.GetSeq_id();
3040         CSeq_loc_CI  lit2 = lit;
3041         for ( ++lit2; lit2; ++lit2 ) {
3042             const CSeq_id& id2 = lit2.GetSeq_id();
3043             if ( IsSameBioseq(id1, id2, m_Scope)  &&  !id1.Match(id2) ) {
3044                 PostErr(eDiag_Warning,
3045                     eErr_SEQ_FEAT_DifferntIdTypesInSeqLoc,
3046                     "Two ids refer to the same bioseq but are of "
3047                     "different type", obj);
3048             }
3049         }
3050         if (IsTemporary(id1)) {
3051             PostErr(eDiag_Critical, eErr_SEQ_INST_BadSeqIdFormat,
3052                 "Feature locations should not use Seq-ids that will be stripped during ID load", obj);
3053         }
3054     }
3055     if (BadMultipleSequenceLocation(loc, *m_Scope)) {
3056         PostErr(eDiag_Error, eErr_SEQ_FEAT_BadLocation,
3057             "Feature location intervals should all be on the same sequence", obj);
3058     }
3059 }
3060 
3061 
IsInOrganelleSmallGenomeSet(const CSeq_id & id,CScope & scope)3062 bool CValidError_imp::IsInOrganelleSmallGenomeSet(const CSeq_id& id, CScope& scope)
3063 {
3064     CBioseq_Handle bsh = scope.GetBioseqHandle(id);
3065     if (!bsh) {
3066         // can't fetch bioseq, can't tell, assume not
3067         return false;
3068     }
3069     CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
3070     if (!src || !src->GetSource().IsSetGenome() || !IsOrganelle(src->GetSource().GetGenome())) {
3071         // not an organelle location
3072         return false;
3073     }
3074     CBioseq_set_Handle set = bsh.GetParentBioseq_set();
3075     while (set) {
3076         if (!set.IsSetClass()) {
3077             // class not set - quit
3078             break;
3079         } else if (set.GetClass() == CBioseq_set::eClass_small_genome_set) {
3080             return true;
3081         } else if (set.GetClass() == CBioseq_set::eClass_nuc_prot) {
3082             // look at parent
3083             set = set.GetParentBioseq_set();
3084         } else {
3085             break;
3086         }
3087     }
3088     return false;
3089 }
3090 
3091 // all ids in a location should point to the same sequence, unless the sequences are
3092 // in an organelle small genome set
BadMultipleSequenceLocation(const CSeq_loc & loc,CScope & scope)3093 bool CValidError_imp::BadMultipleSequenceLocation(const CSeq_loc& loc, CScope& scope)
3094 {
3095     CSeq_loc_CI lit(loc);
3096     const CSeq_id& id1 = lit.GetSeq_id();
3097 
3098     bool in_organelle_small_genome_set = IsInOrganelleSmallGenomeSet(id1, scope);
3099 
3100     ++lit;
3101     while (lit) {
3102         const CSeq_id& id2 = lit.GetSeq_id();
3103         if (in_organelle_small_genome_set && !IsInOrganelleSmallGenomeSet(id2, scope)) {
3104             // if one sequence in small genome set and other not, this is bad
3105             return true;
3106         }
3107         if (!id2.Match(id1) && !IsSameBioseq(id1, id2, &scope) && !in_organelle_small_genome_set) {
3108             return true;
3109         }
3110         ++lit;
3111     }
3112     return false;
3113 }
3114 
3115 
x_IsFarFetchFailure(const CSeq_loc & loc)3116 bool CValidError_imp::x_IsFarFetchFailure (const CSeq_loc& loc)
3117 {
3118     if (!IsFarFetchMRNAproducts() && !IsFarFetchCDSproducts()
3119         && IsFarLocation(loc, GetTSEH())) {
3120         return true;
3121     } else {
3122         return false;
3123     }
3124 }
3125 
3126 
3127 //LCOV_EXCL_START
3128 // not used by asnvalidate, used by external programs
GetTSANStretchErrors(const CSeq_entry_Handle & se)3129 bool CValidError_imp::GetTSANStretchErrors(const CSeq_entry_Handle& se)
3130 {
3131     bool rval = false;
3132     Setup(se);
3133     CValidError_bioseq bioseq_validator(*this);
3134     CBioseq_CI bi(se, CSeq_inst::eMol_na);
3135     while (bi) {
3136         rval |= bioseq_validator.GetTSANStretchErrors(*(bi->GetCompleteBioseq()));
3137         ++bi;
3138     }
3139     return rval;
3140 }
3141 
3142 
GetTSANStretchErrors(const CBioseq & seq)3143 bool CValidError_imp::GetTSANStretchErrors(const CBioseq& seq)
3144 {
3145     CSeq_entry_Handle seh = Setup(seq);
3146     CValidError_bioseq bioseq_validator(*this);
3147     return bioseq_validator.GetTSANStretchErrors(*(seh.GetSeq().GetCompleteBioseq()));
3148 }
3149 
3150 
GetTSACDSOnMinusStrandErrors(const CSeq_entry_Handle & se)3151 bool CValidError_imp::GetTSACDSOnMinusStrandErrors (const CSeq_entry_Handle& se)
3152 {
3153     bool rval = false;
3154     Setup(se);
3155     CValidError_feat feat_validator(*this);
3156     CFeat_CI fi(se);
3157     while (fi) {
3158         CBioseq_Handle bsh = se.GetScope().GetBioseqHandle(fi->GetLocation());
3159         if (bsh) {
3160             rval |= feat_validator.GetTSACDSOnMinusStrandErrors(*(fi->GetSeq_feat()), *(bsh.GetCompleteBioseq()));
3161         }
3162         ++fi;
3163     }
3164 
3165     return rval;
3166 }
3167 
3168 
GetTSACDSOnMinusStrandErrors(const CSeq_feat & f,const CBioseq & seq)3169 bool CValidError_imp::GetTSACDSOnMinusStrandErrors (const CSeq_feat& f, const CBioseq& seq)
3170 {
3171     CSeq_entry_Handle seh = Setup(seq);
3172     CValidError_feat feat_validator(*this);
3173     return feat_validator.GetTSACDSOnMinusStrandErrors(f, *(seh.GetSeq().GetCompleteBioseq()));
3174 }
3175 
3176 
GetTSAConflictingBiomolTechErrors(const CSeq_entry_Handle & se)3177 bool CValidError_imp::GetTSAConflictingBiomolTechErrors (const CSeq_entry_Handle& se)
3178 {
3179     bool rval = false;
3180     Setup(se);
3181     CValidError_bioseq bioseq_validator(*this);
3182     CBioseq_CI bi(se, CSeq_inst::eMol_na);
3183     while (bi) {
3184         rval |= bioseq_validator.GetTSAConflictingBiomolTechErrors(*(bi->GetCompleteBioseq()));
3185         ++bi;
3186     }
3187     return rval;
3188 }
3189 
3190 
GetTSAConflictingBiomolTechErrors(const CBioseq & seq)3191 bool CValidError_imp::GetTSAConflictingBiomolTechErrors (const CBioseq& seq)
3192 {
3193     CSeq_entry_Handle seh = Setup(seq);
3194     CValidError_bioseq bioseq_validator(*this);
3195     return bioseq_validator.GetTSAConflictingBiomolTechErrors(*(seh.GetSeq().GetCompleteBioseq()));
3196 }
3197 //LCOV_EXCL_STOP
3198 
3199 
x_GetTaxonService()3200 ITaxon3* CValidError_imp::x_GetTaxonService()
3201 {
3202     if (m_taxon == NULL) {
3203         //Impossible to reach code, as c'tor requires non-null taxon service
3204         throw runtime_error("Taxon service not defined by CValidator");
3205     }
3206     return m_taxon;
3207 }
3208 
3209 
3210 const string kTooShort = "Too Short";
3211 const string kMissingPrimers = "Missing Primers";
3212 const string kMissingCountry = "Missing Country";
3213 const string kMissingVoucher = "Missing Voucher";
3214 const string kBadCollectionDate = "Bad Collection Date";
3215 const string kTooManyNs = "Too Many Ns";
3216 const string kMissingOrderAssignment = "Missing Order Assignment";
3217 const string kLowTrace = "Low Trace";
3218 const string kFrameShift = "Frame Shift";
3219 const string kStructuredVoucher = "Structured Voucher";
3220 
3221 #define ADD_BARCODE_ERR(TestName) \
3222     PostErr(eDiag_Warning, eErr_GENERIC_Barcode##TestName, k##TestName, sq); \
3223     if (!msg.empty()) { \
3224         msg += ","; \
3225     } \
3226     msg += k##TestName;
3227 
x_DoBarcodeTests(CSeq_entry_Handle seh)3228 void CValidError_imp::x_DoBarcodeTests(CSeq_entry_Handle seh)
3229 {
3230     TBarcodeResults results = GetBarcodeValues(seh);
3231     for (auto r : results) {
3232         const CBioseq& sq = *(r.bsh.GetCompleteBioseq());
3233         if (BarcodeTestFails(r)){
3234             string msg;
3235             if (r.length) {
3236                 ADD_BARCODE_ERR(TooShort)
3237             }
3238             if (r.primers) {
3239                 ADD_BARCODE_ERR(MissingPrimers)
3240             }
3241             if (r.country) {
3242                 ADD_BARCODE_ERR(MissingCountry)
3243             }
3244             if (r.voucher) {
3245                 ADD_BARCODE_ERR(MissingVoucher)
3246             }
3247             if (!r.percent_n.empty()) {
3248                 PostErr(eDiag_Warning, eErr_GENERIC_BarcodeTooManyNs, kTooManyNs + ":" + r.percent_n, sq);
3249                 if (!msg.empty()) {
3250                     msg += ",";
3251                 }
3252                 msg += kTooManyNs + ":" + r.percent_n;
3253             }
3254             if (r.collection_date) {
3255                 ADD_BARCODE_ERR(BadCollectionDate)
3256             }
3257             if (r.order_assignment) {
3258                 ADD_BARCODE_ERR(MissingOrderAssignment)
3259             }
3260             if (r.low_trace) {
3261                 ADD_BARCODE_ERR(LowTrace)
3262             }
3263             if (r.frame_shift) {
3264                 ADD_BARCODE_ERR(FrameShift)
3265             }
3266             if (!r.structured_voucher) {
3267                 ADD_BARCODE_ERR(StructuredVoucher)
3268             }
3269             PostErr(eDiag_Info, eErr_GENERIC_BarcodeTestFails, "FAIL (" + msg + ")", sq);
3270         } else {
3271             PostErr(eDiag_Info, eErr_GENERIC_BarcodeTestPasses, "PASS", sq);
3272         }
3273     }
3274 }
3275 
3276 
3277 // =============================================================================
3278 //                         CValidError_base Implementation
3279 // =============================================================================
3280 
3281 
CValidError_base(CValidError_imp & imp)3282 CValidError_base::CValidError_base(CValidError_imp& imp) :
3283     m_Imp(imp), m_Scope(imp.GetScope())
3284 {
3285 }
3286 
3287 
~CValidError_base()3288 CValidError_base::~CValidError_base()
3289 {
3290 }
3291 
3292 
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSerialObject & obj)3293 void CValidError_base::PostErr
3294 (EDiagSev sv,
3295  EErrType et,
3296  const string& msg,
3297  const CSerialObject& obj)
3298 {
3299     m_Imp.PostErr(sv, et, msg, obj);
3300 }
3301 
3302 
3303 //void CValidError_base::PostErr
3304 //(EDiagSev sv,
3305 // EErrType et,
3306 // const string& msg,
3307 // TDesc ds)
3308 //{
3309 //    m_Imp.PostErr(sv, et, msg, ds);
3310 //}
3311 
3312 
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSeq_feat & ft)3313 void CValidError_base::PostErr
3314 (EDiagSev sv,
3315  EErrType et,
3316  const string& msg,
3317  const CSeq_feat& ft)
3318 {
3319     m_Imp.PostErr(sv, et, msg, ft);
3320 }
3321 
3322 
PostErr(EDiagSev sv,EErrType et,const string & msg,const CBioseq & sq)3323 void CValidError_base::PostErr
3324 (EDiagSev sv,
3325  EErrType et,
3326  const string& msg,
3327  const CBioseq& sq)
3328 {
3329     m_Imp.PostErr(sv, et, msg, sq);
3330 }
3331 
3332 
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSeq_entry & ctx,const CSeqdesc & ds)3333 void CValidError_base::PostErr
3334 (EDiagSev sv,
3335  EErrType et,
3336  const string& msg,
3337  const CSeq_entry& ctx,
3338  const CSeqdesc& ds)
3339 {
3340     m_Imp.PostErr(sv, et, msg, ctx, ds);
3341 }
3342 
3343 
PostErr(EDiagSev sv,EErrType et,const string & msg,const CBioseq_set & set)3344 void CValidError_base::PostErr
3345 (EDiagSev sv,
3346  EErrType et,
3347  const string& msg,
3348  const CBioseq_set& set)
3349 {
3350     m_Imp.PostErr(sv, et, msg, set);
3351 }
3352 
3353 
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSeq_annot & annot)3354 void CValidError_base::PostErr
3355 (EDiagSev sv,
3356  EErrType et,
3357  const string& msg,
3358  const CSeq_annot& annot)
3359 {
3360     m_Imp.PostErr(sv, et, msg, annot);
3361 }
3362 
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSeq_graph & graph)3363 void CValidError_base::PostErr
3364 (EDiagSev sv,
3365  EErrType et,
3366  const string& msg,
3367  const CSeq_graph& graph)
3368 {
3369     m_Imp.PostErr(sv, et, msg, graph);
3370 }
3371 
3372 
PostErr(EDiagSev sv,EErrType et,const string & msg,const CBioseq & sq,const CSeq_graph & graph)3373 void CValidError_base::PostErr
3374 (EDiagSev sv,
3375  EErrType et,
3376  const string& msg,
3377  const CBioseq& sq,
3378  const CSeq_graph& graph)
3379 {
3380     m_Imp.PostErr(sv, et, msg, sq, graph);
3381 }
3382 
3383 
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSeq_align & align)3384 void CValidError_base::PostErr
3385 (EDiagSev sv,
3386  EErrType et,
3387  const string& msg,
3388  const CSeq_align& align)
3389 {
3390     m_Imp.PostErr(sv, et, msg, align);
3391 }
3392 
3393 
PostErr(EDiagSev sv,EErrType et,const string & msg,const CSeq_entry & entry)3394 void CValidError_base::PostErr
3395 (EDiagSev sv,
3396  EErrType et,
3397  const string& msg,
3398  const CSeq_entry& entry)
3399 {
3400     m_Imp.PostErr(sv, et, msg, entry);
3401 }
3402 
3403 CCacheImpl &
GetCache(void)3404 CValidError_base::GetCache(void)
3405 {
3406     return m_Imp.GetCache();
3407 }
3408 
3409 
s_HasTopSetSiblings(CSeq_entry_Handle seh)3410 bool s_HasTopSetSiblings(CSeq_entry_Handle seh)
3411 {
3412     CSeq_entry_Handle parent = seh.GetParentEntry();
3413     if (!parent || !parent.IsSet()) {
3414         return false;
3415     }
3416     CConstRef<CBioseq_set> pset = parent.GetSet().GetCompleteBioseq_set();
3417     if (!pset) {
3418         return false;
3419     }
3420     if (pset->IsSetSeq_set() && pset->GetSeq_set().size() > 10) {
3421         return true;
3422     } else {
3423         return s_HasTopSetSiblings(parent);
3424     }
3425 }
3426 
3427 
GetAppropriateXrefParent(CSeq_entry_Handle seh)3428 CSeq_entry_Handle CValidError_base::GetAppropriateXrefParent(CSeq_entry_Handle seh)
3429 {
3430     CSeq_entry_Handle appropriate_parent;
3431 
3432     CSeq_entry_Handle np;
3433     CSeq_entry_Handle gps;
3434     if (seh.IsSet() && seh.GetSet().IsSetClass()) {
3435         if (seh.GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
3436             np = seh;
3437         } else if (s_IsGoodTopSetClass(seh.GetSet().GetClass())) {
3438             gps = seh;
3439         }
3440     } else if (seh.IsSeq()) {
3441         CSeq_entry_Handle p = seh.GetParentEntry();
3442         if (p && p.IsSet() && p.GetSet().IsSetClass()) {
3443             if (p.GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
3444                 np = p;
3445             } else if (s_IsGoodTopSetClass(p.GetSet().GetClass())) {
3446                 gps = p;
3447             }
3448         }
3449     }
3450     if (gps) {
3451         appropriate_parent = gps;
3452     } else if (np) {
3453         CSeq_entry_Handle gp = np.GetParentEntry();
3454         if (gp && gp.IsSet() && gp.GetSet().IsSetClass() &&
3455             s_IsGoodTopSetClass(gp.GetSet().GetClass())) {
3456             appropriate_parent = gp;
3457         } else {
3458             appropriate_parent = np;
3459         }
3460     } else {
3461         appropriate_parent = seh;
3462     }
3463     return appropriate_parent;
3464 }
3465 
3466 
3467 const CCacheImpl::CPubdescInfo &
GetPubdescToInfo(CConstRef<CPubdesc> pub)3468 CCacheImpl::GetPubdescToInfo(
3469     CConstRef<CPubdesc> pub)
3470 {
3471     // first, try to receive from cache
3472     CCacheImpl::TPubdescCache::const_iterator find_iter =
3473         m_pubdescCache.find(pub);
3474     if( find_iter != m_pubdescCache.end() ) {
3475         return *find_iter->second;
3476     }
3477 
3478     CRef<CPubdescInfo> pInfo(new CPubdescInfo);
3479     CCleanup::GetPubdescLabels(
3480         *pub, pInfo->m_pmids, pInfo->m_muids,
3481         pInfo->m_serials, pInfo->m_published_labels,
3482         pInfo->m_unpublished_labels);
3483     m_pubdescCache[pub] = pInfo;
3484     return *pInfo;
3485 }
3486 
3487 bool
operator <(const SFeatKey & rhs) const3488 CCacheImpl::SFeatKey::operator<(
3489     const SFeatKey & rhs) const
3490 {
3491     if( feat_type != rhs.feat_type ) {
3492         return feat_type < rhs.feat_type;
3493     } else if( feat_subtype != rhs.feat_subtype ) {
3494         return feat_subtype < rhs.feat_subtype;
3495      } else {
3496         return bioseq_h < rhs.bioseq_h;
3497     }
3498 }
3499 
3500 bool
operator ==(const SFeatKey & rhs) const3501 CCacheImpl::SFeatKey::operator==(
3502     const SFeatKey & rhs) const
3503 {
3504     return (feat_type == rhs.feat_type) &&
3505         (feat_subtype == rhs.feat_subtype) && (bioseq_h == rhs.bioseq_h);
3506 }
3507 
3508 const CCacheImpl::TFeatValue &
GetFeatFromCache(const CCacheImpl::SFeatKey & featKey)3509 CCacheImpl::GetFeatFromCache(
3510     const CCacheImpl::SFeatKey & featKey)
3511 {
3512     // check common case where already in the cache
3513     TFeatCache::iterator find_iter = m_featCache.find(featKey);
3514     if( find_iter != m_featCache.end() ) {
3515         return find_iter->second;
3516     }
3517 
3518     // check if bioseq already processed, but had no entry requested above
3519     SFeatKey bioseq_check_key(
3520         kAnyFeatType, kAnyFeatSubtype, featKey.bioseq_h );
3521     TFeatCache::const_iterator bioseq_find_iter =
3522         m_featCache.find(bioseq_check_key);
3523     if( bioseq_find_iter != m_featCache.end() ) {
3524         const static TFeatValue kEmptyFeatValue;
3525         // bioseq was already processed,
3526         // it just happened to not have an entry here
3527         return kEmptyFeatValue;
3528     }
3529 
3530     // bioseq never added to cache, so calculate that now
3531 
3532     // to avoid expensive constructions of CFeat_CI's,
3533     // we iterate through all the seqs on
3534     // the bioseq and load them into the cache.
3535     CFeat_CI feat_ci(featKey.bioseq_h);
3536     for( ; feat_ci; ++feat_ci ) {
3537         SFeatKey inner_feat_key(
3538             feat_ci->GetFeatType(), feat_ci->GetFeatSubtype(), featKey.bioseq_h);
3539 
3540         m_featCache[inner_feat_key].push_back(*feat_ci);
3541 
3542         // also add "don't care" entries for partial searches
3543         // (e.g. if caller just wants to search on type but not on
3544         // subtype they can set subtype to kAnyFeatSubtype)
3545         SFeatKey any_type_key = inner_feat_key;
3546         any_type_key.feat_type = kAnyFeatType;
3547         m_featCache[any_type_key].push_back(*feat_ci);
3548 
3549         SFeatKey any_subtype_key = inner_feat_key;
3550         any_subtype_key.feat_subtype = kAnyFeatSubtype;
3551         m_featCache[any_subtype_key].push_back(*feat_ci);
3552 
3553         // for when the caller wants all feats on a bioseq
3554         SFeatKey any_type_or_subtype_key = inner_feat_key;
3555         any_type_or_subtype_key.feat_type = kAnyFeatType;
3556         any_type_or_subtype_key.feat_subtype = kAnyFeatSubtype;
3557         m_featCache[any_type_or_subtype_key].push_back(*feat_ci);
3558     }
3559 
3560     // in case a bioseq has no features, we add a dummy key just to
3561     // remember that so we don't use CFeat_CI again on the same bioseq
3562     m_featCache[bioseq_check_key]; // gets default val
3563 
3564     return m_featCache[featKey];
3565 }
3566 
3567 AutoPtr<CCacheImpl::TFeatValue>
GetFeatFromCacheMulti(const vector<SFeatKey> & featKeys)3568 CCacheImpl::GetFeatFromCacheMulti(
3569         const vector<SFeatKey> &featKeys)
3570 {
3571     if( featKeys.empty() ) {
3572         return new TFeatValue;
3573     }
3574 
3575     // all featKeys must have the same bioseq
3576     const CBioseq_Handle & bioseq_h = featKeys[0].bioseq_h;
3577     ITERATE(vector<SFeatKey>, feat_it, featKeys) {
3578         if( feat_it->bioseq_h != bioseq_h ) {
3579             throw runtime_error("GetFeatFromCacheMulti must be called with only 1 bioseq in its args");
3580         }
3581     }
3582 
3583     // set prevents dups
3584     set<TFeatValue::value_type> set_of_feats;
3585 
3586     // combine the answers from every key into the set
3587     ITERATE(vector<SFeatKey>, key_it, featKeys  ) {
3588         const TFeatValue & feat_value = GetFeatFromCache(*key_it);
3589         copy(BEGIN_COMMA_END(feat_value), inserter(
3590                  set_of_feats, set_of_feats.begin()));
3591     }
3592 
3593     // go through every feature on the bioseq and remember any that match what's in the set
3594     // (The purpose of this step is to return the feats in the same
3595     // order they were on the original bioseq.  In the future, we may
3596     // consider adding a flag to avoid sorting for time purposes).
3597     AutoPtr<TFeatValue> answer(new TFeatValue);
3598     SFeatKey all_feats_key(
3599         kAnyFeatType, kAnyFeatSubtype, bioseq_h);
3600     const TFeatValue & all_feats_vec = GetFeatFromCache(all_feats_key);
3601     ITERATE(TFeatValue, feat_it, all_feats_vec) {
3602         if( set_of_feats.find(*feat_it) != set_of_feats.end() ) {
3603             answer->push_back(*feat_it);
3604         }
3605     }
3606 
3607     return answer;
3608 }
3609 
3610 
3611 //LCOV_EXCL_START
3612 //not used
3613 bool
operator <(const SFeatStrKey & rhs) const3614 CCacheImpl::SFeatStrKey::operator<(const SFeatStrKey & rhs) const
3615 {
3616     if( m_eFeatKeyStr != rhs.m_eFeatKeyStr ) {
3617         return m_eFeatKeyStr < rhs.m_eFeatKeyStr;
3618     }
3619     if( m_bioseq != rhs.m_bioseq ) {
3620         return m_bioseq < rhs.m_bioseq;
3621     }
3622     return s_QuickStringLess(m_feat_str, rhs.m_feat_str);
3623 }
3624 
3625 
3626 bool
operator ==(const SFeatStrKey & rhs) const3627 CCacheImpl::SFeatStrKey::operator==(const SFeatStrKey & rhs) const
3628 {
3629     if( m_eFeatKeyStr != rhs.m_eFeatKeyStr ) {
3630         return false;
3631     }
3632     if( m_bioseq != rhs.m_bioseq ) {
3633         return false;
3634     }
3635     return (m_feat_str == rhs.m_feat_str);
3636 }
3637 
3638 
3639 const CCacheImpl::TFeatValue &
GetFeatStrKeyToFeats(const SFeatStrKey & feat_str_key,const CTSE_Handle & tse_arg)3640 CCacheImpl::GetFeatStrKeyToFeats(
3641     const SFeatStrKey & feat_str_key, const CTSE_Handle & tse_arg)
3642 {
3643     const CBioseq_Handle & search_bsh = feat_str_key.m_bioseq;
3644 
3645     // caller must give us something to work with
3646     _ASSERT(search_bsh || tse_arg);
3647 
3648     const CTSE_Handle & tse = (tse_arg ? tse_arg : search_bsh.GetTSE_Handle());
3649 
3650     // load cache if empty
3651     if( m_featStrKeyToFeatsCache.empty() ) {
3652         // (for now just indexes genes, but more may be added in the future)
3653         SAnnotSelector sel(CSeqFeatData::e_Gene);
3654         AutoPtr<CFeat_CI> p_gene_ci;
3655         // if we have TSE, get all features on it; otherwise, just get
3656         // the features from the bioseq
3657         if( tse ) {
3658             p_gene_ci.reset(new CFeat_CI(tse, sel));
3659         } else {
3660             p_gene_ci.reset(new CFeat_CI(search_bsh, sel));
3661         }
3662         CFeat_CI & gene_ci = *p_gene_ci; // for convenience
3663 
3664         for( ; gene_ci; ++gene_ci ) {
3665             CBioseq_Handle bsh = tse.GetScope().GetBioseqHandle(gene_ci->GetLocation());
3666             string label;
3667             const CGene_ref & gene_ref = gene_ci->GetData().GetGene();
3668 
3669             // for each one, add an entry for using given Bioseq and the
3670             // kAnyBioseq (so users can search on any bioseq)
3671             gene_ref.GetLabel(&label);
3672             SFeatStrKey label_key(eFeatKeyStr_Label, bsh, label);
3673             m_featStrKeyToFeatsCache[label_key].push_back(*gene_ci);
3674             if( bsh ) {
3675                 label_key.m_bioseq = kAnyBioseq;
3676                 m_featStrKeyToFeatsCache[label_key].push_back(*gene_ci);
3677             }
3678 
3679             const string & locus_tag = (
3680                 gene_ref.IsSetLocus_tag() ? gene_ref.GetLocus_tag() :
3681                 kEmptyStr);
3682             SFeatStrKey locus_tag_key(eFeatKeyStr_LocusTag, bsh, locus_tag);
3683             m_featStrKeyToFeatsCache[locus_tag_key].push_back(*gene_ci);
3684             if( bsh ) {
3685                 locus_tag_key.m_bioseq = kAnyBioseq;
3686                 m_featStrKeyToFeatsCache[locus_tag_key].push_back(*gene_ci);
3687             }
3688         }
3689     }
3690 
3691     // get from cache, if possible
3692     TFeatStrKeyToFeatsCache::const_iterator find_iter =
3693         m_featStrKeyToFeatsCache.find(feat_str_key);
3694     if( find_iter != m_featStrKeyToFeatsCache.end() ) {
3695         return find_iter->second;
3696     } else {
3697         // nothing found
3698         return kEmptyFeatValue;
3699     }
3700 }
3701 
3702 
3703 const CCacheImpl::TFeatToBioseqValue &
GetBioseqsOfFeatCache(const CCacheImpl::TFeatToBioseqKey & feat_to_bioseq_key,const CTSE_Handle & tse)3704 CCacheImpl::GetBioseqsOfFeatCache(
3705     const CCacheImpl::TFeatToBioseqKey & feat_to_bioseq_key,
3706     const CTSE_Handle & tse)
3707 {
3708     // load cache if empty
3709     if( m_featToBioseqCache.empty() ) {
3710         CBioseq_CI bioseq_ci(tse);
3711         for( ; bioseq_ci; ++bioseq_ci ) {
3712             CFeat_CI feat_ci(*bioseq_ci);
3713             for( ;  feat_ci; ++feat_ci ) {
3714                 m_featToBioseqCache[*feat_ci].insert(*bioseq_ci);
3715             }
3716         }
3717     }
3718 
3719     // we're being given the map to a feature, so we should've loaded
3720     // at least one feature when we loaded the cache
3721     _ASSERT( ! m_featToBioseqCache.empty() );
3722 
3723     // load from the cache
3724     TFeatToBioseqCache::const_iterator find_iter =
3725         m_featToBioseqCache.find(feat_to_bioseq_key);
3726     if( find_iter != m_featToBioseqCache.end() ) {
3727         return find_iter->second;
3728     } else {
3729         const static  TFeatToBioseqValue kEmptyFeatToBioseqCache;
3730         return kEmptyFeatToBioseqCache;
3731     }
3732 }
3733 //LCOV_EXCL_STOP
3734 
3735 const CCacheImpl::TIdToBioseqValue &
GetIdToBioseq(const CCacheImpl::TIdToBioseqKey & key,const CTSE_Handle & tse)3736 CCacheImpl::GetIdToBioseq(
3737     const CCacheImpl::TIdToBioseqKey & key,
3738     const CTSE_Handle & tse)
3739 {
3740     _ASSERT(tse);
3741 
3742     // load cache if empty
3743     if( m_IdToBioseqCache.empty() ) {
3744         CBioseq_CI bioseq_ci(tse);
3745         for( ; bioseq_ci; ++bioseq_ci ) {
3746             const CBioseq_Handle::TId & ids = bioseq_ci->GetId();
3747             ITERATE(CBioseq_Handle::TId, id_it, ids) {
3748                 m_IdToBioseqCache[id_it->GetSeqId()] = *bioseq_ci;
3749             }
3750         }
3751     }
3752 
3753     // there should be at least one Bioseq otherwise there wouldn't
3754     // be anything to validate.
3755     _ASSERT(! m_IdToBioseqCache.empty());
3756 
3757     TIdToBioseqCache::const_iterator find_iter = m_IdToBioseqCache.find(key);
3758     if( find_iter != m_IdToBioseqCache.end() ) {
3759         return find_iter->second;
3760     } else {
3761         static const TIdToBioseqValue s_EmptyResult;
3762         return s_EmptyResult;
3763     }
3764 }
3765 
3766 CBioseq_Handle
GetBioseqHandleFromLocation(CScope * scope,const CSeq_loc & loc,const CTSE_Handle & tse)3767 CCacheImpl::GetBioseqHandleFromLocation(
3768     CScope *scope, const CSeq_loc& loc, const CTSE_Handle & tse)
3769 {
3770     _ASSERT(scope || tse);
3771     if( ! tse  || (!tse.GetTopLevelEntry().IsSet() && !tse.GetTopLevelEntry().IsSeq())) {
3772         // fall back on old style
3773         return BioseqHandleFromLocation(scope, loc);
3774     }
3775 
3776 
3777     for ( CSeq_loc_CI citer (loc); citer; ++citer) {
3778         CConstRef<CSeq_id> id(&citer.GetSeq_id());
3779         const TIdToBioseqValue & bioseq = GetIdToBioseq(id, tse);
3780         if( bioseq ) {
3781             return bioseq;
3782         }
3783     }
3784 
3785     // nothing found, so fall back on old style if possible
3786     if( scope ) {
3787         return BioseqHandleFromLocation(scope, loc);
3788     } else {
3789         return kEmptyBioseqHandle;
3790     }
3791 }
3792 
3793 
Clear()3794 void CCacheImpl::Clear()
3795 {
3796     m_pubdescCache.clear();
3797     m_featCache.clear();
3798     m_featStrKeyToFeatsCache.clear();
3799     m_featToBioseqCache.clear();
3800     m_IdToBioseqCache.clear();
3801 }
3802 
3803 
3804 END_SCOPE(validator)
3805 END_SCOPE(objects)
3806 END_NCBI_SCOPE
3807