1 /* $Id: validerror_bioseqset.cpp 632834 2021-06-08 16:47:06Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko......
27 *
28 * File Description:
29 * validation of bioseq_set
30 * .......
31 *
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <objtools/validator/validerror_desc.hpp>
36 #include <objtools/validator/validerror_descr.hpp>
37 #include <objtools/validator/validerror_annot.hpp>
38 #include <objtools/validator/validerror_bioseq.hpp>
39 #include <objtools/validator/validerror_bioseqset.hpp>
40 #include <objtools/validator/validerror_base.hpp>
41 #include <objmgr/util/sequence.hpp>
42 #include <objects/seqset/Seq_entry.hpp>
43 #include <objects/seqset/Bioseq_set.hpp>
44 #include <objects/misc/sequence_macros.hpp>
45 #include <objmgr/seqdesc_ci.hpp>
46 #include <objmgr/seq_annot_ci.hpp>
47
48
49 BEGIN_NCBI_SCOPE
50 BEGIN_SCOPE(objects)
51 BEGIN_SCOPE(validator)
52 using namespace sequence;
53
54
55 // =============================================================================
56 // Public
57 // =============================================================================
58
59
CValidError_bioseqset(CValidError_imp & imp)60 CValidError_bioseqset::CValidError_bioseqset(CValidError_imp& imp) :
61 CValidError_base(imp) , m_AnnotValidator(imp) , m_DescrValidator(imp) , m_BioseqValidator(imp)
62 {
63 }
64
65
~CValidError_bioseqset(void)66 CValidError_bioseqset::~CValidError_bioseqset(void)
67 {
68 }
69
70
ValidateBioseqSet(const CBioseq_set & seqset)71 void CValidError_bioseqset::ValidateBioseqSet(
72 const CBioseq_set& seqset)
73 {
74 int protcnt = 0;
75 int nuccnt = 0;
76 int segcnt = 0;
77
78 // Validate Set Contents
79 FOR_EACH_SEQENTRY_ON_SEQSET (se_list_it, seqset) {
80 const CSeq_entry& se = **se_list_it;
81 if ( se.IsSet() ) {
82 const CBioseq_set& set = se.GetSet();
83
84 // validate member set
85 ValidateBioseqSet (set);
86 } else if (se.IsSeq()) {
87 const CBioseq& seq = se.GetSeq();
88 // Validate Member Seq
89 m_BioseqValidator.ValidateBioseq(seq);
90 }
91 }
92 // note - need to do this with an iterator, so that we count sequences in subsets
93 CTypeConstIterator<CBioseq> seqit(ConstBegin(seqset));
94 for (; seqit; ++seqit) {
95
96 if ( seqit->IsAa() ) {
97 protcnt++;
98 } else if ( seqit->IsNa() ) {
99 nuccnt++;
100 }
101
102 if (seqit->GetInst().GetRepr() == CSeq_inst::eRepr_seg) {
103 segcnt++;
104 }
105 }
106
107 switch ( seqset.GetClass() ) {
108 case CBioseq_set::eClass_not_set:
109 PostErr(eDiag_Warning, eErr_SEQ_PKG_BioseqSetClassNotSet,
110 "Bioseq_set class not set", seqset);
111 break;
112 case CBioseq_set::eClass_nuc_prot:
113 ValidateNucProtSet(seqset, nuccnt, protcnt, segcnt);
114 break;
115 case CBioseq_set::eClass_segset:
116 ValidateSegSet(seqset, segcnt);
117 break;
118 case CBioseq_set::eClass_parts:
119 ValidatePartsSet(seqset);
120 break;
121 case CBioseq_set::eClass_genbank:
122 ValidateGenbankSet(seqset);
123 break;
124 case CBioseq_set::eClass_pop_set:
125 ValidatePopSet(seqset);
126 break;
127 case CBioseq_set::eClass_mut_set:
128 case CBioseq_set::eClass_phy_set:
129 case CBioseq_set::eClass_eco_set:
130 case CBioseq_set::eClass_wgs_set:
131 case CBioseq_set::eClass_small_genome_set:
132 ValidatePhyMutEcoWgsSet(seqset);
133 break;
134 case CBioseq_set::eClass_gen_prod_set:
135 ValidateGenProdSet(seqset);
136 break;
137 case CBioseq_set::eClass_conset:
138 if (!m_Imp.IsRefSeq()) {
139 PostErr (eDiag_Error, eErr_SEQ_PKG_ConSetProblem,
140 "Set class should not be conset", seqset);
141 }
142 break;
143 /*
144 case CBioseq_set::eClass_other:
145 PostErr(eDiag_Critical, eErr_SEQ_PKG_GenomicProductPackagingProblem,
146 "Genomic product set class incorrectly set to other", seqset);
147 break;
148 */
149 default:
150 if ( nuccnt == 0 && protcnt == 0 ) {
151 PostErr(eDiag_Warning, eErr_SEQ_PKG_EmptySet,
152 "No Bioseqs in this set", seqset);
153 }
154 break;
155 }
156
157 SetShouldNotHaveMolInfo(seqset);
158 ValidateSetTitle(seqset);
159 ValidateSetElements(seqset);
160
161 if (seqset.IsSetClass()
162 && (seqset.GetClass() == CBioseq_set::eClass_pop_set
163 || seqset.GetClass() == CBioseq_set::eClass_mut_set
164 || seqset.GetClass() == CBioseq_set::eClass_phy_set
165 || seqset.GetClass() == CBioseq_set::eClass_eco_set
166 || seqset.GetClass() == CBioseq_set::eClass_wgs_set
167 || seqset.GetClass() == CBioseq_set::eClass_small_genome_set)) {
168 CheckForImproperlyNestedSets(seqset);
169 }
170
171 if (seqset.IsSetClass()
172 && (seqset.GetClass() == CBioseq_set::eClass_genbank
173 || seqset.GetClass() == CBioseq_set::eClass_pop_set
174 || seqset.GetClass() == CBioseq_set::eClass_mut_set
175 || seqset.GetClass() == CBioseq_set::eClass_phy_set
176 || seqset.GetClass() == CBioseq_set::eClass_eco_set
177 || seqset.GetClass() == CBioseq_set::eClass_wgs_set
178 || seqset.GetClass() == CBioseq_set::eClass_small_genome_set)) {
179 ShouldHaveNoDblink(seqset);
180 }
181
182 // validate annots
183 FOR_EACH_SEQANNOT_ON_SEQSET (annot_it, seqset) {
184 m_AnnotValidator.ValidateSeqAnnot (**annot_it);
185 m_AnnotValidator.ValidateSeqAnnotContext (**annot_it, seqset);
186 }
187 if (seqset.IsSetDescr()) {
188 CBioseq_set_Handle bsh = m_Scope->GetBioseq_setHandle(seqset);
189 if (bsh) {
190 CSeq_entry_Handle ctx = bsh.GetParentEntry();
191 if (ctx) {
192 m_DescrValidator.ValidateSeqDescr (seqset.GetDescr(), *(ctx.GetCompleteSeq_entry()));
193 }
194 }
195 }
196 }
197
198
199 // =============================================================================
200 // Private
201 // =============================================================================
202
203
IsMrnaProductInGPS(const CBioseq & seq)204 bool CValidError_bioseqset::IsMrnaProductInGPS(const CBioseq& seq)
205 {
206 if ( m_Imp.IsGPS() ) {
207 CFeat_CI mrna(
208 m_Scope->GetBioseqHandle(seq),
209 SAnnotSelector(CSeqFeatData::e_Rna)
210 .SetByProduct());
211 return (bool)mrna;
212 }
213 return true;
214 }
215
216
IsCDSProductInGPS(const CBioseq & seq,const CBioseq_set & gps)217 bool CValidError_bioseqset::IsCDSProductInGPS(const CBioseq& seq, const CBioseq_set& gps)
218 {
219 // there should be a coding region on the contig whose product is seq
220 if (gps.IsSetSeq_set() && gps.GetSeq_set().size() > 0
221 && gps.GetSeq_set().front()->IsSeq()) {
222 CBioseq_Handle contig = m_Scope->GetBioseqHandle(gps.GetSeq_set().front()->GetSeq());
223 CBioseq_Handle prot = m_Scope->GetBioseqHandle(seq);
224 SAnnotSelector sel;
225 sel.SetByProduct(true);
226 CFeat_CI cds(prot, sel);
227 while (cds) {
228 CBioseq_Handle cds_seq = m_Scope->GetBioseqHandle(cds->GetLocation());
229 if (cds_seq == contig) {
230 return true;
231 }
232 ++cds;
233 }
234 }
235
236 return false;
237 }
238
239
ValidateNucProtSet(const CBioseq_set & seqset,int nuccnt,int protcnt,int segcnt)240 void CValidError_bioseqset::ValidateNucProtSet
241 (const CBioseq_set& seqset,
242 int nuccnt,
243 int protcnt,
244 int segcnt)
245 {
246 if ( nuccnt == 0 ) {
247 PostErr(eDiag_Error, eErr_SEQ_PKG_NucProtProblem,
248 "No nucleotides in nuc-prot set", seqset);
249 } else if ( nuccnt > 1 && segcnt != 1) {
250 PostErr(eDiag_Critical, eErr_SEQ_PKG_NucProtProblem,
251 "Multiple unsegmented nucleotides in nuc-prot set", seqset);
252 }
253 if ( protcnt == 0 ) {
254 PostErr(eDiag_Error, eErr_SEQ_PKG_NucProtProblem,
255 "No proteins in nuc-prot set", seqset);
256 }
257
258 int prot_biosource = 0;
259 bool is_nm = false;
260
261 sequence::CDeflineGenerator defline_generator;
262
263 FOR_EACH_SEQENTRY_ON_SEQSET (se_list_it, seqset) {
264 if ( (*se_list_it)->IsSeq() ) {
265 const CBioseq& seq = (*se_list_it)->GetSeq();
266
267
268 bool hasMetaGenomeSource = false;
269 CConstRef<CSeqdesc> closest_biosource = seq.GetClosestDescriptor(CSeqdesc::e_Source);
270 if (closest_biosource) {
271 const CBioSource& src = closest_biosource->GetSource();
272 FOR_EACH_ORGMOD_ON_BIOSOURCE (omd_itr, src) {
273 const COrgMod& omd = **omd_itr;
274 if (omd.IsSetSubname() && omd.IsSetSubtype() && omd.GetSubtype() == COrgMod::eSubtype_metagenome_source) {
275 hasMetaGenomeSource = true;
276 break;
277 }
278 }
279 }
280
281 FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
282 const CSeqdesc& desc = **it;
283 if (desc.Which() == CSeqdesc::e_User && desc.GetUser().IsSetType()) {
284 const CUser_object& usr = desc.GetUser();
285 const CObject_id& oi = usr.GetType();
286 if (oi.IsStr() && NStr::EqualCase(oi.GetStr(), "DBLink")) {
287 PostErr(eDiag_Critical, eErr_SEQ_DESCR_DBLinkProblem, "DBLink user object should not be on a Bioseq", seq);
288 }
289 }
290 }
291
292 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
293 CBioseq_set_Handle gps = GetGenProdSetParent(bsh);
294 if (seq.IsNa()) {
295 if (gps && !IsMrnaProductInGPS(seq) ) {
296 PostErr(eDiag_Warning,
297 eErr_SEQ_PKG_GenomicProductPackagingProblem,
298 "Nucleotide bioseq should be product of mRNA "
299 "feature on contig, but is not",
300 seq);
301 }
302 FOR_EACH_SEQID_ON_BIOSEQ (id_it, seq) {
303 if ((*id_it)->IsOther() && (*id_it)->GetOther().IsSetAccession()) {
304 const string& acc = (*id_it)->GetOther().GetAccession();
305 if (NStr::StartsWith(acc, "NM_")) {
306 is_nm = true;
307 }
308 }
309 }
310 } else if ( seq.IsAa() ) {
311 if (gps && !IsCDSProductInGPS(seq, *(gps.GetCompleteBioseq_set())) ) {
312 PostErr(eDiag_Warning,
313 eErr_SEQ_PKG_GenomicProductPackagingProblem,
314 "Protein bioseq should be product of CDS "
315 "feature on contig, but is not",
316 seq);
317 }
318 string instantiated;
319 FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
320 if ((*it)->IsSource()) {
321 prot_biosource++;
322 }
323 if ((*it)->IsTitle()) {
324 instantiated = (*it)->GetTitle();
325 }
326 }
327 // look for instantiated protein titles that don't match
328
329 if (!NStr::IsBlank(instantiated)) {
330 string generated = defline_generator.GenerateDefline(seq, *m_Scope, sequence::CDeflineGenerator::fIgnoreExisting);
331 if (!NStr::EqualNocase(instantiated, generated)) {
332 generated = defline_generator.GenerateDefline(seq, *m_Scope,
333 sequence::CDeflineGenerator::fIgnoreExisting | sequence::CDeflineGenerator::fAllProteinNames);
334 if (NStr::StartsWith (instantiated, "PREDICTED: ", NStr::eNocase)) {
335 instantiated.erase (0, 11);
336 } else if (NStr::StartsWith (instantiated, "UNVERIFIED: ", NStr::eNocase)) {
337 instantiated.erase (0, 12);
338 } else if (NStr::StartsWith (instantiated, "PUTATIVE PSEUDOGENE: ", NStr::eNocase)) {
339 instantiated.erase (0, 21);
340 }
341 if (NStr::StartsWith (generated, "PREDICTED: ", NStr::eNocase)) {
342 generated.erase (0, 11);
343 } else if (NStr::StartsWith (generated, "UNVERIFIED: ", NStr::eNocase)) {
344 generated.erase (0, 12);
345 } else if (NStr::StartsWith (generated, "PUTATIVE PSEUDOGENE: ", NStr::eNocase)) {
346 generated.erase (0, 21);
347 }
348 //okay if instantiated title has single trailing period
349 if (instantiated.length() == generated.length() + 1 && NStr::EndsWith(instantiated, ".")
350 && !NStr::EndsWith(instantiated, "..")) {
351 generated += ".";
352 }
353 if (!NStr::EqualNocase(instantiated, generated) && !NStr::EqualNocase("MAG " + instantiated, generated)) {
354 if (hasMetaGenomeSource && NStr::EqualNocase("MAG: " + instantiated, generated)) {
355 // allow missing MAG with no other prefix
356 } else if (hasMetaGenomeSource && NStr::EqualNocase("MAG " + instantiated, generated)) {
357 // allow missing MAG followed by another prefix
358 } else {
359 PostErr(eDiag_Warning, eErr_SEQ_DESCR_InconsistentProteinTitle,
360 "Instantiated protein title does not match automatically "
361 "generated title", seq);
362 }
363 }
364 }
365 }
366 }
367 }
368
369 if ( !(*se_list_it)->IsSet() )
370 continue;
371
372 const CBioseq_set& set = (*se_list_it)->GetSet();
373 if ( set.GetClass() != CBioseq_set::eClass_segset ) {
374
375 const CEnumeratedTypeValues* tv =
376 CBioseq_set::GetTypeInfo_enum_EClass();
377 const string& set_class = tv->FindName(set.GetClass(), true);
378
379 PostErr(eDiag_Critical, eErr_SEQ_PKG_NucProtNotSegSet,
380 "Nuc-prot Bioseq-set contains wrong Bioseq-set, "
381 "its class is \"" + set_class + "\".", set);
382 break;
383 }
384 }
385 if (prot_biosource > 1) {
386 PostErr (eDiag_Warning, eErr_SEQ_DESCR_BioSourceOnProtein,
387 "Nuc-prot set has " + NStr::IntToString (prot_biosource)
388 + " proteins with a BioSource descriptor", seqset);
389 } else if (prot_biosource > 0) {
390 PostErr (eDiag_Warning, eErr_SEQ_DESCR_BioSourceOnProtein,
391 "Nuc-prot set has 1 protein with a BioSource descriptor", seqset);
392 }
393
394 bool has_source = false;
395 bool has_title = false;
396 bool has_refgenetracking = false;
397 FOR_EACH_DESCRIPTOR_ON_SEQSET (it, seqset) {
398 if ((*it)->IsSource()
399 && (*it)->GetSource().IsSetOrg()
400 && (*it)->GetSource().GetOrg().IsSetTaxname()
401 && !NStr::IsBlank ((*it)->GetSource().GetOrg().GetTaxname())) {
402 has_source = true;
403 } else if ((*it)->IsTitle()) {
404 has_title = true;
405 } else if ((*it)->IsUser()
406 && (*it)->GetUser().IsRefGeneTracking()) {
407 has_refgenetracking = true;
408 }
409 /*
410 if (has_title && has_source) {
411 break;
412 }
413 */
414 }
415
416 if (!has_source) {
417 // error if does not have source and is not genprodset
418 CBioseq_set_Handle gps = GetGenProdSetParent (m_Scope->GetBioseq_setHandle (seqset));
419 if (!gps) {
420 PostErr (eDiag_Warning, eErr_SEQ_DESCR_BioSourceMissing,
421 "Nuc-prot set does not contain expected BioSource descriptor", seqset);
422 }
423 }
424
425 if (has_title) {
426 PostErr (eDiag_Warning, eErr_SEQ_PKG_NucProtSetHasTitle,
427 "Nuc-prot set should not have title descriptor", seqset);
428 }
429
430 if (has_refgenetracking && (! is_nm)) {
431 PostErr (eDiag_Error, eErr_SEQ_DESCR_RefGeneTrackingOnNucProtSet,
432 "Nuc-prot set should not have RefGeneTracking user object", seqset);
433 }
434 }
435
436
CheckForInconsistentBiomols(const CBioseq_set & seqset)437 void CValidError_bioseqset::CheckForInconsistentBiomols (const CBioseq_set& seqset)
438 {
439 if (!seqset.IsSetClass()) {
440 return;
441 }
442
443 CTypeConstIterator<CMolInfo> miit(ConstBegin(seqset));
444 const CMolInfo* mol_info = 0;
445
446 for (; miit; ++miit) {
447 if (!miit->IsSetBiomol() || miit->GetBiomol() == CMolInfo::eBiomol_peptide) {
448 continue;
449 }
450 if (mol_info == 0) {
451 mol_info = &(*miit);
452 } else if (mol_info->GetBiomol() != miit->GetBiomol() ) {
453 if (seqset.GetClass() == CBioseq_set::eClass_pop_set
454 || seqset.GetClass() == CBioseq_set::eClass_eco_set
455 || seqset.GetClass() == CBioseq_set::eClass_mut_set
456 || seqset.GetClass() == CBioseq_set::eClass_phy_set
457 || seqset.GetClass() == CBioseq_set::eClass_wgs_set
458 || seqset.GetClass() == CBioseq_set::eClass_small_genome_set) {
459 PostErr(eDiag_Warning, eErr_SEQ_PKG_InconsistentMoltypeSet,
460 "Pop/phy/mut/eco set contains inconsistent moltype",
461 seqset);
462 }
463 break;
464 }
465 } // for
466
467 }
468
469
ValidateSegSet(const CBioseq_set & seqset,int segcnt)470 void CValidError_bioseqset::ValidateSegSet(const CBioseq_set& seqset, int segcnt)
471 {
472 if ( segcnt == 0 ) {
473 PostErr(eDiag_Error, eErr_SEQ_PKG_SegSetProblem,
474 "No segmented Bioseq in segset", seqset);
475 }
476
477 CSeq_inst::EMol mol = CSeq_inst::eMol_not_set;
478 CSeq_inst::EMol seq_inst_mol;
479
480 FOR_EACH_SEQENTRY_ON_SEQSET (se_list_it, seqset) {
481 if ( (*se_list_it)->IsSeq() ) {
482 const CSeq_inst& seq_inst = (*se_list_it)->GetSeq().GetInst();
483
484 if ( mol == CSeq_inst::eMol_not_set ||
485 mol == CSeq_inst::eMol_other ) {
486 mol = seq_inst.GetMol();
487 } else if ( (seq_inst_mol = seq_inst.GetMol()) != CSeq_inst::eMol_other) {
488 if ( seq_inst.IsNa() != CSeq_inst::IsNa(mol) ) {
489 PostErr(eDiag_Critical, eErr_SEQ_PKG_SegSetMixedBioseqs,
490 "Segmented set contains mixture of nucleotides"
491 " and proteins", seqset);
492 break;
493 }
494 }
495 } else if ( (*se_list_it)->IsSet() ) {
496 const CBioseq_set& set = (*se_list_it)->GetSet();
497
498 if ( set.IsSetClass() &&
499 set.GetClass() != CBioseq_set::eClass_parts ) {
500 const CEnumeratedTypeValues* tv =
501 CBioseq_set::GetTypeInfo_enum_EClass();
502 const string& set_class_str =
503 tv->FindName(set.GetClass(), true);
504
505 PostErr(eDiag_Critical, eErr_SEQ_PKG_SegSetNotParts,
506 "Segmented set contains wrong Bioseq-set, "
507 "its class is \"" + set_class_str + "\".", set);
508 break;
509 }
510 } // else if
511 } // iterate
512
513 CheckForInconsistentBiomols (seqset);
514 }
515
516
ValidatePartsSet(const CBioseq_set & seqset)517 void CValidError_bioseqset::ValidatePartsSet(const CBioseq_set& seqset)
518 {
519 CSeq_inst::EMol mol = CSeq_inst::eMol_not_set;
520 CSeq_inst::EMol seq_inst_mol;
521
522 FOR_EACH_SEQENTRY_ON_SEQSET (se_list_it, seqset) {
523 if ( (*se_list_it)->IsSeq() ) {
524 const CSeq_inst& seq_inst = (*se_list_it)->GetSeq().GetInst();
525
526 if ( mol == CSeq_inst::eMol_not_set ||
527 mol == CSeq_inst::eMol_other ) {
528 mol = seq_inst.GetMol();
529 } else {
530 seq_inst_mol = seq_inst.GetMol();
531 if ( seq_inst_mol != CSeq_inst::eMol_other) {
532 if ( seq_inst.IsNa() != CSeq_inst::IsNa(mol) ) {
533 PostErr(eDiag_Critical, eErr_SEQ_PKG_PartsSetMixedBioseqs,
534 "Parts set contains mixture of nucleotides "
535 "and proteins", seqset);
536 }
537 }
538 }
539 } else if ( (*se_list_it)->IsSet() ) {
540 const CBioseq_set& set = (*se_list_it)->GetSet();
541 const CEnumeratedTypeValues* tv =
542 CBioseq_set::GetTypeInfo_enum_EClass();
543 const string& set_class_str =
544 tv->FindName(set.GetClass(), true);
545
546 PostErr(eDiag_Critical, eErr_SEQ_PKG_PartsSetHasSets,
547 "Parts set contains unwanted Bioseq-set, "
548 "its class is \"" + set_class_str + "\".", set);
549 } // else if
550 } // for
551 }
552
553
ValidateGenbankSet(const CBioseq_set & seqset)554 void CValidError_bioseqset::ValidateGenbankSet(const CBioseq_set& seqset)
555 {
556 }
557
558
ValidateSetTitle(const CBioseq_set & seqset)559 void CValidError_bioseqset::ValidateSetTitle(const CBioseq_set& seqset)
560 {
561 bool has_title = false;
562 bool needs_title = seqset.NeedsDocsumTitle();
563 if (seqset.IsSetDescr()) {
564 for (auto it : seqset.GetDescr().Get()) {
565 if (it->IsTitle()) {
566 if (!needs_title) {
567 CSeq_entry* parent = seqset.GetParentEntry();
568 if (parent) {
569 PostErr(eDiag_Error, eErr_SEQ_DESCR_TitleNotAppropriateForSet,
570 "Only Pop/Phy/Mut/Eco sets should have titles",
571 *parent, *it);
572 } else {
573 PostErr(eDiag_Error, eErr_SEQ_DESCR_TitleNotAppropriateForSet,
574 "Only Pop/Phy/Mut/Eco sets should have titles",
575 seqset);
576 }
577 }
578 has_title = true;
579 }
580 }
581 }
582
583
584 if (needs_title && !has_title && (m_Imp.IsRefSeq() || m_Imp.IsEmbl() || m_Imp.IsDdbj() || m_Imp.IsGenbank())) {
585 PostErr(eDiag_Warning, eErr_SEQ_PKG_MissingSetTitle,
586 "Pop/Phy/Mut/Eco set does not have title",
587 seqset);
588 }
589 }
590
591
ValidateSetElements(const CBioseq_set & seqset)592 void CValidError_bioseqset::ValidateSetElements(const CBioseq_set& seqset)
593 {
594 if (!seqset.IsSetClass()) {
595 return;
596 }
597 if (seqset.GetClass() == CBioseq_set::eClass_eco_set ||
598 seqset.GetClass() == CBioseq_set::eClass_phy_set ||
599 seqset.GetClass() == CBioseq_set::eClass_pop_set ||
600 seqset.GetClass() == CBioseq_set::eClass_mut_set) {
601
602 if (!seqset.IsSetSeq_set() || seqset.GetSeq_set().size() == 0) {
603 PostErr(eDiag_Warning, eErr_SEQ_PKG_EmptySet,
604 "Pop/Phy/Mut/Eco set has no components",
605 seqset);
606 } else if (seqset.GetSeq_set().size() == 1) {
607 bool has_alignment = false;
608 CSeq_annot_CI annot_it (m_Scope->GetBioseq_setHandle(seqset));
609 while (annot_it && !has_alignment) {
610 if (annot_it->IsAlign()) {
611 has_alignment = true;
612 }
613 ++annot_it;
614 }
615 if (!has_alignment) {
616 PostErr(eDiag_Warning, eErr_SEQ_PKG_SingleItemSet,
617 "Pop/Phy/Mut/Eco set has only one component and no alignments",
618 seqset);
619 }
620 }
621 }
622 if (m_Imp.IsIndexerVersion()) {
623 if (seqset.GetClass() == CBioseq_set::eClass_eco_set ||
624 seqset.GetClass() == CBioseq_set::eClass_phy_set ||
625 seqset.GetClass() == CBioseq_set::eClass_pop_set ||
626 seqset.GetClass() == CBioseq_set::eClass_mut_set) {
627 CBioseq_CI b_i(m_Scope->GetBioseq_setHandle(seqset));
628 while (b_i) {
629 if (b_i->IsNa()) {
630 const CBioseq& seq = *(b_i->GetCompleteBioseq());
631 bool has_title = false;
632 FOR_EACH_DESCRIPTOR_ON_BIOSEQ (d_i, seq) {
633 if ((*d_i)->IsTitle()) {
634 has_title = true;
635 break;
636 }
637 }
638 if (!has_title && (m_Imp.IsRefSeq() || m_Imp.IsEmbl() || m_Imp.IsDdbj() || m_Imp.IsGenbank())) {
639 PostErr(eDiag_Warning, eErr_SEQ_PKG_ComponentMissingTitle,
640 "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title",
641 seq);
642 }
643 }
644 ++b_i;
645 }
646 }
647 }
648 }
649
650
SetShouldNotHaveMolInfo(const CBioseq_set & seqset)651 void CValidError_bioseqset::SetShouldNotHaveMolInfo(const CBioseq_set& seqset)
652 {
653 string class_name;
654 switch (seqset.GetClass()) {
655 case CBioseq_set::eClass_pop_set:
656 class_name = "Pop set";
657 break;
658 case CBioseq_set::eClass_mut_set:
659 class_name = "Mut set";
660 break;
661 case CBioseq_set::eClass_genbank:
662 class_name = "Genbank set";
663 break;
664 case CBioseq_set::eClass_phy_set:
665 case CBioseq_set::eClass_wgs_set:
666 case CBioseq_set::eClass_eco_set:
667 class_name = "Phy/eco/wgs set";
668 break;
669 case CBioseq_set::eClass_gen_prod_set:
670 class_name = "GenProd set";
671 break;
672 case CBioseq_set::eClass_small_genome_set:
673 class_name = "Small genome set";
674 break;
675 case CBioseq_set::eClass_nuc_prot:
676 class_name = "Nuc-prot set";
677 break;
678 default:
679 return;
680 break;
681 }
682
683 FOR_EACH_DESCRIPTOR_ON_SEQSET (it, seqset) {
684 if ((*it)->IsMolinfo()) {
685 PostErr(eDiag_Warning, eErr_SEQ_PKG_MisplacedMolInfo,
686 class_name + " has MolInfo on set", seqset);
687 return;
688 }
689 }
690 }
691
692
ValidatePopSet(const CBioseq_set & seqset)693 void CValidError_bioseqset::ValidatePopSet(const CBioseq_set& seqset)
694 {
695 static const string sp = " sp. ";
696
697 if (m_Imp.IsRefSeq()) {
698 PostErr (eDiag_Critical, eErr_SEQ_PKG_RefSeqPopSet,
699 "RefSeq record should not be a Pop-set", seqset);
700 }
701
702 CTypeConstIterator<CBioseq> seqit(ConstBegin(seqset));
703 string first_taxname;
704 bool is_first = true;
705 for (; seqit; ++seqit) {
706 string taxname;
707 CBioseq_Handle bsh = m_Scope->GetBioseqHandle (*seqit);
708 // Will get the first biosource either from the descriptor
709 // or feature.
710 CSeqdesc_CI d(bsh, CSeqdesc::e_Source);
711 if (d) {
712 if (d->GetSource().IsSetOrg() && d->GetSource().GetOrg().IsSetTaxname()) {
713 taxname = d->GetSource().GetOrg().GetTaxname();
714 }
715 } else {
716 CFeat_CI f(bsh, CSeqFeatData::e_Biosrc);
717 if (f && f->GetData().GetBiosrc().IsSetOrg() && f->GetData().GetBiosrc().GetOrg().IsSetTaxname()) {
718 taxname = f->GetData().GetBiosrc().GetOrg().GetTaxname();
719 }
720 }
721
722 if (is_first) {
723 first_taxname = taxname;
724 is_first = false;
725 continue;
726 }
727
728 // Make sure all the taxnames in the set are the same.
729 if ( NStr::CompareNocase(first_taxname, taxname) == 0 ) {
730 continue;
731 }
732
733 // drops severity if first mismatch is same up to sp.
734 EDiagSev sev = eDiag_Error;
735 SIZE_TYPE pos = NStr::Find(taxname, sp);
736 if ( pos != NPOS ) {
737 SIZE_TYPE len = pos + sp.length();
738 if ( NStr::strncasecmp(first_taxname.c_str(),
739 taxname.c_str(),
740 len) == 0 ) {
741 sev = eDiag_Warning;
742 }
743 }
744 // drops severity if one name is subset of the other
745 SIZE_TYPE comp_len = min (taxname.length(), first_taxname.length());
746 if (NStr::EqualCase(taxname, 0, comp_len, first_taxname)) {
747 sev = eDiag_Warning;
748 }
749
750 PostErr(sev, eErr_SEQ_DESCR_InconsistentTaxNameSet,
751 "Population set contains inconsistent organism names.",
752 seqset);
753 break;
754 }
755 CheckForInconsistentBiomols (seqset);
756 }
757
758
ValidatePhyMutEcoWgsSet(const CBioseq_set & seqset)759 void CValidError_bioseqset::ValidatePhyMutEcoWgsSet(const CBioseq_set& seqset)
760 {
761 CheckForInconsistentBiomols (seqset);
762 }
763
764
ValidateGenProdSet(const CBioseq_set & seqset)765 void CValidError_bioseqset::ValidateGenProdSet(
766 const CBioseq_set& seqset)
767 {
768 bool id_no_good = false;
769 CSeq_id::E_Choice id_type = CSeq_id::e_not_set;
770
771 // genprodset should not have annotations directly on set
772 if (seqset.IsSetAnnot()) {
773 PostErr(eDiag_Critical,
774 eErr_SEQ_PKG_GenomicProductPackagingProblem,
775 "Seq-annot packaged directly on genomic product set", seqset);
776 }
777
778 CBioseq_set::TSeq_set::const_iterator se_list_it =
779 seqset.GetSeq_set().begin();
780
781 if ( !(**se_list_it).IsSeq() ) {
782 return;
783 }
784
785 const CBioseq& seq = (*se_list_it)->GetSeq();
786 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
787
788 CFeat_CI fi(bsh, CSeqFeatData::e_Rna);
789 for (; fi; ++fi) {
790 if ( fi->GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA ) {
791 if ( fi->IsSetProduct() ) {
792 CBioseq_Handle cdna = GetCache().GetBioseqHandleFromLocation(
793 m_Scope, fi->GetProduct(), bsh.GetTSE_Handle());
794 if ( !cdna ) {
795 try {
796 const CSeq_id& id = GetId(fi->GetProduct(), m_Scope);
797 id_type = id.Which();
798 } catch (CException ) {
799 id_no_good = true;
800 } catch (std::exception ) {
801 id_no_good = true;
802 }
803
804 // okay to have far RefSeq product
805 if ( id_no_good || (id_type != CSeq_id::e_Other) ) {
806 string loc_label;
807 fi->GetProduct().GetLabel(&loc_label);
808
809 if (loc_label.empty()) {
810 loc_label = "?";
811 }
812
813 PostErr(eDiag_Warning,
814 eErr_SEQ_PKG_GenomicProductPackagingProblem,
815 "Product of mRNA feature (" + loc_label +
816 ") not packaged in genomic product set", seq);
817 }
818 } // if (cdna == 0)
819 } else if (!sequence::IsPseudo(*(fi->GetSeq_feat()), *m_Scope)) {
820 PostErr(eDiag_Warning,
821 eErr_SEQ_PKG_GenomicProductPackagingProblem,
822 "Product of mRNA feature (?) not packaged in "
823 "genomic product set", seq);
824 }
825 }
826 } // for
827 }
828
829
CheckForImproperlyNestedSets(const CBioseq_set & seqset)830 void CValidError_bioseqset::CheckForImproperlyNestedSets (const CBioseq_set& seqset)
831 {
832 FOR_EACH_SEQENTRY_ON_SEQSET (it, seqset) {
833 if ((*it)->IsSet()) {
834 if (!(*it)->GetSet().IsSetClass()
835 || ((*it)->GetSet().GetClass() != CBioseq_set::eClass_nuc_prot
836 && (*it)->GetSet().GetClass() != CBioseq_set::eClass_segset
837 && (*it)->GetSet().GetClass() != CBioseq_set::eClass_parts)) {
838 PostErr(eDiag_Warning,
839 eErr_SEQ_PKG_ImproperlyNestedSets,
840 "Nested sets within Pop/Phy/Mut/Eco/Wgs set", (*it)->GetSet());
841 }
842 CheckForImproperlyNestedSets((*it)->GetSet());
843 }
844 }
845 }
846
ShouldHaveNoDblink(const CBioseq_set & seqset)847 void CValidError_bioseqset::ShouldHaveNoDblink (const CBioseq_set& seqset)
848 {
849 if (!seqset.IsSetDescr()) return;
850 for (auto it : seqset.GetDescr().Get()) {
851 const CSeqdesc& desc = *it;
852 if (desc.IsUser() && desc.GetUser().GetObjectType() == CUser_object::eObjectType_DBLink) {
853 PostErr(eDiag_Error,
854 eErr_SEQ_DESCR_DBLinkOnSet,
855 "DBLink user object should not be on this set", seqset);
856 }
857 }
858 }
859
860
861 END_SCOPE(validator)
862 END_SCOPE(objects)
863 END_NCBI_SCOPE
864