1 /*  $Id: validerror_bioseqset.cpp 632834 2021-06-08 16:47:06Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Jonathan Kans, Clifford Clausen, Aaron Ucko......
27  *
28  * File Description:
29  *   validation of bioseq_set
30  *   .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <objtools/validator/validerror_desc.hpp>
36 #include <objtools/validator/validerror_descr.hpp>
37 #include <objtools/validator/validerror_annot.hpp>
38 #include <objtools/validator/validerror_bioseq.hpp>
39 #include <objtools/validator/validerror_bioseqset.hpp>
40 #include <objtools/validator/validerror_base.hpp>
41 #include <objmgr/util/sequence.hpp>
42 #include <objects/seqset/Seq_entry.hpp>
43 #include <objects/seqset/Bioseq_set.hpp>
44 #include <objects/misc/sequence_macros.hpp>
45 #include <objmgr/seqdesc_ci.hpp>
46 #include <objmgr/seq_annot_ci.hpp>
47 
48 
49 BEGIN_NCBI_SCOPE
50 BEGIN_SCOPE(objects)
51 BEGIN_SCOPE(validator)
52 using namespace sequence;
53 
54 
55 // =============================================================================
56 //                                     Public
57 // =============================================================================
58 
59 
CValidError_bioseqset(CValidError_imp & imp)60 CValidError_bioseqset::CValidError_bioseqset(CValidError_imp& imp) :
61     CValidError_base(imp) , m_AnnotValidator(imp) , m_DescrValidator(imp) , m_BioseqValidator(imp)
62 {
63 }
64 
65 
~CValidError_bioseqset(void)66 CValidError_bioseqset::~CValidError_bioseqset(void)
67 {
68 }
69 
70 
ValidateBioseqSet(const CBioseq_set & seqset)71 void CValidError_bioseqset::ValidateBioseqSet(
72     const CBioseq_set& seqset)
73 {
74     int protcnt = 0;
75     int nuccnt  = 0;
76     int segcnt  = 0;
77 
78     // Validate Set Contents
79     FOR_EACH_SEQENTRY_ON_SEQSET (se_list_it, seqset) {
80         const CSeq_entry& se = **se_list_it;
81         if ( se.IsSet() ) {
82             const CBioseq_set& set = se.GetSet();
83 
84             // validate member set
85             ValidateBioseqSet (set);
86         } else if (se.IsSeq()) {
87             const CBioseq& seq = se.GetSeq();
88             // Validate Member Seq
89             m_BioseqValidator.ValidateBioseq(seq);
90         }
91     }
92     // note - need to do this with an iterator, so that we count sequences in subsets
93     CTypeConstIterator<CBioseq> seqit(ConstBegin(seqset));
94     for (; seqit; ++seqit) {
95 
96         if ( seqit->IsAa() ) {
97             protcnt++;
98         } else if ( seqit->IsNa() ) {
99             nuccnt++;
100         }
101 
102         if (seqit->GetInst().GetRepr() == CSeq_inst::eRepr_seg) {
103             segcnt++;
104         }
105     }
106 
107     switch ( seqset.GetClass() ) {
108     case CBioseq_set::eClass_not_set:
109         PostErr(eDiag_Warning, eErr_SEQ_PKG_BioseqSetClassNotSet,
110                 "Bioseq_set class not set", seqset);
111         break;
112     case CBioseq_set::eClass_nuc_prot:
113         ValidateNucProtSet(seqset, nuccnt, protcnt, segcnt);
114         break;
115     case CBioseq_set::eClass_segset:
116         ValidateSegSet(seqset, segcnt);
117         break;
118     case CBioseq_set::eClass_parts:
119         ValidatePartsSet(seqset);
120         break;
121     case CBioseq_set::eClass_genbank:
122         ValidateGenbankSet(seqset);
123         break;
124     case CBioseq_set::eClass_pop_set:
125         ValidatePopSet(seqset);
126         break;
127     case CBioseq_set::eClass_mut_set:
128     case CBioseq_set::eClass_phy_set:
129     case CBioseq_set::eClass_eco_set:
130     case CBioseq_set::eClass_wgs_set:
131     case CBioseq_set::eClass_small_genome_set:
132         ValidatePhyMutEcoWgsSet(seqset);
133         break;
134     case CBioseq_set::eClass_gen_prod_set:
135         ValidateGenProdSet(seqset);
136         break;
137     case CBioseq_set::eClass_conset:
138         if (!m_Imp.IsRefSeq()) {
139             PostErr (eDiag_Error, eErr_SEQ_PKG_ConSetProblem,
140                      "Set class should not be conset", seqset);
141         }
142         break;
143     /*
144     case CBioseq_set::eClass_other:
145         PostErr(eDiag_Critical, eErr_SEQ_PKG_GenomicProductPackagingProblem,
146             "Genomic product set class incorrectly set to other", seqset);
147         break;
148     */
149     default:
150         if ( nuccnt == 0  &&  protcnt == 0 )  {
151             PostErr(eDiag_Warning, eErr_SEQ_PKG_EmptySet,
152                 "No Bioseqs in this set", seqset);
153         }
154         break;
155     }
156 
157     SetShouldNotHaveMolInfo(seqset);
158     ValidateSetTitle(seqset);
159     ValidateSetElements(seqset);
160 
161     if (seqset.IsSetClass()
162         && (seqset.GetClass() == CBioseq_set::eClass_pop_set
163             || seqset.GetClass() == CBioseq_set::eClass_mut_set
164             || seqset.GetClass() == CBioseq_set::eClass_phy_set
165             || seqset.GetClass() == CBioseq_set::eClass_eco_set
166             || seqset.GetClass() == CBioseq_set::eClass_wgs_set
167             || seqset.GetClass() == CBioseq_set::eClass_small_genome_set)) {
168         CheckForImproperlyNestedSets(seqset);
169     }
170 
171     if (seqset.IsSetClass()
172         && (seqset.GetClass() == CBioseq_set::eClass_genbank
173             || seqset.GetClass() == CBioseq_set::eClass_pop_set
174             || seqset.GetClass() == CBioseq_set::eClass_mut_set
175             || seqset.GetClass() == CBioseq_set::eClass_phy_set
176             || seqset.GetClass() == CBioseq_set::eClass_eco_set
177             || seqset.GetClass() == CBioseq_set::eClass_wgs_set
178             || seqset.GetClass() == CBioseq_set::eClass_small_genome_set)) {
179         ShouldHaveNoDblink(seqset);
180     }
181 
182     // validate annots
183     FOR_EACH_SEQANNOT_ON_SEQSET (annot_it, seqset) {
184         m_AnnotValidator.ValidateSeqAnnot (**annot_it);
185         m_AnnotValidator.ValidateSeqAnnotContext (**annot_it, seqset);
186     }
187     if (seqset.IsSetDescr()) {
188         CBioseq_set_Handle bsh = m_Scope->GetBioseq_setHandle(seqset);
189         if (bsh) {
190             CSeq_entry_Handle ctx = bsh.GetParentEntry();
191             if (ctx) {
192                 m_DescrValidator.ValidateSeqDescr (seqset.GetDescr(), *(ctx.GetCompleteSeq_entry()));
193             }
194         }
195     }
196 }
197 
198 
199 // =============================================================================
200 //                                     Private
201 // =============================================================================
202 
203 
IsMrnaProductInGPS(const CBioseq & seq)204 bool CValidError_bioseqset::IsMrnaProductInGPS(const CBioseq& seq)
205 {
206     if ( m_Imp.IsGPS() ) {
207         CFeat_CI mrna(
208             m_Scope->GetBioseqHandle(seq),
209             SAnnotSelector(CSeqFeatData::e_Rna)
210             .SetByProduct());
211         return (bool)mrna;
212     }
213     return true;
214 }
215 
216 
IsCDSProductInGPS(const CBioseq & seq,const CBioseq_set & gps)217 bool CValidError_bioseqset::IsCDSProductInGPS(const CBioseq& seq, const CBioseq_set& gps)
218 {
219     // there should be a coding region on the contig whose product is seq
220     if (gps.IsSetSeq_set() && gps.GetSeq_set().size() > 0
221         && gps.GetSeq_set().front()->IsSeq()) {
222         CBioseq_Handle contig = m_Scope->GetBioseqHandle(gps.GetSeq_set().front()->GetSeq());
223         CBioseq_Handle prot = m_Scope->GetBioseqHandle(seq);
224         SAnnotSelector sel;
225         sel.SetByProduct(true);
226         CFeat_CI cds(prot, sel);
227         while (cds) {
228             CBioseq_Handle cds_seq = m_Scope->GetBioseqHandle(cds->GetLocation());
229             if (cds_seq == contig) {
230                 return true;
231             }
232             ++cds;
233         }
234     }
235 
236     return false;
237 }
238 
239 
ValidateNucProtSet(const CBioseq_set & seqset,int nuccnt,int protcnt,int segcnt)240 void CValidError_bioseqset::ValidateNucProtSet
241 (const CBioseq_set& seqset,
242  int nuccnt,
243  int protcnt,
244  int segcnt)
245 {
246     if ( nuccnt == 0 ) {
247         PostErr(eDiag_Error, eErr_SEQ_PKG_NucProtProblem,
248                  "No nucleotides in nuc-prot set", seqset);
249     } else if ( nuccnt > 1 && segcnt != 1) {
250         PostErr(eDiag_Critical, eErr_SEQ_PKG_NucProtProblem,
251                  "Multiple unsegmented nucleotides in nuc-prot set", seqset);
252     }
253     if ( protcnt == 0 ) {
254         PostErr(eDiag_Error, eErr_SEQ_PKG_NucProtProblem,
255                  "No proteins in nuc-prot set", seqset);
256     }
257 
258     int prot_biosource = 0;
259     bool is_nm = false;
260 
261     sequence::CDeflineGenerator defline_generator;
262 
263     FOR_EACH_SEQENTRY_ON_SEQSET (se_list_it, seqset) {
264         if ( (*se_list_it)->IsSeq() ) {
265             const CBioseq& seq = (*se_list_it)->GetSeq();
266 
267 
268             bool hasMetaGenomeSource = false;
269             CConstRef<CSeqdesc> closest_biosource = seq.GetClosestDescriptor(CSeqdesc::e_Source);
270             if (closest_biosource) {
271                 const CBioSource& src = closest_biosource->GetSource();
272                 FOR_EACH_ORGMOD_ON_BIOSOURCE (omd_itr, src) {
273                     const COrgMod& omd = **omd_itr;
274                     if (omd.IsSetSubname() && omd.IsSetSubtype() && omd.GetSubtype() == COrgMod::eSubtype_metagenome_source) {
275                         hasMetaGenomeSource = true;
276                         break;
277                     }
278                 }
279             }
280 
281             FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
282                 const CSeqdesc& desc = **it;
283                 if (desc.Which() == CSeqdesc::e_User && desc.GetUser().IsSetType()) {
284                     const CUser_object& usr = desc.GetUser();
285                     const CObject_id& oi = usr.GetType();
286                     if (oi.IsStr() && NStr::EqualCase(oi.GetStr(), "DBLink")) {
287                         PostErr(eDiag_Critical, eErr_SEQ_DESCR_DBLinkProblem, "DBLink user object should not be on a Bioseq", seq);
288                     }
289                 }
290             }
291 
292             CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
293             CBioseq_set_Handle gps = GetGenProdSetParent(bsh);
294             if (seq.IsNa()) {
295                 if (gps  &&  !IsMrnaProductInGPS(seq) ) {
296                     PostErr(eDiag_Warning,
297                         eErr_SEQ_PKG_GenomicProductPackagingProblem,
298                         "Nucleotide bioseq should be product of mRNA "
299                         "feature on contig, but is not",
300                         seq);
301                 }
302                 FOR_EACH_SEQID_ON_BIOSEQ (id_it, seq) {
303                     if ((*id_it)->IsOther() && (*id_it)->GetOther().IsSetAccession()) {
304                         const string& acc = (*id_it)->GetOther().GetAccession();
305                         if (NStr::StartsWith(acc, "NM_")) {
306                             is_nm = true;
307                         }
308                     }
309                 }
310             } else if ( seq.IsAa() ) {
311                 if (gps && !IsCDSProductInGPS(seq, *(gps.GetCompleteBioseq_set())) ) {
312                     PostErr(eDiag_Warning,
313                         eErr_SEQ_PKG_GenomicProductPackagingProblem,
314                         "Protein bioseq should be product of CDS "
315                         "feature on contig, but is not",
316                         seq);
317                 }
318                 string instantiated;
319                 FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
320                     if ((*it)->IsSource()) {
321                         prot_biosource++;
322                     }
323                     if ((*it)->IsTitle()) {
324                         instantiated = (*it)->GetTitle();
325                     }
326                 }
327                 // look for instantiated protein titles that don't match
328 
329                 if (!NStr::IsBlank(instantiated)) {
330                     string generated = defline_generator.GenerateDefline(seq, *m_Scope, sequence::CDeflineGenerator::fIgnoreExisting);
331                     if (!NStr::EqualNocase(instantiated, generated)) {
332                         generated = defline_generator.GenerateDefline(seq, *m_Scope,
333                             sequence::CDeflineGenerator::fIgnoreExisting | sequence::CDeflineGenerator::fAllProteinNames);
334                         if (NStr::StartsWith (instantiated, "PREDICTED: ", NStr::eNocase)) {
335                             instantiated.erase (0, 11);
336                         } else if (NStr::StartsWith (instantiated, "UNVERIFIED: ", NStr::eNocase)) {
337                             instantiated.erase (0, 12);
338                         } else if (NStr::StartsWith (instantiated, "PUTATIVE PSEUDOGENE: ", NStr::eNocase)) {
339                             instantiated.erase (0, 21);
340                         }
341                         if (NStr::StartsWith (generated, "PREDICTED: ", NStr::eNocase)) {
342                             generated.erase (0, 11);
343                         } else if (NStr::StartsWith (generated, "UNVERIFIED: ", NStr::eNocase)) {
344                             generated.erase (0, 12);
345                          } else if (NStr::StartsWith (generated, "PUTATIVE PSEUDOGENE: ", NStr::eNocase)) {
346                             generated.erase (0, 21);
347                        }
348                         //okay if instantiated title has single trailing period
349                         if (instantiated.length() == generated.length() + 1 && NStr::EndsWith(instantiated, ".")
350                             && !NStr::EndsWith(instantiated, "..")) {
351                             generated += ".";
352                         }
353                         if (!NStr::EqualNocase(instantiated, generated) && !NStr::EqualNocase("MAG " + instantiated, generated)) {
354                             if (hasMetaGenomeSource && NStr::EqualNocase("MAG: " + instantiated, generated)) {
355                                 // allow missing MAG with no other prefix
356                             } else if (hasMetaGenomeSource && NStr::EqualNocase("MAG " + instantiated, generated)) {
357                                 // allow missing MAG followed by another prefix
358                             } else {
359                                 PostErr(eDiag_Warning, eErr_SEQ_DESCR_InconsistentProteinTitle,
360                                         "Instantiated protein title does not match automatically "
361                                         "generated title", seq);
362                             }
363                         }
364                     }
365                 }
366             }
367         }
368 
369         if ( !(*se_list_it)->IsSet() )
370             continue;
371 
372         const CBioseq_set& set = (*se_list_it)->GetSet();
373         if ( set.GetClass() != CBioseq_set::eClass_segset ) {
374 
375             const CEnumeratedTypeValues* tv =
376                 CBioseq_set::GetTypeInfo_enum_EClass();
377             const string& set_class = tv->FindName(set.GetClass(), true);
378 
379             PostErr(eDiag_Critical, eErr_SEQ_PKG_NucProtNotSegSet,
380                      "Nuc-prot Bioseq-set contains wrong Bioseq-set, "
381                      "its class is \"" + set_class + "\".", set);
382             break;
383         }
384     }
385     if (prot_biosource > 1) {
386         PostErr (eDiag_Warning, eErr_SEQ_DESCR_BioSourceOnProtein,
387                  "Nuc-prot set has " + NStr::IntToString (prot_biosource)
388                  + " proteins with a BioSource descriptor", seqset);
389     } else if (prot_biosource > 0) {
390         PostErr (eDiag_Warning, eErr_SEQ_DESCR_BioSourceOnProtein,
391                  "Nuc-prot set has 1 protein with a BioSource descriptor", seqset);
392     }
393 
394     bool has_source = false;
395     bool has_title = false;
396     bool has_refgenetracking = false;
397     FOR_EACH_DESCRIPTOR_ON_SEQSET (it, seqset) {
398         if ((*it)->IsSource()
399             && (*it)->GetSource().IsSetOrg()
400             && (*it)->GetSource().GetOrg().IsSetTaxname()
401             && !NStr::IsBlank ((*it)->GetSource().GetOrg().GetTaxname())) {
402             has_source = true;
403         } else if ((*it)->IsTitle()) {
404             has_title = true;
405         } else if ((*it)->IsUser()
406             && (*it)->GetUser().IsRefGeneTracking()) {
407             has_refgenetracking = true;
408         }
409         /*
410         if (has_title && has_source) {
411             break;
412         }
413         */
414     }
415 
416     if (!has_source) {
417         // error if does not have source and is not genprodset
418         CBioseq_set_Handle gps = GetGenProdSetParent (m_Scope->GetBioseq_setHandle (seqset));
419         if (!gps) {
420             PostErr (eDiag_Warning, eErr_SEQ_DESCR_BioSourceMissing,
421                      "Nuc-prot set does not contain expected BioSource descriptor", seqset);
422         }
423     }
424 
425     if (has_title) {
426         PostErr (eDiag_Warning, eErr_SEQ_PKG_NucProtSetHasTitle,
427                  "Nuc-prot set should not have title descriptor", seqset);
428     }
429 
430     if (has_refgenetracking && (! is_nm)) {
431         PostErr (eDiag_Error, eErr_SEQ_DESCR_RefGeneTrackingOnNucProtSet,
432                  "Nuc-prot set should not have RefGeneTracking user object", seqset);
433     }
434 }
435 
436 
CheckForInconsistentBiomols(const CBioseq_set & seqset)437 void CValidError_bioseqset::CheckForInconsistentBiomols (const CBioseq_set& seqset)
438 {
439     if (!seqset.IsSetClass()) {
440         return;
441     }
442 
443     CTypeConstIterator<CMolInfo> miit(ConstBegin(seqset));
444     const CMolInfo* mol_info = 0;
445 
446     for (; miit; ++miit) {
447         if (!miit->IsSetBiomol() || miit->GetBiomol() == CMolInfo::eBiomol_peptide) {
448             continue;
449         }
450         if (mol_info == 0) {
451             mol_info = &(*miit);
452         } else if (mol_info->GetBiomol() != miit->GetBiomol() ) {
453             if (seqset.GetClass() == CBioseq_set::eClass_pop_set
454                        || seqset.GetClass() == CBioseq_set::eClass_eco_set
455                        || seqset.GetClass() == CBioseq_set::eClass_mut_set
456                        || seqset.GetClass() == CBioseq_set::eClass_phy_set
457                        || seqset.GetClass() == CBioseq_set::eClass_wgs_set
458                        || seqset.GetClass() == CBioseq_set::eClass_small_genome_set) {
459                 PostErr(eDiag_Warning, eErr_SEQ_PKG_InconsistentMoltypeSet,
460                     "Pop/phy/mut/eco set contains inconsistent moltype",
461                     seqset);
462             }
463             break;
464         }
465     } // for
466 
467 }
468 
469 
ValidateSegSet(const CBioseq_set & seqset,int segcnt)470 void CValidError_bioseqset::ValidateSegSet(const CBioseq_set& seqset, int segcnt)
471 {
472     if ( segcnt == 0 ) {
473         PostErr(eDiag_Error, eErr_SEQ_PKG_SegSetProblem,
474             "No segmented Bioseq in segset", seqset);
475     }
476 
477     CSeq_inst::EMol     mol = CSeq_inst::eMol_not_set;
478     CSeq_inst::EMol     seq_inst_mol;
479 
480     FOR_EACH_SEQENTRY_ON_SEQSET (se_list_it, seqset) {
481         if ( (*se_list_it)->IsSeq() ) {
482             const CSeq_inst& seq_inst = (*se_list_it)->GetSeq().GetInst();
483 
484             if ( mol == CSeq_inst::eMol_not_set ||
485                  mol == CSeq_inst::eMol_other ) {
486                 mol = seq_inst.GetMol();
487             } else if ( (seq_inst_mol = seq_inst.GetMol()) != CSeq_inst::eMol_other) {
488                 if ( seq_inst.IsNa() != CSeq_inst::IsNa(mol) ) {
489                     PostErr(eDiag_Critical, eErr_SEQ_PKG_SegSetMixedBioseqs,
490                         "Segmented set contains mixture of nucleotides"
491                         " and proteins", seqset);
492                     break;
493                 }
494             }
495         } else if ( (*se_list_it)->IsSet() ) {
496             const CBioseq_set& set = (*se_list_it)->GetSet();
497 
498             if ( set.IsSetClass()  &&
499                  set.GetClass() != CBioseq_set::eClass_parts ) {
500                 const CEnumeratedTypeValues* tv =
501                     CBioseq_set::GetTypeInfo_enum_EClass();
502                 const string& set_class_str =
503                     tv->FindName(set.GetClass(), true);
504 
505                 PostErr(eDiag_Critical, eErr_SEQ_PKG_SegSetNotParts,
506                     "Segmented set contains wrong Bioseq-set, "
507                     "its class is \"" + set_class_str + "\".", set);
508                 break;
509             }
510         } // else if
511     } // iterate
512 
513     CheckForInconsistentBiomols (seqset);
514 }
515 
516 
ValidatePartsSet(const CBioseq_set & seqset)517 void CValidError_bioseqset::ValidatePartsSet(const CBioseq_set& seqset)
518 {
519     CSeq_inst::EMol     mol = CSeq_inst::eMol_not_set;
520     CSeq_inst::EMol     seq_inst_mol;
521 
522     FOR_EACH_SEQENTRY_ON_SEQSET (se_list_it, seqset) {
523         if ( (*se_list_it)->IsSeq() ) {
524             const CSeq_inst& seq_inst = (*se_list_it)->GetSeq().GetInst();
525 
526             if ( mol == CSeq_inst::eMol_not_set  ||
527                  mol == CSeq_inst::eMol_other ) {
528                 mol = seq_inst.GetMol();
529             } else  {
530                 seq_inst_mol = seq_inst.GetMol();
531                 if ( seq_inst_mol != CSeq_inst::eMol_other) {
532                     if ( seq_inst.IsNa() != CSeq_inst::IsNa(mol) ) {
533                         PostErr(eDiag_Critical, eErr_SEQ_PKG_PartsSetMixedBioseqs,
534                                  "Parts set contains mixture of nucleotides "
535                                  "and proteins", seqset);
536                     }
537                 }
538             }
539         } else if ( (*se_list_it)->IsSet() ) {
540             const CBioseq_set& set = (*se_list_it)->GetSet();
541             const CEnumeratedTypeValues* tv =
542                 CBioseq_set::GetTypeInfo_enum_EClass();
543             const string& set_class_str =
544                 tv->FindName(set.GetClass(), true);
545 
546             PostErr(eDiag_Critical, eErr_SEQ_PKG_PartsSetHasSets,
547                     "Parts set contains unwanted Bioseq-set, "
548                     "its class is \"" + set_class_str + "\".", set);
549         } // else if
550     } // for
551 }
552 
553 
ValidateGenbankSet(const CBioseq_set & seqset)554 void CValidError_bioseqset::ValidateGenbankSet(const CBioseq_set& seqset)
555 {
556 }
557 
558 
ValidateSetTitle(const CBioseq_set & seqset)559 void CValidError_bioseqset::ValidateSetTitle(const CBioseq_set& seqset)
560 {
561     bool has_title = false;
562     bool needs_title = seqset.NeedsDocsumTitle();
563     if (seqset.IsSetDescr()) {
564         for (auto it : seqset.GetDescr().Get()) {
565             if (it->IsTitle()) {
566                 if (!needs_title) {
567                     CSeq_entry* parent = seqset.GetParentEntry();
568                     if (parent) {
569                         PostErr(eDiag_Error, eErr_SEQ_DESCR_TitleNotAppropriateForSet,
570                             "Only Pop/Phy/Mut/Eco sets should have titles",
571                             *parent, *it);
572                     } else {
573                         PostErr(eDiag_Error, eErr_SEQ_DESCR_TitleNotAppropriateForSet,
574                             "Only Pop/Phy/Mut/Eco sets should have titles",
575                             seqset);
576                     }
577                 }
578                 has_title = true;
579             }
580         }
581     }
582 
583 
584     if (needs_title && !has_title && (m_Imp.IsRefSeq() || m_Imp.IsEmbl() || m_Imp.IsDdbj() || m_Imp.IsGenbank())) {
585         PostErr(eDiag_Warning, eErr_SEQ_PKG_MissingSetTitle,
586             "Pop/Phy/Mut/Eco set does not have title",
587             seqset);
588     }
589 }
590 
591 
ValidateSetElements(const CBioseq_set & seqset)592 void CValidError_bioseqset::ValidateSetElements(const CBioseq_set& seqset)
593 {
594     if (!seqset.IsSetClass()) {
595         return;
596     }
597     if (seqset.GetClass() == CBioseq_set::eClass_eco_set ||
598         seqset.GetClass() == CBioseq_set::eClass_phy_set ||
599         seqset.GetClass() == CBioseq_set::eClass_pop_set ||
600         seqset.GetClass() == CBioseq_set::eClass_mut_set) {
601 
602         if (!seqset.IsSetSeq_set() || seqset.GetSeq_set().size() == 0) {
603             PostErr(eDiag_Warning, eErr_SEQ_PKG_EmptySet,
604                 "Pop/Phy/Mut/Eco set has no components",
605                 seqset);
606         } else if (seqset.GetSeq_set().size() == 1) {
607             bool has_alignment = false;
608             CSeq_annot_CI annot_it (m_Scope->GetBioseq_setHandle(seqset));
609             while (annot_it && !has_alignment) {
610                 if (annot_it->IsAlign()) {
611                     has_alignment = true;
612                 }
613                 ++annot_it;
614             }
615             if (!has_alignment) {
616                 PostErr(eDiag_Warning, eErr_SEQ_PKG_SingleItemSet,
617                     "Pop/Phy/Mut/Eco set has only one component and no alignments",
618                     seqset);
619             }
620         }
621     }
622     if (m_Imp.IsIndexerVersion()) {
623         if (seqset.GetClass() == CBioseq_set::eClass_eco_set ||
624             seqset.GetClass() == CBioseq_set::eClass_phy_set ||
625             seqset.GetClass() == CBioseq_set::eClass_pop_set ||
626             seqset.GetClass() == CBioseq_set::eClass_mut_set) {
627             CBioseq_CI b_i(m_Scope->GetBioseq_setHandle(seqset));
628             while (b_i) {
629                 if (b_i->IsNa()) {
630                     const CBioseq& seq = *(b_i->GetCompleteBioseq());
631                     bool has_title = false;
632                     FOR_EACH_DESCRIPTOR_ON_BIOSEQ (d_i, seq) {
633                         if ((*d_i)->IsTitle()) {
634                             has_title = true;
635                             break;
636                         }
637                     }
638                     if (!has_title && (m_Imp.IsRefSeq() || m_Imp.IsEmbl() || m_Imp.IsDdbj() || m_Imp.IsGenbank())) {
639                         PostErr(eDiag_Warning, eErr_SEQ_PKG_ComponentMissingTitle,
640                             "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title",
641                             seq);
642                     }
643                 }
644                 ++b_i;
645             }
646         }
647     }
648 }
649 
650 
SetShouldNotHaveMolInfo(const CBioseq_set & seqset)651 void CValidError_bioseqset::SetShouldNotHaveMolInfo(const CBioseq_set& seqset)
652 {
653     string class_name;
654     switch (seqset.GetClass()) {
655         case CBioseq_set::eClass_pop_set:
656             class_name = "Pop set";
657             break;
658         case CBioseq_set::eClass_mut_set:
659             class_name = "Mut set";
660             break;
661         case CBioseq_set::eClass_genbank:
662             class_name = "Genbank set";
663             break;
664         case CBioseq_set::eClass_phy_set:
665         case CBioseq_set::eClass_wgs_set:
666         case CBioseq_set::eClass_eco_set:
667             class_name = "Phy/eco/wgs set";
668             break;
669         case CBioseq_set::eClass_gen_prod_set:
670             class_name = "GenProd set";
671             break;
672         case CBioseq_set::eClass_small_genome_set:
673             class_name = "Small genome set";
674             break;
675         case CBioseq_set::eClass_nuc_prot:
676             class_name = "Nuc-prot set";
677             break;
678         default:
679             return;
680             break;
681     }
682 
683     FOR_EACH_DESCRIPTOR_ON_SEQSET (it, seqset) {
684         if ((*it)->IsMolinfo()) {
685             PostErr(eDiag_Warning, eErr_SEQ_PKG_MisplacedMolInfo,
686                     class_name + " has MolInfo on set", seqset);
687             return;
688         }
689     }
690 }
691 
692 
ValidatePopSet(const CBioseq_set & seqset)693 void CValidError_bioseqset::ValidatePopSet(const CBioseq_set& seqset)
694 {
695     static const string sp = " sp. ";
696 
697     if (m_Imp.IsRefSeq()) {
698         PostErr (eDiag_Critical, eErr_SEQ_PKG_RefSeqPopSet,
699                 "RefSeq record should not be a Pop-set", seqset);
700     }
701 
702     CTypeConstIterator<CBioseq> seqit(ConstBegin(seqset));
703     string first_taxname;
704     bool is_first = true;
705     for (; seqit; ++seqit) {
706         string taxname;
707         CBioseq_Handle bsh = m_Scope->GetBioseqHandle (*seqit);
708         // Will get the first biosource either from the descriptor
709         // or feature.
710         CSeqdesc_CI d(bsh, CSeqdesc::e_Source);
711         if (d) {
712             if (d->GetSource().IsSetOrg() && d->GetSource().GetOrg().IsSetTaxname()) {
713                 taxname = d->GetSource().GetOrg().GetTaxname();
714             }
715         } else {
716             CFeat_CI f(bsh, CSeqFeatData::e_Biosrc);
717             if (f && f->GetData().GetBiosrc().IsSetOrg() && f->GetData().GetBiosrc().GetOrg().IsSetTaxname()) {
718                 taxname = f->GetData().GetBiosrc().GetOrg().GetTaxname();
719             }
720         }
721 
722         if (is_first) {
723             first_taxname = taxname;
724             is_first = false;
725             continue;
726         }
727 
728         // Make sure all the taxnames in the set are the same.
729         if ( NStr::CompareNocase(first_taxname, taxname) == 0 ) {
730             continue;
731         }
732 
733         // drops severity if first mismatch is same up to sp.
734         EDiagSev sev = eDiag_Error;
735         SIZE_TYPE pos = NStr::Find(taxname, sp);
736         if ( pos != NPOS ) {
737             SIZE_TYPE len = pos + sp.length();
738             if ( NStr::strncasecmp(first_taxname.c_str(),
739                                    taxname.c_str(),
740                                    len) == 0 ) {
741                 sev = eDiag_Warning;
742             }
743         }
744         // drops severity if one name is subset of the other
745         SIZE_TYPE comp_len = min (taxname.length(), first_taxname.length());
746         if (NStr::EqualCase(taxname, 0, comp_len, first_taxname)) {
747             sev = eDiag_Warning;
748         }
749 
750         PostErr(sev, eErr_SEQ_DESCR_InconsistentTaxNameSet,
751             "Population set contains inconsistent organism names.",
752             seqset);
753         break;
754     }
755     CheckForInconsistentBiomols (seqset);
756 }
757 
758 
ValidatePhyMutEcoWgsSet(const CBioseq_set & seqset)759 void CValidError_bioseqset::ValidatePhyMutEcoWgsSet(const CBioseq_set& seqset)
760 {
761     CheckForInconsistentBiomols (seqset);
762 }
763 
764 
ValidateGenProdSet(const CBioseq_set & seqset)765 void CValidError_bioseqset::ValidateGenProdSet(
766     const CBioseq_set& seqset)
767 {
768     bool                id_no_good = false;
769     CSeq_id::E_Choice   id_type = CSeq_id::e_not_set;
770 
771     // genprodset should not have annotations directly on set
772     if (seqset.IsSetAnnot()) {
773         PostErr(eDiag_Critical,
774             eErr_SEQ_PKG_GenomicProductPackagingProblem,
775             "Seq-annot packaged directly on genomic product set", seqset);
776     }
777 
778     CBioseq_set::TSeq_set::const_iterator se_list_it =
779         seqset.GetSeq_set().begin();
780 
781     if ( !(**se_list_it).IsSeq() ) {
782         return;
783     }
784 
785     const CBioseq& seq = (*se_list_it)->GetSeq();
786     CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
787 
788     CFeat_CI fi(bsh, CSeqFeatData::e_Rna);
789     for (; fi; ++fi) {
790         if ( fi->GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA ) {
791             if ( fi->IsSetProduct() ) {
792                 CBioseq_Handle cdna = GetCache().GetBioseqHandleFromLocation(
793                     m_Scope, fi->GetProduct(), bsh.GetTSE_Handle());
794                  if ( !cdna ) {
795                     try {
796                         const CSeq_id& id = GetId(fi->GetProduct(), m_Scope);
797                         id_type = id.Which();
798                     } catch (CException ) {
799                         id_no_good = true;
800                     } catch (std::exception ) {
801                         id_no_good = true;
802                     }
803 
804                     // okay to have far RefSeq product
805                     if ( id_no_good  ||  (id_type != CSeq_id::e_Other) ) {
806                         string loc_label;
807                         fi->GetProduct().GetLabel(&loc_label);
808 
809                         if (loc_label.empty()) {
810                             loc_label = "?";
811                         }
812 
813                         PostErr(eDiag_Warning,
814                             eErr_SEQ_PKG_GenomicProductPackagingProblem,
815                             "Product of mRNA feature (" + loc_label +
816                             ") not packaged in genomic product set", seq);
817                     }
818                 } // if (cdna == 0)
819             } else if (!sequence::IsPseudo(*(fi->GetSeq_feat()), *m_Scope)) {
820                 PostErr(eDiag_Warning,
821                     eErr_SEQ_PKG_GenomicProductPackagingProblem,
822                     "Product of mRNA feature (?) not packaged in "
823                     "genomic product set", seq);
824             }
825         }
826     } // for
827 }
828 
829 
CheckForImproperlyNestedSets(const CBioseq_set & seqset)830 void CValidError_bioseqset::CheckForImproperlyNestedSets (const CBioseq_set& seqset)
831 {
832     FOR_EACH_SEQENTRY_ON_SEQSET (it, seqset) {
833         if ((*it)->IsSet()) {
834             if (!(*it)->GetSet().IsSetClass()
835                 || ((*it)->GetSet().GetClass() != CBioseq_set::eClass_nuc_prot
836                     && (*it)->GetSet().GetClass() != CBioseq_set::eClass_segset
837                     && (*it)->GetSet().GetClass() != CBioseq_set::eClass_parts)) {
838                 PostErr(eDiag_Warning,
839                     eErr_SEQ_PKG_ImproperlyNestedSets,
840                     "Nested sets within Pop/Phy/Mut/Eco/Wgs set", (*it)->GetSet());
841             }
842             CheckForImproperlyNestedSets((*it)->GetSet());
843         }
844     }
845 }
846 
ShouldHaveNoDblink(const CBioseq_set & seqset)847 void CValidError_bioseqset::ShouldHaveNoDblink (const CBioseq_set& seqset)
848 {
849     if (!seqset.IsSetDescr()) return;
850     for (auto it : seqset.GetDescr().Get()) {
851         const CSeqdesc& desc = *it;
852         if (desc.IsUser() && desc.GetUser().GetObjectType() == CUser_object::eObjectType_DBLink) {
853             PostErr(eDiag_Error,
854                 eErr_SEQ_DESCR_DBLinkOnSet,
855                 "DBLink user object should not be on this set", seqset);
856         }
857     }
858 }
859 
860 
861 END_SCOPE(validator)
862 END_SCOPE(objects)
863 END_NCBI_SCOPE
864